Failed epoch doesn't stop rest of task's epochs
Opened this issue · 0 comments
When running multiple tasks with multiple epochs, and a single epoch of one task fails (e.g. unhandled exception in solver), the console output suggesting they are not running (red X, 100%, timer stopped).
However, it appears that other epochs of the failed task continue to run (unexpected).
Once any other running tasks complete, any epochs of failed tasks which still running are stopped. And if any other epochs of the failed task do succeed, they are not written to the log, so it seems like wasted effort.
Here's a repro, which I've attached a screen recording of (see the containers on the right hand side) with watch docker ps -q
Screen.Recording.2024-10-23.at.11.24.27.mov
import asyncio
import logging
from inspect_ai import Task, task, eval
from inspect_ai.dataset import MemoryDataset, Sample
from inspect_ai.solver import Generate, TaskState, solver
logger = logging.getLogger(__name__)
# Observed behaviour:
# * 4 containers get created immediately (as expected)
# * task_fail_once epoch 1 fails (as expected)
# * The console output shows task_fail_once as failed, with 100% progress and timer
# stopped:
# ╭─ eval: 1/2 tasks complete ─────────────────────────────────────────────────────────╮
# │ sleep_duration: 10, epochs: 2, max_samples: 4, max_tasks: 2 │
# │ │
# │ ⠿ task_sleep mockllm/model ━━━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━ 27% 0:00:05 │
# │ ✗ task_fail_once mockllm/model ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:01 │
# │ │
# │ subprocesses: 0/16 HTTP rate limits: 0 │
# ╰────────────────────────────────────────────────────────────────────────────────────╯
# * task_fail_once epoch 1's container is deleted (as expected)
# * task_fail_once epoch 2's container sticks around, and solvers keep getting run (not
# expected given console output)
# * once task_sleep completes, task_fail_once is stopped.
# Expected behaviour:
# Either the console output should convey that there is still some progress with
# task_fail_once, or all epochs for that task should be stopped.
def main():
eval(
tasks=[
# This task just sleeps for 10s.
task_sleep(10),
# This task fails immediately for the first epochs, but sleeps for 20s on
# the second.
task_fail_once(20),
],
model="mockllm/model",
max_tasks=2,
max_samples=4,
epochs=2,
log_level="info",
)
@task
def task_sleep(sleep_duration: int):
return Task(
dataset=MemoryDataset([Sample(input="task_sleep", target="Target")]),
sandbox="docker",
solver=[sleep_solver()] * sleep_duration,
)
@task
def task_fail_once(sleep_duration: int):
return Task(
dataset=MemoryDataset([Sample(input="task_fail_once", target="Target")]),
sandbox="docker",
solver=[fail_once_solver()] + [sleep_solver()] * sleep_duration,
)
@solver
def sleep_solver():
async def solve(state: TaskState, generate: Generate):
await asyncio.sleep(1)
logger.info(f"epoch {state.epoch} for task {state.input}")
return state
return solve
has_failed = False
@solver
def fail_once_solver():
async def solve(state: TaskState, generate: Generate):
global has_failed
if not has_failed:
has_failed = True
logger.error("Deliberate failure")
raise RuntimeError("This solver fails just once.")
return state
return solve
if __name__ == "__main__":
main()
inspect_ai version 0.3.42.dev43+g1c13832e
Happy to try out any potential fixes! Thanks a lot.