def evaluate(
self,
batch: list[TerminalBenchTask],
candidate: dict[str, str],
capture_traces: bool = False,
) -> EvaluationBatch:
outputs = []
scores = []
trajectories = []
example_run_id = "temp_gepa_run" + "_" + datetime.now().strftime("%Y%m%d%H%M%S")
example_model_name = batch[0].model_name
run_agent_tb(
[task.task_id for task in batch],
example_run_id,
example_model_name,
instruction_prompt=candidate["instruction_prompt"],
n_concurrent=self.n_concurrent,
prompt_template_path=self.instruction_prompt_path,
)
for example in batch:
try:
success, score, failed_reason, messages = get_results(example.task_id, example_run_id)
except Exception as e:
print(f"Error running example {example.task_id} {example_run_id}: {e}")
success = False
score = 0
failed_reason = str(e)
messages = []
outputs.append(
f"Terminal Bench outputs are omitted. Please see runs/{example_run_id}/{example.task_id}/ for detailed logging."
)
scores.append(score)
trajectories.append(
{
"messages": messages,
"instruction_prompt": candidate["instruction_prompt"],
"failed_reason": failed_reason,
"success": success,
}
)
return EvaluationBatch(
outputs=outputs,
scores=scores,
trajectories=trajectories,
)