def evaluate(
self,
batch: list[DefaultDataInst],
candidate: dict[str, str],
capture_traces: bool = False,
) -> EvaluationBatch[DefaultTrajectory, DefaultRolloutOutput]:
outputs: list[DefaultRolloutOutput] = []
scores: list[float] = []
objective_scores: list[dict[str, float] | None] = []
trajectories: list[DefaultTrajectory] | None = [] if capture_traces else None
system_content = next(iter(candidate.values()))
litellm_requests = []
for data in batch:
user_content = f"{data['input']}"
messages: list[ChatMessage] = [
{"role": "system", "content": system_content},
{"role": "user", "content": user_content},
]
litellm_requests.append(messages)
if isinstance(self.model, str):
responses = [
resp.choices[0].message.content.strip()
for resp in self.litellm.batch_completion(
model=self.model,
messages=litellm_requests,
max_workers=self.max_litellm_workers,
**self.litellm_batch_completion_kwargs,
)
]
else:
responses = [self.model(messages) for messages in litellm_requests]
for data, assistant_response in zip(batch, responses, strict=True):
eval_result = self.evaluator(data, assistant_response)
score = eval_result.score
feedback = eval_result.feedback
obj_scores = eval_result.objective_scores
output: DefaultRolloutOutput = {"full_assistant_response": assistant_response}
outputs.append(output)
scores.append(score)
objective_scores.append(obj_scores)
if trajectories is not None:
trajectories.append(
{
"data": data,
"full_assistant_response": assistant_response,
"feedback": feedback,
}
)
objective_scores_arg: list[dict[str, float]] | None = None
if objective_scores:
all_none = all(x is None for x in objective_scores)
all_not_none = all(x is not None for x in objective_scores)
if not (all_none or all_not_none):
raise ValueError("Objective scores must either be all None or all not None.")
if all_not_none:
objective_scores_arg = cast(list[dict[str, float]], objective_scores)
return EvaluationBatch(
outputs=outputs,
scores=scores,
trajectories=trajectories,
objective_scores=objective_scores_arg,
)