Example Eval - Anthropic Claude

import anthropic
ant = anthropic.Anthropic()

def claude_agent(case):
    msg = ant.messages.create(
        model="claude-haiku-4-5",
        max_tokens=1024,
        system="You are a helpful customer support agent.",
        messages=[{"role": "user", "content": case.query}],
    )
    return {"output": msg.content[0].text, "metadata": {"model": msg.model}}

report = (
    client.evaluations
    .run(dataset_id="...", subject={"kind": "custom_agent", "displayName": "Claude Haiku", "framework": "anthropic"})
    .execute(claude_agent)
    .finalize()
    .analyze()
)

Full example: anthropic_eval