Source code for agent_framework_evaluator.cli

from __future__ import annotations

import argparse
import json
import os
import sys
import webbrowser
from pathlib import Path

from agent_framework_evaluator.evaluation import CASE_NO_CALLBACKS_POSTFIX
from agent_framework_evaluator.runtime.session_runner import SessionRunner

DEFAULT_ENV_ARGUMENT = ".env"


[docs] def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Agent evaluator and debugger.") subparsers = parser.add_subparsers(dest="command", required=True) web = subparsers.add_parser("web") web.add_argument( "--env", default=DEFAULT_ENV_ARGUMENT, help="Path to .env (default: ./.env; overridable in the UI).", ) web.add_argument( "--agent", default=None, help="Default agent id for the web UI (datalist still overridable).", ) web.add_argument( "--initializer", default=None, metavar="NAME", help="Default initializer .py (under AGENT_EVAL_INITIALIZER_DIR from .env); overridable in the UI.", ) web.add_argument( "--agent-model-override", default=None, help="Default model override for the tested agent (or all agents when paired with --agent-model-override-scope).", ) web.add_argument( "--agent-model-override-scope", choices=("root_only", "all_agents"), default="root_only", help="Scope for --agent-model-override in the UI defaults.", ) web.add_argument("--host", default="127.0.0.1") web.add_argument("--port", type=int, default=8123) web.add_argument( "--no-open-browser", action="store_false", dest="open_browser", help="Do not launch the browser automatically.", ) web.set_defaults(open_browser=True) run = subparsers.add_parser("run") run.add_argument("--env", default=DEFAULT_ENV_ARGUMENT) run.add_argument("--agent", default=None) run.add_argument("--setup") run.add_argument("--prompt") run.add_argument("--prompt-file") run.add_argument("--agent-model-override") run.add_argument( "--agent-model-override-scope", choices=("root_only", "all_agents"), default="root_only", ) run.add_argument("--output") run.add_argument( "--trace-jsonl", metavar="PATH", default=None, help="Append unified trace events to a JSONL file.", ) run.add_argument( "--trace-llm-dir", metavar="DIR", default=None, help="Write llm-channel events to per-agent logs under DIR.", ) evaluate = subparsers.add_parser( "evaluate", help="Run and evaluate test cases without the web UI.", ) evaluate.add_argument("--env", default=DEFAULT_ENV_ARGUMENT, help="Path to .env file (default: ./.env).") src = evaluate.add_mutually_exclusive_group(required=True) src.add_argument( "--initializer", metavar="PATH", help="Initializer .py; runs all cases unless --case is set.", ) src.add_argument( "--case-file", metavar="PATH", help="Standalone case .md file (no initializer required).", ) evaluate.add_argument( "--case", metavar="N", type=int, default=None, help="Select a single case by 0-based index (requires --initializer).", ) evaluate.add_argument( "--agent", default=None, help="Agent id to run (default: initializer DEFAULT_AGENT or 'root').", ) evaluate.add_argument("--agent-model-override") evaluate.add_argument( "--agent-model-override-scope", choices=("root_only", "all_agents"), default="root_only", ) evaluate.add_argument("--output", metavar="FILE", help="Write full JSON result to file.") evaluate.add_argument( "--verbose", action="store_true", help="Batch only: include per-case run result in addition to summary table.", ) return parser
def _ensure_default_env_argument(args: argparse.Namespace) -> None: """Treat omitted/empty ``--env`` as if the caller had passed ``--env .env``.""" if hasattr(args, "env") and not getattr(args, "env", None): args.env = DEFAULT_ENV_ARGUMENT def _cmd_web(args: argparse.Namespace) -> int: import uvicorn os.environ["AGENT_EVAL_DEFAULT_ENV_PATH"] = str(args.env) for key in ( "AGENT_EVAL_DEFAULT_AGENT", "AGENT_EVAL_DEFAULT_INITIALIZER", "AGENT_EVAL_DEFAULT_AGENT_MODEL_OVERRIDE", "AGENT_EVAL_DEFAULT_AGENT_MODEL_OVERRIDE_SCOPE", ): os.environ.pop(key, None) if args.agent: os.environ["AGENT_EVAL_DEFAULT_AGENT"] = args.agent if args.initializer: os.environ["AGENT_EVAL_DEFAULT_INITIALIZER"] = args.initializer if args.agent_model_override: os.environ["AGENT_EVAL_DEFAULT_AGENT_MODEL_OVERRIDE"] = args.agent_model_override os.environ["AGENT_EVAL_DEFAULT_AGENT_MODEL_OVERRIDE_SCOPE"] = ( args.agent_model_override_scope ) url = f"http://{args.host}:{args.port}/" if args.open_browser: webbrowser.open(url) uvicorn.run( "agent_framework_evaluator.app:app", host=args.host, port=args.port, factory=False, ) return 0 def _cmd_run(args: argparse.Namespace) -> int: setup_module = None if args.setup: from agent_framework_evaluator.runtime.setup_loader import load_setup_module setup_module = load_setup_module(Path(args.setup)) agent_id: str | None = args.agent if agent_id is None and setup_module is not None: agent_id = getattr(setup_module, "DEFAULT_AGENT", None) if agent_id is None: print("error: provide --agent or set DEFAULT_AGENT in --setup script", file=sys.stderr) return 2 if args.prompt_file: prompt = Path(args.prompt_file).read_text(encoding="utf-8") elif args.prompt: prompt = args.prompt elif setup_module is not None and hasattr(setup_module, "get_prompt_template"): prompt = setup_module.get_prompt_template() else: print("error: provide --prompt, --prompt-file, or get_prompt_template() in --setup script", file=sys.stderr) return 2 from agent_framework.tracing import CompositeRuntimeTracer from agent_framework.tracing_subscribers.jsonl_subscriber import JsonlTraceSubscriber from agent_framework.tracing_subscribers.llm_trace_file_subscriber import LlmTraceFileSubscriber subs: list[object] = [] if args.trace_jsonl: subs.append(JsonlTraceSubscriber(Path(args.trace_jsonl))) if args.trace_llm_dir: subs.append(LlmTraceFileSubscriber(Path(args.trace_llm_dir))) merged_tracer = CompositeRuntimeTracer(subscribers=subs) if subs else None runner = SessionRunner(args.env) try: result = runner.run_once( agent_id=agent_id, prompt=prompt, setup_path=Path(args.setup) if args.setup else None, runtime_tracer=merged_tracer, agent_model_override=args.agent_model_override, agent_model_override_scope=args.agent_model_override_scope, ) usage_summary = getattr(runner, "_last_usage_summary", None) except Exception as exc: print(str(exc), file=sys.stderr) return 1 payload = { "status": result["status"], "message": result["message"], "usage_summary": usage_summary or {"session_totals": {}, "agents": {}, "runs": {}}, } text = json.dumps(payload, indent=2, ensure_ascii=False) if args.output: Path(args.output).write_text(text + "\n", encoding="utf-8") else: print(text) return 0 def _cmd_evaluate(args: argparse.Namespace) -> int: from agent_framework_evaluator.case_markdown import parse_case_markdown_file from agent_framework_evaluator.evaluation import ( run_code_evaluations, run_evaluation, select_agent_result_field, ) from agent_framework_evaluator.initializer_catalog import ( load_initializer_default_agent, load_initializer_default_eval_model, load_raw_test_cases, resolve_env_path, resolve_setup_path_for_run, ) env_file = resolve_env_path(args.env) def _run_single_case( *, agent_id: str, prompt: str, criteria: str, result_field: str, code_evaluators: list, flags: set, setup_path: "Path | None", eval_model: "str | tuple | None", ) -> dict[str, object]: runner = SessionRunner(args.env) run_result = runner.run_once( agent_id=agent_id, prompt=prompt.rstrip() + CASE_NO_CALLBACKS_POSTFIX, setup_path=setup_path, agent_model_override=args.agent_model_override, agent_model_override_scope=args.agent_model_override_scope, ) usage_summary = getattr(runner, "_last_usage_summary", None) selected = select_agent_result_field(run_result, result_field) if selected is None: print( f"error: result_field '{result_field}' not present in agent result", file=sys.stderr, ) sys.exit(1) llm = run_evaluation( env_path=env_file, evaluator_prompt=criteria, agent_message=selected, model_override=eval_model if eval_model else None, ) llm["score"] = min(10.0, max(0.0, float(llm["score"]))) code_results = run_code_evaluations(code_evaluators, prompt=prompt, agent_message=selected, flags=flags) parts = [float(llm["score"])] + [float(r["score"]) for r in code_results if r is not None] average = sum(parts) / len(parts) return { "run_result": run_result, "llm_result": llm, "code_results": code_results, "average_score": average, "selected_payload": selected, "result_field": result_field, "usage_summary": usage_summary or {"session_totals": {}, "agents": {}, "runs": {}}, "session_usage_totals": ( dict((usage_summary or {}).get("session_totals") or {}) if isinstance(usage_summary, dict) else {} ), } if args.case_file: # --case-file: standalone .md, no initializer case_path = Path(args.case_file) if not case_path.exists(): print(f"error: case file not found: {case_path}", file=sys.stderr) return 1 case = parse_case_markdown_file(path=case_path, evaluator_registry={}) if case is None: print(f"error: could not parse case file: {case_path}", file=sys.stderr) return 1 fm_agent = case.get("fm_agent") fm_initializer = case.get("fm_initializer") # Conflict: both CLI and frontmatter specify an agent but they disagree. if args.agent and fm_agent and args.agent != fm_agent: print( f"info: skipping {case_path.name} — frontmatter agent={fm_agent!r} " f"does not match --agent {args.agent!r}", file=sys.stderr, ) return 0 # Conflict: both CLI initializer (via --agent default path) and frontmatter disagree. # (--case-file and --initializer are mutually exclusive, so no CLI initializer here; # this guard is for future-proofing and env-based overrides.) agent_id = args.agent or fm_agent or "root" setup_path = resolve_setup_path_for_run(env_file, fm_initializer) if fm_initializer else None result = _run_single_case( agent_id=agent_id, prompt=case["prompt"], criteria=str(case.get("evaluation_criteria", "") or ""), result_field=str(case.get("result_field", "message") or "message"), code_evaluators=case.get("code_evaluators", []), flags=case.get("flags", set()), setup_path=setup_path, eval_model=None, ) text = json.dumps(result, indent=2, ensure_ascii=False, default=str) if args.output: Path(args.output).write_text(text + "\n", encoding="utf-8") else: print(text) return 0 # --initializer path initializer = args.initializer cases = load_raw_test_cases(env_file, initializer) if not cases: print(f"error: no cases found for initializer: {initializer}", file=sys.stderr) return 1 setup_path = resolve_setup_path_for_run(env_file, initializer) default_agent = args.agent or load_initializer_default_agent(env_file, initializer) or "root" eval_model = load_initializer_default_eval_model(env_file, initializer) if args.case is not None: # Single case by index if args.case >= len(cases): print( f"error: --case {args.case} out of range (0..{len(cases)-1})", file=sys.stderr ) return 1 case = cases[args.case] result = _run_single_case( agent_id=default_agent, prompt=str(case.get("prompt", "")), criteria=str(case.get("evaluation_criteria", "") or ""), result_field=str(case.get("result_field", "message") or "message"), code_evaluators=case.get("code_evaluators", []), flags=case.get("flags", set()), setup_path=setup_path, eval_model=eval_model, ) text = json.dumps(result, indent=2, ensure_ascii=False, default=str) if args.output: Path(args.output).write_text(text + "\n", encoding="utf-8") else: print(text) return 0 # Full batch batch_results: list[dict[str, object]] = [] for i, case in enumerate(cases): title = str(case.get("title", f"Case {i}")) print(f"[{i+1}/{len(cases)}] {title} …", flush=True) try: result = _run_single_case( agent_id=default_agent, prompt=str(case.get("prompt", "")), criteria=str(case.get("evaluation_criteria", "") or ""), result_field=str(case.get("result_field", "message") or "message"), code_evaluators=case.get("code_evaluators", []), flags=case.get("flags", set()), setup_path=setup_path, eval_model=eval_model, ) avg = result["average_score"] verdict = "PASS" if float(avg) >= 7.0 else "FAIL" totals = result.get("session_usage_totals") or {} tokens = totals.get("total_tokens", "—") if isinstance(totals, dict) else "—" print(f" score={float(avg):.1f} tokens={tokens} {verdict}") if args.verbose: print(f" run_result={json.dumps(result['run_result'], ensure_ascii=False, default=str)}") batch_results.append({"case_index": i, "title": title, **result}) except Exception as exc: print(f" ERROR: {exc}", file=sys.stderr) batch_results.append({"case_index": i, "title": title, "error": str(exc)}) if args.output: text = json.dumps(batch_results, indent=2, ensure_ascii=False, default=str) Path(args.output).write_text(text + "\n", encoding="utf-8") print(f"\nFull results written to {args.output}") all_errored = batch_results and all("error" in r for r in batch_results) return 1 if all_errored else 0
[docs] def main(argv: list[str] | None = None) -> int: parser = build_parser() args = parser.parse_args(argv) _ensure_default_env_argument(args) if args.command == "web": return _cmd_web(args) if args.command == "run": return _cmd_run(args) if args.command == "evaluate": return _cmd_evaluate(args) return 0