Source code for agent_framework_evaluator.case_markdown

"""Load evaluator test cases from markdown files (``---``-separated frontmatter, prompt, criteria).

Use :class:`MarkdownCaseLoader` from an initializer's ``get_test_cases()`` so cases live in
``*.md`` files next to the initializer module, e.g. ``eval/deck-review-01.md`` with glob
``eval/*.md`` relative to ``deck-review.py``.
"""

from __future__ import annotations

import logging
import re
from collections.abc import Callable, Mapping
from pathlib import Path
from typing import Any

from agent_framework.file_reference import DefaultFileReferenceResolver, FileReferenceResolver, expand_file_refs

_LOGGER = logging.getLogger(__name__)

_SECTION_RE = re.compile(r"^---\s*$", re.MULTILINE)


[docs] def parse_simple_frontmatter(text: str) -> dict[str, str]: """Parse ``key: value`` lines (no nesting). For nested YAML use ``yaml.safe_load`` on the block.""" out: dict[str, str] = {} for raw in text.splitlines(): line = raw.strip() if not line or line.startswith("#"): continue if ":" not in line: continue k, v = line.split(":", 1) key = k.strip() val = v.strip() if key: out[key] = val _LOGGER.debug( "Parsed evaluator case markdown frontmatter.", extra={ "trace_kind": "evaluator.case_markdown.frontmatter_parsed", "trace_title": "Evaluator case frontmatter parsed", "trace_payload": {"frontmatter": dict(out)}, }, ) return out
def _normalise_initializer_ref(ref: str) -> str: """Return the stem of an initializer ref for comparison (strips path and .py suffix).""" return Path(ref).stem
[docs] def parse_case_markdown_file( *, path: Path, evaluator_registry: Mapping[str, Callable[..., Any]], resolver: FileReferenceResolver | None = None, ) -> dict[str, Any] | None: """Parse one case file; return case metadata, prompt, criteria, and evaluator hooks.""" try: raw = path.read_text(encoding="utf-8") except OSError as exc: _LOGGER.warning("Case file %s: cannot read (%s).", path, exc) return None parts = _SECTION_RE.split(raw) if len(parts) < 4: _LOGGER.warning( "Case file %s: expected three lines containing only --- (frontmatter, prompt block, criteria block). " "Got %d segment(s) after split; need at least 4. Common mistake: escaped \\--- from an editor export — " "use a plain --- line.", path, len(parts), ) return None fm = parse_simple_frontmatter(parts[1].strip()) title = fm.get("title", path.stem) eval_names_raw = fm.get("code_evaluator", "").strip() result_field = fm.get("result_field", "message").strip() or "message" flags: set[str] = {f.strip() for f in fm.get("flags", "").split(",") if f.strip()} fm_agent: str | None = fm.get("agent", "").strip() or None fm_initializer: str | None = fm.get("initializer", "").strip() or None prompt = parts[2].strip() criteria = parts[3].strip() _resolver = resolver if resolver is not None else DefaultFileReferenceResolver() prompt = expand_file_refs(prompt, _resolver, base_dir=path.parent) code_evaluators: list[Callable[..., Any]] = [] for eval_name in [n.strip() for n in eval_names_raw.split(",") if n.strip()]: fn = evaluator_registry.get(eval_name) if fn is not None and callable(fn): code_evaluators.append(fn) else: _LOGGER.warning( "Case file %s: frontmatter code_evaluator=%r is not registered on this initializer module.", path, eval_name, ) return { "title": title, "prompt": prompt, "evaluation_criteria": criteria, "code_evaluators": code_evaluators, "result_field": result_field, "flags": flags, "fm_agent": fm_agent, "fm_initializer": fm_initializer, }
[docs] class MarkdownCaseLoader: """Discover ``*.md`` cases under ``base_dir`` with a glob; cache invalidates on path/mtime changes. Pass ``initializer_ref`` to automatically skip cases whose ``initializer`` frontmatter field is set to a different initializer (stem comparison, so ``foo.py`` matches ``foo``). Cases with no ``initializer`` frontmatter field always match. """ def __init__( self, base_dir: Path, glob_pattern: str, evaluator_registry: Mapping[str, Callable[..., Any]] | None = None, *, initializer_ref: str | None = None, resolver: FileReferenceResolver | None = None, ) -> None: self._base = base_dir.resolve() self._glob = glob_pattern self._reg: Mapping[str, Callable[..., Any]] = evaluator_registry or {} self._initializer_stem = _normalise_initializer_ref(initializer_ref) if initializer_ref else None self._resolver = resolver self._cache: list[dict[str, Any]] | None = None self._cache_key: frozenset[tuple[str, float]] | None = None
[docs] def get_test_cases(self) -> list[dict[str, Any]]: files = sorted(self._base.glob(self._glob)) key = frozenset((str(p.resolve()), p.stat().st_mtime) for p in files) if self._cache is not None and key == self._cache_key: return self._cache parsed: list[dict[str, Any]] = [] for f in files: row = parse_case_markdown_file(path=f, evaluator_registry=self._reg, resolver=self._resolver) if row is None: continue fm_init = row.get("fm_initializer") if fm_init and self._initializer_stem: if _normalise_initializer_ref(fm_init) != self._initializer_stem: _LOGGER.debug( "Skipping case %s: initializer %r does not match %r.", f.name, fm_init, self._initializer_stem, ) continue parsed.append(row) self._cache = parsed self._cache_key = key return self._cache