from typing import Tuple, Dict, Any from collections import Counter from difflib import SequenceMatcher from pydantic import BaseModel from enum import Enum import json from typing import Optional, List import re from rapidfuzz import fuzz, distance from cuga.backend.tools_env.registry.mcp_manager.adapter import sanitize_tool_name class ToolCall(BaseModel): """ Basic model for a tool call """ name: str args: Dict class ScoringMethod(str, Enum): EXACT = "exact" SEQUENCE_MATCHER = "sequence_matcher" JACCARD = "jaccard" COSINE = "cosine" FUZZY_PARTIAL = "fuzzy_partial" FUZZY_TOKEN_SET = "fuzzy_token_set" JARO_WINKLER = "jaro_winkler" LEVENSHTEIN_NORM = "levenshtein_norm" class ToolCallMismatchType(str, Enum): ARGS_MISMATCH = "args_mismatch" NAME_MISMATCH = "name_mismatch" MISSING = "missing" UNEXPECTED = "unexpected" class ToolCallMismatch(BaseModel): tool_name: str type: ToolCallMismatchType expected: Optional[ToolCall] = None actual: Optional[ToolCall] = None class TestScore(BaseModel): """ Basic model for test score """ keyword_score: float tool_call_score: float response_score: float response_scoring_type: ScoringMethod class TestScoreDetails(BaseModel): """ Detailed artifacts to inspect why a test scored the way it did. """ missing_keywords: List[str] expected_keywords: List[str] expected_tool_calls: List[ToolCall] tool_call_mismatches: List[ToolCallMismatch] response_expected: str response_actual: str response_scoring_type: ScoringMethod def _normalize_tokens(s: str) -> List[str]: return [t for t in "".join(ch.lower() if ch.isalnum() else " " for ch in s).split() if t] def _jaccard(a_tokens: List[str], b_tokens: List[str]) -> float: a, b = set(a_tokens), set(b_tokens) if not a and not b: return 1.0 return len(a & b) / max(1, len(a | b)) def _cosine_tf(a_tokens: List[str], b_tokens: List[str]) -> float: if not a_tokens and not b_tokens: return 1.0 from collections import Counter as C ca, cb = C(a_tokens), C(b_tokens) dot = sum(ca[k] * cb.get(k, 0) for k in ca) na = sum(v * v for v in ca.values()) ** 0.5 nb = sum(v * v for v in cb.values()) ** 0.5 return (dot / (na * nb)) if na and nb else 0.0 def _sequence_matcher(a: str, b: str) -> float: if not a and not b: return 1.0 return SequenceMatcher(None, a, b).ratio() # ========== 1) Keyword scoring ========== def score_keywords(answer: str, expected_keywords: List[str]) -> Tuple[float, List[str]]: """ Calculate how many expected keywords appear in the given text. Matching is case-insensitive and ignores punctuation/formatting. Returns: (score, missing_keywords) """ if not expected_keywords: return 1.0, [] # Normalize the text: lowercase + remove punctuation normalized_text = re.sub(r"[^a-z0-9]+", " ", answer.lower()) missing_keywords = [] for kw in expected_keywords: normalized_kw = re.sub(r"[^a-z0-9]+", " ", kw.lower()).strip() if normalized_kw not in normalized_text: missing_keywords.append(kw) found = len(expected_keywords) - len(missing_keywords) score = found / len(expected_keywords) return round(score, 4), missing_keywords # ========== 2) Response proximity ========== def score_response( actual: str, expected: str, method: ScoringMethod = ScoringMethod.SEQUENCE_MATCHER ) -> Tuple[float, ScoringMethod]: if method == ScoringMethod.EXACT: return (1.0 if actual == expected else 0.0), ScoringMethod.EXACT if method == ScoringMethod.SEQUENCE_MATCHER: return _sequence_matcher(actual, expected), ScoringMethod.SEQUENCE_MATCHER if method in {ScoringMethod.JACCARD, ScoringMethod.COSINE}: toks_a, toks_b = _normalize_tokens(actual), _normalize_tokens(expected) if method == ScoringMethod.JACCARD: return _jaccard(toks_a, toks_b), ScoringMethod.JACCARD return _cosine_tf(toks_a, toks_b), ScoringMethod.COSINE if method == ScoringMethod.FUZZY_PARTIAL: # robust to extra prefixes/suffixes; good for “expected snippet within longer response” return round(fuzz.partial_ratio(expected, actual) / 100.0, 4), ScoringMethod.FUZZY_PARTIAL if method == ScoringMethod.FUZZY_TOKEN_SET: # ignores word order and duplicates—great for rephrased responses return round(fuzz.token_set_ratio(expected, actual) / 100.0, 4), ScoringMethod.FUZZY_TOKEN_SET if method == ScoringMethod.JARO_WINKLER: # typo-friendly; higher for small transpositions; normalized 0..1 jw = distance.JaroWinkler.normalized_similarity(expected, actual) return round(float(jw), 4), ScoringMethod.JARO_WINKLER if method == ScoringMethod.LEVENSHTEIN_NORM: # classic edit distance normalized to 0..1 similarity sim = distance.Levenshtein.normalized_similarity(expected, actual) return round(float(sim), 4), ScoringMethod.LEVENSHTEIN_NORM # default return _sequence_matcher(actual, expected), ScoringMethod.SEQUENCE_MATCHER def _canon_args(d: Dict[str, Any]) -> str: return json.dumps(d, sort_keys=True, separators=(",", ":")) def _canon(tc: ToolCall) -> Tuple[str, str]: return (tc.name, _canon_args(tc.args)) def _key(tc: ToolCall) -> Tuple[str, str]: """Canonical key (name + normalized args) for comparing tool calls.""" return _canon(tc) def score_tool_calls_exact( actual: List[ToolCall], expected: List[ToolCall], ) -> Tuple[float, List[ToolCallMismatch]]: """ Exact multiset match of (name, args). Scoring: matched = sum over keys of min(count_actual, count_expected) unexpected_count = sum((c_act - c_exp).values()) # extras in actual expected_count = len(expected) score = 1.0 if (expected_count == 0 and unexpected_count == 0) else matched / (expected_count + unexpected_count) Mismatches (typed): - ARGS_MISMATCH (same tool name, different args) - NAME_MISMATCH (different tool used instead of expected) - MISSING (expected not called) - UNEXPECTED (called but not expected) """ # sanitize tool names for tool_call in expected: tool_call.name = sanitize_tool_name(tool_call.name) exp_keys = [_key(tc) for tc in expected] act_keys = [_key(tc) for tc in actual] c_exp, c_act = Counter(exp_keys), Counter(act_keys) matched = sum(min(c_exp[k], c_act.get(k, 0)) for k in c_exp) unexpected_count = sum((c_act - c_exp).values()) expected_count = len(expected) if expected_count == 0 and unexpected_count == 0: score = 1.0 else: denom = expected_count + unexpected_count score = (matched / denom) if denom else 1.0 # Build unmatched lists for detailed mismatch reporting def expand_unmatched( counter_a: Counter, counter_b: Counter, source_list: List[ToolCall] ) -> List[ToolCall]: leftover = counter_a - counter_b need: List[ToolCall] = [] by_key: Dict[Tuple[str, str], List[ToolCall]] = {} for tc in source_list: by_key.setdefault(_key(tc), []).append(tc) for k, cnt in leftover.items(): pool = by_key.get(k, []) need.extend(pool[:cnt]) return need unmatched_expected = expand_unmatched(c_exp, c_act, expected) unmatched_actual = expand_unmatched(c_act, c_exp, actual) mismatches: List[ToolCallMismatch] = [] # 1) Flag args mismatches first (same tool name, different args) # Greedy, deterministic (left-to-right). used_a = set() still_ue: List[ToolCall] = [] for e in unmatched_expected: found_ai = None for ai, a in enumerate(unmatched_actual): if ai in used_a: continue if a.name == e.name and a.args != e.args: mismatches.append( ToolCallMismatch( tool_name=e.name, type=ToolCallMismatchType.ARGS_MISMATCH, expected=e.model_dump(), actual=a.model_dump(), ) ) used_a.add(ai) found_ai = ai break if found_ai is None: still_ue.append(e) still_ua: List[ToolCall] = [a for ai, a in enumerate(unmatched_actual) if ai not in used_a] unmatched_expected, unmatched_actual = still_ue, still_ua # 2) Pair remaining as name mismatches (A instead of B) # Pair in order to stay deterministic. for e, a in zip(unmatched_expected, unmatched_actual): mismatches.append( ToolCallMismatch( tool_name=e.name, # expected tool name type=ToolCallMismatchType.NAME_MISMATCH, expected=e.model_dump(), actual=a.model_dump(), ) ) # 3) Any leftovers after pairing are pure missing/unexpected if len(unmatched_expected) > len(unmatched_actual): for e in unmatched_expected[len(unmatched_actual) :]: mismatches.append( ToolCallMismatch( tool_name=e.name, type=ToolCallMismatchType.MISSING, expected=e.model_dump(), actual=None, ) ) elif len(unmatched_actual) > len(unmatched_expected): for a in unmatched_actual[len(unmatched_expected) :]: mismatches.append( ToolCallMismatch( tool_name=a.name, type=ToolCallMismatchType.UNEXPECTED, expected=None, actual=a.model_dump(), ) ) return round(score, 4), mismatches # ========== Orchestrators ========== def evaluate_test( expected_keywords: List[str], tool_calls: List[ToolCall], expected_tool_calls: List[ToolCall], response: str, expected_response: str, response_scoring_type: ScoringMethod = ScoringMethod.FUZZY_TOKEN_SET, ) -> TestScore: """ Backward-compatible: returns only TestScore. """ kw_score, _missing_keywords = score_keywords(response, expected_keywords) tc_score, _tc_mismatches = score_tool_calls_exact(tool_calls, expected_tool_calls) resp_score, resp_method = score_response(response, expected_response, method=response_scoring_type) return TestScore( keyword_score=round(kw_score, 4), tool_call_score=round(tc_score, 4), response_score=round(resp_score, 4), response_scoring_type=resp_method, ) def evaluate_test_and_details( expected_keywords: List[str], tool_calls: List[ToolCall], expected_tool_calls: List[ToolCall], response: str, expected_response: str, response_scoring_type: ScoringMethod = ScoringMethod.FUZZY_TOKEN_SET, ) -> Tuple[TestScore, TestScoreDetails]: kw_score, missing_keywords = score_keywords(response, expected_keywords) tc_score, tc_mismatches = score_tool_calls_exact(tool_calls, expected_tool_calls) resp_score, resp_method = score_response(response, expected_response, method=response_scoring_type) score = TestScore( keyword_score=round(kw_score, 4), tool_call_score=round(tc_score, 4), response_score=round(resp_score, 4), response_scoring_type=resp_method, ) details = TestScoreDetails( expected_keywords=expected_keywords, missing_keywords=missing_keywords, expected_tool_calls=expected_tool_calls, tool_call_mismatches=tc_mismatches, response_expected=expected_response, response_actual=response, response_scoring_type=resp_method, ) return score, details