| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373 |
- """MCP Server Evaluation Harness
- This script evaluates MCP servers by running test questions against them using Claude.
- """
- import argparse
- import asyncio
- import json
- import re
- import sys
- import time
- import traceback
- import xml.etree.ElementTree as ET
- from pathlib import Path
- from typing import Any
- from anthropic import Anthropic
- from connections import create_connection
- EVALUATION_PROMPT = """You are an AI assistant with access to tools.
- When given a task, you MUST:
- 1. Use the available tools to complete the task
- 2. Provide summary of each step in your approach, wrapped in <summary> tags
- 3. Provide feedback on the tools provided, wrapped in <feedback> tags
- 4. Provide your final response, wrapped in <response> tags
- Summary Requirements:
- - In your <summary> tags, you must explain:
- - The steps you took to complete the task
- - Which tools you used, in what order, and why
- - The inputs you provided to each tool
- - The outputs you received from each tool
- - A summary for how you arrived at the response
- Feedback Requirements:
- - In your <feedback> tags, provide constructive feedback on the tools:
- - Comment on tool names: Are they clear and descriptive?
- - Comment on input parameters: Are they well-documented? Are required vs optional parameters clear?
- - Comment on descriptions: Do they accurately describe what the tool does?
- - Comment on any errors encountered during tool usage: Did the tool fail to execute? Did the tool return too many tokens?
- - Identify specific areas for improvement and explain WHY they would help
- - Be specific and actionable in your suggestions
- Response Requirements:
- - Your response should be concise and directly address what was asked
- - Always wrap your final response in <response> tags
- - If you cannot solve the task return <response>NOT_FOUND</response>
- - For numeric responses, provide just the number
- - For IDs, provide just the ID
- - For names or text, provide the exact text requested
- - Your response should go last"""
- def parse_evaluation_file(file_path: Path) -> list[dict[str, Any]]:
- """Parse XML evaluation file with qa_pair elements."""
- try:
- tree = ET.parse(file_path)
- root = tree.getroot()
- evaluations = []
- for qa_pair in root.findall(".//qa_pair"):
- question_elem = qa_pair.find("question")
- answer_elem = qa_pair.find("answer")
- if question_elem is not None and answer_elem is not None:
- evaluations.append({
- "question": (question_elem.text or "").strip(),
- "answer": (answer_elem.text or "").strip(),
- })
- return evaluations
- except Exception as e:
- print(f"Error parsing evaluation file {file_path}: {e}")
- return []
- def extract_xml_content(text: str, tag: str) -> str | None:
- """Extract content from XML tags."""
- pattern = rf"<{tag}>(.*?)</{tag}>"
- matches = re.findall(pattern, text, re.DOTALL)
- return matches[-1].strip() if matches else None
- async def agent_loop(
- client: Anthropic,
- model: str,
- question: str,
- tools: list[dict[str, Any]],
- connection: Any,
- ) -> tuple[str, dict[str, Any]]:
- """Run the agent loop with MCP tools."""
- messages = [{"role": "user", "content": question}]
- response = await asyncio.to_thread(
- client.messages.create,
- model=model,
- max_tokens=4096,
- system=EVALUATION_PROMPT,
- messages=messages,
- tools=tools,
- )
- messages.append({"role": "assistant", "content": response.content})
- tool_metrics = {}
- while response.stop_reason == "tool_use":
- tool_use = next(block for block in response.content if block.type == "tool_use")
- tool_name = tool_use.name
- tool_input = tool_use.input
- tool_start_ts = time.time()
- try:
- tool_result = await connection.call_tool(tool_name, tool_input)
- tool_response = json.dumps(tool_result) if isinstance(tool_result, (dict, list)) else str(tool_result)
- except Exception as e:
- tool_response = f"Error executing tool {tool_name}: {str(e)}\n"
- tool_response += traceback.format_exc()
- tool_duration = time.time() - tool_start_ts
- if tool_name not in tool_metrics:
- tool_metrics[tool_name] = {"count": 0, "durations": []}
- tool_metrics[tool_name]["count"] += 1
- tool_metrics[tool_name]["durations"].append(tool_duration)
- messages.append({
- "role": "user",
- "content": [{
- "type": "tool_result",
- "tool_use_id": tool_use.id,
- "content": tool_response,
- }]
- })
- response = await asyncio.to_thread(
- client.messages.create,
- model=model,
- max_tokens=4096,
- system=EVALUATION_PROMPT,
- messages=messages,
- tools=tools,
- )
- messages.append({"role": "assistant", "content": response.content})
- response_text = next(
- (block.text for block in response.content if hasattr(block, "text")),
- None,
- )
- return response_text, tool_metrics
- async def evaluate_single_task(
- client: Anthropic,
- model: str,
- qa_pair: dict[str, Any],
- tools: list[dict[str, Any]],
- connection: Any,
- task_index: int,
- ) -> dict[str, Any]:
- """Evaluate a single QA pair with the given tools."""
- start_time = time.time()
- print(f"Task {task_index + 1}: Running task with question: {qa_pair['question']}")
- response, tool_metrics = await agent_loop(client, model, qa_pair["question"], tools, connection)
- response_value = extract_xml_content(response, "response")
- summary = extract_xml_content(response, "summary")
- feedback = extract_xml_content(response, "feedback")
- duration_seconds = time.time() - start_time
- return {
- "question": qa_pair["question"],
- "expected": qa_pair["answer"],
- "actual": response_value,
- "score": int(response_value == qa_pair["answer"]) if response_value else 0,
- "total_duration": duration_seconds,
- "tool_calls": tool_metrics,
- "num_tool_calls": sum(len(metrics["durations"]) for metrics in tool_metrics.values()),
- "summary": summary,
- "feedback": feedback,
- }
- REPORT_HEADER = """
- # Evaluation Report
- ## Summary
- - **Accuracy**: {correct}/{total} ({accuracy:.1f}%)
- - **Average Task Duration**: {average_duration_s:.2f}s
- - **Average Tool Calls per Task**: {average_tool_calls:.2f}
- - **Total Tool Calls**: {total_tool_calls}
- ---
- """
- TASK_TEMPLATE = """
- ### Task {task_num}
- **Question**: {question}
- **Ground Truth Answer**: `{expected_answer}`
- **Actual Answer**: `{actual_answer}`
- **Correct**: {correct_indicator}
- **Duration**: {total_duration:.2f}s
- **Tool Calls**: {tool_calls}
- **Summary**
- {summary}
- **Feedback**
- {feedback}
- ---
- """
- async def run_evaluation(
- eval_path: Path,
- connection: Any,
- model: str = "claude-3-7-sonnet-20250219",
- ) -> str:
- """Run evaluation with MCP server tools."""
- print("🚀 Starting Evaluation")
- client = Anthropic()
- tools = await connection.list_tools()
- print(f"📋 Loaded {len(tools)} tools from MCP server")
- qa_pairs = parse_evaluation_file(eval_path)
- print(f"📋 Loaded {len(qa_pairs)} evaluation tasks")
- results = []
- for i, qa_pair in enumerate(qa_pairs):
- print(f"Processing task {i + 1}/{len(qa_pairs)}")
- result = await evaluate_single_task(client, model, qa_pair, tools, connection, i)
- results.append(result)
- correct = sum(r["score"] for r in results)
- accuracy = (correct / len(results)) * 100 if results else 0
- average_duration_s = sum(r["total_duration"] for r in results) / len(results) if results else 0
- average_tool_calls = sum(r["num_tool_calls"] for r in results) / len(results) if results else 0
- total_tool_calls = sum(r["num_tool_calls"] for r in results)
- report = REPORT_HEADER.format(
- correct=correct,
- total=len(results),
- accuracy=accuracy,
- average_duration_s=average_duration_s,
- average_tool_calls=average_tool_calls,
- total_tool_calls=total_tool_calls,
- )
- report += "".join([
- TASK_TEMPLATE.format(
- task_num=i + 1,
- question=qa_pair["question"],
- expected_answer=qa_pair["answer"],
- actual_answer=result["actual"] or "N/A",
- correct_indicator="✅" if result["score"] else "❌",
- total_duration=result["total_duration"],
- tool_calls=json.dumps(result["tool_calls"], indent=2),
- summary=result["summary"] or "N/A",
- feedback=result["feedback"] or "N/A",
- )
- for i, (qa_pair, result) in enumerate(zip(qa_pairs, results))
- ])
- return report
- def parse_headers(header_list: list[str]) -> dict[str, str]:
- """Parse header strings in format 'Key: Value' into a dictionary."""
- headers = {}
- if not header_list:
- return headers
- for header in header_list:
- if ":" in header:
- key, value = header.split(":", 1)
- headers[key.strip()] = value.strip()
- else:
- print(f"Warning: Ignoring malformed header: {header}")
- return headers
- def parse_env_vars(env_list: list[str]) -> dict[str, str]:
- """Parse environment variable strings in format 'KEY=VALUE' into a dictionary."""
- env = {}
- if not env_list:
- return env
- for env_var in env_list:
- if "=" in env_var:
- key, value = env_var.split("=", 1)
- env[key.strip()] = value.strip()
- else:
- print(f"Warning: Ignoring malformed environment variable: {env_var}")
- return env
- async def main():
- parser = argparse.ArgumentParser(
- description="Evaluate MCP servers using test questions",
- formatter_class=argparse.RawDescriptionHelpFormatter,
- epilog="""
- Examples:
- # Evaluate a local stdio MCP server
- python evaluation.py -t stdio -c python -a my_server.py eval.xml
- # Evaluate an SSE MCP server
- python evaluation.py -t sse -u https://example.com/mcp -H "Authorization: Bearer token" eval.xml
- # Evaluate an HTTP MCP server with custom model
- python evaluation.py -t http -u https://example.com/mcp -m claude-3-5-sonnet-20241022 eval.xml
- """,
- )
- parser.add_argument("eval_file", type=Path, help="Path to evaluation XML file")
- parser.add_argument("-t", "--transport", choices=["stdio", "sse", "http"], default="stdio", help="Transport type (default: stdio)")
- parser.add_argument("-m", "--model", default="claude-3-7-sonnet-20250219", help="Claude model to use (default: claude-3-7-sonnet-20250219)")
- stdio_group = parser.add_argument_group("stdio options")
- stdio_group.add_argument("-c", "--command", help="Command to run MCP server (stdio only)")
- stdio_group.add_argument("-a", "--args", nargs="+", help="Arguments for the command (stdio only)")
- stdio_group.add_argument("-e", "--env", nargs="+", help="Environment variables in KEY=VALUE format (stdio only)")
- remote_group = parser.add_argument_group("sse/http options")
- remote_group.add_argument("-u", "--url", help="MCP server URL (sse/http only)")
- remote_group.add_argument("-H", "--header", nargs="+", dest="headers", help="HTTP headers in 'Key: Value' format (sse/http only)")
- parser.add_argument("-o", "--output", type=Path, help="Output file for evaluation report (default: stdout)")
- args = parser.parse_args()
- if not args.eval_file.exists():
- print(f"Error: Evaluation file not found: {args.eval_file}")
- sys.exit(1)
- headers = parse_headers(args.headers) if args.headers else None
- env_vars = parse_env_vars(args.env) if args.env else None
- try:
- connection = create_connection(
- transport=args.transport,
- command=args.command,
- args=args.args,
- env=env_vars,
- url=args.url,
- headers=headers,
- )
- except ValueError as e:
- print(f"Error: {e}")
- sys.exit(1)
- print(f"🔗 Connecting to MCP server via {args.transport}...")
- async with connection:
- print("✅ Connected successfully")
- report = await run_evaluation(args.eval_file, connection, args.model)
- if args.output:
- args.output.write_text(report)
- print(f"\n✅ Report saved to {args.output}")
- else:
- print("\n" + report)
- if __name__ == "__main__":
- asyncio.run(main())
|