|
|
|
|
|
""" |
|
|
Test specific multimedia GAIA questions |
|
|
""" |
|
|
|
|
|
import os |
|
|
from dotenv import load_dotenv |
|
|
from app import BasicAgent |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
def test_specific_questions(): |
|
|
"""Test specific GAIA questions with multimedia""" |
|
|
|
|
|
|
|
|
agent = BasicAgent() |
|
|
api_key = os.getenv("ANTHROPIC_API_KEY") |
|
|
if not api_key: |
|
|
print("Error: ANTHROPIC_API_KEY not found in environment variables") |
|
|
return |
|
|
|
|
|
agent.set_api_key(api_key) |
|
|
|
|
|
|
|
|
test_cases = [ |
|
|
{ |
|
|
"question": "What is 2 + 2?", |
|
|
"expected": "4", |
|
|
"type": "simple" |
|
|
}, |
|
|
{ |
|
|
"question": 'In the video https://www.youtube.com/watch?v=1htKBjuUWec, Verma claims the existence of "a "moat" in the education system that provides a systemic advantage for those who know about it and can get into the pipeline." Verma\'s "moat" is a well-known advantage for students. What is the four-letter abbreviation used to describe this systemic advantage?', |
|
|
"expected": "STEM", |
|
|
"type": "youtube" |
|
|
}, |
|
|
{ |
|
|
"question": "Tell me the amount of sales in the sales sheet for the attached excel file.", |
|
|
"expected": "Unable to determine", |
|
|
"type": "excel" |
|
|
}, |
|
|
{ |
|
|
"question": "How many times is the word \"therefore\" used in the attached PDF?", |
|
|
"expected": "Unable to determine", |
|
|
"type": "pdf" |
|
|
}, |
|
|
{ |
|
|
"question": "In the attached Python code, I try to use the string method zfill. It does not work. Can you fix the problem for me and give me the only the complete corrected code?", |
|
|
"expected": "Unable to determine", |
|
|
"type": "code" |
|
|
} |
|
|
] |
|
|
|
|
|
correct = 0 |
|
|
for i, test_case in enumerate(test_cases, 1): |
|
|
question = test_case["question"] |
|
|
expected = test_case["expected"] |
|
|
q_type = test_case["type"] |
|
|
|
|
|
print(f"\nTest {i} ({q_type}): {question[:80]}...") |
|
|
print(f"Expected: {expected}") |
|
|
|
|
|
try: |
|
|
answer = agent(question) |
|
|
print(f"Got: {answer}") |
|
|
|
|
|
|
|
|
if q_type in ["excel", "pdf", "code"] and "Unable to determine" in answer: |
|
|
print("β
Correctly handled inaccessible file") |
|
|
correct += 1 |
|
|
elif expected.lower() in answer.lower(): |
|
|
print("β
Correct answer") |
|
|
correct += 1 |
|
|
else: |
|
|
print("β Incorrect answer") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error: {e}") |
|
|
|
|
|
print(f"\n{'='*80}") |
|
|
print(f"Score: {correct}/{len(test_cases)} ({correct/len(test_cases)*100:.0f}%)") |
|
|
print(f"{'='*80}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
test_specific_questions() |