|
|
|
|
|
""" |
|
|
Run GAIA evaluation test |
|
|
""" |
|
|
|
|
|
import os |
|
|
from dotenv import load_dotenv |
|
|
from app import BasicAgent |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
def test_gaia_questions(): |
|
|
"""Test with GAIA questions""" |
|
|
|
|
|
|
|
|
agent = BasicAgent() |
|
|
api_key = os.getenv("ANTHROPIC_API_KEY") |
|
|
if not api_key: |
|
|
print("Error: ANTHROPIC_API_KEY not found in environment variables") |
|
|
return |
|
|
|
|
|
agent.set_api_key(api_key) |
|
|
|
|
|
|
|
|
questions = [ |
|
|
"How many lightning strikes occur on Earth each second? Round your answer to the nearest integer.", |
|
|
"In Audre Lorde's poem 'Diaspora', she repeats, \"home is\" three times. The last line ends \"and I am...\" what?", |
|
|
"On April 1, 2024, the French National Railway Company (SNCF) published an April Fool's joke on X (formerly Twitter) about a new model of train. What is the name of this model?", |
|
|
"In the video https://www.youtube.com/watch?v=1htKBjuUWec, Verma claims the existence of \"a \"moat\" in the education system that provides a systemic advantage for those who know about it and can get into the pipeline.\" Verma's \"moat\" is a well-known advantage for students. What is the four-letter abbreviation used to describe this systemic advantage?", |
|
|
"Whose X account (formerly Twitter) is this: @lbcmjc?", |
|
|
"What is the current population of Gabon?", |
|
|
"In the attached Python code, I try to use the string method zfill. It does not work. Can you fix the problem for me and give me the only the complete corrected code?", |
|
|
"In a park, there are three gardens: one with 5 tulips and 3 daisies, one with 6 marigolds and 4 petunias, and one with 8 hydrangeas, 2 jasmines, and twice as many roses as the first two gardens combined. How many flowers are there in total?", |
|
|
"What is the name of the only Israeli pitcher to ever play in the major leagues?", |
|
|
"When would a purple lightsaber be needed for the August 16, 2024, Lego Star Wars release?", |
|
|
"What is the sum of the first 20 terms of the arithmetic sequence where the first term is 5 and the common difference is 3?", |
|
|
"What percentage of Gabon is covered by forests?", |
|
|
"When did the Khorezm People's Soviet Republic cease to exist?", |
|
|
"As of January 2024, what is the latest OS update for iPad mini (5th generation)?", |
|
|
"Tell me the amount of sales in the sales sheet for the attached excel file.", |
|
|
"How many times is the word \"therefore\" used in the attached PDF?", |
|
|
"What item came in first on the Official Monster Raving Loony Party's 2019 manifesto?", |
|
|
"What is the hexadecimal value of the unicode character for 'Brain' emoji?", |
|
|
"What was the score of the Women's Handball World Championship match between Argentina and Austria on 4 December 2023?", |
|
|
"Which record producer is quoted in the Wikipedia article on James Blake's album \"Friends That Break Your Heart\"?" |
|
|
] |
|
|
|
|
|
correct = 0 |
|
|
for i, question in enumerate(questions, 1): |
|
|
print(f"\nQuestion {i}: {question}") |
|
|
try: |
|
|
answer = agent(question) |
|
|
print(f"Answer: {answer}") |
|
|
|
|
|
if answer and "error" not in answer.lower() and len(answer) < 100: |
|
|
correct += 1 |
|
|
except Exception as e: |
|
|
print(f"Error: {e}") |
|
|
|
|
|
print(f"\n{'='*80}") |
|
|
print(f"Final Score: {correct}/{len(questions)} ({correct/len(questions)*100:.1f}%)") |
|
|
print(f"{'='*80}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
test_gaia_questions() |