MentorFlow / teacher_agent_dev /diagnose_accuracy_drop.py
Cornelius
Deploy MentorFlow with GPU support
a52f96d
"""
Diagnose why accuracy drops at the end of training.
Issues to investigate:
1. Evaluation task generation (are they consistent?)
2. Forgetting over time
3. Evaluation timing (before/after learning, before/after time advance)
"""
import numpy as np
from mock_student import MockStudentAgent
from mock_task_generator import MockTaskGenerator
def diagnose_evaluation():
"""Check if evaluation tasks are consistent."""
print("=" * 70)
print("DIAGNOSING ACCURACY DROP")
print("=" * 70)
generator = MockTaskGenerator(seed=42)
student = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05, seed=42)
topics = generator.get_available_topics()
# Create FIXED eval set
fixed_eval_tasks = [
generator.generate_task(topic, 'medium')
for topic in topics
for _ in range(3)
]
print(f"\n1. Fixed eval set created: {len(fixed_eval_tasks)} tasks")
# Check if regenerating tasks gives same tasks
print("\n2. Checking task consistency...")
task1 = generator.generate_task('history', 'medium')
generator2 = MockTaskGenerator(seed=42)
task2 = generator2.generate_task('history', 'medium')
print(f" Same seed, same topic: {'SAME' if task1.question == task2.question else 'DIFFERENT'}")
# Simulate training and track accuracy
print("\n3. Simulating training with FIXED eval set...")
accuracies = []
time_points = []
for iteration in range(500):
# Random learning
import random
rng = random.Random(42 + iteration)
topic = rng.choice(topics)
difficulty = rng.choice(['easy', 'medium', 'hard'])
task = generator.generate_task(topic, difficulty)
student.learn(task)
student.advance_time(1.0)
# Evaluate on FIXED set
if iteration % 50 == 0:
acc = student.evaluate(fixed_eval_tasks)
accuracies.append(acc)
time_points.append(student.current_time)
print(f" Iteration {iteration:3d}, Time: {student.current_time:5.1f}, Acc: {acc:.3f}")
print(f"\n Accuracy trend: {accuracies[0]:.3f} β†’ {accuracies[-1]:.3f}")
# Now check what happens with REGENERATED eval tasks
print("\n4. Simulating with REGENERATED eval tasks each time...")
student2 = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05, seed=42)
generator2 = MockTaskGenerator(seed=42)
accuracies2 = []
for iteration in range(500):
topic = rng.choice(topics)
difficulty = rng.choice(['easy', 'medium', 'hard'])
task = generator2.generate_task(topic, difficulty)
student2.learn(task)
student2.advance_time(1.0)
if iteration % 50 == 0:
# Regenerate eval tasks
new_eval_tasks = [
generator2.generate_task(t, 'medium')
for t in topics
for _ in range(3)
]
acc = student2.evaluate(new_eval_tasks)
accuracies2.append(acc)
print(f"\n Accuracy trend: {accuracies2[0]:.3f} β†’ {accuracies2[-1]:.3f}")
# Check forgetting effect
print("\n5. Checking forgetting effect...")
student3 = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05, seed=42)
generator3 = MockTaskGenerator(seed=42)
# Train intensively
for _ in range(100):
for topic in topics:
task = generator3.generate_task(topic, 'easy')
student3.learn(task)
# Evaluate immediately
eval_tasks = [generator3.generate_task(t, 'medium') for t in topics for _ in range(3)]
acc_before = student3.evaluate(eval_tasks)
# Advance time significantly
student3.advance_time(100.0)
acc_after = student3.evaluate(eval_tasks)
print(f" After intensive training: {acc_before:.3f}")
print(f" After 100 time units pass: {acc_after:.3f}")
print(f" Forgetting: {acc_before - acc_after:.3f}")
# Check retention formula
print("\n6. Retention calculation at different time points:")
base_skill = 1.0 # Perfect skill
forgetting_rate = 0.05
for time in [0, 50, 100, 200, 500]:
retention = np.exp(-forgetting_rate * time)
effective_skill = base_skill * retention
accuracy = 0.25 + 0.75 * effective_skill
print(f" Time={time:3d}: retention={retention:.3f}, accuracy={accuracy:.3f}")
if __name__ == "__main__":
diagnose_evaluation()