Spaces:
Paused
Paused
File size: 4,518 Bytes
a52f96d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
"""
Diagnose why accuracy drops at the end of training.
Issues to investigate:
1. Evaluation task generation (are they consistent?)
2. Forgetting over time
3. Evaluation timing (before/after learning, before/after time advance)
"""
import numpy as np
from mock_student import MockStudentAgent
from mock_task_generator import MockTaskGenerator
def diagnose_evaluation():
"""Check if evaluation tasks are consistent."""
print("=" * 70)
print("DIAGNOSING ACCURACY DROP")
print("=" * 70)
generator = MockTaskGenerator(seed=42)
student = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05, seed=42)
topics = generator.get_available_topics()
# Create FIXED eval set
fixed_eval_tasks = [
generator.generate_task(topic, 'medium')
for topic in topics
for _ in range(3)
]
print(f"\n1. Fixed eval set created: {len(fixed_eval_tasks)} tasks")
# Check if regenerating tasks gives same tasks
print("\n2. Checking task consistency...")
task1 = generator.generate_task('history', 'medium')
generator2 = MockTaskGenerator(seed=42)
task2 = generator2.generate_task('history', 'medium')
print(f" Same seed, same topic: {'SAME' if task1.question == task2.question else 'DIFFERENT'}")
# Simulate training and track accuracy
print("\n3. Simulating training with FIXED eval set...")
accuracies = []
time_points = []
for iteration in range(500):
# Random learning
import random
rng = random.Random(42 + iteration)
topic = rng.choice(topics)
difficulty = rng.choice(['easy', 'medium', 'hard'])
task = generator.generate_task(topic, difficulty)
student.learn(task)
student.advance_time(1.0)
# Evaluate on FIXED set
if iteration % 50 == 0:
acc = student.evaluate(fixed_eval_tasks)
accuracies.append(acc)
time_points.append(student.current_time)
print(f" Iteration {iteration:3d}, Time: {student.current_time:5.1f}, Acc: {acc:.3f}")
print(f"\n Accuracy trend: {accuracies[0]:.3f} β {accuracies[-1]:.3f}")
# Now check what happens with REGENERATED eval tasks
print("\n4. Simulating with REGENERATED eval tasks each time...")
student2 = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05, seed=42)
generator2 = MockTaskGenerator(seed=42)
accuracies2 = []
for iteration in range(500):
topic = rng.choice(topics)
difficulty = rng.choice(['easy', 'medium', 'hard'])
task = generator2.generate_task(topic, difficulty)
student2.learn(task)
student2.advance_time(1.0)
if iteration % 50 == 0:
# Regenerate eval tasks
new_eval_tasks = [
generator2.generate_task(t, 'medium')
for t in topics
for _ in range(3)
]
acc = student2.evaluate(new_eval_tasks)
accuracies2.append(acc)
print(f"\n Accuracy trend: {accuracies2[0]:.3f} β {accuracies2[-1]:.3f}")
# Check forgetting effect
print("\n5. Checking forgetting effect...")
student3 = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05, seed=42)
generator3 = MockTaskGenerator(seed=42)
# Train intensively
for _ in range(100):
for topic in topics:
task = generator3.generate_task(topic, 'easy')
student3.learn(task)
# Evaluate immediately
eval_tasks = [generator3.generate_task(t, 'medium') for t in topics for _ in range(3)]
acc_before = student3.evaluate(eval_tasks)
# Advance time significantly
student3.advance_time(100.0)
acc_after = student3.evaluate(eval_tasks)
print(f" After intensive training: {acc_before:.3f}")
print(f" After 100 time units pass: {acc_after:.3f}")
print(f" Forgetting: {acc_before - acc_after:.3f}")
# Check retention formula
print("\n6. Retention calculation at different time points:")
base_skill = 1.0 # Perfect skill
forgetting_rate = 0.05
for time in [0, 50, 100, 200, 500]:
retention = np.exp(-forgetting_rate * time)
effective_skill = base_skill * retention
accuracy = 0.25 + 0.75 * effective_skill
print(f" Time={time:3d}: retention={retention:.3f}, accuracy={accuracy:.3f}")
if __name__ == "__main__":
diagnose_evaluation()
|