File size: 4,518 Bytes
a52f96d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
Diagnose why accuracy drops at the end of training.

Issues to investigate:
1. Evaluation task generation (are they consistent?)
2. Forgetting over time
3. Evaluation timing (before/after learning, before/after time advance)
"""

import numpy as np
from mock_student import MockStudentAgent
from mock_task_generator import MockTaskGenerator

def diagnose_evaluation():
    """Check if evaluation tasks are consistent."""
    print("=" * 70)
    print("DIAGNOSING ACCURACY DROP")
    print("=" * 70)
    
    generator = MockTaskGenerator(seed=42)
    student = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05, seed=42)
    
    topics = generator.get_available_topics()
    
    # Create FIXED eval set
    fixed_eval_tasks = [
        generator.generate_task(topic, 'medium')
        for topic in topics
        for _ in range(3)
    ]
    
    print(f"\n1. Fixed eval set created: {len(fixed_eval_tasks)} tasks")
    
    # Check if regenerating tasks gives same tasks
    print("\n2. Checking task consistency...")
    task1 = generator.generate_task('history', 'medium')
    generator2 = MockTaskGenerator(seed=42)
    task2 = generator2.generate_task('history', 'medium')
    print(f"   Same seed, same topic: {'SAME' if task1.question == task2.question else 'DIFFERENT'}")
    
    # Simulate training and track accuracy
    print("\n3. Simulating training with FIXED eval set...")
    accuracies = []
    time_points = []
    
    for iteration in range(500):
        # Random learning
        import random
        rng = random.Random(42 + iteration)
        topic = rng.choice(topics)
        difficulty = rng.choice(['easy', 'medium', 'hard'])
        
        task = generator.generate_task(topic, difficulty)
        student.learn(task)
        student.advance_time(1.0)
        
        # Evaluate on FIXED set
        if iteration % 50 == 0:
            acc = student.evaluate(fixed_eval_tasks)
            accuracies.append(acc)
            time_points.append(student.current_time)
            print(f"   Iteration {iteration:3d}, Time: {student.current_time:5.1f}, Acc: {acc:.3f}")
    
    print(f"\n   Accuracy trend: {accuracies[0]:.3f} β†’ {accuracies[-1]:.3f}")
    
    # Now check what happens with REGENERATED eval tasks
    print("\n4. Simulating with REGENERATED eval tasks each time...")
    student2 = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05, seed=42)
    generator2 = MockTaskGenerator(seed=42)
    accuracies2 = []
    
    for iteration in range(500):
        topic = rng.choice(topics)
        difficulty = rng.choice(['easy', 'medium', 'hard'])
        
        task = generator2.generate_task(topic, difficulty)
        student2.learn(task)
        student2.advance_time(1.0)
        
        if iteration % 50 == 0:
            # Regenerate eval tasks
            new_eval_tasks = [
                generator2.generate_task(t, 'medium')
                for t in topics
                for _ in range(3)
            ]
            acc = student2.evaluate(new_eval_tasks)
            accuracies2.append(acc)
    
    print(f"\n   Accuracy trend: {accuracies2[0]:.3f} β†’ {accuracies2[-1]:.3f}")
    
    # Check forgetting effect
    print("\n5. Checking forgetting effect...")
    student3 = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05, seed=42)
    generator3 = MockTaskGenerator(seed=42)
    
    # Train intensively
    for _ in range(100):
        for topic in topics:
            task = generator3.generate_task(topic, 'easy')
            student3.learn(task)
    
    # Evaluate immediately
    eval_tasks = [generator3.generate_task(t, 'medium') for t in topics for _ in range(3)]
    acc_before = student3.evaluate(eval_tasks)
    
    # Advance time significantly
    student3.advance_time(100.0)
    acc_after = student3.evaluate(eval_tasks)
    
    print(f"   After intensive training: {acc_before:.3f}")
    print(f"   After 100 time units pass: {acc_after:.3f}")
    print(f"   Forgetting: {acc_before - acc_after:.3f}")
    
    # Check retention formula
    print("\n6. Retention calculation at different time points:")
    base_skill = 1.0  # Perfect skill
    forgetting_rate = 0.05
    
    for time in [0, 50, 100, 200, 500]:
        retention = np.exp(-forgetting_rate * time)
        effective_skill = base_skill * retention
        accuracy = 0.25 + 0.75 * effective_skill
        print(f"   Time={time:3d}: retention={retention:.3f}, accuracy={accuracy:.3f}")

if __name__ == "__main__":
    diagnose_evaluation()