Spaces:
Paused
Paused
| """ | |
| Diagnose why accuracy drops at the end of training. | |
| Issues to investigate: | |
| 1. Evaluation task generation (are they consistent?) | |
| 2. Forgetting over time | |
| 3. Evaluation timing (before/after learning, before/after time advance) | |
| """ | |
| import numpy as np | |
| from mock_student import MockStudentAgent | |
| from mock_task_generator import MockTaskGenerator | |
| def diagnose_evaluation(): | |
| """Check if evaluation tasks are consistent.""" | |
| print("=" * 70) | |
| print("DIAGNOSING ACCURACY DROP") | |
| print("=" * 70) | |
| generator = MockTaskGenerator(seed=42) | |
| student = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05, seed=42) | |
| topics = generator.get_available_topics() | |
| # Create FIXED eval set | |
| fixed_eval_tasks = [ | |
| generator.generate_task(topic, 'medium') | |
| for topic in topics | |
| for _ in range(3) | |
| ] | |
| print(f"\n1. Fixed eval set created: {len(fixed_eval_tasks)} tasks") | |
| # Check if regenerating tasks gives same tasks | |
| print("\n2. Checking task consistency...") | |
| task1 = generator.generate_task('history', 'medium') | |
| generator2 = MockTaskGenerator(seed=42) | |
| task2 = generator2.generate_task('history', 'medium') | |
| print(f" Same seed, same topic: {'SAME' if task1.question == task2.question else 'DIFFERENT'}") | |
| # Simulate training and track accuracy | |
| print("\n3. Simulating training with FIXED eval set...") | |
| accuracies = [] | |
| time_points = [] | |
| for iteration in range(500): | |
| # Random learning | |
| import random | |
| rng = random.Random(42 + iteration) | |
| topic = rng.choice(topics) | |
| difficulty = rng.choice(['easy', 'medium', 'hard']) | |
| task = generator.generate_task(topic, difficulty) | |
| student.learn(task) | |
| student.advance_time(1.0) | |
| # Evaluate on FIXED set | |
| if iteration % 50 == 0: | |
| acc = student.evaluate(fixed_eval_tasks) | |
| accuracies.append(acc) | |
| time_points.append(student.current_time) | |
| print(f" Iteration {iteration:3d}, Time: {student.current_time:5.1f}, Acc: {acc:.3f}") | |
| print(f"\n Accuracy trend: {accuracies[0]:.3f} β {accuracies[-1]:.3f}") | |
| # Now check what happens with REGENERATED eval tasks | |
| print("\n4. Simulating with REGENERATED eval tasks each time...") | |
| student2 = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05, seed=42) | |
| generator2 = MockTaskGenerator(seed=42) | |
| accuracies2 = [] | |
| for iteration in range(500): | |
| topic = rng.choice(topics) | |
| difficulty = rng.choice(['easy', 'medium', 'hard']) | |
| task = generator2.generate_task(topic, difficulty) | |
| student2.learn(task) | |
| student2.advance_time(1.0) | |
| if iteration % 50 == 0: | |
| # Regenerate eval tasks | |
| new_eval_tasks = [ | |
| generator2.generate_task(t, 'medium') | |
| for t in topics | |
| for _ in range(3) | |
| ] | |
| acc = student2.evaluate(new_eval_tasks) | |
| accuracies2.append(acc) | |
| print(f"\n Accuracy trend: {accuracies2[0]:.3f} β {accuracies2[-1]:.3f}") | |
| # Check forgetting effect | |
| print("\n5. Checking forgetting effect...") | |
| student3 = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05, seed=42) | |
| generator3 = MockTaskGenerator(seed=42) | |
| # Train intensively | |
| for _ in range(100): | |
| for topic in topics: | |
| task = generator3.generate_task(topic, 'easy') | |
| student3.learn(task) | |
| # Evaluate immediately | |
| eval_tasks = [generator3.generate_task(t, 'medium') for t in topics for _ in range(3)] | |
| acc_before = student3.evaluate(eval_tasks) | |
| # Advance time significantly | |
| student3.advance_time(100.0) | |
| acc_after = student3.evaluate(eval_tasks) | |
| print(f" After intensive training: {acc_before:.3f}") | |
| print(f" After 100 time units pass: {acc_after:.3f}") | |
| print(f" Forgetting: {acc_before - acc_after:.3f}") | |
| # Check retention formula | |
| print("\n6. Retention calculation at different time points:") | |
| base_skill = 1.0 # Perfect skill | |
| forgetting_rate = 0.05 | |
| for time in [0, 50, 100, 200, 500]: | |
| retention = np.exp(-forgetting_rate * time) | |
| effective_skill = base_skill * retention | |
| accuracy = 0.25 + 0.75 * effective_skill | |
| print(f" Time={time:3d}: retention={retention:.3f}, accuracy={accuracy:.3f}") | |
| if __name__ == "__main__": | |
| diagnose_evaluation() | |