File size: 8,406 Bytes
a52f96d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
"""Main training loop for Teacher Agent system."""

import numpy as np
from typing import Dict, Tuple
from interfaces import Task

from mock_student import MockStudentAgent
from mock_task_generator import MockTaskGenerator
from teacher_agent import TeacherAgent, compute_reward


def train_teacher(num_iterations: int = 500, verbose: bool = True, seed: int = 42) -> Tuple[Dict, TeacherAgent, MockStudentAgent]:
    """
    Train teacher agent with mock student.
    
    Args:
        num_iterations: Number of training iterations
        verbose: Whether to print progress
        seed: Random seed
    
    Returns:
        Tuple of (history dict, teacher agent, student agent)
    """
    # Initialize components
    generator = MockTaskGenerator(seed=seed)
    teacher = TeacherAgent(exploration_bonus=2.0, task_generator=generator)  # Pass generator for dynamic action space
    student = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.01, seed=seed)  # Reduced forgetting rate
    
    # Create evaluation set (held-out tasks for measuring student performance)
    eval_tasks = []
    for topic in generator.get_available_topics():
        for _ in range(3):  # 3 tasks per topic
            eval_tasks.append(generator.generate_task(topic, 'medium'))
    
    if verbose:
        print("=" * 70)
        print("TEACHER AGENT TRAINING")
        print("=" * 70)
        print(f"Iterations: {num_iterations}")
        print(f"Evaluation tasks: {len(eval_tasks)}")
        print(f"Action space: {teacher.num_actions} actions")
        print("=" * 70)
    
    # Track metrics
    history = {
        'iterations': [],
        'student_accuracies': [],
        'teacher_rewards': [],
        'actions': [],
        'topics': [],
        'difficulties': [],
        'is_reviews': []
    }
    
    for iteration in range(num_iterations):
        # 1. Get student state
        student_state = student.get_state()
        
        # 2. Teacher selects action
        action = teacher.select_action(student_state)
        
        # 3. Generate task
        # For reviews, use same topic but maybe different difficulty
        if action.is_review:
            # Review: use same topic, medium difficulty
            task = generator.generate_task(action.topic, 'medium')
        else:
            # New material: use specified topic and difficulty
            task = generator.generate_task(action.topic, action.difficulty)
        
        # 4. Evaluate student BEFORE learning
        accuracy_before = student.evaluate(eval_tasks)
        
        # 5. Student learns from task
        was_correct = student.learn(task)
        
        # 6. Evaluate student AFTER learning
        accuracy_after = student.evaluate(eval_tasks)
        
        # 7. Compute reward for teacher
        reward = compute_reward(
            accuracy_before, 
            accuracy_after, 
            action.difficulty, 
            action.is_review
        )
        
        # 8. Update teacher's policy
        teacher.update(action, reward)
        
        # 9. Time passes (for forgetting)
        student.advance_time(1.0)
        
        # 10. Log metrics
        history['iterations'].append(iteration)
        history['student_accuracies'].append(accuracy_after)
        history['teacher_rewards'].append(reward)
        history['actions'].append(action)
        history['topics'].append(action.topic)
        history['difficulties'].append(action.difficulty)
        history['is_reviews'].append(action.is_review)
        
        # 11. Print progress
        if verbose and (iteration % 50 == 0 or iteration == num_iterations - 1):
            window = min(50, iteration + 1)
            recent_rewards = history['teacher_rewards'][-window:]
            avg_reward = np.mean(recent_rewards) if recent_rewards else 0.0
            
            print(f"Iteration {iteration:3d} | "
                  f"Student Acc: {accuracy_after:.3f} | "
                  f"Avg Reward: {avg_reward:.3f} | "
                  f"Action: {action.topic[:3]}-{action.difficulty[:2]}-{'R' if action.is_review else 'N'}")
    
    if verbose:
        print("=" * 70)
        print(f"Final accuracy: {history['student_accuracies'][-1]:.3f}")
        print(f"Average reward: {np.mean(history['teacher_rewards']):.3f}")
        print("=" * 70)
    
    return history, teacher, student


def train_baseline_random(num_iterations: int = 500, seed: int = 42) -> Dict:
    """Train with random teacher (baseline)."""
    import random
    rng = random.Random(seed)
    
    student = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05, seed=seed)
    generator = MockTaskGenerator(seed=seed)
    
    topics = generator.get_available_topics()
    difficulties = generator.get_available_difficulties()
    
    eval_tasks = [
        generator.generate_task(topic, 'medium')
        for topic in topics
        for _ in range(3)
    ]
    
    history = {
        'iterations': [],
        'student_accuracies': [],
        'teacher_rewards': [],
        'actions': [],
        'topics': [],
        'difficulties': [],
        'is_reviews': []
    }
    
    for iteration in range(num_iterations):
        # Random action
        topic = rng.choice(topics)
        difficulty = rng.choice(difficulties)
        is_review = rng.random() < 0.3  # 30% chance of review
        
        task = generator.generate_task(topic, 'medium' if is_review else difficulty)
        
        accuracy_before = student.evaluate(eval_tasks)
        student.learn(task)
        accuracy_after = student.evaluate(eval_tasks)
        
        reward = compute_reward(accuracy_before, accuracy_after, difficulty, is_review)
        
        student.advance_time(1.0)
        
        history['iterations'].append(iteration)
        history['student_accuracies'].append(accuracy_after)
        history['teacher_rewards'].append(reward)
        history['topics'].append(topic)
        history['difficulties'].append(difficulty)
        history['is_reviews'].append(is_review)
    
    return history


def train_baseline_fixed(num_iterations: int = 500, seed: int = 42) -> Dict:
    """Train with fixed curriculum (easy→medium→hard, sequential topics)."""
    student = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05, seed=seed)
    generator = MockTaskGenerator(seed=seed)
    
    topics = generator.get_available_topics()
    difficulties = ['easy', 'medium', 'hard']
    
    eval_tasks = [
        generator.generate_task(topic, 'medium')
        for topic in topics
        for _ in range(3)
    ]
    
    history = {
        'iterations': [],
        'student_accuracies': [],
        'teacher_rewards': [],
        'actions': [],
        'topics': [],
        'difficulties': [],
        'is_reviews': []
    }
    
    # Fixed curriculum: cycle through topics, increase difficulty over time
    phase_length = num_iterations // (len(topics) * len(difficulties))
    
    for iteration in range(num_iterations):
        # Determine phase
        phase = iteration // phase_length
        topic_idx = (phase // len(difficulties)) % len(topics)
        diff_idx = phase % len(difficulties)
        
        topic = topics[topic_idx]
        difficulty = difficulties[diff_idx]
        
        task = generator.generate_task(topic, difficulty)
        
        accuracy_before = student.evaluate(eval_tasks)
        student.learn(task)
        accuracy_after = student.evaluate(eval_tasks)
        
        reward = compute_reward(accuracy_before, accuracy_after, difficulty, False)
        
        student.advance_time(1.0)
        
        history['iterations'].append(iteration)
        history['student_accuracies'].append(accuracy_after)
        history['teacher_rewards'].append(reward)
        history['topics'].append(topic)
        history['difficulties'].append(difficulty)
        history['is_reviews'].append(False)
    
    return history


if __name__ == "__main__":
    # Train teacher agent
    print("\n" + "=" * 70)
    print("TRAINING TEACHER AGENT")
    print("=" * 70)
    history, teacher, student = train_teacher(num_iterations=500, verbose=True)
    
    # Print statistics
    stats = teacher.get_statistics()
    print(f"\nTeacher Statistics:")
    print(f"  Total actions tried: {stats['total_pulls']}")
    print(f"  Unique actions: {np.sum(stats['action_counts'] > 0)}/{stats['total_pulls']}")