Spaces:

iteratehack
/

MentorFlow

Paused

App Files Files Community

CorneliusWang commited on 11 days ago

Commit

d06d2e6

verified ·

1 Parent(s): eb20fef

Update teacher_agent_dev/compare_strategies.py

Browse files

Files changed (1) hide show

teacher_agent_dev/compare_strategies.py +131 -425

teacher_agent_dev/compare_strategies.py CHANGED Viewed

@@ -9,6 +9,8 @@ Uses LM Student (DistilBERT) instead of MockStudentAgent.
 import sys
 import os
 from pathlib import Path
 # Add student_agent_dev to path for LM student import
@@ -46,9 +48,6 @@ from train_teacher import train_teacher
 def evaluate_difficult_questions(student, generator: MockTaskGenerator, num_questions: int = 20) -> float:
     """
     Evaluate student on difficult questions from all topics.
-    Returns:
-        Accuracy on difficult questions (0.0 to 1.0)
     """
     topics = generator.get_available_topics()
     eval_tasks = []
@@ -65,82 +64,58 @@ def evaluate_difficult_questions(student, generator: MockTaskGenerator, num_ques
 def train_strategy_random(num_iterations: int = 500, seed: int = 42, target_accuracy: float = 0.75) -> Dict:
     """
     Strategy 1: Random questions until student can confidently pass difficult questions.
-    Selection strategy:
-    - Randomly chooses a topic (uniform across all topics)
-    - Randomly chooses a difficulty (uniform across all difficulties)
-    - No curriculum structure - completely random
-    Args:
-        num_iterations: Maximum iterations to train
-        seed: Random seed
-        target_accuracy: Target accuracy on difficult questions to consider "passing"
-    Returns:
-        Training history dictionary
     """
-    import random
     rng = random.Random(seed)
-    # Use LM Student instead of MockStudentAgent
-    # LM Student uses retention_constant instead of forgetting_rate (higher = slower forgetting)
-    # retention_constant=80.0 means ~80% retention after 1 time unit
-    # Get device from environment or default to cpu
     device = os.environ.get("CUDA_DEVICE", "cpu")
     if device == "cuda":
         try:
             import torch
             if torch.cuda.is_available():
-                try:
-                    # Verify GPU actually works
-                    gpu_name = torch.cuda.get_device_name(0)
-                    print(f"✅ Using GPU: {gpu_name}")
-                except Exception as e:
-                    print(f"⚠️ GPU access failed: {e}, using CPU")
-                    device = "cpu"
             else:
                 device = "cpu"
-                print("⚠️ CUDA not available, using CPU")
-        except ImportError:
-            device = "cpu"
-            print("⚠️ PyTorch not available, using CPU")
-        except Exception as e:
             device = "cpu"
-            print(f"⚠️ GPU check error: {e}, using CPU")
     print(f"🔧 LM Student device: {device}")
     student = LMStudentAgent(
-        learning_rate=5e-5,  # LM fine-tuning learning rate
-        retention_constant=80.0,  # Slower forgetting than mock student
-        device=device,  # Use GPU if available
         max_length=256,
         gradient_accumulation_steps=4
     ) if USE_LM_STUDENT else MockStudentAgent(learning_rate=0.15, forgetting_rate=0.01, seed=seed)
-    generator = MockTaskGenerator(seed=seed)
     topics = generator.get_available_topics()
     difficulties = generator.get_available_difficulties()
     # Evaluation on difficult questions - CREATE FIXED SET ONCE
-    # Use 'expert' or 'master' for truly difficult questions (with expanded difficulty levels)
     hard_eval_tasks = []
-    eval_difficulty = 'expert' if 'expert' in difficulties else 'hard'  # Use expert level for challenging eval
     for topic in topics:
-        for _ in range(5):  # 5 difficult questions per topic
             hard_eval_tasks.append(generator.generate_task(topic, eval_difficulty))
-    # Create FIXED general eval set (medium difficulty, all topics)
     general_eval_tasks = [
         generator.generate_task(topic, 'medium')
         for topic in topics
-        for _ in range(3)  # 3 tasks per topic
     ]
     history = {
         'iterations': [],
         'student_accuracies': [],
-        'difficult_accuracies': [],  # Accuracy on hard questions
         'teacher_rewards': [],
         'topics': [],
         'difficulties': [],
@@ -152,25 +127,19 @@ def train_strategy_random(num_iterations: int = 500, seed: int = 42, target_accu
         iterator = tqdm(iterator, desc="Random Strategy", unit="iter")
     for iteration in iterator:
-        # Random strategy: choose random topic AND random difficulty independently
-        topic = rng.choice(topics)           # Random topic
-        difficulty = rng.choice(difficulties)  # Random difficulty
         task = generator.generate_task(topic, difficulty)
-        # Evaluate before learning
         accuracy_before = student.evaluate(hard_eval_tasks)
-        # Student learns
         student.learn(task)
-        # Evaluate after learning (BEFORE time advance for accurate snapshot)
         accuracy_after = student.evaluate(hard_eval_tasks)
-        general_accuracy = student.evaluate(general_eval_tasks)  # Use FIXED eval set
         student.advance_time(1.0)
-        # Track metrics
         history['iterations'].append(iteration)
         history['student_accuracies'].append(general_accuracy)
         history['difficult_accuracies'].append(accuracy_after)
@@ -178,8 +147,7 @@ def train_strategy_random(num_iterations: int = 500, seed: int = 42, target_accu
         history['topics'].append(topic)
         history['difficulties'].append(difficulty)
-        # Check if we've reached target (optional early stopping)
-        if accuracy_after >= target_accuracy and iteration > 50:  # Give at least 50 iterations
             if 'reached_target' not in locals():
                 print(f"  Random strategy reached target accuracy {target_accuracy:.2f} at iteration {iteration}")
                 reached_target = True
@@ -190,20 +158,10 @@ def train_strategy_random(num_iterations: int = 500, seed: int = 42, target_accu
 def train_strategy_progressive(num_iterations: int = 500, seed: int = 42) -> Dict:
     """
     Strategy 2: Progressive difficulty within each family.
-    Easy → Medium → Hard for each topic, then move to next topic.
-    Args:
-        num_iterations: Number of iterations
-        seed: Random seed
-    Returns:
-        Training history dictionary
     """
-    # Reduce forgetting rate OR use periodic time reset for long training
-    # Option 1: Lower forgetting rate (better for long training)
-    # Option 2: Reset time periodically (keeps forgetting realistic but prevents complete loss)
-    # Using Option 1: lower forgetting rate
-    # Use LM Student instead of MockStudentAgent
     student = LMStudentAgent(
         learning_rate=5e-5,
         retention_constant=80.0,
@@ -211,26 +169,24 @@ def train_strategy_progressive(num_iterations: int = 500, seed: int = 42) -> Dic
         max_length=256,
         gradient_accumulation_steps=4
     ) if USE_LM_STUDENT else MockStudentAgent(learning_rate=0.15, forgetting_rate=0.01, seed=seed)
-    generator = MockTaskGenerator(seed=seed)
     topics = generator.get_available_topics()
     all_difficulties = generator.get_available_difficulties()
-    # Progressive: use all difficulties in order
-    difficulties = all_difficulties  # Use all 7 difficulty levels
-    # Evaluation on difficult questions - CREATE FIXED SET ONCE
-    # Use 'expert' or 'master' for truly difficult questions
     hard_eval_tasks = []
     eval_difficulty = 'expert' if 'expert' in all_difficulties else 'hard'
     for topic in topics:
         for _ in range(5):
             hard_eval_tasks.append(generator.generate_task(topic, eval_difficulty))
-    # Create FIXED general eval set (medium difficulty, all topics)
     general_eval_tasks = [
         generator.generate_task(topic, 'medium')
         for topic in topics
-        for _ in range(3)  # 3 tasks per topic
     ]
     history = {
@@ -243,8 +199,6 @@ def train_strategy_progressive(num_iterations: int = 500, seed: int = 42) -> Dic
         'strategy': 'progressive'
     }
-    # Progressive curriculum: cycle through topics, increase difficulty over time
-    # Structure: For each topic, do easy → medium → hard
     questions_per_difficulty = max(1, num_iterations // (len(topics) * len(difficulties)))
     iterator = range(num_iterations)
@@ -252,7 +206,6 @@ def train_strategy_progressive(num_iterations: int = 500, seed: int = 42) -> Dic
         iterator = tqdm(iterator, desc="Progressive Strategy", unit="iter")
     for iteration in iterator:
-        # Determine current phase
         phase = iteration // questions_per_difficulty if questions_per_difficulty > 0 else iteration
         topic_idx = (phase // len(difficulties)) % len(topics)
         diff_idx = phase % len(difficulties)
@@ -262,19 +215,14 @@ def train_strategy_progressive(num_iterations: int = 500, seed: int = 42) -> Dic
         task = generator.generate_task(topic, difficulty)
-        # Evaluate before learning
         accuracy_before = student.evaluate(hard_eval_tasks)
-        # Student learns
         student.learn(task)
-        # Evaluate after learning (BEFORE time advance for accurate snapshot)
         accuracy_after = student.evaluate(hard_eval_tasks)
-        general_accuracy = student.evaluate(general_eval_tasks)  # Use FIXED eval set
         student.advance_time(1.0)
-        # Track metrics
         history['iterations'].append(iteration)
         history['student_accuracies'].append(general_accuracy)
         history['difficult_accuracies'].append(accuracy_after)
@@ -288,18 +236,15 @@ def train_strategy_progressive(num_iterations: int = 500, seed: int = 42) -> Dic
 def train_strategy_teacher(num_iterations: int = 500, seed: int = 42) -> Dict:
     """
     Strategy 3: RL Teacher Agent learns optimal curriculum.
-    Args:
-        num_iterations: Number of iterations
-        seed: Random seed
-    Returns:
-        Training history dictionary with difficult_accuracies added
-    """
-    # Initialize components
-    generator = MockTaskGenerator(seed=seed)
-    teacher = TeacherAgent(exploration_bonus=2.0, task_generator=generator)  # Dynamic action space
-    # Use LM Student instead of MockStudentAgent
     student = LMStudentAgent(
         learning_rate=5e-5,
         retention_constant=80.0,
@@ -310,14 +255,12 @@ def train_strategy_teacher(num_iterations: int = 500, seed: int = 42) -> Dict:
     topics = generator.get_available_topics()
-    # Create evaluation sets
     eval_tasks = [
         generator.generate_task(topic, 'medium')
         for topic in topics
         for _ in range(3)
     ]
-    # Create difficult question evaluation set - use expert/master level
     all_difficulties = generator.get_available_difficulties()
     eval_difficulty = 'expert' if 'expert' in all_difficulties else 'hard'
     hard_eval_tasks = [
@@ -326,7 +269,6 @@ def train_strategy_teacher(num_iterations: int = 500, seed: int = 42) -> Dict:
         for _ in range(5)
     ]
-    # Track metrics
     history = {
         'iterations': [],
         'student_accuracies': [],
@@ -344,30 +286,22 @@ def train_strategy_teacher(num_iterations: int = 500, seed: int = 42) -> Dict:
         iterator = tqdm(iterator, desc="Teacher Strategy", unit="iter")
     for iteration in iterator:
-        # 1. Get student state
         student_state = student.get_state()
-        # 2. Teacher selects action
         action = teacher.select_action(student_state)
-        # 3. Generate task
         if action.is_review:
             task = generator.generate_task(action.topic, 'medium')
         else:
             task = generator.generate_task(action.topic, action.difficulty)
-        # 4. Evaluate student BEFORE learning
         accuracy_before = student.evaluate(eval_tasks)
         difficult_acc_before = student.evaluate(hard_eval_tasks)
-        # 5. Student learns from task
         student.learn(task)
-        # 6. Evaluate student AFTER learning
         accuracy_after = student.evaluate(eval_tasks)
         difficult_acc_after = student.evaluate(hard_eval_tasks)
-        # 7. Compute reward for teacher
         reward = compute_reward(
             accuracy_before,
             accuracy_after,
@@ -375,13 +309,9 @@ def train_strategy_teacher(num_iterations: int = 500, seed: int = 42) -> Dict:
             action.is_review
         )
-        # 8. Update teacher's policy
         teacher.update(action, reward)
-        # 9. Time passes (for forgetting)
         student.advance_time(1.0)
-        # 10. Log metrics
         history['iterations'].append(iteration)
         history['student_accuracies'].append(accuracy_after)
         history['difficult_accuracies'].append(difficult_acc_after)
@@ -397,231 +327,116 @@ def train_strategy_teacher(num_iterations: int = 500, seed: int = 42) -> Dict:
 def plot_comparison(histories: Dict[str, Dict], save_path: str = 'teacher_agent_dev/comparison_all_strategies.png'):
     """
     Create comprehensive comparison plots of all three strategies.
-    Args:
-        histories: Dictionary mapping strategy name to history
-                   e.g., {'Random': history1, 'Progressive': history2, 'Teacher': history3}
-        save_path: Where to save the plot
     """
     import matplotlib.pyplot as plt
     fig, axes = plt.subplots(4, 1, figsize=(16, 14))
-    # Define colors and styles for each strategy
     colors = {
         'Random': '#FF6B6B',      # Red
         'Progressive': '#4ECDC4', # Teal
-        'Teacher': '#2ECC71'      # Green (highlight teacher as best)
     }
     line_styles = {
-        'Random': '--',           # Dashed = stochastic/erratic
-        'Progressive': '-.',      # Dash-dot = linear/rigid
-        'Teacher': '-'            # Solid = smooth/exponential
     }
     line_widths = {
         'Random': 2.0,
         'Progressive': 2.0,
-        'Teacher': 3.5  # Much thicker line for teacher to emphasize exponential growth
     }
-    # 1. Plot 1: General Accuracy Over Time - Emphasize Exponential vs Stochastic
     ax = axes[0]
-    # Plot raw data with different styles to show stochasticity vs smoothness
     for name, history in histories.items():
         iterations = history['iterations']
         accuracies = history['student_accuracies']
-        if name == 'Teacher':
-            # Teacher: Show exponential growth clearly with smooth curve
-            # Less smoothing to show actual exponential curve
-            window = 10 if len(accuracies) > 50 else 5
             smoothed = np.convolve(accuracies, np.ones(window)/window, mode='same')
             ax.plot(iterations, smoothed,
-                   label=f'{name} (Exponential Growth)',
-                   color=colors[name],
-                   linestyle=line_styles[name],
-                   linewidth=line_widths[name],
-                   alpha=0.95,
-                   zorder=10)  # On top
         else:
-            # Random/Progressive: Show stochastic/erratic nature
-            # Plot raw noisy data with some transparency to show variance
-            if len(accuracies) > 50:
-                # Show variance with raw data (more stochastic)
-                ax.plot(iterations, accuracies,
-                       label=f'{name} (Stochastic/Erratic)',
-                       color=colors[name],
-                       linestyle=line_styles[name],
-                       linewidth=line_widths[name],
-                       alpha=0.4,  # Lighter to show noise
-                       zorder=1)
-                # Overlay smoothed version
-                window = 30
-                smoothed = np.convolve(accuracies, np.ones(window)/window, mode='same')
-                ax.plot(iterations, smoothed,
-                       color=colors[name],
-                       linestyle=line_styles[name],
-                       linewidth=line_widths[name],
-                       alpha=0.8)
-            else:
-                ax.plot(iterations, accuracies,
-                       label=f'{name} (Stochastic)',
-                       color=colors[name],
-                       linestyle=line_styles[name],
-                       linewidth=line_widths[name],
-                       alpha=0.8)
-    ax.set_xlabel('Training Iteration', fontsize=12, fontweight='bold')
-    ax.set_ylabel('General Accuracy', fontsize=12, fontweight='bold')
-    ax.set_title('Learning Curves: Exponential (Teacher) vs Stochastic (Baselines)', fontsize=14, fontweight='bold')
-    ax.legend(loc='lower right', fontsize=11, framealpha=0.9)
-    ax.grid(True, alpha=0.3, linestyle='--')
-    ax.set_ylim([0.2, 1.0])
-    # Add text annotation highlighting exponential vs stochastic
-    ax.text(0.02, 0.98,
-           '📈 Teacher: Smooth exponential growth\n📉 Baselines: Erratic, stochastic learning',
-           transform=ax.transAxes,
-           fontsize=10,
-           verticalalignment='top',
-           bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
-    # Add final accuracy annotations
-    for name, history in histories.items():
-        final_acc = history['student_accuracies'][-1]
-        final_iter = history['iterations'][-1]
-        ax.annotate(f'{final_acc:.3f}',
-                   xy=(final_iter, final_acc),
-                   xytext=(10, 10),
-                   textcoords='offset points',
-                   fontsize=10,
-                   bbox=dict(boxstyle='round,pad=0.3', facecolor=colors[name], alpha=0.5),
-                   arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))
-    # 2. Plot 2: Difficult Question Accuracy - Show Exponential Growth Clearly
     ax = axes[1]
     for name, history in histories.items():
         iterations = history['iterations']
         difficult_accuracies = history['difficult_accuracies']
-        if name == 'Teacher':
-            # Teacher: Emphasize exponential growth
-            window = 8  # Less smoothing to show exponential shape
             smoothed = np.convolve(difficult_accuracies, np.ones(window)/window, mode='same')
             ax.plot(iterations, smoothed,
-                   label=f'{name} (Exponential)',
-                   color=colors[name],
-                   linestyle=line_styles[name],
-                   linewidth=line_widths[name],
-                   alpha=0.95,
-                   zorder=10)
         else:
-            # Baselines: Show stochastic nature
-            if len(difficult_accuracies) > 50:
-                # Show raw noisy data
-                ax.plot(iterations, difficult_accuracies,
-                       label=f'{name} (Erratic)',
-                       color=colors[name],
-                       linestyle=line_styles[name],
-                       linewidth=line_widths[name],
-                       alpha=0.3,
-                       zorder=1)
-                # Overlay smoothed
-                window = 25
-                smoothed = np.convolve(difficult_accuracies, np.ones(window)/window, mode='same')
-                ax.plot(iterations, smoothed,
-                       color=colors[name],
-                       linestyle=line_styles[name],
-                       linewidth=line_widths[name],
-                       alpha=0.8)
-            else:
-                ax.plot(iterations, difficult_accuracies,
-                       label=name,
-                       color=colors[name],
-                       linestyle=line_styles[name],
-                       linewidth=line_widths[name],
-                       alpha=0.8)
-    ax.set_xlabel('Training Iteration', fontsize=12, fontweight='bold')
-    ax.set_ylabel('Accuracy on Difficult Questions', fontsize=12, fontweight='bold')
-    ax.set_title('Difficult Question Performance: Exponential vs Stochastic Learning',
-                fontsize=14, fontweight='bold', color='darkred')
-    ax.legend(loc='lower right', fontsize=11, framealpha=0.9)
-    ax.grid(True, alpha=0.3, linestyle='--')
-    ax.set_ylim([0.2, 1.0])
-    # Highlight target accuracy line (75%)
-    ax.axhline(y=0.75, color='gray', linestyle=':', linewidth=1, alpha=0.5)
-    # Add final accuracy annotations
-    for name, history in histories.items():
-        final_acc = history['difficult_accuracies'][-1]
-        final_iter = history['iterations'][-1]
-        ax.annotate(f'{final_acc:.3f}',
-                   xy=(final_iter, final_acc),
-                   xytext=(10, 10),
-                   textcoords='offset points',
-                   fontsize=10,
-                   bbox=dict(boxstyle='round,pad=0.3', facecolor=colors[name], alpha=0.3),
-                   arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))
-    # 3. Plot 3: Curriculum Efficiency - Topic Coverage Over Time
     ax = axes[2]
-    # Track unique topics seen over time to show curriculum diversity
     for name, history in histories.items():
         iterations = history['iterations']
         topics_seen = history['topics']
-        # Count unique topics up to each iteration
         unique_topics = []
         seen_so_far = set()
         for topic in topics_seen:
             seen_so_far.add(topic)
             unique_topics.append(len(seen_so_far))
-        if name == 'Teacher':
-            ax.plot(iterations, unique_topics,
-                   label=f'{name} (Diverse Curriculum)',
-                   color=colors[name],
-                   linestyle=line_styles[name],
-                   linewidth=line_widths[name],
-                   alpha=0.9,
-                   zorder=10,
-                   marker='o', markersize=3)
-        else:
-            ax.plot(iterations, unique_topics,
-                   label=f'{name}',
-                   color=colors[name],
-                   linestyle=line_styles[name],
-                   linewidth=line_widths[name],
-                   alpha=0.8,
-                   marker='s', markersize=2)
-    ax.set_xlabel('Training Iteration', fontsize=12, fontweight='bold')
-    ax.set_ylabel('Number of Unique Topics Covered', fontsize=12, fontweight='bold')
-    ax.set_title('Curriculum Diversity: Topic Coverage Over Time',
-                fontsize=14, fontweight='bold')
-    ax.legend(loc='lower right', fontsize=11, framealpha=0.9)
-    ax.grid(True, alpha=0.3, linestyle='--')
-    # Add total topics line if available
-    if histories:
-        first_history = list(histories.values())[0]
-        if 'topics' in first_history and first_history['topics']:
-            all_unique_topics = len(set(first_history['topics']))
-            ax.axhline(y=all_unique_topics, color='gray', linestyle=':',
-                      alpha=0.5, label=f'Total topics: {all_unique_topics}')
-            ax.legend(loc='lower right', fontsize=11, framealpha=0.9)
-    # 4. Plot 4: Learning Speed Comparison (Iterations to reach 75% on difficult)
-    ax = axes[3]
     target_acc = 0.75
     strategy_stats = {}
@@ -629,7 +444,6 @@ def plot_comparison(histories: Dict[str, Dict], save_path: str = 'teacher_agent_
         difficult_accuracies = history['difficult_accuracies']
         iterations = history['iterations']
-        # Find when target is reached
         reached_target = False
         target_iteration = len(iterations) - 1
@@ -645,7 +459,6 @@ def plot_comparison(histories: Dict[str, Dict], save_path: str = 'teacher_agent_
             'final_acc': difficult_accuracies[-1]
         }
-    # Create bar plot
     names = list(strategy_stats.keys())
     iterations_to_target = [
         strategy_stats[n]['iteration'] if strategy_stats[n]['reached'] else len(histories[n]['iterations'])
@@ -656,169 +469,62 @@ def plot_comparison(histories: Dict[str, Dict], save_path: str = 'teacher_agent_
     x = np.arange(len(names))
     width = 0.35
-    bars1 = ax.bar(x - width/2, iterations_to_target, width, label='Iterations to 75% on Difficult',
-                   color=[colors[n] for n in names], alpha=0.7)
-    bars2 = ax.bar(x + width/2, [acc * max(iterations_to_target) for acc in final_accs], width,
-                   label='Final Difficult Accuracy (scaled)',
-                   color=[colors[n] for n in names], alpha=0.5)
-    ax.set_xlabel('Strategy', fontsize=12, fontweight='bold')
-    ax.set_ylabel('Iterations / Scaled Accuracy', fontsize=12, fontweight='bold')
-    ax.set_title('Learning Efficiency: Iterations to Reach Target vs Final Performance',
-                fontsize=14, fontweight='bold')
     ax.set_xticks(x)
     ax.set_xticklabels(names)
-    ax.legend(fontsize=10, framealpha=0.9)
-    ax.grid(True, alpha=0.3, linestyle='--', axis='y')
-    # Add value labels on bars
-    for i, (bar1, bar2, name) in enumerate(zip(bars1, bars2, names)):
-        height1 = bar1.get_height()
-        height2 = bar2.get_height()
-        # Label for iterations
-        if strategy_stats[name]['reached']:
-            ax.text(bar1.get_x() + bar1.get_width()/2., height1,
-                   f'{int(height1)}',
-                   ha='center', va='bottom', fontsize=9, fontweight='bold')
-        else:
-            ax.text(bar1.get_x() + bar1.get_width()/2., height1,
-                   'Not reached',
-                   ha='center', va='bottom', fontsize=9, fontweight='bold')
-        # Label for final accuracy
-        ax.text(bar2.get_x() + bar2.get_width()/2., height2,
-               f'{final_accs[i]:.2f}',
-               ha='center', va='bottom', fontsize=9, fontweight='bold')
     plt.tight_layout()
-    plt.savefig(save_path, dpi=150, bbox_inches='tight')
     print(f"\n✅ Saved comparison plot to {save_path}")
     plt.close()
-    # Print summary statistics
-    print("\n" + "=" * 70)
-    print("STRATEGY COMPARISON SUMMARY")
-    print("=" * 70)
-    for name, stats in strategy_stats.items():
-        status = "✅ Reached" if stats['reached'] else "❌ Not reached"
-        print(f"{name:15s} | {status:15s} | Iterations: {stats['iteration']:4d} | Final Acc: {stats['final_acc']:.3f}")
-    print("=" * 70)
 if __name__ == "__main__":
     import argparse
     import time
-    parser = argparse.ArgumentParser(description='Compare training strategies with configurable randomness')
-    parser.add_argument('--seed', type=int, default=None,
-                       help='Random seed for reproducibility (default: None = use current time)')
-    parser.add_argument('--iterations', type=int, default=500,
-                       help='Number of training iterations (default: 500)')
-    parser.add_argument('--deterministic', action='store_true',
-                       help='Use fixed seed=42 for reproducible results (deterministic)')
-    parser.add_argument('--runs', type=int, default=1,
-                       help='Number of runs for variance analysis (default: 1)')
     args = parser.parse_args()
-    # Determine seed
     if args.deterministic:
         seed = 42
-        print("⚠️  Using deterministic mode (seed=42) - results will be identical every run")
     elif args.seed is not None:
         seed = args.seed
-        print(f"Using specified seed: {seed}")
     else:
-        seed = int(time.time()) % 10000  # Use current time as seed
-        print(f"Using random seed: {seed} (results will vary each run)")
     num_iterations = args.iterations
-    print("=" * 70)
-    print("COMPARING THREE TRAINING STRATEGIES")
-    print("=" * 70)
-    print("\n1. Random: Random questions until student can pass difficult")
-    print("2. Progressive: Easy → Medium → Hard within each family")
-    print("3. Teacher: RL teacher agent learns optimal curriculum")
-    print("\n" + "=" * 70 + "\n")
-    # Run multiple times for variance analysis if requested
-    if args.runs > 1:
-        print(f"Running {args.runs} times for variance analysis...\n")
-        all_results = {
-            'Random': [],
-            'Progressive': [],
-            'Teacher': []
-        }
-        for run in range(args.runs):
-            run_seed = seed + run  # Different seed for each run
-            print(f"Run {run + 1}/{args.runs} (seed={run_seed})...")
-            history_random = train_strategy_random(num_iterations=num_iterations, seed=run_seed)
-            history_progressive = train_strategy_progressive(num_iterations=num_iterations, seed=run_seed)
-            history_teacher = train_strategy_teacher(num_iterations=num_iterations, seed=run_seed)
-            all_results['Random'].append(history_random)
-            all_results['Progressive'].append(history_progressive)
-            all_results['Teacher'].append(history_teacher)
-        # Compute statistics across runs
-        print("\n" + "=" * 70)
-        print("VARIANCE ANALYSIS ACROSS RUNS")
-        print("=" * 70)
-        for strategy_name in ['Random', 'Progressive', 'Teacher']:
-            final_accs = [h['difficult_accuracies'][-1] for h in all_results[strategy_name]]
-            iterations_to_target = []
-            for h in all_results[strategy_name]:
-                target_acc = 0.75
-                reached = False
-                for i, acc in enumerate(h['difficult_accuracies']):
-                    if acc >= target_acc:
-                        iterations_to_target.append(i)
-                        reached = True
-                        break
-                if not reached:
-                    iterations_to_target.append(len(h['difficult_accuracies']))
-            mean_final = np.mean(final_accs)
-            std_final = np.std(final_accs)
-            mean_iters = np.mean(iterations_to_target)
-            std_iters = np.std(iterations_to_target)
-            print(f"\n{strategy_name}:")
-            print(f"  Final Accuracy: {mean_final:.3f} ± {std_final:.3f} (range: {min(final_accs):.3f} - {max(final_accs):.3f})")
-            print(f"  Iterations to Target: {mean_iters:.1f} ± {std_iters:.1f} (range: {min(iterations_to_target)} - {max(iterations_to_target)})")
-        # Use first run for plotting (or could average)
-        history_random = all_results['Random'][0]
-        history_progressive = all_results['Progressive'][0]
-        history_teacher = all_results['Teacher'][0]
-    else:
-        # Single run
-        # Train all three strategies
-        print("Training Random Strategy...")
-        history_random = train_strategy_random(num_iterations=num_iterations, seed=seed)
-        print("\nTraining Progressive Strategy...")
-        history_progressive = train_strategy_progressive(num_iterations=num_iterations, seed=seed)
-        print("\nTraining Teacher Strategy...")
-        history_teacher = train_strategy_teacher(num_iterations=num_iterations, seed=seed)
-    # Create comparison plots
-    print("\nGenerating comparison plots...")
     histories = {
         'Random': history_random,
         'Progressive': history_progressive,
         'Teacher': history_teacher
     }
-    plot_comparison(histories, save_path='comparison_all_strategies.png')
-    print("\n✅ Comparison complete! Check 'comparison_all_strategies.png'")
-    if not args.deterministic and args.seed is None:
-        print(f"💡 Tip: Results vary each run. Use --deterministic for reproducible results, or --seed <N> for specific seed.")

 import sys
 import os
+import random  # Added for global seeding
+import numpy as np # Added for global seeding
 from pathlib import Path
 # Add student_agent_dev to path for LM student import
 def evaluate_difficult_questions(student, generator: MockTaskGenerator, num_questions: int = 20) -> float:
     """
     Evaluate student on difficult questions from all topics.
     """
     topics = generator.get_available_topics()
     eval_tasks = []
 def train_strategy_random(num_iterations: int = 500, seed: int = 42, target_accuracy: float = 0.75) -> Dict:
     """
     Strategy 1: Random questions until student can confidently pass difficult questions.
     """
+    # Set global seeds to ensure MockTaskGenerator behaves deterministically
+    random.seed(seed)
+    np.random.seed(seed)
     rng = random.Random(seed)
     device = os.environ.get("CUDA_DEVICE", "cpu")
     if device == "cuda":
         try:
             import torch
             if torch.cuda.is_available():
+                print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")
             else:
                 device = "cpu"
+        except:
             device = "cpu"
     print(f"🔧 LM Student device: {device}")
     student = LMStudentAgent(
+        learning_rate=5e-5,
+        retention_constant=80.0,
+        device=device,
         max_length=256,
         gradient_accumulation_steps=4
     ) if USE_LM_STUDENT else MockStudentAgent(learning_rate=0.15, forgetting_rate=0.01, seed=seed)
+    # --- FIX 1: REMOVED seed=seed ---
+    generator = MockTaskGenerator()
     topics = generator.get_available_topics()
     difficulties = generator.get_available_difficulties()
     # Evaluation on difficult questions - CREATE FIXED SET ONCE
     hard_eval_tasks = []
+    eval_difficulty = 'expert' if 'expert' in difficulties else 'hard'
     for topic in topics:
+        for _ in range(5):
             hard_eval_tasks.append(generator.generate_task(topic, eval_difficulty))
+    # Create FIXED general eval set
     general_eval_tasks = [
         generator.generate_task(topic, 'medium')
         for topic in topics
+        for _ in range(3)
     ]
     history = {
         'iterations': [],
         'student_accuracies': [],
+        'difficult_accuracies': [],
         'teacher_rewards': [],
         'topics': [],
         'difficulties': [],
         iterator = tqdm(iterator, desc="Random Strategy", unit="iter")
     for iteration in iterator:
+        topic = rng.choice(topics)
+        difficulty = rng.choice(difficulties)
         task = generator.generate_task(topic, difficulty)
         accuracy_before = student.evaluate(hard_eval_tasks)
         student.learn(task)
         accuracy_after = student.evaluate(hard_eval_tasks)
+        general_accuracy = student.evaluate(general_eval_tasks)
         student.advance_time(1.0)
         history['iterations'].append(iteration)
         history['student_accuracies'].append(general_accuracy)
         history['difficult_accuracies'].append(accuracy_after)
         history['topics'].append(topic)
         history['difficulties'].append(difficulty)
+        if accuracy_after >= target_accuracy and iteration > 50:
             if 'reached_target' not in locals():
                 print(f"  Random strategy reached target accuracy {target_accuracy:.2f} at iteration {iteration}")
                 reached_target = True
 def train_strategy_progressive(num_iterations: int = 500, seed: int = 42) -> Dict:
     """
     Strategy 2: Progressive difficulty within each family.
     """
+    random.seed(seed)
+    np.random.seed(seed)
     student = LMStudentAgent(
         learning_rate=5e-5,
         retention_constant=80.0,
         max_length=256,
         gradient_accumulation_steps=4
     ) if USE_LM_STUDENT else MockStudentAgent(learning_rate=0.15, forgetting_rate=0.01, seed=seed)
+    # --- FIX 2: REMOVED seed=seed ---
+    generator = MockTaskGenerator()
     topics = generator.get_available_topics()
     all_difficulties = generator.get_available_difficulties()
+    difficulties = all_difficulties
     hard_eval_tasks = []
     eval_difficulty = 'expert' if 'expert' in all_difficulties else 'hard'
     for topic in topics:
         for _ in range(5):
             hard_eval_tasks.append(generator.generate_task(topic, eval_difficulty))
     general_eval_tasks = [
         generator.generate_task(topic, 'medium')
         for topic in topics
+        for _ in range(3)
     ]
     history = {
         'strategy': 'progressive'
     }
     questions_per_difficulty = max(1, num_iterations // (len(topics) * len(difficulties)))
     iterator = range(num_iterations)
         iterator = tqdm(iterator, desc="Progressive Strategy", unit="iter")
     for iteration in iterator:
         phase = iteration // questions_per_difficulty if questions_per_difficulty > 0 else iteration
         topic_idx = (phase // len(difficulties)) % len(topics)
         diff_idx = phase % len(difficulties)
         task = generator.generate_task(topic, difficulty)
         accuracy_before = student.evaluate(hard_eval_tasks)
         student.learn(task)
         accuracy_after = student.evaluate(hard_eval_tasks)
+        general_accuracy = student.evaluate(general_eval_tasks)
         student.advance_time(1.0)
         history['iterations'].append(iteration)
         history['student_accuracies'].append(general_accuracy)
         history['difficult_accuracies'].append(accuracy_after)
 def train_strategy_teacher(num_iterations: int = 500, seed: int = 42) -> Dict:
     """
     Strategy 3: RL Teacher Agent learns optimal curriculum.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    # --- FIX 3: REMOVED seed=seed ---
+    generator = MockTaskGenerator()
+    teacher = TeacherAgent(exploration_bonus=2.0, task_generator=generator)
     student = LMStudentAgent(
         learning_rate=5e-5,
         retention_constant=80.0,
     topics = generator.get_available_topics()
     eval_tasks = [
         generator.generate_task(topic, 'medium')
         for topic in topics
         for _ in range(3)
     ]
     all_difficulties = generator.get_available_difficulties()
     eval_difficulty = 'expert' if 'expert' in all_difficulties else 'hard'
     hard_eval_tasks = [
         for _ in range(5)
     ]
     history = {
         'iterations': [],
         'student_accuracies': [],
         iterator = tqdm(iterator, desc="Teacher Strategy", unit="iter")
     for iteration in iterator:
         student_state = student.get_state()
         action = teacher.select_action(student_state)
         if action.is_review:
             task = generator.generate_task(action.topic, 'medium')
         else:
             task = generator.generate_task(action.topic, action.difficulty)
         accuracy_before = student.evaluate(eval_tasks)
         difficult_acc_before = student.evaluate(hard_eval_tasks)
         student.learn(task)
         accuracy_after = student.evaluate(eval_tasks)
         difficult_acc_after = student.evaluate(hard_eval_tasks)
         reward = compute_reward(
             accuracy_before,
             accuracy_after,
             action.is_review
         )
         teacher.update(action, reward)
         student.advance_time(1.0)
         history['iterations'].append(iteration)
         history['student_accuracies'].append(accuracy_after)
         history['difficult_accuracies'].append(difficult_acc_after)
 def plot_comparison(histories: Dict[str, Dict], save_path: str = 'teacher_agent_dev/comparison_all_strategies.png'):
     """
     Create comprehensive comparison plots of all three strategies.
     """
     import matplotlib.pyplot as plt
+    # Ensure directory exists
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
     fig, axes = plt.subplots(4, 1, figsize=(16, 14))
     colors = {
         'Random': '#FF6B6B',      # Red
         'Progressive': '#4ECDC4', # Teal
+        'Teacher': '#2ECC71'      # Green
     }
     line_styles = {
+        'Random': '--',
+        'Progressive': '-.',
+        'Teacher': '-'
     }
     line_widths = {
         'Random': 2.0,
         'Progressive': 2.0,
+        'Teacher': 3.5
     }
+    # 1. Plot 1: General Accuracy
     ax = axes[0]
     for name, history in histories.items():
         iterations = history['iterations']
         accuracies = history['student_accuracies']
+        if len(accuracies) > 50:
+             # Smooth curves
+            window = 10
             smoothed = np.convolve(accuracies, np.ones(window)/window, mode='same')
             ax.plot(iterations, smoothed,
+                    label=name,
+                    color=colors[name],
+                    linestyle=line_styles[name],
+                    linewidth=line_widths[name],
+                    alpha=0.9)
         else:
+            ax.plot(iterations, accuracies,
+                    label=name,
+                    color=colors[name],
+                    linestyle=line_styles[name],
+                    linewidth=line_widths[name])
+    ax.set_xlabel('Training Iteration')
+    ax.set_ylabel('General Accuracy')
+    ax.set_title('Learning Curves')
+    ax.legend(loc='lower right')
+    ax.grid(True, alpha=0.3)
+    ax.set_ylim([0.0, 1.0])
+    # 2. Plot 2: Difficult Question Accuracy
     ax = axes[1]
     for name, history in histories.items():
         iterations = history['iterations']
         difficult_accuracies = history['difficult_accuracies']
+        if len(difficult_accuracies) > 50:
+            window = 10
             smoothed = np.convolve(difficult_accuracies, np.ones(window)/window, mode='same')
             ax.plot(iterations, smoothed,
+                    label=name,
+                    color=colors[name],
+                    linestyle=line_styles[name],
+                    linewidth=line_widths[name])
         else:
+            ax.plot(iterations, difficult_accuracies,
+                    label=name,
+                    color=colors[name],
+                    linestyle=line_styles[name],
+                    linewidth=line_widths[name])
+    ax.set_xlabel('Training Iteration')
+    ax.set_ylabel('Accuracy on Hard Questions')
+    ax.set_title('Performance on Difficult Content')
+    ax.legend(loc='lower right')
+    ax.grid(True, alpha=0.3)
+    ax.set_ylim([0.0, 1.0])
+    # 3. Plot 3: Topic Coverage
     ax = axes[2]
     for name, history in histories.items():
         iterations = history['iterations']
         topics_seen = history['topics']
         unique_topics = []
         seen_so_far = set()
         for topic in topics_seen:
             seen_so_far.add(topic)
             unique_topics.append(len(seen_so_far))
+        ax.plot(iterations, unique_topics,
+                label=name,
+                color=colors[name],
+                linestyle=line_styles[name],
+                linewidth=line_widths[name])
+    ax.set_xlabel('Training Iteration')
+    ax.set_ylabel('Unique Topics Seen')
+    ax.set_title('Curriculum Diversity')
+    ax.legend(loc='lower right')
+    ax.grid(True, alpha=0.3)
+    # 4. Plot 4: Learning Efficiency
+    ax = axes[3]
     target_acc = 0.75
     strategy_stats = {}
         difficult_accuracies = history['difficult_accuracies']
         iterations = history['iterations']
         reached_target = False
         target_iteration = len(iterations) - 1
             'final_acc': difficult_accuracies[-1]
         }
     names = list(strategy_stats.keys())
     iterations_to_target = [
         strategy_stats[n]['iteration'] if strategy_stats[n]['reached'] else len(histories[n]['iterations'])
     x = np.arange(len(names))
     width = 0.35
+    ax.bar(x - width/2, iterations_to_target, width, label='Iterations to 75% on Hard',
+           color=[colors[n] for n in names], alpha=0.7)
+    ax.bar(x + width/2, [acc * max(iterations_to_target) for acc in final_accs], width,
+           label='Final Hard Accuracy (scaled)',
+           color=[colors[n] for n in names], alpha=0.5)
+    ax.set_title('Learning Efficiency')
     ax.set_xticks(x)
     ax.set_xticklabels(names)
+    ax.legend()
     plt.tight_layout()
+    plt.savefig(save_path, dpi=150)
     print(f"\n✅ Saved comparison plot to {save_path}")
     plt.close()
 if __name__ == "__main__":
     import argparse
     import time
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--seed', type=int, default=None)
+    parser.add_argument('--iterations', type=int, default=500)
+    parser.add_argument('--deterministic', action='store_true')
+    parser.add_argument('--runs', type=int, default=1)
     args = parser.parse_args()
     if args.deterministic:
         seed = 42
+        print("⚠️  Using deterministic mode (seed=42)")
     elif args.seed is not None:
         seed = args.seed
     else:
+        seed = int(time.time()) % 10000
+    print(f"Using seed: {seed}")
     num_iterations = args.iterations
+    # Run strategies
+    print("Training Random Strategy...")
+    history_random = train_strategy_random(num_iterations=num_iterations, seed=seed)
+    print("\nTraining Progressive Strategy...")
+    history_progressive = train_strategy_progressive(num_iterations=num_iterations, seed=seed)
+    print("\nTraining Teacher Strategy...")
+    history_teacher = train_strategy_teacher(num_iterations=num_iterations, seed=seed)
     histories = {
         'Random': history_random,
         'Progressive': history_progressive,
         'Teacher': history_teacher
     }
+    plot_comparison(histories, save_path='teacher_agent_dev/comparison_all_strategies.png')
+    print("\n✅ Comparison complete!")