"""Teacher Agent using Upper Confidence Bound (UCB) bandit algorithm.""" import numpy as np from typing import Dict, List from interfaces import TeacherAction, StudentState, TeacherAgentInterface def compute_reward( accuracy_before: float, accuracy_after: float, difficulty: str, is_review: bool ) -> float: """ Compute reward for teacher action. Reward structure: - Base: improvement in accuracy - Bonus: harder tasks encourage pushing boundaries - Bonus: successful reviews (spaced repetition) - Penalty: wasted reviews (student still remembers perfectly) """ improvement = accuracy_after - accuracy_before # Bonus for harder tasks (encourage pushing boundaries) - expanded for all 7 levels difficulty_bonus_map = { 'trivial': 0.2, 'easy': 0.5, 'medium': 1.0, 'hard': 2.0, 'expert': 3.0, 'master': 4.0, 'grandmaster': 5.0 } difficulty_bonus = difficulty_bonus_map.get(difficulty, 1.0) # Bonus for successful reviews (spaced repetition) review_bonus = 1.0 if (is_review and improvement > 0) else 0.0 # Penalty for wasted reviews (student still remembers perfectly) review_penalty = -0.5 if (is_review and accuracy_after > 0.9) else 0.0 return improvement + difficulty_bonus + review_bonus + review_penalty class TeacherAgent(TeacherAgentInterface): """ Teacher Agent using UCB (Upper Confidence Bound) bandit algorithm. Action space: Dynamically determined from task generator - Topics: From MockTaskGenerator (15 topics) - Difficulties: From MockTaskGenerator (7 difficulties: trivial→grandmaster) - Options: 2 (new vs review) UCB formula: UCB(a) = estimated_reward(a) + exploration_bonus × sqrt(log(total_pulls) / pulls(a)) Balances exploration (trying new actions) vs exploitation (using known-good actions). """ def __init__(self, exploration_bonus: float = 2.0, task_generator=None): """ Initialize teacher agent with dynamic action space. Args: exploration_bonus: Controls exploration vs exploitation balance. Higher = more exploration (try new actions) Lower = more exploitation (use known-good actions) task_generator: Optional MockTaskGenerator to get topics/difficulties. If None, uses default expanded set. """ self.exploration_bonus = exploration_bonus # Define action space dynamically if task_generator: self.topics = task_generator.get_available_topics() self.difficulties = task_generator.get_available_difficulties() else: # Default expanded set self.topics = [ 'history', 'science', 'literature', 'geography', 'current_events', 'mathematics', 'programming', 'philosophy', 'art', 'music', 'biology', 'chemistry', 'physics', 'economics', 'psychology' ] self.difficulties = ['trivial', 'easy', 'medium', 'hard', 'expert', 'master', 'grandmaster'] self.review_options = [False, True] # False = new, True = review # Create all action combinations self.actions = [ (topic, diff, review) for topic in self.topics for diff in self.difficulties for review in self.review_options ] self.num_actions = len(self.actions) # Now 15 topics × 7 difficulties × 2 = 210 actions # Track statistics per action self.action_counts = np.zeros(self.num_actions, dtype=np.float64) self.action_rewards = np.zeros(self.num_actions, dtype=np.float64) self.total_pulls = 0 def select_action(self, student_state: StudentState) -> TeacherAction: """ Select next action using UCB algorithm. For each action: - If never tried: select it (cold start) - Otherwise: compute UCB score and select highest """ # Cold start: try each action at least once untried_actions = [i for i in range(self.num_actions) if self.action_counts[i] == 0] if untried_actions: action_idx = self.total_pulls % len(untried_actions) selected_idx = untried_actions[action_idx] else: # All actions tried - use UCB ucb_scores = self._compute_ucb_scores() selected_idx = np.argmax(ucb_scores) return self._index_to_action(selected_idx) def _compute_ucb_scores(self) -> np.ndarray: """Compute UCB score for each action.""" scores = np.zeros(self.num_actions) for i in range(self.num_actions): if self.action_counts[i] == 0: # Never tried - give high score for exploration scores[i] = float('inf') else: # Estimated reward (average so far) estimated_reward = self.action_rewards[i] / self.action_counts[i] # Exploration bonus: sqrt(log(total_pulls) / pulls(action)) exploration_term = np.sqrt( np.log(max(1, self.total_pulls)) / self.action_counts[i] ) # UCB score = estimated reward + exploration bonus scores[i] = estimated_reward + self.exploration_bonus * exploration_term return scores def update(self, action: TeacherAction, reward: float): """ Update teacher policy based on reward. Uses running average: new_avg = old_avg + (reward - old_avg) / count """ action_idx = self._action_to_index(action) # Update statistics self.action_counts[action_idx] += 1 n = self.action_counts[action_idx] # Running average update old_avg = self.action_rewards[action_idx] / max(1, n - 1) if n > 1 else 0.0 self.action_rewards[action_idx] = (old_avg * (n - 1)) + reward self.total_pulls += 1 def _action_to_index(self, action: TeacherAction) -> int: """Convert TeacherAction to integer index.""" try: topic_idx = self.topics.index(action.topic) diff_idx = self.difficulties.index(action.difficulty) review_idx = int(action.is_review) # Encode: topic * (diffs * reviews) + diff * reviews + review index = ( topic_idx * (len(self.difficulties) * len(self.review_options)) + diff_idx * len(self.review_options) + review_idx ) return index except (ValueError, AttributeError): raise ValueError(f"Invalid action: {action}") def _index_to_action(self, index: int) -> TeacherAction: """Convert integer index to TeacherAction.""" if not (0 <= index < self.num_actions): raise ValueError(f"Invalid action index: {index}") # Decode: index -> (topic, difficulty, review) review_idx = index % len(self.review_options) diff_idx = (index // len(self.review_options)) % len(self.difficulties) topic_idx = index // (len(self.difficulties) * len(self.review_options)) return TeacherAction( topic=self.topics[topic_idx], difficulty=self.difficulties[diff_idx], is_review=bool(review_idx) ) def get_statistics(self) -> Dict: """Get teacher statistics for visualization.""" return { 'action_counts': self.action_counts.copy(), 'action_rewards': self.action_rewards.copy(), 'actions': self.actions.copy(), 'topics': self.topics.copy(), 'difficulties': self.difficulties.copy(), 'review_options': self.review_options.copy(), 'total_pulls': self.total_pulls }