Spaces:

iteratehack
/

MentorFlow

Paused

MentorFlow / teacher_agent_dev /test_teacher.py

Cornelius

Deploy MentorFlow with GPU support

a52f96d 12 days ago

8.46 kB

	"""Unit tests for Teacher Agent system."""

	import sys
	from pathlib import Path

	# Add parent directory to path for imports
	sys.path.insert(0, str(Path(__file__).parent))

	from mock_student import MockStudentAgent
	from mock_task_generator import MockTaskGenerator
	from teacher_agent import TeacherAgent
	from interfaces import TeacherAction


	def test_mock_student_learning():
	"""Test that mock student learns."""
	print("Testing student learning...", end=" ")

	student = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.05)
	generator = MockTaskGenerator()

	# Test learning
	topic = 'history'
	tasks = [generator.generate_task(topic, 'easy') for _ in range(20)]

	accuracies = []
	for task in tasks:
	eval_tasks = [generator.generate_task(topic, 'easy') for _ in range(10)]
	acc = student.evaluate(eval_tasks)
	accuracies.append(acc)
	student.learn(task)

	# Student should improve
	improvement = accuracies[-1] - accuracies[0]
	assert improvement > 0.1, f"Student should improve! Improvement: {improvement:.3f}"

	print("✅ PASSED")
	print(f" Initial accuracy: {accuracies[0]:.3f}")
	print(f" Final accuracy: {accuracies[-1]:.3f}")
	print(f" Improvement: {improvement:.3f}")


	def test_mock_student_forgetting():
	"""Test that mock student forgets over time."""
	print("Testing student forgetting...", end=" ")

	student = MockStudentAgent(learning_rate=0.15, forgetting_rate=0.1)
	generator = MockTaskGenerator()

	# Train on one topic
	topic = 'science'
	for _ in range(30):
	task = generator.generate_task(topic, 'easy')
	student.learn(task)

	# Measure accuracy
	eval_tasks = [generator.generate_task(topic, 'easy') for _ in range(10)]
	acc_before = student.evaluate(eval_tasks)

	# Time passes without practice
	student.advance_time(50.0)

	acc_after = student.evaluate(eval_tasks)

	# Student should forget
	assert acc_after < acc_before - 0.05, f"Student should forget! Before: {acc_before:.3f}, After: {acc_after:.3f}"

	print("✅ PASSED")
	print(f" Accuracy before forgetting: {acc_before:.3f}")
	print(f" Accuracy after 50 time units: {acc_after:.3f}")
	print(f" Forgetting: {acc_before - acc_after:.3f}")


	def test_mock_student_initial_accuracy():
	"""Test that student starts at ~25% accuracy (random guessing)."""
	print("Testing initial student accuracy...", end=" ")

	student = MockStudentAgent()
	generator = MockTaskGenerator()

	# Evaluate on many tasks
	eval_tasks = [generator.generate_task('history', 'easy') for _ in range(100)]
	initial_acc = student.evaluate(eval_tasks)

	# Should be around 25% (random guessing on 4-choice MCQ)
	assert 0.15 < initial_acc < 0.35, f"Initial accuracy should be ~25%! Got: {initial_acc:.3f}"

	print("✅ PASSED")
	print(f" Initial accuracy: {initial_acc:.3f} (~25% expected)")


	def test_teacher_exploration():
	"""Test that teacher explores all actions."""
	print("Testing teacher exploration...", end=" ")

	teacher = TeacherAgent(exploration_bonus=5.0) # High exploration
	from mock_student import MockStudentAgent
	from interfaces import StudentState

	# Create minimal student state
	student = MockStudentAgent()

	actions_tried = set()
	for _ in range(100):
	student_state = student.get_state()
	action = teacher.select_action(student_state)
	actions_tried.add((action.topic, action.difficulty, action.is_review))
	teacher.update(action, 0.0) # Neutral reward

	# Teacher should explore many actions (now has 15 topics × 7 difficulties × 2 = 210 actions)
	expected_actions = 15 * 7 * 2 # topics × difficulties × review options
	assert len(actions_tried) > 20, f"Teacher should explore many actions! Only tried: {len(actions_tried)}"

	print("✅ PASSED")
	print(f" Unique actions tried: {len(actions_tried)}/{expected_actions}")


	def test_teacher_exploitation():
	"""Test that teacher exploits good actions."""
	print("Testing teacher exploitation...", end=" ")

	teacher = TeacherAgent(exploration_bonus=0.1) # Very low exploration
	from mock_student import MockStudentAgent

	student = MockStudentAgent()

	# Manually set one action to be very good
	best_action = TeacherAction(topic='history', difficulty='easy', is_review=False)
	best_action_idx = teacher._action_to_index(best_action)

	# First, try all actions once (cold start)
	for i in range(teacher.num_actions):
	test_action = teacher._index_to_action(i)
	if i == best_action_idx:
	teacher.update(test_action, 100.0) # Very high reward
	else:
	teacher.update(test_action, 0.0) # Low reward

	# Now teacher should prefer the best action
	selections = []
	for _ in range(50): # More samples for better statistics
	student_state = student.get_state()
	action = teacher.select_action(student_state)
	idx = teacher._action_to_index(action)
	selections.append(idx == best_action_idx)

	# Should select best action frequently
	exploit_rate = sum(selections) / len(selections)
	assert exploit_rate > 0.3, f"Teacher should exploit good actions! Exploit rate: {exploit_rate:.2f}"

	print("✅ PASSED")
	print(f" Best action selection rate: {exploit_rate:.2f}")


	def test_teacher_action_encoding():
	"""Test that action encoding/decoding works correctly."""
	print("Testing action encoding/decoding...", end=" ")

	teacher = TeacherAgent()

	# Test all actions
	for idx in range(teacher.num_actions):
	action1 = teacher._index_to_action(idx)
	idx2 = teacher._action_to_index(action1)
	action2 = teacher._index_to_action(idx2)

	assert idx == idx2, f"Encoding mismatch! {idx} != {idx2}"
	assert action1.topic == action2.topic, "Topic mismatch"
	assert action1.difficulty == action2.difficulty, "Difficulty mismatch"
	assert action1.is_review == action2.is_review, "Review flag mismatch"

	print("✅ PASSED")
	print(f" Tested {teacher.num_actions} actions")


	def test_task_generator():
	"""Test that task generator creates valid tasks."""
	print("Testing task generator...", end=" ")

	generator = MockTaskGenerator()

	topics = generator.get_available_topics()
	difficulties = generator.get_available_difficulties()

	# Check that we have topics and difficulties (exact count may vary after expansion)
	assert len(topics) >= 5, f"Should have at least 5 topics, got {len(topics)}"
	assert len(difficulties) >= 3, f"Should have at least 3 difficulties, got {len(difficulties)}"

	# Generate tasks for all combinations
	for topic in topics:
	for difficulty in difficulties:
	task = generator.generate_task(topic, difficulty)
	assert len(task.choices) == 4, "Should have 4 choices"
	assert 0 <= task.answer < 4, "Answer should be valid index"
	assert task.topic == topic, "Topic should match"
	assert task.difficulty == difficulty, "Difficulty should match"

	print("✅ PASSED")
	print(f" Generated tasks for {len(topics)} topics × {len(difficulties)} difficulties")


	def run_all_tests():
	"""Run all tests."""
	print("=" * 70)
	print("RUNNING TESTS")
	print("=" * 70)
	print()

	tests = [
	test_task_generator,
	test_mock_student_initial_accuracy,
	test_mock_student_learning,
	test_mock_student_forgetting,
	test_teacher_action_encoding,
	test_teacher_exploration,
	test_teacher_exploitation,
	]

	passed = 0
	failed = 0

	for test_func in tests:
	try:
	test_func()
	passed += 1
	except AssertionError as e:
	print(f"❌ FAILED: {e}")
	failed += 1
	except Exception as e:
	print(f"❌ ERROR: {e}")
	import traceback
	traceback.print_exc()
	failed += 1
	print()

	print("=" * 70)
	print(f"TESTS COMPLETE: {passed} passed, {failed} failed")
	print("=" * 70)

	return failed == 0


	if __name__ == "__main__":
	success = run_all_tests()
	sys.exit(0 if success else 1)