File size: 18,944 Bytes
631eb6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
# Compatibility fix for huggingface_hub - MUST BE AT TOP
import sys
try:
    from huggingface_hub import snapshot_download
except ImportError:
    try:
        from huggingface_hub import cached_download as snapshot_download
    except ImportError:
        from huggingface_hub import hf_hub_download as snapshot_download
        
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer
import logging
from typing import List, Dict, Set, Tuple, Optional
import time
import re

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize the sentence transformer model for semantic similarity
try:
    model = SentenceTransformer('all-MiniLM-L6-v2')
    MODEL_LOADED = True
    logger.info("Sentence transformer model loaded successfully")
except Exception as e:
    logger.error(f"Failed to load sentence transformer model: {e}")
    MODEL_LOADED = False

# Embedding cache for performance
embedding_cache = {}
last_cache_clear = time.time()
CACHE_TTL = 3600  # Clear cache every hour

# Configurable weights for scoring
SCORING_WEIGHTS = {
    'semantic_similarity': 0.5,
    'popularity': 0.2,
    'category_relevance': 0.3  # Increased weight for category relevance
}

# Enhanced category relationships with case-insensitive matching
RELATED_CATEGORIES = {
    # Standardized category names (lowercase)
    'fullstack': {'web development': 1.0, 'frontend': 0.9, 'backend': 0.9, 'javascript': 0.8, 'react': 0.7, 'node.js': 0.7, 'php': 0.8},
    'full stack': {'web development': 1.0, 'frontend': 0.9, 'backend': 0.9, 'javascript': 0.8, 'react': 0.7, 'node.js': 0.7, 'php': 0.8},
    'php': {'web development': 0.9, 'backend': 0.8, 'fullstack': 0.7, 'mysql': 0.7, 'laravel': 0.6},
    'web development': {'fullstack': 1.0, 'frontend': 0.8, 'backend': 0.8, 'javascript': 0.9, 'html': 0.7, 'php': 0.8},
    'web dev': {'fullstack': 1.0, 'frontend': 0.8, 'backend': 0.8, 'javascript': 0.9, 'html': 0.7, 'php': 0.8},
    'frontend': {'web development': 0.9, 'html': 0.8, 'css': 0.8, 'javascript': 0.9, 'react': 0.8},
    'backend': {'web development': 0.9, 'node.js': 0.8, 'python': 0.7, 'database': 0.8, 'api': 0.7, 'php': 0.8},
    'cybersecurity': {'networking': 0.8, 'linux': 0.7, 'python': 0.6, 'ethical hacking': 0.9, 'security': 0.9},
    'cyber security': {'networking': 0.8, 'linux': 0.7, 'python': 0.6, 'ethical hacking': 0.9, 'security': 0.9},
    'aiml': {'python': 0.9, 'machine learning': 0.8, 'ai': 0.9, 'deep learning': 0.8, 'data science': 0.7},
    'ai/ml': {'python': 0.9, 'machine learning': 0.8, 'ai': 0.9, 'deep learning': 0.8, 'data science': 0.7},
    'ai ml': {'python': 0.9, 'machine learning': 0.8, 'ai': 0.9, 'deep learning': 0.8, 'data science': 0.7},
    'artificial intelligence': {'python': 0.9, 'machine learning': 0.8, 'ai': 0.9, 'deep learning': 0.8, 'data science': 0.7},
    'machine learning': {'data science': 0.9, 'python': 0.8, 'ai': 0.7, 'deep learning': 0.8},
    'data science': {'python': 0.9, 'machine learning': 0.8, 'statistics': 0.7, 'sql': 0.6},
    'mobile development': {'javascript': 0.7, 'react native': 0.9, 'flutter': 0.8, 'ios': 0.7},
    'devops': {'linux': 0.8, 'docker': 0.9, 'aws': 0.7, 'ci/cd': 0.8},
    'blockchain': {'javascript': 0.7, 'web3': 0.9, 'solidity': 0.8, 'cryptocurrency': 0.7},
    'javascript': {'web development': 0.9, 'frontend': 0.8, 'node.js': 0.7, 'react': 0.8},
    'python': {'data science': 0.8, 'backend': 0.7, 'machine learning': 0.8, 'automation': 0.6},
    'react': {'javascript': 0.9, 'frontend': 0.8, 'web development': 0.7},
    'reactjs': {'javascript': 0.9, 'frontend': 0.8, 'web development': 0.7},
    'node.js': {'javascript': 0.9, 'backend': 0.8, 'web development': 0.7},
    'nodejs': {'javascript': 0.9, 'backend': 0.8, 'web development': 0.7},
    'html': {'web development': 0.8, 'frontend': 0.9, 'css': 0.8},
    'css': {'web development': 0.8, 'frontend': 0.9, 'html': 0.8},
    'sql': {'database': 0.9, 'backend': 0.7, 'data science': 0.6},
    'java': {'backend': 0.8, 'spring': 0.9, 'enterprise': 0.7},
}

def normalize_category_name(category: str) -> str:
    """Normalize category name to lowercase and handle common variations"""
    if not category:
        return ""
    
    # Convert to lowercase and strip whitespace
    normalized = category.lower().strip()
    
    # Handle common variations
    variations = {
        'ai/ml': 'aiml',
        'ai ml': 'aiml',
        'artificial intelligence': 'aiml',
        'full stack': 'fullstack',
        'web dev': 'web development',
        'cyber security': 'cybersecurity',
        'nodejs': 'node.js',
        'reactjs': 'react'
    }
    
    return variations.get(normalized, normalized)

def _clear_old_cache():
    """Clear cache if TTL has expired"""
    global last_cache_clear
    current_time = time.time()
    if current_time - last_cache_clear > CACHE_TTL:
        embedding_cache.clear()
        last_cache_clear = current_time
        logger.info("Embedding cache cleared")

def get_course_embeddings_batch(courses: List[Dict]) -> Dict[str, np.ndarray]:
    """Generate embeddings for multiple courses with caching"""
    if not MODEL_LOADED:
        raise Exception("AI model not loaded")
    
    _clear_old_cache()
    
    # Find courses that need embedding
    courses_to_embed = []
    course_ids_to_embed = []
    
    for course in courses:
        course_id = course['id']
        if course_id not in embedding_cache:
            courses_to_embed.append(course)
            course_ids_to_embed.append(course_id)
    
    # Generate embeddings for new courses
    if courses_to_embed:
        descriptions = [course.get('description', '') or 'No description available' 
                      for course in courses_to_embed]
        
        logger.info(f"Generating embeddings for {len(courses_to_embed)} courses")
        embeddings = model.encode(descriptions)
        
        # Cache the new embeddings
        for course_id, embedding in zip(course_ids_to_embed, embeddings):
            embedding_cache[course_id] = embedding
    
    # Return all requested embeddings
    result = {}
    for course in courses:
        course_id = course['id']
        if course_id in embedding_cache:
            result[course_id] = embedding_cache[course_id]
    
    return result

def get_related_categories_with_scores(enrolled_categories: Set[str]) -> Dict[str, float]:
    """

    Get related categories with similarity scores based on enrolled categories

    """
    related_scores = {}
    
    for category in enrolled_categories:
        normalized_category = normalize_category_name(category)
        
        # Try exact match first
        if normalized_category in RELATED_CATEGORIES:
            for related_cat, score in RELATED_CATEGORIES[normalized_category].items():
                if related_cat not in enrolled_categories:
                    if related_cat in related_scores:
                        related_scores[related_cat] = max(related_scores[related_cat], score)
                    else:
                        related_scores[related_cat] = score
        else:
            # Try partial matching for unknown categories
            for known_category, relations in RELATED_CATEGORIES.items():
                if known_category in normalized_category or normalized_category in known_category:
                    for related_cat, score in relations.items():
                        if related_cat not in enrolled_categories:
                            if related_cat in related_scores:
                                related_scores[related_cat] = max(related_scores[related_cat], score * 0.7)  # Lower confidence for partial matches
                            else:
                                related_scores[related_cat] = score * 0.7
    
    return related_scores

def calculate_category_relevance(course_category: str, 

                               enrolled_categories: Set[str],

                               related_categories: Dict[str, float]) -> float:
    """Calculate how relevant a course category is to enrolled categories"""
    normalized_course_category = normalize_category_name(course_category)
    normalized_enrolled_categories = {normalize_category_name(cat) for cat in enrolled_categories}
    
    # Direct match with enrolled categories
    for enrolled_cat in normalized_enrolled_categories:
        if enrolled_cat in normalized_course_category or normalized_course_category in enrolled_cat:
            return 1.0
    
    # Check related categories
    for related_cat, score in related_categories.items():
        normalized_related_cat = normalize_category_name(related_cat)
        if normalized_related_cat in normalized_course_category or normalized_course_category in normalized_related_cat:
            return score
    
    return 0.0  # No relevance

def recommend_courses(enrolled_courses, all_courses, top_n=5):
    """

    Recommend courses based on enrolled courses using multi-factor scoring

    

    Args:

        enrolled_courses: List of courses the student is enrolled in

        all_courses: List of all available courses

        top_n: Number of recommendations to return



    Returns:

        List of recommended course IDs

    """
    if not MODEL_LOADED:
        raise Exception("AI model not loaded")
    
    if not enrolled_courses:
        # If no enrolled courses, return popular courses
        sorted_courses = sorted(all_courses, 
                              key=lambda x: x.get('enrollment_count', 0), 
                              reverse=True)
        return [course['id'] for course in sorted_courses[:top_n]]
    
    try:
        # Get enrolled categories and related categories with scores
        enrolled_categories = set(course['category'] for course in enrolled_courses)
        related_categories = get_related_categories_with_scores(enrolled_categories)
        enrolled_ids = set(course['id'] for course in enrolled_courses)
        
        logger.info(f"Enrolled categories: {enrolled_categories}")
        logger.info(f"Related categories: {list(related_categories.keys())}")
        
        # Filter out enrolled courses
        available_courses = [course for course in all_courses 
                           if course['id'] not in enrolled_ids]
        
        if not available_courses:
            logger.warning("No available courses to recommend")
            return []
        
        # Get embeddings for all courses in batch
        all_courses_for_embedding = enrolled_courses + available_courses
        embeddings = get_course_embeddings_batch(all_courses_for_embedding)
        
        # Calculate scores for each available course
        scored_courses = []
        enrolled_embeddings = [embeddings[course['id']] for course in enrolled_courses 
                             if course['id'] in embeddings]
        
        # Calculate popularity scores more robustly
        enrollment_counts = [course.get('enrollment_count', 0) for course in available_courses]
        max_enrollment = max(enrollment_counts) if enrollment_counts else 1
        min_enrollment = min(enrollment_counts) if enrollment_counts else 0
        
        for course in available_courses:
            if course['id'] not in embeddings:
                continue
                
            course_embedding = embeddings[course['id']]
            
            # Calculate semantic similarity
            semantic_score = 0.0
            if enrolled_embeddings:
                similarities = cosine_similarity([course_embedding], enrolled_embeddings)[0]
                semantic_score = float(np.mean(similarities))
            
            # Calculate robust popularity score (normalized 0-1)
            enrollment_count = course.get('enrollment_count', 0)
            if max_enrollment > min_enrollment:
                popularity_score = (enrollment_count - min_enrollment) / (max_enrollment - min_enrollment)
            else:
                popularity_score = 0.5  # Default if all courses have same enrollment
            
            # Calculate category relevance
            category_relevance = calculate_category_relevance(
                course['category'], enrolled_categories, related_categories
            )
            
            # Combined score with category relevance having more weight
            combined_score = (
                semantic_score * SCORING_WEIGHTS['semantic_similarity'] +
                popularity_score * SCORING_WEIGHTS['popularity'] +
                category_relevance * SCORING_WEIGHTS['category_relevance']
            )
            
            scored_courses.append((course, combined_score, semantic_score, popularity_score, category_relevance))
        
        # Sort by combined score
        scored_courses.sort(key=lambda x: x[1], reverse=True)
        
        # Apply diversity boost
        final_recommendations = _apply_diversity_boost(scored_courses, top_n)
        
        # Log recommendation details
        logger.info("=== Recommendation Details ===")
        for i, (course, combined_score, semantic_score, popularity_score, category_relevance) in enumerate(scored_courses[:top_n]):
            logger.info(f"{i+1}. {course['title']} (Category: {course['category']})")
            logger.info(f"   Score: {combined_score:.3f} (Semantic: {semantic_score:.3f}, Popularity: {popularity_score:.3f}, Category: {category_relevance:.3f})")
        
        return [course['id'] for course in final_recommendations]
        
    except Exception as e:
        logger.error(f"Error generating recommendations: {e}")
        return _fallback_recommendations(enrolled_courses, all_courses, top_n)

def _apply_diversity_boost(scored_courses: List[Tuple], top_n: int) -> List[Dict]:
    """Ensure recommendations cover different categories"""
    selected_courses = []
    selected_categories = set()
    
    for course, combined_score, semantic_score, popularity_score, category_relevance in scored_courses:
        if len(selected_courses) >= top_n:
            break
            
        current_category = normalize_category_name(course['category'])
        
        # If we already have this category, skip unless it's highly relevant
        if current_category in selected_categories and category_relevance < 0.5:
            continue
            
        selected_courses.append(course)
        selected_categories.add(current_category)
    
    # If we don't have enough recommendations, add the highest scoring ones regardless of category
    if len(selected_courses) < top_n:
        remaining_slots = top_n - len(selected_courses)
        for course, combined_score, semantic_score, popularity_score, category_relevance in scored_courses:
            if course not in selected_courses:
                selected_courses.append(course)
                remaining_slots -= 1
                if remaining_slots <= 0:
                    break
    
    return selected_courses[:top_n]

def _fallback_recommendations(enrolled_courses: List[Dict], 

                            all_courses: List[Dict], top_n: int) -> List[str]:
    """Fallback recommendation strategy when main algorithm fails"""
    logger.info("Using fallback recommendation strategy")
    
    enrolled_categories = set(course['category'] for course in enrolled_courses)
    enrolled_ids = set(course['id'] for course in enrolled_courses)
    
    # Priority 1: Same categories, sorted by popularity
    category_matches = [
        course for course in all_courses
        if course['category'] in enrolled_categories and course['id'] not in enrolled_ids
    ]
    
    if len(category_matches) >= top_n:
        category_matches.sort(key=lambda x: x.get('enrollment_count', 0), reverse=True)
        return [course['id'] for course in category_matches[:top_n]]
    
    # Priority 2: Include related categories
    related_categories_map = get_related_categories_with_scores(enrolled_categories)
    related_matches = [
        course for course in all_courses
        if any(related_cat in course['category'] for related_cat in related_categories_map) and course['id'] not in enrolled_ids
    ]
    
    all_matches = category_matches + related_matches
    if all_matches:
        all_matches.sort(key=lambda x: x.get('enrollment_count', 0), reverse=True)
        return [course['id'] for course in all_matches[:top_n]]
    
    # Priority 3: Most popular courses overall
    available_courses = [course for course in all_courses if course['id'] not in enrolled_ids]
    available_courses.sort(key=lambda x: x.get('enrollment_count', 0), reverse=True)
    return [course['id'] for course in available_courses[:top_n]]

# Legacy functions for backward compatibility
def get_course_embeddings(courses):
    """Legacy function for backward compatibility"""
    return get_course_embeddings_batch(courses)

def get_related_categories(enrolled_categories):
    """Legacy function for backward compatibility"""
    related_scores = get_related_categories_with_scores(set(enrolled_categories))
    return list(related_scores.keys())

def rank_within_category(category_courses, enrolled_courses, all_courses, top_n):
    """Legacy function for backward compatibility - simplified version"""
    if not category_courses:
        return []
    
    # Use the main recommendation function but filter for category courses
    all_courses_filtered = [course for course in all_courses if course in category_courses]
    recommendations = recommend_courses(enrolled_courses, all_courses_filtered, top_n)
    
    # Convert back to course objects
    course_map = {course['id']: course for course in category_courses}
    return [course_map[course_id] for course_id in recommendations if course_id in course_map]

def rank_other_courses(other_courses, enrolled_courses, all_courses, top_n):
    """Legacy function for backward compatibility - simplified version"""
    if not other_courses or top_n <= 0:
        return []
    
    # Use the main recommendation function but filter for other courses
    all_courses_filtered = [course for course in all_courses if course in other_courses]
    recommendations = recommend_courses(enrolled_courses, all_courses_filtered, top_n)
    
    # Convert back to course objects
    course_map = {course['id']: course for course in other_courses}
    return [course_map[course_id] for course_id in recommendations if course_id in course_map]