Vennilavan commited on
Commit
631eb6a
·
verified ·
1 Parent(s): e03b1b6

Upload 9 files

Browse files
Files changed (9) hide show
  1. .dockerignore +33 -0
  2. .env +8 -0
  3. Dockerfile +21 -0
  4. app.py +207 -0
  5. recommendation.py +400 -0
  6. requirements.txt +12 -0
  7. summarizer.py +28 -0
  8. transcriber.py +17 -0
  9. utils.py +42 -0
.dockerignore ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ env/
7
+ venv/
8
+ pip-log.txt
9
+ pip-delete-this-directory.txt
10
+ .tox
11
+ .coverage
12
+ .coverage.*
13
+ .cache
14
+ nosetests.xml
15
+ coverage.xml
16
+ *.cover
17
+ *.log
18
+ .git
19
+ .mypy_cache/
20
+ .pytest_cache/
21
+ .history
22
+ .DS_Store
23
+ *.mp4
24
+ *.avi
25
+ *.mov
26
+ *.mkv
27
+ *.wav
28
+ temp_*
29
+ test_*
30
+ .gitignore
31
+ README.md
32
+ Dockerfile
33
+ .dockerignore
.env ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Python Server Environment Variables
2
+
3
+ # Server URLs
4
+ FRONTEND_URL=https://vortex-ai-powered-lms.vercel.app
5
+ BACKEND_URL=https://vortex-ai-powered-lms-production.up.railway.app
6
+
7
+
8
+
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /code
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ ffmpeg \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Copy requirements and install Python dependencies
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # Copy application code
15
+ COPY . .
16
+
17
+ # Expose port
18
+ EXPOSE 7860
19
+
20
+ # Start the application
21
+ CMD uvicorn app:app --host 0.0.0.0 --port 7860
app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException, Query
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ import uvicorn
4
+ import os
5
+ import tempfile
6
+ import aiofiles
7
+ from datetime import datetime
8
+ import traceback
9
+ import logging
10
+ from typing import List, Dict, Any
11
+ import httpx
12
+
13
+ # Setup logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ app = FastAPI(title="Video Summarizer API")
18
+
19
+ # Load environment variables
20
+ import os
21
+ from dotenv import load_dotenv
22
+ load_dotenv()
23
+
24
+ # Get URLs from environment
25
+ FRONTEND_URL = os.getenv('FRONTEND_URL')
26
+ BACKEND_URL = os.getenv('BACKEND_URL', 'http://localhost:5000')
27
+
28
+ # CORS middleware
29
+ app.add_middleware(
30
+ CORSMiddleware,
31
+ allow_origins=[FRONTEND_URL, BACKEND_URL],
32
+ allow_credentials=True,
33
+ allow_methods=["*"],
34
+ allow_headers=["*"],
35
+ )
36
+
37
+ # Import processing functions with error handling
38
+ try:
39
+ from transcriber import extract_audio, transcribe_audio
40
+ from summarizer import summarize_text
41
+ from recommendation import recommend_courses
42
+ from utils import chunked_summarize
43
+ DEPENDENCIES_LOADED = True
44
+ logger.info("All AI dependencies loaded successfully")
45
+ except ImportError as e:
46
+ logger.error(f"Import error: {e}")
47
+ DEPENDENCIES_LOADED = False
48
+
49
+ @app.get("/")
50
+ async def root():
51
+ return {"message": "Video Summarizer API", "status": "running"}
52
+
53
+ @app.get("/health")
54
+ async def health_check():
55
+ status = "healthy" if DEPENDENCIES_LOADED else "missing_dependencies"
56
+ return {
57
+ "status": status,
58
+ "service": "python-video-processor",
59
+ "dependencies_loaded": DEPENDENCIES_LOADED
60
+ }
61
+
62
+ @app.post("/process-video")
63
+ async def process_video(video: UploadFile = File(...)):
64
+ if not DEPENDENCIES_LOADED:
65
+ raise HTTPException(
66
+ status_code=500,
67
+ detail="Required AI dependencies not loaded. Check server logs."
68
+ )
69
+
70
+ temp_video_path = None
71
+ audio_path = "temp_audio.wav"
72
+
73
+ try:
74
+ # Validate file type
75
+ allowed_extensions = {'.mp4', '.avi', '.mov', '.mkv', '.wmv'}
76
+ file_extension = os.path.splitext(video.filename)[1].lower()
77
+ if file_extension not in allowed_extensions:
78
+ raise HTTPException(
79
+ status_code=400,
80
+ detail=f"Invalid video format. Allowed: {', '.join(allowed_extensions)}"
81
+ )
82
+
83
+ # Create temporary file
84
+ temp_video_path = f"temp_{video.filename}"
85
+
86
+ # Save uploaded file
87
+ logger.info(f"Saving uploaded file: {video.filename}")
88
+ async with aiofiles.open(temp_video_path, 'wb') as out_file:
89
+ content = await video.read()
90
+ await out_file.write(content)
91
+
92
+ start_time = datetime.now()
93
+
94
+ # 1. Extract audio
95
+ logger.info("Step 1: Extracting audio from video...")
96
+ if not os.path.exists(temp_video_path):
97
+ raise HTTPException(status_code=500, detail="Video file not found after upload")
98
+
99
+ extract_audio(temp_video_path, audio_path)
100
+
101
+ if not os.path.exists(audio_path):
102
+ raise HTTPException(status_code=500, detail="Audio extraction failed")
103
+
104
+ # 2. Transcribe audio
105
+ logger.info("Step 2: Transcribing audio...")
106
+ transcript = transcribe_audio(audio_path, model_size="base")
107
+ logger.info(f"Transcript length: {len(transcript)} characters")
108
+
109
+ if not transcript or len(transcript.strip()) < 10:
110
+ raise HTTPException(status_code=500, detail="Transcription failed or too short")
111
+
112
+ # 3. Summarize text with chunking
113
+ logger.info("Step 3: Generating summary...")
114
+ final_summary = chunked_summarize(
115
+ text=transcript,
116
+ summarize_func=lambda text: summarize_text(text, model_name="facebook/bart-large-cnn"),
117
+ max_chunk_size=1500
118
+ )
119
+
120
+ if not final_summary or len(final_summary.strip()) < 10:
121
+ raise HTTPException(status_code=500, detail="Summary generation failed")
122
+
123
+ processing_time = (datetime.now() - start_time).total_seconds()
124
+
125
+ logger.info(f"Processing completed in {processing_time:.2f} seconds")
126
+
127
+ return {
128
+ "success": True,
129
+ "summary": final_summary,
130
+ "transcript": transcript,
131
+ "processing_time": processing_time
132
+ }
133
+
134
+ except Exception as e:
135
+ logger.error(f"Error processing video: {str(e)}")
136
+ logger.error(traceback.format_exc())
137
+ raise HTTPException(
138
+ status_code=500,
139
+ detail=f"Processing failed: {str(e)}"
140
+ )
141
+
142
+ finally:
143
+ # Cleanup temporary files
144
+ try:
145
+ if temp_video_path and os.path.exists(temp_video_path):
146
+ os.remove(temp_video_path)
147
+ logger.info(f"Cleaned up: {temp_video_path}")
148
+ if os.path.exists(audio_path):
149
+ os.remove(audio_path)
150
+ logger.info(f"Cleaned up: {audio_path}")
151
+ except Exception as cleanup_error:
152
+ logger.error(f"Cleanup error: {cleanup_error}")
153
+
154
+ @app.post("/recommend-courses")
155
+ async def get_course_recommendations(
156
+ enrolled_courses: List[Dict[str, Any]],
157
+ all_courses: List[Dict[str, Any]],
158
+ top_n: int = Query(5, description="Number of recommendations to return")
159
+ ):
160
+ """
161
+ Get course recommendations based on enrolled courses using AI semantic similarity
162
+ """
163
+ if not DEPENDENCIES_LOADED:
164
+ raise HTTPException(
165
+ status_code=500,
166
+ detail="Required AI dependencies not loaded. Check server logs."
167
+ )
168
+
169
+ try:
170
+ logger.info(f"Generating recommendations for {len(enrolled_courses)} enrolled courses from {len(all_courses)} total courses")
171
+
172
+ recommended_ids = recommend_courses(enrolled_courses, all_courses, top_n)
173
+
174
+ # Get the recommended course details
175
+ recommended_courses = [course for course in all_courses if course['id'] in recommended_ids]
176
+
177
+ logger.info(f"Successfully generated {len(recommended_courses)} recommendations")
178
+
179
+ return {
180
+ "success": True,
181
+ "recommendations": recommended_courses,
182
+ "count": len(recommended_courses)
183
+ }
184
+
185
+ except Exception as e:
186
+ logger.error(f"Error generating recommendations: {str(e)}")
187
+ logger.error(traceback.format_exc())
188
+ raise HTTPException(
189
+ status_code=500,
190
+ detail=f"Recommendation generation failed: {str(e)}"
191
+ )
192
+
193
+ if __name__ == "__main__":
194
+ logger.info("Starting Python Video Summarizer Server...")
195
+ logger.info("Dependencies loaded: %s", DEPENDENCIES_LOADED)
196
+
197
+ if not DEPENDENCIES_LOADED:
198
+ logger.error("CRITICAL: AI dependencies not loaded. Video processing will not work!")
199
+ logger.error("Please check that whisper-openai, transformers, and torch are installed.")
200
+
201
+ port = int(os.environ.get("PORT", 7860))
202
+ uvicorn.run(
203
+ "app:app",
204
+ host="0.0.0.0",
205
+ port=port,
206
+ reload=False
207
+ )
recommendation.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Compatibility fix for huggingface_hub - MUST BE AT TOP
2
+ import sys
3
+ try:
4
+ from huggingface_hub import snapshot_download
5
+ except ImportError:
6
+ try:
7
+ from huggingface_hub import cached_download as snapshot_download
8
+ except ImportError:
9
+ from huggingface_hub import hf_hub_download as snapshot_download
10
+
11
+ from transformers import pipeline
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+ import numpy as np
14
+ from sentence_transformers import SentenceTransformer
15
+ import logging
16
+ from typing import List, Dict, Set, Tuple, Optional
17
+ import time
18
+ import re
19
+
20
+ # Configure logging
21
+ logging.basicConfig(level=logging.INFO)
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Initialize the sentence transformer model for semantic similarity
25
+ try:
26
+ model = SentenceTransformer('all-MiniLM-L6-v2')
27
+ MODEL_LOADED = True
28
+ logger.info("Sentence transformer model loaded successfully")
29
+ except Exception as e:
30
+ logger.error(f"Failed to load sentence transformer model: {e}")
31
+ MODEL_LOADED = False
32
+
33
+ # Embedding cache for performance
34
+ embedding_cache = {}
35
+ last_cache_clear = time.time()
36
+ CACHE_TTL = 3600 # Clear cache every hour
37
+
38
+ # Configurable weights for scoring
39
+ SCORING_WEIGHTS = {
40
+ 'semantic_similarity': 0.5,
41
+ 'popularity': 0.2,
42
+ 'category_relevance': 0.3 # Increased weight for category relevance
43
+ }
44
+
45
+ # Enhanced category relationships with case-insensitive matching
46
+ RELATED_CATEGORIES = {
47
+ # Standardized category names (lowercase)
48
+ 'fullstack': {'web development': 1.0, 'frontend': 0.9, 'backend': 0.9, 'javascript': 0.8, 'react': 0.7, 'node.js': 0.7, 'php': 0.8},
49
+ 'full stack': {'web development': 1.0, 'frontend': 0.9, 'backend': 0.9, 'javascript': 0.8, 'react': 0.7, 'node.js': 0.7, 'php': 0.8},
50
+ 'php': {'web development': 0.9, 'backend': 0.8, 'fullstack': 0.7, 'mysql': 0.7, 'laravel': 0.6},
51
+ 'web development': {'fullstack': 1.0, 'frontend': 0.8, 'backend': 0.8, 'javascript': 0.9, 'html': 0.7, 'php': 0.8},
52
+ 'web dev': {'fullstack': 1.0, 'frontend': 0.8, 'backend': 0.8, 'javascript': 0.9, 'html': 0.7, 'php': 0.8},
53
+ 'frontend': {'web development': 0.9, 'html': 0.8, 'css': 0.8, 'javascript': 0.9, 'react': 0.8},
54
+ 'backend': {'web development': 0.9, 'node.js': 0.8, 'python': 0.7, 'database': 0.8, 'api': 0.7, 'php': 0.8},
55
+ 'cybersecurity': {'networking': 0.8, 'linux': 0.7, 'python': 0.6, 'ethical hacking': 0.9, 'security': 0.9},
56
+ 'cyber security': {'networking': 0.8, 'linux': 0.7, 'python': 0.6, 'ethical hacking': 0.9, 'security': 0.9},
57
+ 'aiml': {'python': 0.9, 'machine learning': 0.8, 'ai': 0.9, 'deep learning': 0.8, 'data science': 0.7},
58
+ 'ai/ml': {'python': 0.9, 'machine learning': 0.8, 'ai': 0.9, 'deep learning': 0.8, 'data science': 0.7},
59
+ 'ai ml': {'python': 0.9, 'machine learning': 0.8, 'ai': 0.9, 'deep learning': 0.8, 'data science': 0.7},
60
+ 'artificial intelligence': {'python': 0.9, 'machine learning': 0.8, 'ai': 0.9, 'deep learning': 0.8, 'data science': 0.7},
61
+ 'machine learning': {'data science': 0.9, 'python': 0.8, 'ai': 0.7, 'deep learning': 0.8},
62
+ 'data science': {'python': 0.9, 'machine learning': 0.8, 'statistics': 0.7, 'sql': 0.6},
63
+ 'mobile development': {'javascript': 0.7, 'react native': 0.9, 'flutter': 0.8, 'ios': 0.7},
64
+ 'devops': {'linux': 0.8, 'docker': 0.9, 'aws': 0.7, 'ci/cd': 0.8},
65
+ 'blockchain': {'javascript': 0.7, 'web3': 0.9, 'solidity': 0.8, 'cryptocurrency': 0.7},
66
+ 'javascript': {'web development': 0.9, 'frontend': 0.8, 'node.js': 0.7, 'react': 0.8},
67
+ 'python': {'data science': 0.8, 'backend': 0.7, 'machine learning': 0.8, 'automation': 0.6},
68
+ 'react': {'javascript': 0.9, 'frontend': 0.8, 'web development': 0.7},
69
+ 'reactjs': {'javascript': 0.9, 'frontend': 0.8, 'web development': 0.7},
70
+ 'node.js': {'javascript': 0.9, 'backend': 0.8, 'web development': 0.7},
71
+ 'nodejs': {'javascript': 0.9, 'backend': 0.8, 'web development': 0.7},
72
+ 'html': {'web development': 0.8, 'frontend': 0.9, 'css': 0.8},
73
+ 'css': {'web development': 0.8, 'frontend': 0.9, 'html': 0.8},
74
+ 'sql': {'database': 0.9, 'backend': 0.7, 'data science': 0.6},
75
+ 'java': {'backend': 0.8, 'spring': 0.9, 'enterprise': 0.7},
76
+ }
77
+
78
+ def normalize_category_name(category: str) -> str:
79
+ """Normalize category name to lowercase and handle common variations"""
80
+ if not category:
81
+ return ""
82
+
83
+ # Convert to lowercase and strip whitespace
84
+ normalized = category.lower().strip()
85
+
86
+ # Handle common variations
87
+ variations = {
88
+ 'ai/ml': 'aiml',
89
+ 'ai ml': 'aiml',
90
+ 'artificial intelligence': 'aiml',
91
+ 'full stack': 'fullstack',
92
+ 'web dev': 'web development',
93
+ 'cyber security': 'cybersecurity',
94
+ 'nodejs': 'node.js',
95
+ 'reactjs': 'react'
96
+ }
97
+
98
+ return variations.get(normalized, normalized)
99
+
100
+ def _clear_old_cache():
101
+ """Clear cache if TTL has expired"""
102
+ global last_cache_clear
103
+ current_time = time.time()
104
+ if current_time - last_cache_clear > CACHE_TTL:
105
+ embedding_cache.clear()
106
+ last_cache_clear = current_time
107
+ logger.info("Embedding cache cleared")
108
+
109
+ def get_course_embeddings_batch(courses: List[Dict]) -> Dict[str, np.ndarray]:
110
+ """Generate embeddings for multiple courses with caching"""
111
+ if not MODEL_LOADED:
112
+ raise Exception("AI model not loaded")
113
+
114
+ _clear_old_cache()
115
+
116
+ # Find courses that need embedding
117
+ courses_to_embed = []
118
+ course_ids_to_embed = []
119
+
120
+ for course in courses:
121
+ course_id = course['id']
122
+ if course_id not in embedding_cache:
123
+ courses_to_embed.append(course)
124
+ course_ids_to_embed.append(course_id)
125
+
126
+ # Generate embeddings for new courses
127
+ if courses_to_embed:
128
+ descriptions = [course.get('description', '') or 'No description available'
129
+ for course in courses_to_embed]
130
+
131
+ logger.info(f"Generating embeddings for {len(courses_to_embed)} courses")
132
+ embeddings = model.encode(descriptions)
133
+
134
+ # Cache the new embeddings
135
+ for course_id, embedding in zip(course_ids_to_embed, embeddings):
136
+ embedding_cache[course_id] = embedding
137
+
138
+ # Return all requested embeddings
139
+ result = {}
140
+ for course in courses:
141
+ course_id = course['id']
142
+ if course_id in embedding_cache:
143
+ result[course_id] = embedding_cache[course_id]
144
+
145
+ return result
146
+
147
+ def get_related_categories_with_scores(enrolled_categories: Set[str]) -> Dict[str, float]:
148
+ """
149
+ Get related categories with similarity scores based on enrolled categories
150
+ """
151
+ related_scores = {}
152
+
153
+ for category in enrolled_categories:
154
+ normalized_category = normalize_category_name(category)
155
+
156
+ # Try exact match first
157
+ if normalized_category in RELATED_CATEGORIES:
158
+ for related_cat, score in RELATED_CATEGORIES[normalized_category].items():
159
+ if related_cat not in enrolled_categories:
160
+ if related_cat in related_scores:
161
+ related_scores[related_cat] = max(related_scores[related_cat], score)
162
+ else:
163
+ related_scores[related_cat] = score
164
+ else:
165
+ # Try partial matching for unknown categories
166
+ for known_category, relations in RELATED_CATEGORIES.items():
167
+ if known_category in normalized_category or normalized_category in known_category:
168
+ for related_cat, score in relations.items():
169
+ if related_cat not in enrolled_categories:
170
+ if related_cat in related_scores:
171
+ related_scores[related_cat] = max(related_scores[related_cat], score * 0.7) # Lower confidence for partial matches
172
+ else:
173
+ related_scores[related_cat] = score * 0.7
174
+
175
+ return related_scores
176
+
177
+ def calculate_category_relevance(course_category: str,
178
+ enrolled_categories: Set[str],
179
+ related_categories: Dict[str, float]) -> float:
180
+ """Calculate how relevant a course category is to enrolled categories"""
181
+ normalized_course_category = normalize_category_name(course_category)
182
+ normalized_enrolled_categories = {normalize_category_name(cat) for cat in enrolled_categories}
183
+
184
+ # Direct match with enrolled categories
185
+ for enrolled_cat in normalized_enrolled_categories:
186
+ if enrolled_cat in normalized_course_category or normalized_course_category in enrolled_cat:
187
+ return 1.0
188
+
189
+ # Check related categories
190
+ for related_cat, score in related_categories.items():
191
+ normalized_related_cat = normalize_category_name(related_cat)
192
+ if normalized_related_cat in normalized_course_category or normalized_course_category in normalized_related_cat:
193
+ return score
194
+
195
+ return 0.0 # No relevance
196
+
197
+ def recommend_courses(enrolled_courses, all_courses, top_n=5):
198
+ """
199
+ Recommend courses based on enrolled courses using multi-factor scoring
200
+
201
+ Args:
202
+ enrolled_courses: List of courses the student is enrolled in
203
+ all_courses: List of all available courses
204
+ top_n: Number of recommendations to return
205
+
206
+ Returns:
207
+ List of recommended course IDs
208
+ """
209
+ if not MODEL_LOADED:
210
+ raise Exception("AI model not loaded")
211
+
212
+ if not enrolled_courses:
213
+ # If no enrolled courses, return popular courses
214
+ sorted_courses = sorted(all_courses,
215
+ key=lambda x: x.get('enrollment_count', 0),
216
+ reverse=True)
217
+ return [course['id'] for course in sorted_courses[:top_n]]
218
+
219
+ try:
220
+ # Get enrolled categories and related categories with scores
221
+ enrolled_categories = set(course['category'] for course in enrolled_courses)
222
+ related_categories = get_related_categories_with_scores(enrolled_categories)
223
+ enrolled_ids = set(course['id'] for course in enrolled_courses)
224
+
225
+ logger.info(f"Enrolled categories: {enrolled_categories}")
226
+ logger.info(f"Related categories: {list(related_categories.keys())}")
227
+
228
+ # Filter out enrolled courses
229
+ available_courses = [course for course in all_courses
230
+ if course['id'] not in enrolled_ids]
231
+
232
+ if not available_courses:
233
+ logger.warning("No available courses to recommend")
234
+ return []
235
+
236
+ # Get embeddings for all courses in batch
237
+ all_courses_for_embedding = enrolled_courses + available_courses
238
+ embeddings = get_course_embeddings_batch(all_courses_for_embedding)
239
+
240
+ # Calculate scores for each available course
241
+ scored_courses = []
242
+ enrolled_embeddings = [embeddings[course['id']] for course in enrolled_courses
243
+ if course['id'] in embeddings]
244
+
245
+ # Calculate popularity scores more robustly
246
+ enrollment_counts = [course.get('enrollment_count', 0) for course in available_courses]
247
+ max_enrollment = max(enrollment_counts) if enrollment_counts else 1
248
+ min_enrollment = min(enrollment_counts) if enrollment_counts else 0
249
+
250
+ for course in available_courses:
251
+ if course['id'] not in embeddings:
252
+ continue
253
+
254
+ course_embedding = embeddings[course['id']]
255
+
256
+ # Calculate semantic similarity
257
+ semantic_score = 0.0
258
+ if enrolled_embeddings:
259
+ similarities = cosine_similarity([course_embedding], enrolled_embeddings)[0]
260
+ semantic_score = float(np.mean(similarities))
261
+
262
+ # Calculate robust popularity score (normalized 0-1)
263
+ enrollment_count = course.get('enrollment_count', 0)
264
+ if max_enrollment > min_enrollment:
265
+ popularity_score = (enrollment_count - min_enrollment) / (max_enrollment - min_enrollment)
266
+ else:
267
+ popularity_score = 0.5 # Default if all courses have same enrollment
268
+
269
+ # Calculate category relevance
270
+ category_relevance = calculate_category_relevance(
271
+ course['category'], enrolled_categories, related_categories
272
+ )
273
+
274
+ # Combined score with category relevance having more weight
275
+ combined_score = (
276
+ semantic_score * SCORING_WEIGHTS['semantic_similarity'] +
277
+ popularity_score * SCORING_WEIGHTS['popularity'] +
278
+ category_relevance * SCORING_WEIGHTS['category_relevance']
279
+ )
280
+
281
+ scored_courses.append((course, combined_score, semantic_score, popularity_score, category_relevance))
282
+
283
+ # Sort by combined score
284
+ scored_courses.sort(key=lambda x: x[1], reverse=True)
285
+
286
+ # Apply diversity boost
287
+ final_recommendations = _apply_diversity_boost(scored_courses, top_n)
288
+
289
+ # Log recommendation details
290
+ logger.info("=== Recommendation Details ===")
291
+ for i, (course, combined_score, semantic_score, popularity_score, category_relevance) in enumerate(scored_courses[:top_n]):
292
+ logger.info(f"{i+1}. {course['title']} (Category: {course['category']})")
293
+ logger.info(f" Score: {combined_score:.3f} (Semantic: {semantic_score:.3f}, Popularity: {popularity_score:.3f}, Category: {category_relevance:.3f})")
294
+
295
+ return [course['id'] for course in final_recommendations]
296
+
297
+ except Exception as e:
298
+ logger.error(f"Error generating recommendations: {e}")
299
+ return _fallback_recommendations(enrolled_courses, all_courses, top_n)
300
+
301
+ def _apply_diversity_boost(scored_courses: List[Tuple], top_n: int) -> List[Dict]:
302
+ """Ensure recommendations cover different categories"""
303
+ selected_courses = []
304
+ selected_categories = set()
305
+
306
+ for course, combined_score, semantic_score, popularity_score, category_relevance in scored_courses:
307
+ if len(selected_courses) >= top_n:
308
+ break
309
+
310
+ current_category = normalize_category_name(course['category'])
311
+
312
+ # If we already have this category, skip unless it's highly relevant
313
+ if current_category in selected_categories and category_relevance < 0.5:
314
+ continue
315
+
316
+ selected_courses.append(course)
317
+ selected_categories.add(current_category)
318
+
319
+ # If we don't have enough recommendations, add the highest scoring ones regardless of category
320
+ if len(selected_courses) < top_n:
321
+ remaining_slots = top_n - len(selected_courses)
322
+ for course, combined_score, semantic_score, popularity_score, category_relevance in scored_courses:
323
+ if course not in selected_courses:
324
+ selected_courses.append(course)
325
+ remaining_slots -= 1
326
+ if remaining_slots <= 0:
327
+ break
328
+
329
+ return selected_courses[:top_n]
330
+
331
+ def _fallback_recommendations(enrolled_courses: List[Dict],
332
+ all_courses: List[Dict], top_n: int) -> List[str]:
333
+ """Fallback recommendation strategy when main algorithm fails"""
334
+ logger.info("Using fallback recommendation strategy")
335
+
336
+ enrolled_categories = set(course['category'] for course in enrolled_courses)
337
+ enrolled_ids = set(course['id'] for course in enrolled_courses)
338
+
339
+ # Priority 1: Same categories, sorted by popularity
340
+ category_matches = [
341
+ course for course in all_courses
342
+ if course['category'] in enrolled_categories and course['id'] not in enrolled_ids
343
+ ]
344
+
345
+ if len(category_matches) >= top_n:
346
+ category_matches.sort(key=lambda x: x.get('enrollment_count', 0), reverse=True)
347
+ return [course['id'] for course in category_matches[:top_n]]
348
+
349
+ # Priority 2: Include related categories
350
+ related_categories_map = get_related_categories_with_scores(enrolled_categories)
351
+ related_matches = [
352
+ course for course in all_courses
353
+ if any(related_cat in course['category'] for related_cat in related_categories_map) and course['id'] not in enrolled_ids
354
+ ]
355
+
356
+ all_matches = category_matches + related_matches
357
+ if all_matches:
358
+ all_matches.sort(key=lambda x: x.get('enrollment_count', 0), reverse=True)
359
+ return [course['id'] for course in all_matches[:top_n]]
360
+
361
+ # Priority 3: Most popular courses overall
362
+ available_courses = [course for course in all_courses if course['id'] not in enrolled_ids]
363
+ available_courses.sort(key=lambda x: x.get('enrollment_count', 0), reverse=True)
364
+ return [course['id'] for course in available_courses[:top_n]]
365
+
366
+ # Legacy functions for backward compatibility
367
+ def get_course_embeddings(courses):
368
+ """Legacy function for backward compatibility"""
369
+ return get_course_embeddings_batch(courses)
370
+
371
+ def get_related_categories(enrolled_categories):
372
+ """Legacy function for backward compatibility"""
373
+ related_scores = get_related_categories_with_scores(set(enrolled_categories))
374
+ return list(related_scores.keys())
375
+
376
+ def rank_within_category(category_courses, enrolled_courses, all_courses, top_n):
377
+ """Legacy function for backward compatibility - simplified version"""
378
+ if not category_courses:
379
+ return []
380
+
381
+ # Use the main recommendation function but filter for category courses
382
+ all_courses_filtered = [course for course in all_courses if course in category_courses]
383
+ recommendations = recommend_courses(enrolled_courses, all_courses_filtered, top_n)
384
+
385
+ # Convert back to course objects
386
+ course_map = {course['id']: course for course in category_courses}
387
+ return [course_map[course_id] for course_id in recommendations if course_id in course_map]
388
+
389
+ def rank_other_courses(other_courses, enrolled_courses, all_courses, top_n):
390
+ """Legacy function for backward compatibility - simplified version"""
391
+ if not other_courses or top_n <= 0:
392
+ return []
393
+
394
+ # Use the main recommendation function but filter for other courses
395
+ all_courses_filtered = [course for course in all_courses if course in other_courses]
396
+ recommendations = recommend_courses(enrolled_courses, all_courses_filtered, top_n)
397
+
398
+ # Convert back to course objects
399
+ course_map = {course['id']: course for course in other_courses}
400
+ return [course_map[course_id] for course_id in recommendations if course_id in course_map]
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn==0.24.0
3
+ python-multipart==0.0.6
4
+ aiofiles==23.2.1
5
+ python-dotenv==1.0.0
6
+ httpx==0.25.2
7
+ sentence-transformers==2.2.2
8
+ scikit-learn==1.3.2
9
+ torch==2.1.0
10
+ transformers==4.35.2
11
+ openai-whisper==20231117
12
+ accelerate>=0.20.0
summarizer.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ def summarize_text(text: str, model_name: str = "facebook/bart-large-cnn", max_length: int = 300, min_length: int = 100) -> str:
4
+ try:
5
+ summarizer = pipeline("summarization", model=model_name)
6
+
7
+ # If text is too short, return as is
8
+ if len(text.split()) < 50:
9
+ return text
10
+
11
+ # Calculate appropriate max_length based on input
12
+ input_length = len(text.split())
13
+ adjusted_max_length = min(max_length, input_length // 2)
14
+ adjusted_min_length = min(min_length, adjusted_max_length // 3)
15
+
16
+ summary = summarizer(
17
+ text,
18
+ max_length=adjusted_max_length,
19
+ min_length=adjusted_min_length,
20
+ do_sample=False,
21
+ truncation=True
22
+ )
23
+ return summary[0]['summary_text']
24
+ except Exception as e:
25
+ print(f"Summarization error: {e}")
26
+ # Fallback: return the first part of the text
27
+ sentences = text.split('.')
28
+ return '. '.join(sentences[:3]) + '.'
transcriber.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import whisper
3
+ import os
4
+
5
+ def extract_audio(video_path: str, audio_path: str = "temp_audio.wav") -> str:
6
+ if os.path.exists(audio_path):
7
+ os.remove(audio_path)
8
+
9
+ command = ["ffmpeg", "-i", video_path, "-q:a", "0", "-map", "a", audio_path, "-y"]
10
+ subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
11
+ return audio_path
12
+
13
+ def transcribe_audio(audio_path: str, model_size: str = "base") -> str:
14
+ model = whisper.load_model(model_size)
15
+ result = model.transcribe(audio_path)
16
+ transcript = result["text"]
17
+ return transcript
utils.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200) -> list:
2
+ chunks = []
3
+ start = 0
4
+ text_length = len(text)
5
+
6
+ # If text is shorter than chunk_size, return as single chunk
7
+ if text_length <= chunk_size:
8
+ return [text]
9
+
10
+ while start < text_length:
11
+ end = min(start + chunk_size, text_length)
12
+ chunk = text[start:end]
13
+ chunks.append(chunk)
14
+ start += chunk_size - overlap
15
+
16
+ # Prevent infinite loop
17
+ if start >= text_length:
18
+ break
19
+
20
+ return chunks
21
+
22
+ def chunked_summarize(text: str, summarize_func, max_chunk_size: int = 1500) -> str:
23
+ if len(text) <= max_chunk_size:
24
+ return summarize_func(text)
25
+
26
+ text_chunks = chunk_text(text, chunk_size=max_chunk_size, overlap=200)
27
+ print(f"Processing {len(text_chunks)} chunks...")
28
+
29
+ partial_summaries = []
30
+ for i, chunk in enumerate(text_chunks):
31
+ print(f"Summarizing chunk {i+1}/{len(text_chunks)}...")
32
+ summary = summarize_func(chunk)
33
+ partial_summaries.append(summary)
34
+
35
+ combined_summary_input = " ".join(partial_summaries)
36
+
37
+ # Final summarization if combined text is still long
38
+ if len(combined_summary_input) > max_chunk_size:
39
+ print("Final summarization of combined chunks...")
40
+ return summarize_func(combined_summary_input)
41
+
42
+ return combined_summary_input