Spaces:
Sleeping
Sleeping
updated v2 api combined
Browse files
main.py
CHANGED
|
@@ -154,15 +154,72 @@ async def crawl_web(request: CrawlerRequest):
|
|
| 154 |
except Exception as e:
|
| 155 |
logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
|
| 156 |
raise HTTPException(status_code=500, detail=str(e))
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
class ScrapeAndCrawlRequest(BaseModel):
|
| 159 |
-
url: str
|
| 160 |
college_name: str
|
| 161 |
topic_title: str
|
| 162 |
model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
|
| 163 |
-
|
| 164 |
|
| 165 |
-
# Combined API endpoint
|
| 166 |
@app.post("/scrape-and-crawl")
|
| 167 |
async def scrape_and_crawl(
|
| 168 |
request: ScrapeAndCrawlRequest,
|
|
@@ -172,39 +229,38 @@ async def scrape_and_crawl(
|
|
| 172 |
if not x_api_key:
|
| 173 |
raise HTTPException(status_code=400, detail="API key is missing from the header")
|
| 174 |
|
| 175 |
-
logger.info(f"Received combined scrape and crawl request for
|
| 176 |
|
| 177 |
# Configure Google Generative AI API key from header
|
| 178 |
genai.configure(api_key=x_api_key)
|
| 179 |
-
|
| 180 |
-
# Scrape visible text from the provided URL asynchronously
|
| 181 |
-
visible_text = await scrape_visible_text(request.url)
|
| 182 |
-
|
| 183 |
-
# Structure the scraped data using the specified model from the request
|
| 184 |
-
structured_data = structure_data(visible_text, request.college_name)
|
| 185 |
|
| 186 |
-
#
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results)
|
| 190 |
|
| 191 |
-
#
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
-
# Return the
|
| 201 |
-
logger.info(f"Successfully processed combined request for
|
| 202 |
return {
|
| 203 |
-
"structured_data":
|
| 204 |
-
"all_links": all_links,
|
| 205 |
-
"filtered_links": filtered_links
|
| 206 |
}
|
| 207 |
-
|
| 208 |
except Exception as e:
|
| 209 |
logger.error(f"Error occurred while processing combined request: {e}")
|
| 210 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
| 154 |
except Exception as e:
|
| 155 |
logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
|
| 156 |
raise HTTPException(status_code=500, detail=str(e))
|
| 157 |
+
|
| 158 |
+
# # Updated Pydantic models
|
| 159 |
+
# class ScrapeAndCrawlRequest(BaseModel):
|
| 160 |
+
# url: str
|
| 161 |
+
# college_name: str
|
| 162 |
+
# topic_title: str
|
| 163 |
+
# model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
|
| 164 |
+
# num_results: int = 5 # Default number of results to fetch from Google, Quora, Reddit
|
| 165 |
+
|
| 166 |
+
# # Combined API endpoint
|
| 167 |
+
# @app.post("/scrape-and-crawl")
|
| 168 |
+
# async def scrape_and_crawl(
|
| 169 |
+
# request: ScrapeAndCrawlRequest,
|
| 170 |
+
# x_api_key: Optional[str] = Header(None) # API key to be passed in the request header
|
| 171 |
+
# ):
|
| 172 |
+
# try:
|
| 173 |
+
# if not x_api_key:
|
| 174 |
+
# raise HTTPException(status_code=400, detail="API key is missing from the header")
|
| 175 |
+
|
| 176 |
+
# logger.info(f"Received combined scrape and crawl request for URL: {request.url}, College Name: {request.college_name}, Topic: {request.topic_title}")
|
| 177 |
+
|
| 178 |
+
# # Configure Google Generative AI API key from header
|
| 179 |
+
# genai.configure(api_key=x_api_key)
|
| 180 |
+
|
| 181 |
+
# # Scrape visible text from the provided URL asynchronously
|
| 182 |
+
# visible_text = await scrape_visible_text(request.url)
|
| 183 |
+
|
| 184 |
+
# # Structure the scraped data using the specified model from the request
|
| 185 |
+
# structured_data = structure_data(visible_text, request.college_name)
|
| 186 |
+
|
| 187 |
+
# # Perform web crawling to get related links with customizable result count
|
| 188 |
+
# google_links = google_search(request.topic_title, num_results=request.num_results)
|
| 189 |
+
# quora_links = advanced_search_on_site("quora.com", request.topic_title, num_results=request.num_results)
|
| 190 |
+
# reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results)
|
| 191 |
+
|
| 192 |
+
# # Combine all links into one list
|
| 193 |
+
# all_links = google_links + quora_links + reddit_links
|
| 194 |
+
|
| 195 |
+
# # Use the specified model to filter and get the most relevant URLs
|
| 196 |
+
# prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{request.topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
|
| 197 |
+
# model = genai.GenerativeModel(request.model_name)
|
| 198 |
+
# response = model.generate_content(prompt)
|
| 199 |
+
# filtered_links = response.text.strip().split('\n')
|
| 200 |
+
|
| 201 |
+
# # Return the combined structured data and filtered links
|
| 202 |
+
# logger.info(f"Successfully processed combined request for URL: {request.url} and Topic: {request.topic_title}")
|
| 203 |
+
# return {
|
| 204 |
+
# "structured_data": structured_data,
|
| 205 |
+
# "all_links": all_links,
|
| 206 |
+
# "filtered_links": filtered_links
|
| 207 |
+
# }
|
| 208 |
+
|
| 209 |
+
# except Exception as e:
|
| 210 |
+
# logger.error(f"Error occurred while processing combined request: {e}")
|
| 211 |
+
# raise HTTPException(status_code=500, detail=str(e))
|
| 212 |
+
|
| 213 |
+
class SiteSearch(BaseModel):
|
| 214 |
+
site_url: str # Website to perform advanced search on
|
| 215 |
+
num_results: Optional[int] = 5 # Optional number of results to fetch, default is 5
|
| 216 |
+
|
| 217 |
class ScrapeAndCrawlRequest(BaseModel):
|
|
|
|
| 218 |
college_name: str
|
| 219 |
topic_title: str
|
| 220 |
model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
|
| 221 |
+
sites: list[SiteSearch] # List of websites and the number of results for each site
|
| 222 |
|
|
|
|
| 223 |
@app.post("/scrape-and-crawl")
|
| 224 |
async def scrape_and_crawl(
|
| 225 |
request: ScrapeAndCrawlRequest,
|
|
|
|
| 229 |
if not x_api_key:
|
| 230 |
raise HTTPException(status_code=400, detail="API key is missing from the header")
|
| 231 |
|
| 232 |
+
logger.info(f"Received combined scrape and crawl request for College: {request.college_name}, Topic: {request.topic_title}")
|
| 233 |
|
| 234 |
# Configure Google Generative AI API key from header
|
| 235 |
genai.configure(api_key=x_api_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
+
# Initialize lists to hold all crawled links and structured data
|
| 238 |
+
all_links = []
|
| 239 |
+
structured_data_list = []
|
|
|
|
| 240 |
|
| 241 |
+
# Perform advanced search on the provided sites with custom result counts
|
| 242 |
+
for site in request.sites:
|
| 243 |
+
logger.info(f"Performing advanced search on {site.site_url} for {site.num_results} results")
|
| 244 |
+
site_links = advanced_search_on_site(site.site_url, request.topic_title, num_results=site.num_results)
|
| 245 |
+
all_links.extend(site_links)
|
| 246 |
+
|
| 247 |
+
# Scrape visible text from each fetched link and structure the data
|
| 248 |
+
for link in all_links:
|
| 249 |
+
logger.info(f"Scraping visible text from link: {link}")
|
| 250 |
+
try:
|
| 251 |
+
visible_text = await scrape_visible_text(link) # Scrape the text
|
| 252 |
+
structured_data = structure_data(visible_text, request.college_name) # Structure it
|
| 253 |
+
structured_data_list.append({"link": link, "structured_data": structured_data})
|
| 254 |
+
except Exception as scrape_error:
|
| 255 |
+
logger.error(f"Error scraping link {link}: {scrape_error}")
|
| 256 |
+
continue # If scraping fails, continue with the next link
|
| 257 |
|
| 258 |
+
# Return the structured data for all successfully scraped links
|
| 259 |
+
logger.info(f"Successfully processed combined request for Topic: {request.topic_title}")
|
| 260 |
return {
|
| 261 |
+
"structured_data": structured_data_list
|
|
|
|
|
|
|
| 262 |
}
|
| 263 |
+
|
| 264 |
except Exception as e:
|
| 265 |
logger.error(f"Error occurred while processing combined request: {e}")
|
| 266 |
raise HTTPException(status_code=500, detail=str(e))
|