Spaces:
Sleeping
Sleeping
Commit
·
a88b526
1
Parent(s):
6db47e9
api v1.2
Browse files
main.py
CHANGED
|
@@ -7,6 +7,8 @@ from pydantic import BaseModel
|
|
| 7 |
import uvicorn
|
| 8 |
import asyncio
|
| 9 |
import json
|
|
|
|
|
|
|
| 10 |
|
| 11 |
# Load environment variables
|
| 12 |
load_dotenv()
|
|
@@ -56,6 +58,29 @@ def structure_data(text, college_name):
|
|
| 56 |
class URLRequest(BaseModel):
|
| 57 |
url: str
|
| 58 |
college_name: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
# FastAPI endpoint to scrape and structure data
|
| 61 |
@app.post("/scrape")
|
|
@@ -72,6 +97,36 @@ async def scrape_and_structure_data(request: URLRequest):
|
|
| 72 |
except Exception as e:
|
| 73 |
print(f"Error occurred while processing the request: {e}")
|
| 74 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
if __name__ == "__main__":
|
| 77 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
|
| 7 |
import uvicorn
|
| 8 |
import asyncio
|
| 9 |
import json
|
| 10 |
+
import requests
|
| 11 |
+
from bs4 import BeautifulSoup
|
| 12 |
|
| 13 |
# Load environment variables
|
| 14 |
load_dotenv()
|
|
|
|
| 58 |
class URLRequest(BaseModel):
|
| 59 |
url: str
|
| 60 |
college_name: str
|
| 61 |
+
# Pydantic model for Crawler request
|
| 62 |
+
class CrawlerRequest(BaseModel):
|
| 63 |
+
topic_title: str
|
| 64 |
+
|
| 65 |
+
# Function to perform Google search and return top N links
|
| 66 |
+
def google_search(query, num_results=5):
|
| 67 |
+
search_url = f"https://www.google.com/search?q={query}&num={num_results}"
|
| 68 |
+
headers = {
|
| 69 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
|
| 70 |
+
}
|
| 71 |
+
response = requests.get(search_url, headers=headers)
|
| 72 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 73 |
+
links = []
|
| 74 |
+
for a in soup.find_all('a', href=True, attrs={'jsname': True}):
|
| 75 |
+
link = a['href']
|
| 76 |
+
if link.startswith("https://"):
|
| 77 |
+
links.append(link)
|
| 78 |
+
return links[:num_results]
|
| 79 |
+
|
| 80 |
+
# Function to perform advanced search on specific sites
|
| 81 |
+
def advanced_search_on_site(site, topic, num_results=10):
|
| 82 |
+
query = f"site:{site} {topic}"
|
| 83 |
+
return google_search(query, num_results)
|
| 84 |
|
| 85 |
# FastAPI endpoint to scrape and structure data
|
| 86 |
@app.post("/scrape")
|
|
|
|
| 97 |
except Exception as e:
|
| 98 |
print(f"Error occurred while processing the request: {e}")
|
| 99 |
raise HTTPException(status_code=500, detail=str(e))
|
| 100 |
+
|
| 101 |
+
# FastAPI endpoint to perform web crawling
|
| 102 |
+
@app.post("/crawl")
|
| 103 |
+
async def crawl_web(request: CrawlerRequest):
|
| 104 |
+
try:
|
| 105 |
+
topic_title = request.topic_title
|
| 106 |
+
|
| 107 |
+
# Get top 5 links from Google search
|
| 108 |
+
google_links = google_search(topic_title, num_results=5)
|
| 109 |
+
|
| 110 |
+
# Get links from Quora
|
| 111 |
+
quora_links = advanced_search_on_site("quora.com", topic_title, num_results=10)
|
| 112 |
+
|
| 113 |
+
# Additional sites can be added similarly
|
| 114 |
+
other_links = advanced_search_on_site("reddit.com", topic_title, num_results=10)
|
| 115 |
|
| 116 |
+
# Combine all links
|
| 117 |
+
all_links = google_links + quora_links + other_links
|
| 118 |
+
|
| 119 |
+
# Use Gemini to filter and list relevant URLs
|
| 120 |
+
prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{topic_title}':\n{all_links}"
|
| 121 |
+
model = genai.GenerativeModel("gemini-1.5-pro")
|
| 122 |
+
response = model.generate_content(prompt)
|
| 123 |
+
filtered_links = response.text.strip().split('\n')
|
| 124 |
+
|
| 125 |
+
# Return the filtered links
|
| 126 |
+
return {"filtered_links": filtered_links}
|
| 127 |
+
except Exception as e:
|
| 128 |
+
print(f"Error occurred while processing the request: {e}")
|
| 129 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 130 |
+
|
| 131 |
if __name__ == "__main__":
|
| 132 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|