Spaces:
Running
on
Zero
Running
on
Zero
| import requests | |
| from bs4 import BeautifulSoup | |
| from html_to_markdown import convert_to_markdown | |
| # --- Static Helper Functions (Web Scraping) --- | |
| def get_html(url: str) -> str: | |
| """Fetches HTML content from a URL.""" | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx) | |
| return response.text | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching {url}: {e}") | |
| return "" | |
| def find_wiki_links(html_content: str) -> list[str]: | |
| """Parses HTML to find all boss links within the 'mw-pages' div.""" | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| mw_pages_div = soup.find('div', id='mw-pages') | |
| if not mw_pages_div: | |
| return [] | |
| return [a['href'] for a in mw_pages_div.find_all('a', href=True)] | |
| def get_markdown_from_html(html: str) -> str: | |
| if not html: | |
| return "" | |
| soup = BeautifulSoup(html, 'html.parser') | |
| return convert_to_markdown(soup) | |
| def get_markdown_from_url(url: str) -> str: | |
| return get_markdown_from_html(get_html(url)) | |