Spaces:
Sleeping
Sleeping
updated for reddit.com
Browse files
main.py
CHANGED
|
@@ -36,15 +36,17 @@ async def scrape_visible_text(url):
|
|
| 36 |
try:
|
| 37 |
logger.info(f"Starting to scrape visible text from URL: {url}")
|
| 38 |
async with async_playwright() as p:
|
| 39 |
-
browser
|
|
|
|
| 40 |
context = await browser.new_context(
|
| 41 |
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
|
| 42 |
viewport={"width": 1280, "height": 800},
|
| 43 |
extra_http_headers={
|
| 44 |
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
| 45 |
"accept-encoding": "gzip, deflate, br, zstd",
|
| 46 |
-
"accept-language": "en-US,en;q=0.9
|
| 47 |
-
"cache-control": "
|
|
|
|
| 48 |
"sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
|
| 49 |
"sec-ch-ua-mobile": "?0",
|
| 50 |
"sec-ch-ua-platform": '"Windows"',
|
|
@@ -52,15 +54,20 @@ async def scrape_visible_text(url):
|
|
| 52 |
"sec-fetch-mode": "navigate",
|
| 53 |
"sec-fetch-site": "none",
|
| 54 |
"sec-fetch-user": "?1",
|
| 55 |
-
"upgrade-insecure-requests": "1"
|
| 56 |
}
|
| 57 |
)
|
|
|
|
| 58 |
page = await context.new_page()
|
| 59 |
-
await page.goto(url, wait_until="domcontentloaded")
|
|
|
|
|
|
|
| 60 |
visible_text = await page.evaluate("document.body.innerText")
|
|
|
|
| 61 |
await browser.close()
|
| 62 |
logger.info(f"Successfully scraped visible text from URL: {url}")
|
| 63 |
return visible_text
|
|
|
|
| 64 |
except Exception as e:
|
| 65 |
logger.error(f"Error while scraping visible text from URL {url}: {e}")
|
| 66 |
raise
|
|
|
|
| 36 |
try:
|
| 37 |
logger.info(f"Starting to scrape visible text from URL: {url}")
|
| 38 |
async with async_playwright() as p:
|
| 39 |
+
# Launch the browser in headless mode (can change to headless=False if you want to see it)
|
| 40 |
+
browser = await p.chromium.launch(headless=True)
|
| 41 |
context = await browser.new_context(
|
| 42 |
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
|
| 43 |
viewport={"width": 1280, "height": 800},
|
| 44 |
extra_http_headers={
|
| 45 |
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
| 46 |
"accept-encoding": "gzip, deflate, br, zstd",
|
| 47 |
+
"accept-language": "en-US,en;q=0.9",
|
| 48 |
+
"cache-control": "no-cache",
|
| 49 |
+
"pragma": "no-cache",
|
| 50 |
"sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
|
| 51 |
"sec-ch-ua-mobile": "?0",
|
| 52 |
"sec-ch-ua-platform": '"Windows"',
|
|
|
|
| 54 |
"sec-fetch-mode": "navigate",
|
| 55 |
"sec-fetch-site": "none",
|
| 56 |
"sec-fetch-user": "?1",
|
| 57 |
+
"upgrade-insecure-requests": "1",
|
| 58 |
}
|
| 59 |
)
|
| 60 |
+
|
| 61 |
page = await context.new_page()
|
| 62 |
+
await page.goto(url, wait_until="domcontentloaded") # Wait until the DOM is fully loaded
|
| 63 |
+
|
| 64 |
+
# Extract visible text from the body of the page
|
| 65 |
visible_text = await page.evaluate("document.body.innerText")
|
| 66 |
+
|
| 67 |
await browser.close()
|
| 68 |
logger.info(f"Successfully scraped visible text from URL: {url}")
|
| 69 |
return visible_text
|
| 70 |
+
|
| 71 |
except Exception as e:
|
| 72 |
logger.error(f"Error while scraping visible text from URL {url}: {e}")
|
| 73 |
raise
|