Spaces:
Runtime error
Runtime error
| # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | |
| import os | |
| from typing import Any, Dict, Optional | |
| from pydantic import BaseModel | |
| class Firecrawl: | |
| r"""Firecrawl allows you to turn entire websites into LLM-ready markdown. | |
| Args: | |
| api_key (Optional[str]): API key for authenticating with the Firecrawl | |
| API. | |
| api_url (Optional[str]): Base URL for the Firecrawl API. | |
| References: | |
| https://docs.firecrawl.dev/introduction | |
| """ | |
| def __init__( | |
| self, | |
| api_key: Optional[str] = None, | |
| api_url: Optional[str] = None, | |
| ) -> None: | |
| from firecrawl import FirecrawlApp | |
| self._api_key = api_key or os.environ.get("FIRECRAWL_API_KEY") | |
| self._api_url = api_url or os.environ.get("FIRECRAWL_API_URL") | |
| self.app = FirecrawlApp(api_key=self._api_key, api_url=self._api_url) | |
| def crawl( | |
| self, | |
| url: str, | |
| params: Optional[Dict[str, Any]] = None, | |
| **kwargs: Any, | |
| ) -> Any: | |
| r"""Crawl a URL and all accessible subpages. Customize the crawl by | |
| setting different parameters, and receive the full response or a job | |
| ID based on the specified options. | |
| Args: | |
| url (str): The URL to crawl. | |
| params (Optional[Dict[str, Any]]): Additional parameters for the | |
| crawl request. Defaults to `None`. | |
| **kwargs (Any): Additional keyword arguments, such as | |
| `poll_interval`, `idempotency_key`. | |
| Returns: | |
| Any: The crawl job ID or the crawl results if waiting until | |
| completion. | |
| Raises: | |
| RuntimeError: If the crawling process fails. | |
| """ | |
| try: | |
| crawl_response = self.app.crawl_url( | |
| url=url, | |
| params=params, | |
| **kwargs, | |
| ) | |
| return crawl_response | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to crawl the URL: {e}") | |
| def check_crawl_job(self, job_id: str) -> Dict: | |
| r"""Check the status of a crawl job. | |
| Args: | |
| job_id (str): The ID of the crawl job. | |
| Returns: | |
| Dict: The response including status of the crawl job. | |
| Raises: | |
| RuntimeError: If the check process fails. | |
| """ | |
| try: | |
| return self.app.check_crawl_status(job_id) | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to check the crawl job status: {e}") | |
| def scrape( | |
| self, | |
| url: str, | |
| params: Optional[Dict[str, Any]] = None, | |
| ) -> Dict: | |
| r"""To scrape a single URL. This function supports advanced scraping | |
| by setting different parameters and returns the full scraped data as a | |
| dictionary. | |
| Reference: https://docs.firecrawl.dev/advanced-scraping-guide | |
| Args: | |
| url (str): The URL to read. | |
| params (Optional[Dict[str, Any]]): Additional parameters for the | |
| scrape request. | |
| Returns: | |
| Dict: The scraped data. | |
| Raises: | |
| RuntimeError: If the scrape process fails. | |
| """ | |
| try: | |
| return self.app.scrape_url(url=url, params=params) | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to scrape the URL: {e}") | |
| def structured_scrape(self, url: str, response_format: BaseModel) -> Dict: | |
| r"""Use LLM to extract structured data from given URL. | |
| Args: | |
| url (str): The URL to read. | |
| response_format (BaseModel): A pydantic model | |
| that includes value types and field descriptions used to | |
| generate a structured response by LLM. This schema helps | |
| in defining the expected output format. | |
| Returns: | |
| Dict: The content of the URL. | |
| Raises: | |
| RuntimeError: If the scrape process fails. | |
| """ | |
| try: | |
| data = self.app.scrape_url( | |
| url, | |
| { | |
| 'formats': ['extract'], | |
| 'extract': {'schema': response_format.model_json_schema()}, | |
| }, | |
| ) | |
| return data.get("extract", {}) | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to perform structured scrape: {e}") | |
| def map_site( | |
| self, url: str, params: Optional[Dict[str, Any]] = None | |
| ) -> list: | |
| r"""Map a website to retrieve all accessible URLs. | |
| Args: | |
| url (str): The URL of the site to map. | |
| params (Optional[Dict[str, Any]]): Additional parameters for the | |
| map request. Defaults to `None`. | |
| Returns: | |
| list: A list containing the URLs found on the site. | |
| Raises: | |
| RuntimeError: If the mapping process fails. | |
| """ | |
| try: | |
| return self.app.map_url(url=url, params=params) | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to map the site: {e}") | |