AutoPage

Runtime error

App Files Files Community

AutoPage / camel /datahubs /huggingface.py

Mqleet

upd code

fcaa164 about 2 months ago

raw

history blame contribute delete

14.9 kB

	# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
	import json
	import os
	import tempfile
	from typing import Any, List, Optional

	from camel.datahubs.base import BaseDatasetManager
	from camel.datahubs.models import Record
	from camel.logger import get_logger
	from camel.types import HuggingFaceRepoType
	from camel.utils import api_keys_required, dependencies_required

	logger = get_logger(__name__)


	class HuggingFaceDatasetManager(BaseDatasetManager):
	r"""A dataset manager for Hugging Face datasets. This class provides
	methods to create, add, update, delete, and list records in a dataset on
	the Hugging Face Hub.

	Args:
	token (str): The Hugging Face API token. If not provided, the token
	will be read from the environment variable `HF_TOKEN`.
	"""

	@api_keys_required(
	[
	("token", "HF_TOKEN"),
	]
	)
	@dependencies_required('huggingface_hub')
	def __init__(self, token: Optional[str] = None):
	from huggingface_hub import HfApi

	self._api_key = token or os.getenv("HF_TOKEN")
	self.api = HfApi(token=self._api_key)

	def create_dataset_card(
	self,
	dataset_name: str,
	description: str,
	license: Optional[str] = None,
	version: Optional[str] = None,
	tags: Optional[List[str]] = None,
	authors: Optional[List[str]] = None,
	size_category: Optional[List[str]] = None,
	language: Optional[List[str]] = None,
	task_categories: Optional[List[str]] = None,
	content: Optional[str] = None,
	) -> None:
	r"""Creates and uploads a dataset card to the Hugging Face Hub in YAML
	format.

	Args:
	dataset_name (str): The name of the dataset.
	description (str): A description of the dataset.
	license (str): The license of the dataset. (default: :obj:`None`)
	version (str): The version of the dataset. (default: :obj:`None`)
	tags (list): A list of tags for the dataset.(default: :obj:`None`)
	authors (list): A list of authors of the dataset. (default:
	:obj:`None`)
	size_category (list): A size category for the dataset. (default:
	:obj:`None`)
	language (list): A list of languages the dataset is in. (default:
	:obj:`None`)
	task_categories (list): A list of task categories. (default:
	:obj:`None`)
	content (str): Custom markdown content that the user wants to add
	to the dataset card. (default: :obj:`None`)
	"""
	import yaml

	metadata = {
	"license": license,
	"authors": authors,
	"task_categories": task_categories,
	"language": language,
	"tags": tags,
	"pretty_name": dataset_name,
	"size_categories": size_category,
	"version": version,
	"description": description,
	}

	# Remove keys with None values
	metadata = {k: v for k, v in metadata.items() if v}

	card_content = (
	"---\n"
	+ yaml.dump(metadata, default_flow_style=False, allow_unicode=True)
	+ "\n---"
	)

	if content:
	card_content += f"\n\n# Additional Information\n{content}\n"

	self._upload_file(
	file_content=card_content,
	dataset_name=dataset_name,
	filepath="README.md",
	file_type="md",
	)

	def create_dataset(
	self, name: str, private: bool = False, **kwargs: Any
	) -> str:
	r"""Creates a new dataset on the Hugging Face Hub.

	Args:
	name (str): The name of the dataset.
	private (bool): Whether the dataset should be private. defaults to
	False.
	kwargs (Any): Additional keyword arguments.

	Returns:
	str: The URL of the created dataset.
	"""
	from huggingface_hub.errors import RepositoryNotFoundError

	try:
	self.api.repo_info(
	repo_id=name,
	repo_type=HuggingFaceRepoType.DATASET.value,
	**kwargs,
	)
	except RepositoryNotFoundError:
	self.api.create_repo(
	repo_id=name,
	repo_type=HuggingFaceRepoType.DATASET.value,
	private=private,
	)

	return f"https://huggingface.co/datasets/{name}"

	def list_datasets(
	self, username: str, limit: int = 100, **kwargs: Any
	) -> List[str]:
	r"""Lists all datasets for the current user.

	Args:
	username (str): The username of the user whose datasets to list.
	limit (int): The maximum number of datasets to list.
	(default: :obj:`100`)
	kwargs (Any): Additional keyword arguments.

	Returns:
	List[str]: A list of dataset ids.
	"""
	try:
	return [
	dataset.id
	for dataset in self.api.list_datasets(
	author=username, limit=limit, **kwargs
	)
	]
	except Exception as e:
	logger.error(f"Error listing datasets: {e}")
	return []

	def delete_dataset(self, dataset_name: str, **kwargs: Any) -> None:
	r"""Deletes a dataset from the Hugging Face Hub.

	Args:
	dataset_name (str): The name of the dataset to delete.
	kwargs (Any): Additional keyword arguments.
	"""
	try:
	self.api.delete_repo(
	repo_id=dataset_name,
	repo_type=HuggingFaceRepoType.DATASET.value,
	**kwargs,
	)
	logger.info(f"Dataset '{dataset_name}' deleted successfully.")
	except Exception as e:
	logger.error(f"Error deleting dataset '{dataset_name}': {e}")
	raise

	def add_records(
	self,
	dataset_name: str,
	records: List[Record],
	filepath: str = "records/records.json",
	**kwargs: Any,
	) -> None:
	r"""Adds records to a dataset on the Hugging Face Hub.

	Args:
	dataset_name (str): The name of the dataset.
	records (List[Record]): A list of records to add to the dataset.
	filepath (str): The path to the file containing the records.
	kwargs (Any): Additional keyword arguments.

	Raises:
	ValueError: If the dataset already has a records file.
	"""
	existing_records = self._download_records(
	dataset_name=dataset_name, filepath=filepath, **kwargs
	)

	if existing_records:
	raise ValueError(
	f"Dataset '{filepath}' already exists. "
	f"Use `update_records` to modify."
	)

	self._upload_records(
	records=records,
	dataset_name=dataset_name,
	filepath=filepath,
	**kwargs,
	)

	def update_records(
	self,
	dataset_name: str,
	records: List[Record],
	filepath: str = "records/records.json",
	**kwargs: Any,
	) -> None:
	r"""Updates records in a dataset on the Hugging Face Hub.

	Args:
	dataset_name (str): The name of the dataset.
	records (List[Record]): A list of records to update in the dataset.
	filepath (str): The path to the file containing the records.
	kwargs (Any): Additional keyword arguments.

	Raises:
	ValueError: If the dataset does not have an existing file to update
	records in.
	"""
	existing_records = self._download_records(
	dataset_name=dataset_name, filepath=filepath, **kwargs
	)

	if not existing_records:
	logger.warning(
	f"Dataset '{dataset_name}' does not have existing "
	"records. Adding new records."
	)
	self._upload_records(
	records=records,
	dataset_name=dataset_name,
	filepath=filepath,
	**kwargs,
	)
	return

	old_dict = {record.id: record for record in existing_records}
	new_dict = {record.id: record for record in records}
	merged_dict = old_dict.copy()
	merged_dict.update(new_dict)

	self._upload_records(
	records=list(merged_dict.values()),
	dataset_name=dataset_name,
	filepath=filepath,
	**kwargs,
	)

	def delete_record(
	self,
	dataset_name: str,
	record_id: str,
	filepath: str = "records/records.json",
	**kwargs: Any,
	) -> None:
	r"""Deletes a record from the dataset.

	Args:
	dataset_name (str): The name of the dataset.
	record_id (str): The ID of the record to delete.
	filepath (str): The path to the file containing the records.
	kwargs (Any): Additional keyword arguments.

	Raises:
	ValueError: If the dataset does not have an existing file to delete
	records from.
	"""
	existing_records = self._download_records(
	dataset_name=dataset_name, filepath=filepath, **kwargs
	)

	if not existing_records:
	raise ValueError(
	f"Dataset '{dataset_name}' does not have an existing file to "
	f"delete records from."
	)

	filtered_records = [
	record for record in existing_records if record.id != record_id
	]

	self._upload_records(
	records=filtered_records,
	dataset_name=dataset_name,
	filepath=filepath,
	**kwargs,
	)

	def list_records(
	self,
	dataset_name: str,
	filepath: str = "records/records.json",
	**kwargs: Any,
	) -> List[Record]:
	r"""Lists all records in a dataset.

	Args:
	dataset_name (str): The name of the dataset.
	filepath (str): The path to the file containing the records.
	kwargs (Any): Additional keyword arguments.

	Returns:
	List[Record]: A list of records in the dataset.
	"""
	return self._download_records(
	dataset_name=dataset_name, filepath=filepath, **kwargs
	)

	def _download_records(
	self, dataset_name: str, filepath: str, **kwargs: Any
	) -> List[Record]:
	from huggingface_hub import hf_hub_download
	from huggingface_hub.errors import EntryNotFoundError

	try:
	downloaded_file_path = hf_hub_download(
	repo_id=dataset_name,
	filename=filepath,
	repo_type=HuggingFaceRepoType.DATASET.value,
	token=self._api_key,
	**kwargs,
	)

	with open(downloaded_file_path, "r") as f:
	records_data = json.load(f)

	return [Record(**record) for record in records_data]
	except EntryNotFoundError:
	logger.info(f"No records found for dataset '{dataset_name}'.")
	return []
	except Exception as e:
	logger.error(f"Error downloading or processing records: {e}")
	raise e

	def _upload_records(
	self,
	records: List[Record],
	dataset_name: str,
	filepath: str,
	**kwargs: Any,
	):
	with tempfile.NamedTemporaryFile(
	delete=False, mode="w", newline="", encoding="utf-8"
	) as f:
	json.dump(
	[
	record.model_dump(exclude_defaults=True)
	for record in records
	],
	f,
	)
	temp_file_path = f.name

	try:
	self.api.upload_file(
	path_or_fileobj=temp_file_path,
	path_in_repo=filepath,
	repo_id=dataset_name,
	repo_type=HuggingFaceRepoType.DATASET.value,
	**kwargs,
	)
	except Exception as e:
	logger.error(f"Error uploading records file: {e}")
	raise
	finally:
	if os.path.exists(temp_file_path):
	os.remove(temp_file_path)

	def _upload_file(
	self,
	file_content: str,
	dataset_name: str,
	filepath: str,
	file_type: str = "json",
	**kwargs: Any,
	):
	with tempfile.NamedTemporaryFile(
	mode="w", delete=False, suffix=f".{file_type}"
	) as f:
	if file_type == "json":
	if isinstance(file_content, str):
	try:
	json_content = json.loads(file_content)
	except json.JSONDecodeError:
	raise ValueError(
	"Invalid JSON string provided for file_content."
	)
	else:
	try:
	json.dumps(file_content)
	json_content = file_content
	except (TypeError, ValueError):
	raise ValueError(
	"file_content is not JSON serializable."
	)

	json.dump(json_content, f)
	elif file_type == "md" or file_type == "txt":
	f.write(file_content)
	else:
	raise ValueError(f"Unsupported file type: {file_type}")

	temp_file_path = f.name

	try:
	self.api.upload_file(
	path_or_fileobj=temp_file_path,
	path_in_repo=filepath,
	repo_id=dataset_name,
	repo_type=HuggingFaceRepoType.DATASET.value,
	**kwargs,
	)
	logger.info(f"File uploaded successfully: {filepath}")
	except Exception as e:
	logger.error(f"Error uploading file: {e}")
	raise

	if os.path.exists(temp_file_path):
	os.remove(temp_file_path)