Source code for mlflow.data.huggingface_dataset_source

from typing import Any, Union, Optional, Mapping, Sequence, Dict, TYPE_CHECKING

from mlflow.data.dataset_source import DatasetSource
from mlflow.utils.annotations import experimental


if TYPE_CHECKING:
    import datasets


[docs]@experimental class HuggingFaceDatasetSource(DatasetSource): """ Represents the source of a Hugging Face dataset used in MLflow Tracking. """ def __init__( self, path: str, config_name: Optional[str] = None, data_dir: Optional[str] = None, data_files: Optional[ Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]] ] = None, split: Optional[Union[str, "datasets.Split"]] = None, revision: Optional[Union[str, "datasets.Version"]] = None, task: Optional[Union[str, "datasets.TaskTemplate"]] = None, ): """ :param path: The path of the Hugging Face dataset. :param config_name: The name of of the Hugging Face dataset configuration. :param data_dir: The `data_dir` of the Hugging Face dataset configuration. :param data_files: Paths to source data file(s) for the Hugging Face dataset configuration. :param revision: Version of the dataset script to load. :param task: The task to prepare the Hugging Face dataset for during training and evaluation. """ self._path = path self._config_name = config_name self._data_dir = data_dir self._data_files = data_files self._split = split self._revision = revision self._task = task @staticmethod def _get_source_type() -> str: return "hugging_face"
[docs] def load(self, **kwargs): """ Loads the dataset source as a Hugging Face Dataset. :param kwargs: Additional keyword arguments used for loading the dataset with the Hugging Face ``datasets.load_dataset()`` method. The following keyword arguments are used automatically from the dataset source but may be overridden by values passed in ``**kwargs``: ``path``, ``name``, ``data_dir``, ``data_files``, ``split``, ``revision``, ``task``. :return: An instance of ``datasets.Dataset``. """ import datasets load_kwargs = { "path": self._path, "name": self._config_name, "data_dir": self._data_dir, "data_files": self._data_files, "split": self._split, "revision": self._revision, "task": self._task, } load_kwargs.update(kwargs) return datasets.load_dataset(**load_kwargs)
@staticmethod def _can_resolve(raw_source: Any): # NB: Initially, we expect that Hugging Face dataset sources will only be used with # Hugging Face datasets constructed by from_huggingface_dataset, which can create # an instance of HuggingFaceDatasetSource directly without the need for resolution return False @classmethod def _resolve(cls, raw_source: str) -> "HuggingFaceDatasetSource": raise NotImplementedError def _to_dict(self) -> Dict[Any, Any]: return { "path": self._path, "config_name": self._config_name, "data_dir": self._data_dir, "data_files": self._data_files, "split": str(self._split), "revision": self._revision, "task": self._task, } @classmethod def _from_dict(cls, source_dict: Dict[Any, Any]) -> "HuggingFaceDatasetSource": return cls( path=source_dict.get("path"), config_name=source_dict.get("config_name"), data_dir=source_dict.get("data_dir"), data_files=source_dict.get("data_files"), split=source_dict.get("split"), revision=source_dict.get("revision"), task=source_dict.get("task"), )