Source code for cleanlab.datalab.internal.data

# Copyright (C) 2017-2023  Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab.  If not, see <https://www.gnu.org/licenses/>.
"""Classes and methods for datasets that are loaded into Datalab."""

import os
from typing import Any, Callable, Dict, List, Mapping, Optional, Union, cast, TYPE_CHECKING, Tuple

from cleanlab.datalab.internal.task import Task

try:
    import datasets
except ImportError as error:
    raise ImportError(
        "Cannot import datasets package. "
        "Please install it and try again, or just install cleanlab with "
        "all optional dependencies via: `pip install 'cleanlab[all]'`"
    ) from error
from abc import ABC, abstractmethod
import numpy as np
import pandas as pd
from datasets.arrow_dataset import Dataset
from datasets import ClassLabel

from cleanlab.internal.validation import labels_to_array, labels_to_list_multilabel


if TYPE_CHECKING:  # pragma: no cover
    DatasetLike = Union[Dataset, pd.DataFrame, Dict[str, Any], List[Dict[str, Any]], str]


[docs]class DataFormatError(ValueError):
    """Exception raised when the data is not in a supported format."""

    def __init__(self, data: Any):
        self.data = data
        message = (
            f"Unsupported data type: {type(data)}\n"
            "Supported types: "
            "datasets.Dataset, pandas.DataFrame, dict, list, str"
        )
        super().__init__(message)


[docs]class DatasetDictError(ValueError):
    """Exception raised when a DatasetDict is passed to Datalab.

    Usually, this means that a dataset identifier was passed to Datalab, but
    the dataset is a DatasetDict, which contains multiple splits of the dataset.

    """

    def __init__(self):
        message = (
            "Please pass a single dataset, not a DatasetDict. "
            "Try specifying a split, e.g. `dataset = load_dataset('dataset', split='train')` "
            "then pass `dataset` to Datalab."
        )
        super().__init__(message)


[docs]class DatasetLoadError(ValueError):
    """Exception raised when a dataset cannot be loaded.

    Parameters
    ----------
    dataset_type: type
        The type of dataset that failed to load.
    """

    def __init__(self, dataset_type: type):
        message = f"Failed to load dataset from {dataset_type}.\n"
        super().__init__(message)


[docs]class Data:
    """
    Class that holds and validates datasets for Datalab.

    Internally, the data is stored as a datasets.Dataset object and the labels
    are integers (ranging from 0 to K-1, where K is the number of classes) stored
    in a numpy array.

    Parameters
    ----------
    data :
        Dataset to be audited by Datalab.
        Several formats are supported, which will internally be converted to a Dataset object.

        Supported formats:
            - datasets.Dataset
            - pandas.DataFrame
            - dict
                - keys are strings
                - values are arrays or lists of equal length
            - list
                - list of dictionaries with the same keys
            - str
                - path to a local file
                    - Text (.txt)
                    - CSV (.csv)
                    - JSON (.json)
                - or a dataset identifier on the Hugging Face Hub
            It checks if the string is a path to a file that exists locally, and if not,
            it assumes it is a dataset identifier on the Hugging Face Hub.

    label_name : Union[str, List[str]]
        Name of the label column in the dataset.

    task :
        The task associated with the dataset. This is used to determine how to
        to format the labels.

        Note:

          - If the task is a classification task, the labels
          will be mapped to integers, e.g. [0, 1, ..., K-1] where K is the number
          of classes. If the task is a regression task, the labels will not be
          mapped to integers.

          - If the task is a multilabel task, the labels will be formatted as a
            list of lists, e.g. [[0, 1], [1, 2], [0, 2]] where each sublist contains
            the labels for a single example. If the task is not a multilabel task,
            the labels will be formatted as a 1D numpy array.

    Warnings
    --------
    Optional dependencies:

    - datasets :
        Dataset, DatasetDict and load_dataset are imported from datasets.
        This is an optional dependency of cleanlab, but is required for
        :py:class:`Datalab <cleanlab.datalab.datalab.Datalab>` to work.
    """

    def __init__(
        self,
        data: "DatasetLike",
        task: Task,
        label_name: Optional[str] = None,
    ) -> None:
        self._validate_data(data)
        self._data = self._load_data(data)
        self._data_hash = hash(self._data)
        self.labels: Label
        label_class = MultiLabel if task.is_multilabel else MultiClass
        map_to_int = task.is_classification
        self.labels = label_class(data=self._data, label_name=label_name, map_to_int=map_to_int)

    def _load_data(self, data: "DatasetLike") -> Dataset:
        """Checks the type of dataset and uses the correct loader method and
        assigns the result to the data attribute."""
        dataset_factory_map: Dict[type, Callable[..., Dataset]] = {
            Dataset: lambda x: x,
            pd.DataFrame: Dataset.from_pandas,
            dict: self._load_dataset_from_dict,
            list: self._load_dataset_from_list,
            str: self._load_dataset_from_string,
        }
        if not isinstance(data, tuple(dataset_factory_map.keys())):
            raise DataFormatError(data)
        return dataset_factory_map[type(data)](data)

    def __len__(self) -> int:
        return len(self._data)

    def __eq__(self, other) -> bool:
        if isinstance(other, Data):
            # Equality checks
            hashes_are_equal = self._data_hash == other._data_hash
            labels_are_equal = self.labels == other.labels
            return all([hashes_are_equal, labels_are_equal])
        return False

    def __hash__(self) -> int:
        return self._data_hash

    @property
    def class_names(self) -> List[str]:
        return self.labels.class_names

    @property
    def has_labels(self) -> bool:
        """Check if labels are available."""
        return self.labels.is_available

    @staticmethod
    def _validate_data(data) -> None:
        if isinstance(data, datasets.DatasetDict):
            raise DatasetDictError()
        if not isinstance(data, (Dataset, pd.DataFrame, dict, list, str)):
            raise DataFormatError(data)

    @staticmethod
    def _load_dataset_from_dict(data_dict: Dict[str, Any]) -> Dataset:
        try:
            return Dataset.from_dict(data_dict)
        except Exception as error:
            raise DatasetLoadError(dict) from error

    @staticmethod
    def _load_dataset_from_list(data_list: List[Dict[str, Any]]) -> Dataset:
        try:
            return Dataset.from_list(data_list)
        except Exception as error:
            raise DatasetLoadError(list) from error

    @staticmethod
    def _load_dataset_from_string(data_string: str) -> Dataset:
        if not os.path.exists(data_string):
            try:
                dataset = datasets.load_dataset(data_string)
                return cast(Dataset, dataset)
            except Exception as error:
                raise DatasetLoadError(str) from error

        factory: Dict[str, Callable[[str], Any]] = {
            ".txt": Dataset.from_text,
            ".csv": Dataset.from_csv,
            ".json": Dataset.from_json,
        }

        extension = os.path.splitext(data_string)[1]
        if extension not in factory:
            raise DatasetLoadError(type(data_string))

        dataset = factory[extension](data_string)
        dataset_cast = cast(Dataset, dataset)
        return dataset_cast


[docs]class Label(ABC):
    """
    Class to represent labels in a dataset.

    It stores the labels as a numpy array and maps them to integers if necessary.
    If a mapping is not necessary, e.g. for regression tasks, the mapping will be an empty dictionary.

    Parameters
    ----------
    data :
        A Hugging Face Dataset object.

    label_name : str
        Name of the label column in the dataset.

    map_to_int : bool
        Whether to map the labels to integers, e.g. [0, 1, ..., K-1] where K is the number of classes.
        If False, the labels are not mapped to integers, e.g. for regression tasks.
    """

    def __init__(
        self, *, data: Dataset, label_name: Optional[str] = None, map_to_int: bool = True
    ) -> None:
        self._data = data
        self.label_name = label_name
        self.labels = labels_to_array([])
        self.label_map: Mapping[Union[str, int], Any] = {}
        if label_name is not None:
            self.labels, self.label_map = self._extract_labels(data, label_name, map_to_int)
            self._validate_labels()

    def __len__(self) -> int:
        if self.labels is None:
            return 0
        return len(self.labels)

    def __eq__(self, __value: object) -> bool:
        if isinstance(__value, Label):
            labels_are_equal = np.array_equal(self.labels, __value.labels)
            names_are_equal = self.label_name == __value.label_name
            maps_are_equal = self.label_map == __value.label_map
            return all([labels_are_equal, names_are_equal, maps_are_equal])
        return False

    def __getitem__(self, __index: Union[int, slice, np.ndarray]) -> np.ndarray:
        return self.labels[__index]

    def __bool__(self) -> bool:
        return self.is_available

    @property
    def class_names(self) -> List[str]:
        """A list of class names that are present in the dataset.

        Without labels, this will return an empty list.
        """
        return list(self.label_map.values())

    @property
    def is_available(self) -> bool:
        """Check if labels are available."""
        empty_labels = self.labels is None or len(self.labels) == 0
        empty_label_map = self.label_map is None or len(self.label_map) == 0
        return not (empty_labels or empty_label_map)

    def _validate_labels(self) -> None:
        if self.label_name not in self._data.column_names:
            raise ValueError(f"Label column '{self.label_name}' not found in dataset.")
        labels = self._data[self.label_name]
        assert isinstance(labels, (np.ndarray, list))
        assert len(labels) == len(self._data)

    @abstractmethod
    def _extract_labels(self, *args, **kwargs) -> Any:
        """Extract labels from the dataset and formats them"""
        raise NotImplementedError


[docs]class MultiLabel(Label):
    def __init__(self, data, label_name, map_to_int):
        super().__init__(data=data, label_name=label_name, map_to_int=map_to_int)

    def _extract_labels(
        self, data: Dataset, label_name: str, map_to_int: bool
    ) -> Tuple[List[List[int]], Dict[int, Any]]:
        labels: List[List[int]] = labels_to_list_multilabel(data[label_name])
        # label_map needs to be lexicographically sorted. np.unique should sort it
        unique_labels = np.unique([x for ele in labels for x in ele])
        label_map = {label: i for i, label in enumerate(unique_labels)}
        formatted_labels = [[label_map[item] for item in label] for label in labels]
        inverse_map = {i: label for label, i in label_map.items()}
        return formatted_labels, inverse_map


[docs]class MultiClass(Label):
    def __init__(self, data, label_name, map_to_int):
        super().__init__(data=data, label_name=label_name, map_to_int=map_to_int)

    def _extract_labels(self, data: Dataset, label_name: str, map_to_int: bool):
        """
        Picks out labels from the dataset and formats them to be [0, 1, ..., K-1]
        where K is the number of classes. Also returns a mapping from the formatted
        labels to the original labels in the dataset.

        Note: This function is not meant to be used directly. It is used by
        ``cleanlab.data.Data`` to extract the formatted labels from the dataset
        and stores them as attributes.

        Parameters
        ----------
        data : datasets.Dataset
            A Hugging Face Dataset object.

        label_name : str
            Name of the column in the dataset that contains the labels.

        map_to_int : bool
            Whether to map the labels to integers, e.g. [0, 1, ..., K-1] where K is the number of classes.
            If False, the labels are not mapped to integers, e.g. for regression tasks.
        Returns
        -------
        formatted_labels : np.ndarray
            Labels in the format [0, 1, ..., K-1] where K is the number of classes.

        inverse_map : dict
            Mapping from the formatted labels to the original labels in the dataset.
        """

        labels = labels_to_array(data[label_name])  # type: ignore[assignment]
        if labels.ndim != 1:
            raise ValueError("labels must be 1D numpy array.")

        if not map_to_int:
            # Don't map labels to integers, e.g. for regression tasks
            return labels, {}
        label_name_feature = data.features[label_name]
        if isinstance(label_name_feature, ClassLabel):
            label_map = {
                label: label_name_feature.str2int(label) for label in label_name_feature.names
            }
            formatted_labels = labels
        else:
            label_map = {label: i for i, label in enumerate(np.unique(labels))}
            formatted_labels = np.vectorize(label_map.get)(labels)
        inverse_map = {i: label for label, i in label_map.items()}

        return formatted_labels, inverse_map