Source code for cleanlab.internal.multiannotator_utils

# Copyright (C) 2017-2022  Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab.  If not, see <https://www.gnu.org/licenses/>.

"""
Helper methods used internally in cleanlab.multiannotator
"""

from cleanlab.typing import LabelLike
from typing import Optional, Tuple
import warnings
import numpy as np
import pandas as pd
from cleanlab.internal.validation import assert_valid_class_labels


[docs]def assert_valid_inputs_multiannotator( labels_multiannotator: pd.DataFrame, pred_probs: Optional[np.ndarray] = None, ) -> None: """Validate format of multi-annotator labels""" # Raise error if labels are not formatted properly if any([isinstance(label, str) for label in labels_multiannotator.values.ravel()]): raise ValueError( "Labels cannot be strings, they must be zero-indexed integers corresponding to class indices." ) all_labels_flatten = labels_multiannotator.replace({pd.NA: np.NaN}).astype(float).values.ravel() all_labels_flatten = all_labels_flatten[~np.isnan(all_labels_flatten)] assert_valid_class_labels(all_labels_flatten, allow_one_class=True) # Raise error if number of classes in labels_multiannoator does not match number of classes in pred_probs if pred_probs is not None: num_classes = pred_probs.shape[1] highest_class = ( np.nanmax(labels_multiannotator.replace({pd.NA: np.NaN}).astype(float).values) + 1 ) # this allows for missing labels, but not missing columns in pred_probs if num_classes < highest_class: raise ValueError( f"""pred_probs must have at least {int(highest_class)} columns based on the largest class label which appears in labels_multiannotator. Perhaps some rarely-annotated classes were lost while establishing consensus labels used to train your classifier.""" ) # Raise error if labels_multiannotator has NaN rows if labels_multiannotator.isna().all(axis=1).any(): raise ValueError("labels_multiannotator cannot have rows with all NaN.") # Raise error if labels_multiannotator has NaN columns if labels_multiannotator.isna().all().any(): nan_columns = list( labels_multiannotator.columns[labels_multiannotator.isna().all() == True] ) raise ValueError( f"""labels_multiannotator cannot have columns with all NaN. Annotators {nan_columns} did not label any examples.""" ) # Raise error if labels_multiannotator has <= 1 column if len(labels_multiannotator.columns) <= 1: raise ValueError( """labels_multiannotator must have more than one column. If there is only one annotator, use cleanlab.rank.get_label_quality_scores instead""" ) # Raise error if labels_multiannotator only has 1 label per example if labels_multiannotator.apply(lambda s: len(s.dropna()) == 1, axis=1).all(): raise ValueError( """Each example only has one label, collapse the labels into a 1-D array and use cleanlab.rank.get_label_quality_scores instead""" ) # Raise warning if no examples with 2 or more annotators agree # TODO: might shift this later in the code to avoid extra compute if labels_multiannotator.apply( lambda s: np.array_equal(s.dropna().unique(), s.dropna()), axis=1 ).all(): warnings.warn("Annotators do not agree on any example. Check input data.")
[docs]def format_multiannotator_labels(labels: LabelLike) -> Tuple[pd.DataFrame, dict]: """Takes an array of labels and formats it such that labels are in the set ``0, 1, ..., K-1``, where ``K`` is the number of classes. The labels are assigned based on lexicographic order. Returns ------- formatted_labels Returns pd.DataFrame of shape ``(N,M)``. The return labels will be properly formatted and can be passed to cleanlab.multiannotator functions. mapping A dictionary showing the mapping of new to old labels, such that ``mapping[k]`` returns the name of the k-th class. """ if isinstance(labels, pd.DataFrame): np_labels = labels.values elif isinstance(labels, np.ndarray): np_labels = labels else: raise TypeError("labels must be 2D numpy array or pandas DataFrame") unique_labels = pd.unique(np_labels.ravel()) try: unique_labels = unique_labels[~np.isnan(unique_labels)] unique_labels.sort() except (TypeError): # np.unique / np.sort cannot handle string values or pd.NA types nan_mask = np.array([(l is np.NaN) or (l is pd.NA) or (l == "nan") for l in unique_labels]) unique_labels = unique_labels[~nan_mask] unique_labels.sort() # convert float labels (that arose because np.nan is float type) to int if unique_labels.dtype == "float": unique_labels = unique_labels.astype("int") label_map = {label: i for i, label in enumerate(unique_labels)} inverse_map = {i: label for label, i in label_map.items()} if isinstance(labels, np.ndarray): labels = pd.DataFrame(labels) formatted_labels = labels.replace(label_map) return formatted_labels, inverse_map