Source code for cleanlab.internal.multilabel_utils

# Copyright (C) 2017-2023  Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab.  If not, see <https://www.gnu.org/licenses/>.

"""
Helper functions used internally for multi-label classification tasks.
"""
from typing import Tuple, Optional, List

import numpy as np

from cleanlab.internal.util import get_num_classes


def _is_multilabel(y: np.ndarray) -> bool:
    """Checks whether `y` is in a multi-label indicator matrix format.

    Sparse matrices are not supported.
    """
    if not (isinstance(y, np.ndarray) and y.ndim == 2 and y.shape[1] > 1):
        return False
    return np.array_equal(np.unique(y), [0, 1])


[docs]def stack_complement(pred_prob_slice: np.ndarray) -> np.ndarray: """ Extends predicted probabilities of a single class to two columns. Parameters ---------- pred_prob_slice: A 1D array with predicted probabilities for a single class. Example ------- >>> pred_prob_slice = np.array([0.1, 0.9, 0.3, 0.8]) >>> stack_complement(pred_prob_slice) array([[0.9, 0.1], [0.1, 0.9], [0.7, 0.3], [0.2, 0.8]]) """ return np.vstack((1 - pred_prob_slice, pred_prob_slice)).T
[docs]def get_onehot_num_classes( labels: list, pred_probs: Optional[np.ndarray] = None ) -> Tuple[np.ndarray, int]: """Returns OneHot encoding of MultiLabel Data, and number of classes""" num_classes = get_num_classes(labels=labels, pred_probs=pred_probs) try: y_one = int2onehot(labels, K=num_classes) except TypeError: raise ValueError( "wrong format for labels, should be a list of list[indices], please check the documentation in find_label_issues for further information" ) return y_one, num_classes
[docs]def int2onehot(labels: list, K: int) -> np.ndarray: """Convert multi-label classification `labels` from a ``List[List[int]]`` format to a onehot matrix. This returns a binarized format of the labels as a multi-hot vector for each example, where the entries in this vector are 1 for each class that applies to this example and 0 otherwise. Parameters ---------- labels: list of lists of integers e.g. [[0,1], [3], [1,2,3], [1], [2]] All integers from 0,1,...,K-1 must be represented. K: int The number of classes.""" from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer(classes=range(K)) return mlb.fit_transform(labels)
[docs]def onehot2int(onehot_matrix: np.ndarray) -> List[List[int]]: """Convert multi-label classification `labels` from a onehot matrix format to a ``List[List[int]]`` format that can be used with other cleanlab functions. Parameters ---------- onehot_matrix: 2D np.ndarray of 0s and 1s A matrix representation of multi-label classification labels in a binarized format as a multi-hot vector for each example. The entries in this vector are 1 for each class that applies to this example and 0 otherwise. Returns ------- labels: list of lists of integers e.g. [[0,1], [3], [1,2,3], [1], [2]] All integers from 0,1,...,K-1 must be represented.""" return [list(np.where(row == 1)[0]) for row in onehot_matrix]