Source code for cleanlab.internal.multilabel_utils
# Copyright (C) 2017-2023 Cleanlab Inc.# This file is part of cleanlab.## cleanlab is free software: you can redistribute it and/or modify# it under the terms of the GNU Affero General Public License as published# by the Free Software Foundation, either version 3 of the License, or# (at your option) any later version.## cleanlab is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the# GNU Affero General Public License for more details.## You should have received a copy of the GNU Affero General Public License# along with cleanlab. If not, see <https://www.gnu.org/licenses/>."""Helper functions used internally for multi-label classification tasks."""fromtypingimportList,Optional,Tupleimportnumpyasnpfromcleanlab.internal.utilimportget_num_classesdef_is_multilabel(y:np.ndarray)->bool:"""Checks whether `y` is in a multi-label indicator matrix format. Sparse matrices are not supported. """ifnot(isinstance(y,np.ndarray)andy.ndim==2andy.shape[1]>1):returnFalsereturnnp.array_equal(np.unique(y),[0,1])
[docs]defstack_complement(pred_prob_slice:np.ndarray)->np.ndarray:""" Extends predicted probabilities of a single class to two columns. Parameters ---------- pred_prob_slice: A 1D array with predicted probabilities for a single class. Example ------- >>> pred_prob_slice = np.array([0.1, 0.9, 0.3, 0.8]) >>> stack_complement(pred_prob_slice) array([[0.9, 0.1], [0.1, 0.9], [0.7, 0.3], [0.2, 0.8]]) """returnnp.vstack((1-pred_prob_slice,pred_prob_slice)).T
[docs]defget_onehot_num_classes(labels:list,pred_probs:Optional[np.ndarray]=None)->Tuple[np.ndarray,int]:"""Returns OneHot encoding of MultiLabel Data, and number of classes"""num_classes=get_num_classes(labels=labels,pred_probs=pred_probs)try:y_one=int2onehot(labels,K=num_classes)exceptTypeError:raiseValueError("wrong format for labels, should be a list of list[indices], please check the documentation in find_label_issues for further information")returny_one,num_classes
[docs]defint2onehot(labels:list,K:int)->np.ndarray:"""Convert multi-label classification `labels` from a ``List[List[int]]`` format to a onehot matrix. This returns a binarized format of the labels as a multi-hot vector for each example, where the entries in this vector are 1 for each class that applies to this example and 0 otherwise. Parameters ---------- labels: list of lists of integers e.g. [[0,1], [3], [1,2,3], [1], [2]] All integers from 0,1,...,K-1 must be represented. K: int The number of classes."""fromsklearn.preprocessingimportMultiLabelBinarizermlb=MultiLabelBinarizer(classes=range(K))returnmlb.fit_transform(labels)
[docs]defonehot2int(onehot_matrix:np.ndarray)->List[List[int]]:"""Convert multi-label classification `labels` from a onehot matrix format to a ``List[List[int]]`` format that can be used with other cleanlab functions. Parameters ---------- onehot_matrix: 2D np.ndarray of 0s and 1s A matrix representation of multi-label classification labels in a binarized format as a multi-hot vector for each example. The entries in this vector are 1 for each class that applies to this example and 0 otherwise. Returns ------- labels: list of lists of integers e.g. [[0,1], [3], [1,2,3], [1], [2]] All integers from 0,1,...,K-1 must be represented."""return[np.where(row)[0].tolist()forrowinonehot_matrix]