Source code for cleanlab.multilabel_classification.filter
# Copyright (C) 2017-2023 Cleanlab Inc.# This file is part of cleanlab.## cleanlab is free software: you can redistribute it and/or modify# it under the terms of the GNU Affero General Public License as published# by the Free Software Foundation, either version 3 of the License, or# (at your option) any later version.## cleanlab is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the# GNU Affero General Public License for more details.## You should have received a copy of the GNU Affero General Public License# along with cleanlab. If not, see <https://www.gnu.org/licenses/>."""Methods to flag which examples have label issues in multi-label classification datasets.Here each example can belong to one or more classes, or none of the classes at all.Unlike in standard multi-class classification, model-predicted class probabilities need not sum to 1 for each row in multi-label classification."""importwarningsimportinspectfromtypingimportOptional,Union,Tuple,List,Anyimportnumpyasnp
[docs]deffind_label_issues(labels:list,pred_probs:np.ndarray,return_indices_ranked_by:Optional[str]=None,rank_by_kwargs={},filter_by:str="prune_by_noise_rate",frac_noise:float=1.0,num_to_remove_per_class:Optional[List[int]]=None,min_examples_per_class=1,confident_joint:Optional[np.ndarray]=None,n_jobs:Optional[int]=None,verbose:bool=False,low_memory:bool=False,)->np.ndarray:""" Identifies potentially mislabeled examples in a multi-label classification dataset. An example is flagged as with a label issue if *any* of the classes appear to be incorrectly annotated for this example. Parameters ---------- labels : List[List[int]] List of noisy labels for multi-label classification where each example can belong to multiple classes. This is an iterable of iterables where the i-th element of `labels` corresponds to a list of classes that the i-th example belongs to, according to the original data annotation (e.g. ``labels = [[1,2],[1],[0],..]``). This method will return the indices i where the inner list ``labels[i]`` is estimated to have some error. For a dataset with K classes, each class must be represented as an integer in 0, 1, ..., K-1 within the labels. pred_probs : np.ndarray An array of shape ``(N, K)`` of model-predicted class probabilities. Each row of this matrix corresponds to an example `x` and contains the predicted probability that `x` belongs to each possible class, for each of the K classes (along its columns). The columns need not sum to 1 but must be ordered such that these probabilities correspond to class 0, 1, ..., K-1. Note ---- Estimated label quality scores are most accurate when they are computed based on out-of-sample ``pred_probs`` from your model. To obtain out-of-sample predicted probabilities for every example in your dataset, you can use :ref:`cross-validation <pred_probs_cross_val>`. This is encouraged to get better results. return_indices_ranked_by : {None, 'self_confidence', 'normalized_margin', 'confidence_weighted_entropy'}, default = None This function can return a boolean mask (if None) or an array of the example-indices with issues sorted based on the specified ranking method. Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details. rank_by_kwargs : dict, optional Optional keyword arguments to pass into scoring functions for ranking by label quality score (see :py:func:`rank.get_label_quality_scores <cleanlab.rank.get_label_quality_scores>`). filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given', 'low_normalized_margin', 'low_self_confidence'}, default='prune_by_noise_rate' The specific Confident Learning method to determine precisely which examples have label issues in a dataset. Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details. frac_noise : float, default = 1.0 This will return the "top" frac_noise * num_label_issues estimated label errors, dependent on the filtering method used, Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details. num_to_remove_per_class : array_like An iterable that specifies the number of mislabeled examples to return from each class. Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details. min_examples_per_class : int, default = 1 The minimum number of examples required per class below which examples from this class will not be flagged as label issues. Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details. confident_joint : np.ndarray, optional An array of shape ``(K, 2, 2)`` representing a one-vs-rest formatted confident joint, as is appropriate for multi-label classification tasks. Entry ``(c, i, j)`` in this array is the number of examples confidently counted into a ``(class c, noisy label=i, true label=j)`` bin, where `i, j` are either 0 or 1 to denote whether this example belongs to class `c` or not (recall examples can belong to multiple classes in multi-label classification). The `confident_joint` can be computed using :py:func:`count.compute_confident_joint <cleanlab.count.compute_confident_joint>` with ``multi_label=True``. If not provided, it is computed from the given (noisy) `labels` and `pred_probs`. n_jobs : optional Number of processing threads used by multiprocessing. Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details. verbose : optional If ``True``, prints when multiprocessing happens. low_memory: bool, default=False Set as ``True`` if you have a big dataset with limited memory. Uses :py:func:`experimental.label_issues_batched.find_label_issues_batched <cleanlab.experimental.label_issues_batched>` Returns ------- label_issues : np.ndarray If `return_indices_ranked_by` left unspecified, returns a boolean **mask** for the entire dataset where ``True`` represents an example suffering from some label issue and ``False`` represents an example that appears accurately labeled. If `return_indices_ranked_by` is specified, this method instead returns a list of **indices** of examples identified with label issues (i.e. those indices where the mask would be ``True``). Indices are sorted by the likelihood that *all* classes are correctly annotated for the corresponding example. Note ---- Obtain the *indices* of examples with label issues in your dataset by setting `return_indices_ranked_by`. """fromcleanlab.filterimport_find_label_issues_multilabeliflow_memory:ifrank_by_kwargs:warnings.warn(f"`rank_by_kwargs` is not used when `low_memory=True`.")func_signature=inspect.signature(find_label_issues)default_args={k:v.defaultfork,vinfunc_signature.parameters.items()ifv.defaultisnotinspect.Parameter.empty}arg_values={"filter_by":filter_by,"num_to_remove_per_class":num_to_remove_per_class,"confident_joint":confident_joint,"n_jobs":n_jobs,"num_to_remove_per_class":num_to_remove_per_class,"frac_noise":frac_noise,"min_examples_per_class":min_examples_per_class,}forarg_name,arg_valinarg_values.items():ifarg_val!=default_args[arg_name]:warnings.warn(f"`{arg_name}` is not used when `low_memory=True`.")return_find_label_issues_multilabel(labels=labels,pred_probs=pred_probs,return_indices_ranked_by=return_indices_ranked_by,rank_by_kwargs=rank_by_kwargs,filter_by=filter_by,frac_noise=frac_noise,num_to_remove_per_class=num_to_remove_per_class,min_examples_per_class=min_examples_per_class,confident_joint=confident_joint,n_jobs=n_jobs,verbose=verbose,low_memory=low_memory,)
[docs]deffind_multilabel_issues_per_class(labels:list,pred_probs:np.ndarray,return_indices_ranked_by:Optional[str]=None,rank_by_kwargs={},filter_by:str="prune_by_noise_rate",frac_noise:float=1.0,num_to_remove_per_class:Optional[List[int]]=None,min_examples_per_class=1,confident_joint:Optional[np.ndarray]=None,n_jobs:Optional[int]=None,verbose:bool=False,low_memory:bool=False,)->Union[np.ndarray,Tuple[List[np.ndarray],List[Any],List[np.ndarray]]]:""" Identifies potentially bad labels for each example and each class in a multi-label classification dataset. Whereas `~cleanlab.multilabel_classification.filter.find_label_issues` estimates which examples have an erroneous annotation for *any* class, this method estimates which specific classes are incorrectly annotated as well. This method returns a list of size K, the number of classes in the dataset. Parameters ---------- labels : List[List[int]] List of noisy labels for multi-label classification where each example can belong to multiple classes. Refer to documentation for this argument in `~cleanlab.multilabel_classification.filter.find_label_issues` for further details. This method will identify whether ``labels[i][k]`` appears correct, for every example ``i`` and class ``k``. pred_probs : np.ndarray An array of shape ``(N, K)`` of model-predicted class probabilities. Refer to documentation for this argument in `~cleanlab.multilabel_classification.filter.find_label_issues` for further details. return_indices_ranked_by : {None, 'self_confidence', 'normalized_margin', 'confidence_weighted_entropy'}, default = None This function can return a boolean mask (if this argument is ``None``) or a sorted array of indices based on the specified ranking method (if not ``None``). Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details. rank_by_kwargs : dict, optional Optional keyword arguments to pass into scoring functions for ranking by. label quality score (see :py:func:`rank.get_label_quality_scores <cleanlab.rank.get_label_quality_scores>`). filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given', 'low_normalized_margin', 'low_self_confidence'}, default = 'prune_by_noise_rate' The specific method that can be used to filter or prune examples with label issues from a dataset. Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details. frac_noise : float, default = 1.0 This will return the "top" frac_noise * num_label_issues estimated label errors, dependent on the filtering method used, Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details. num_to_remove_per_class : array_like This parameter is an iterable that specifies the number of mislabeled examples to return from each class. Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details. min_examples_per_class : int, default = 1 The minimum number of examples required per class to avoid flagging as label issues. Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details. confident_joint : np.ndarray, optional An array of shape ``(K, 2, 2)`` representing a one-vs-rest formatted confident joint. Refer to documentation for this argument in `~cleanlab.multilabel_classification.filter.find_label_issues` for details. n_jobs : optional Number of processing threads used by multiprocessing. Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details. verbose : optional If ``True``, prints when multiprocessing happens. Returns ------- per_class_label_issues : list(np.ndarray) By default, this is a list of length K containing the examples where each class appears incorrectly annotated. ``per_class_label_issues[k]`` is a Boolean mask of the same length as the dataset, where ``True`` values indicate examples where class ``k`` appears incorrectly annotated. For more details, refer to `~cleanlab.multilabel_classification.filter.find_label_issues`. Otherwise if `return_indices_ranked_by` is not ``None``, then this method returns 3 objects (each of length K, the number of classes): `label_issues_list`, `labels_list`, `pred_probs_list`. - *label_issues_list*: an ordered list of indices of examples where class k appears incorrectly annotated, sorted by the likelihood that class k is correctly annotated. - *labels_list*: a binary one-hot representation of the original labels, useful if you want to compute label quality scores. - *pred_probs_list*: a one-vs-rest representation of the original predicted probabilities of shape ``(N, 2)``, useful if you want to compute label quality scores. ``pred_probs_list[k][i][0]`` is the estimated probability that example ``i`` belongs to class ``k``, and is equal to: ``1 - pred_probs_list[k][i][1]``. """importcleanlab.filterfromcleanlab.internal.multilabel_utilsimportget_onehot_num_classes,stack_complementfromcleanlab.experimental.label_issues_batchedimportfind_label_issues_batchedy_one,num_classes=get_onehot_num_classes(labels,pred_probs)ifreturn_indices_ranked_byisNone:bissues=np.zeros(y_one.shape).astype(bool)else:label_issues_list=[]labels_list=[]pred_probs_list=[]ifconfident_jointisnotNoneandnotlow_memory:confident_joint_shape=confident_joint.shapeifconfident_joint_shape==(num_classes,num_classes):warnings.warn(f"The new recommended format for `confident_joint` in multi_label settings is (num_classes,2,2) as output by compute_confident_joint(...,multi_label=True). Your K x K confident_joint in the old format is being ignored.")confident_joint=Noneelifconfident_joint_shape!=(num_classes,2,2):raiseValueError("confident_joint should be of shape (num_classes, 2, 2)")forclass_num,(label,pred_prob_for_class)inenumerate(zip(y_one.T,pred_probs.T)):pred_probs_binary=stack_complement(pred_prob_for_class)iflow_memory:quality_score_kwargs=({"method":return_indices_ranked_by}ifreturn_indices_ranked_byelseNone)binary_label_issues=find_label_issues_batched(labels=label,pred_probs=pred_probs_binary,verbose=verbose,quality_score_kwargs=quality_score_kwargs,return_mask=return_indices_ranked_byisNone,)else:ifconfident_jointisNone:conf=Noneelse:conf=confident_joint[class_num]ifnum_to_remove_per_classisnotNone:ml_num_to_remove_per_class=[num_to_remove_per_class[class_num],0]else:ml_num_to_remove_per_class=Nonebinary_label_issues=cleanlab.filter.find_label_issues(labels=label,pred_probs=pred_probs_binary,return_indices_ranked_by=return_indices_ranked_by,frac_noise=frac_noise,rank_by_kwargs=rank_by_kwargs,filter_by=filter_by,num_to_remove_per_class=ml_num_to_remove_per_class,min_examples_per_class=min_examples_per_class,confident_joint=conf,n_jobs=n_jobs,verbose=verbose,)ifreturn_indices_ranked_byisNone:bissues[:,class_num]=binary_label_issueselse:label_issues_list.append(binary_label_issues)labels_list.append(label)pred_probs_list.append(pred_probs_binary)ifreturn_indices_ranked_byisNone:returnbissueselse:returnlabel_issues_list,labels_list,pred_probs_list