Source code for cleanlab.multilabel_classification.filter

# Copyright (C) 2017-2023  Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab.  If not, see <https://www.gnu.org/licenses/>.

"""
Methods to flag which examples have label issues in multi-label classification datasets.
Here each example can belong to one or more classes, or none of the classes at all.
Unlike in standard multi-class classification, model-predicted class probabilities need not sum to 1 for each row in multi-label classification.
"""

import warnings
import inspect
from typing import Optional, Union, Tuple, List, Any
import numpy as np


[docs]def find_label_issues(
    labels: list,
    pred_probs: np.ndarray,
    return_indices_ranked_by: Optional[str] = None,
    rank_by_kwargs={},
    filter_by: str = "prune_by_noise_rate",
    frac_noise: float = 1.0,
    num_to_remove_per_class: Optional[List[int]] = None,
    min_examples_per_class=1,
    confident_joint: Optional[np.ndarray] = None,
    n_jobs: Optional[int] = None,
    verbose: bool = False,
    low_memory: bool = False,
) -> np.ndarray:
    """
    Identifies potentially mislabeled examples in a multi-label classification dataset.
    An example is flagged as with a label issue if *any* of the classes appear to be incorrectly annotated for this example.

    Parameters
    ----------
    labels : List[List[int]]
      List of noisy labels for multi-label classification where each example can belong to multiple classes.
      This is an iterable of iterables where the i-th element of `labels` corresponds to a list of classes that the i-th example belongs to,
      according to the original data annotation (e.g. ``labels = [[1,2],[1],[0],..]``).
      This method will return the indices i where the inner list ``labels[i]`` is estimated to have some error.
      For a dataset with K classes, each class must be represented as an integer in 0, 1, ..., K-1 within the labels.

    pred_probs : np.ndarray
      An array of shape ``(N, K)`` of model-predicted class probabilities.
      Each row of this matrix corresponds to an example `x`
      and contains the predicted probability that `x` belongs to each possible class,
      for each of the K classes (along its columns).
      The columns need not sum to 1 but must be ordered such that
      these probabilities correspond to class 0, 1, ..., K-1.

      Note
      ----
      Estimated label quality scores are most accurate when they are computed based on out-of-sample ``pred_probs`` from your model.
      To obtain out-of-sample predicted probabilities for every example in your dataset, you can use :ref:`cross-validation <pred_probs_cross_val>`.
      This is encouraged to get better results.

    return_indices_ranked_by : {None, 'self_confidence', 'normalized_margin', 'confidence_weighted_entropy'}, default = None
      This function can return a boolean mask (if None) or an array of the example-indices with issues sorted based on the specified ranking method.
      Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details.

    rank_by_kwargs : dict, optional
      Optional keyword arguments to pass into scoring functions for ranking by
      label quality score (see :py:func:`rank.get_label_quality_scores
      <cleanlab.rank.get_label_quality_scores>`).

    filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given', 'low_normalized_margin', 'low_self_confidence'}, default='prune_by_noise_rate'
      The specific Confident Learning method to determine precisely which examples have label issues in a dataset.
      Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details.

    frac_noise : float, default = 1.0
      This will return the "top" frac_noise * num_label_issues estimated label errors, dependent on the filtering method used,
      Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details.

    num_to_remove_per_class : array_like
      An iterable that specifies the number of mislabeled examples to return from each class.
      Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details.

    min_examples_per_class : int, default = 1
      The minimum number of examples required per class below which examples from this class will not be flagged as label issues.
      Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details.

    confident_joint : np.ndarray, optional
      An array of shape ``(K, 2, 2)`` representing a one-vs-rest formatted confident joint, as is appropriate for multi-label classification tasks.
      Entry ``(c, i, j)`` in this array is the number of examples confidently counted into a ``(class c, noisy label=i, true label=j)`` bin,
      where `i, j` are either 0 or 1 to denote whether this example belongs to class `c` or not
      (recall examples can belong to multiple classes in multi-label classification).
      The `confident_joint` can be computed using :py:func:`count.compute_confident_joint <cleanlab.count.compute_confident_joint>` with ``multi_label=True``.
      If not provided, it is computed from the given (noisy) `labels` and `pred_probs`.

    n_jobs : optional
      Number of processing threads used by multiprocessing.
      Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details.

    verbose : optional
      If ``True``, prints when multiprocessing happens.

    low_memory: bool, default=False
      Set as ``True`` if you have a big dataset with limited memory.
      Uses :py:func:`experimental.label_issues_batched.find_label_issues_batched <cleanlab.experimental.label_issues_batched>`

    Returns
    -------
    label_issues : np.ndarray
      If `return_indices_ranked_by` left unspecified, returns a boolean **mask** for the entire dataset
      where ``True`` represents an example suffering from some label issue and
      ``False`` represents an example that appears accurately labeled.

      If `return_indices_ranked_by` is specified, this method instead returns a list of **indices** of examples identified with
      label issues (i.e. those indices where the mask would be ``True``).
      Indices are sorted by the likelihood that *all* classes are correctly annotated for the corresponding example.

      Note
      ----
      Obtain the *indices* of examples with label issues in your dataset by setting
      `return_indices_ranked_by`.

    """
    from cleanlab.filter import _find_label_issues_multilabel

    if low_memory:
        if rank_by_kwargs:
            warnings.warn(f"`rank_by_kwargs` is not used when `low_memory=True`.")

        func_signature = inspect.signature(find_label_issues)
        default_args = {
            k: v.default
            for k, v in func_signature.parameters.items()
            if v.default is not inspect.Parameter.empty
        }
        arg_values = {
            "filter_by": filter_by,
            "num_to_remove_per_class": num_to_remove_per_class,
            "confident_joint": confident_joint,
            "n_jobs": n_jobs,
            "num_to_remove_per_class": num_to_remove_per_class,
            "frac_noise": frac_noise,
            "min_examples_per_class": min_examples_per_class,
        }
        for arg_name, arg_val in arg_values.items():
            if arg_val != default_args[arg_name]:
                warnings.warn(f"`{arg_name}` is not used when `low_memory=True`.")

    return _find_label_issues_multilabel(
        labels=labels,
        pred_probs=pred_probs,
        return_indices_ranked_by=return_indices_ranked_by,
        rank_by_kwargs=rank_by_kwargs,
        filter_by=filter_by,
        frac_noise=frac_noise,
        num_to_remove_per_class=num_to_remove_per_class,
        min_examples_per_class=min_examples_per_class,
        confident_joint=confident_joint,
        n_jobs=n_jobs,
        verbose=verbose,
        low_memory=low_memory,
    )


[docs]def find_multilabel_issues_per_class(
    labels: list,
    pred_probs: np.ndarray,
    return_indices_ranked_by: Optional[str] = None,
    rank_by_kwargs={},
    filter_by: str = "prune_by_noise_rate",
    frac_noise: float = 1.0,
    num_to_remove_per_class: Optional[List[int]] = None,
    min_examples_per_class=1,
    confident_joint: Optional[np.ndarray] = None,
    n_jobs: Optional[int] = None,
    verbose: bool = False,
    low_memory: bool = False,
) -> Union[np.ndarray, Tuple[List[np.ndarray], List[Any], List[np.ndarray]]]:
    """
    Identifies potentially bad labels for each example and each class in a multi-label classification dataset.
    Whereas `~cleanlab.multilabel_classification.filter.find_label_issues`
    estimates which examples have an erroneous annotation for *any* class, this method estimates which specific classes are incorrectly annotated as well.
    This method returns a list of size K, the number of classes in the dataset.

    Parameters
    ----------
    labels : List[List[int]]
      List of noisy labels for multi-label classification where each example can belong to multiple classes.
      Refer to documentation for this argument in `~cleanlab.multilabel_classification.filter.find_label_issues` for further details.
      This method will identify whether ``labels[i][k]`` appears correct, for every example ``i`` and class ``k``.

    pred_probs : np.ndarray
      An array of shape ``(N, K)`` of model-predicted class probabilities.
      Refer to documentation for this argument in `~cleanlab.multilabel_classification.filter.find_label_issues` for further details.

    return_indices_ranked_by : {None, 'self_confidence', 'normalized_margin', 'confidence_weighted_entropy'}, default = None
      This function can return a boolean mask (if this argument is ``None``) or a sorted array of indices based on the specified ranking method (if not ``None``).
      Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details.

    rank_by_kwargs : dict, optional
      Optional keyword arguments to pass into scoring functions for ranking by.
      label quality score (see :py:func:`rank.get_label_quality_scores
      <cleanlab.rank.get_label_quality_scores>`).

    filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given', 'low_normalized_margin', 'low_self_confidence'}, default = 'prune_by_noise_rate'
      The specific method that can be used to filter or prune examples with label issues from a dataset.
      Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details.

    frac_noise : float, default = 1.0
      This will return the "top" frac_noise * num_label_issues estimated label errors, dependent on the filtering method used,
      Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details.

    num_to_remove_per_class : array_like
      This parameter is an iterable that specifies the number of mislabeled examples to return from each class.
      Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details.

    min_examples_per_class : int, default = 1
      The minimum number of examples required per class to avoid flagging as label issues.
      Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details.

    confident_joint : np.ndarray, optional
      An array of shape ``(K, 2, 2)`` representing a one-vs-rest formatted confident joint.
      Refer to documentation for this argument in `~cleanlab.multilabel_classification.filter.find_label_issues` for details.

    n_jobs : optional
      Number of processing threads used by multiprocessing.
      Refer to documentation for this argument in :py:func:`filter.find_label_issues <cleanlab.filter.find_label_issues>` for details.

    verbose : optional
      If ``True``, prints when multiprocessing happens.

    Returns
    -------
    per_class_label_issues : list(np.ndarray)
      By default, this is a list of length K containing the examples where each class appears incorrectly annotated.
      ``per_class_label_issues[k]`` is a Boolean mask of the same length as the dataset,
      where ``True`` values indicate examples where class ``k`` appears incorrectly annotated.

      For more details, refer to `~cleanlab.multilabel_classification.filter.find_label_issues`.

      Otherwise if `return_indices_ranked_by` is not ``None``, then this method returns 3 objects (each of length K, the number of classes): `label_issues_list`, `labels_list`, `pred_probs_list`.
        - *label_issues_list*: an ordered list of indices of examples where class k appears incorrectly annotated, sorted by the likelihood that class k is correctly annotated.
        - *labels_list*: a binary one-hot representation of the original labels, useful if you want to compute label quality scores.
        - *pred_probs_list*: a one-vs-rest representation of the original predicted probabilities of shape ``(N, 2)``, useful if you want to compute label quality scores.
          ``pred_probs_list[k][i][0]`` is the estimated probability that example ``i`` belongs to class ``k``, and is equal to: ``1 - pred_probs_list[k][i][1]``.
    """
    import cleanlab.filter
    from cleanlab.internal.multilabel_utils import get_onehot_num_classes, stack_complement
    from cleanlab.experimental.label_issues_batched import find_label_issues_batched

    y_one, num_classes = get_onehot_num_classes(labels, pred_probs)
    if return_indices_ranked_by is None:
        bissues = np.zeros(y_one.shape).astype(bool)
    else:
        label_issues_list = []
    labels_list = []
    pred_probs_list = []
    if confident_joint is not None and not low_memory:
        confident_joint_shape = confident_joint.shape
        if confident_joint_shape == (num_classes, num_classes):
            warnings.warn(
                f"The new recommended format for `confident_joint` in multi_label settings is (num_classes,2,2) as output by compute_confident_joint(...,multi_label=True). Your K x K confident_joint in the old format is being ignored."
            )
            confident_joint = None
        elif confident_joint_shape != (num_classes, 2, 2):
            raise ValueError("confident_joint should be of shape (num_classes, 2, 2)")
    for class_num, (label, pred_prob_for_class) in enumerate(zip(y_one.T, pred_probs.T)):
        pred_probs_binary = stack_complement(pred_prob_for_class)
        if low_memory:
            quality_score_kwargs = (
                {"method": return_indices_ranked_by} if return_indices_ranked_by else None
            )
            binary_label_issues = find_label_issues_batched(
                labels=label,
                pred_probs=pred_probs_binary,
                verbose=verbose,
                quality_score_kwargs=quality_score_kwargs,
                return_mask=return_indices_ranked_by is None,
            )
        else:
            if confident_joint is None:
                conf = None
            else:
                conf = confident_joint[class_num]
            if num_to_remove_per_class is not None:
                ml_num_to_remove_per_class = [num_to_remove_per_class[class_num], 0]
            else:
                ml_num_to_remove_per_class = None
            binary_label_issues = cleanlab.filter.find_label_issues(
                labels=label,
                pred_probs=pred_probs_binary,
                return_indices_ranked_by=return_indices_ranked_by,
                frac_noise=frac_noise,
                rank_by_kwargs=rank_by_kwargs,
                filter_by=filter_by,
                num_to_remove_per_class=ml_num_to_remove_per_class,
                min_examples_per_class=min_examples_per_class,
                confident_joint=conf,
                n_jobs=n_jobs,
                verbose=verbose,
            )

        if return_indices_ranked_by is None:
            bissues[:, class_num] = binary_label_issues
        else:
            label_issues_list.append(binary_label_issues)
            labels_list.append(label)
            pred_probs_list.append(pred_probs_binary)
    if return_indices_ranked_by is None:
        return bissues
    else:
        return label_issues_list, labels_list, pred_probs_list