Source code for cleanlab.baseline_methods

# Copyright (C) 2017-2022  Cleanlab Inc.
# This file is part of cleanlab.
# 
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab.  If not, see <https://www.gnu.org/licenses/>.

# Baseline methods
# 
# Contains baseline methods for estimating label errors.
#
# These methods ONLY WORK FOR SINGLE_LABEL (not multi-label)

from __future__ import (
    print_function, absolute_import, division, unicode_literals, with_statement
)
from sklearn.metrics import confusion_matrix
from cleanlab.pruning import get_noise_indices
from cleanlab.latent_estimation import calibrate_confident_joint
import numpy as np


[docs]def baseline_argmax(psx, s): '''This is the simplest baseline approach. Just consider anywhere argmax != s as a label error. Parameters ---------- s : np.array A discrete vector of noisy labels, i.e. some labels may be erroneous. psx : np.array (shape (N, K)) P(label=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation. Returns ------- A boolean mask that is true if the example belong to that index is label error..''' return np.argmax(psx, axis=1) != np.asarray(s)
[docs]def baseline_argmax_confusion_matrix( psx, s, calibrate=False, prune_method='prune_by_noise_rate', ): '''This is a baseline approach. That uses the a confusion matrix of argmax(psx) and s as the confident joint and then uses cleanlab (confident learning) to find the label errors using this matrix. Parameters ---------- s : np.array A discrete vector of noisy labels, i.e. some labels may be erroneous. psx : np.array (shape (N, K)) P(label=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation. Returns ------- A boolean mask that is true if the example belong to that index is label error..''' confident_joint = confusion_matrix(np.argmax(psx, axis=1), s).T if calibrate: confident_joint = calibrate_confident_joint(confident_joint, s) return get_noise_indices( s=s, psx=psx, confident_joint=confident_joint, prune_method=prune_method, )
[docs]def baseline_argmax_calibrated_confusion_matrix( psx, s, prune_method='prune_by_noise_rate', ): '''docstring is the same as baseline_argmax_confusion_matrix Except in this method, we calibrate the confident joint created using the confusion matrix before using cleanlab to find the label errors.''' return baseline_argmax_confusion_matrix( s=s, psx=psx, calibrate=True, prune_method=prune_method, )