Source code for cleanlab.datalab.internal.issue_manager.regression.label

# Copyright (C) 2017-2023  Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab.  If not, see <https://www.gnu.org/licenses/>.

from __future__ import annotations

from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional
import numpy as np
import pandas as pd

from cleanlab.regression.learn import CleanLearning
from cleanlab.datalab.internal.issue_manager import IssueManager
from cleanlab.regression.rank import get_label_quality_scores

if TYPE_CHECKING:  # pragma: no cover
    from cleanlab.datalab.datalab import Datalab


[docs]class RegressionLabelIssueManager(IssueManager): """Manages label issues in a Datalab for regression tasks. Parameters ---------- datalab : A Datalab instance. clean_learning_kwargs : Keyword arguments to pass to the :py:meth:`regression.learn.CleanLearning <cleanlab.regression.learn.CleanLearning>` constructor. threshold : The threshold to use to determine if an example has a label issue. It is a multiplier of the median label quality score that sets the absolute threshold. Only used if predictions are provided to `~RegressionLabelIssueManager.find_issues`, not if features are provided. Default is 0.05. """ description: ClassVar[ str ] = """Examples whose given label is estimated to be potentially incorrect (e.g. due to annotation error) are flagged as having label issues. """ issue_name: ClassVar[str] = "label" verbosity_levels = { 0: [], 1: [], 2: [], 3: [], # TODO } def __init__( self, datalab: Datalab, clean_learning_kwargs: Optional[Dict[str, Any]] = None, threshold: float = 0.05, health_summary_parameters: Optional[Dict[str, Any]] = None, **_, ): super().__init__(datalab) self.cl = CleanLearning(**(clean_learning_kwargs or {})) # This is a field for prioritizing features only when using a custom model self._uses_custom_model = "model" in (clean_learning_kwargs or {}) self.threshold = threshold
[docs] def find_issues( self, features: Optional[np.ndarray] = None, predictions: Optional[np.ndarray] = None, **kwargs, ) -> None: """Find label issues in the datalab. .. admonition:: Priority Order for finding issues: 1. Custom Model: Requires `features` to be passed to this method. Used if a model is set up in the constructor. 2. Predictions: Uses `predictions` if provided and no model is set up in the constructor. 3. Default Model: Defaults to a standard model using `features` if no model or predictions are provided. """ if features is None and predictions is None: raise ValueError( "Regression requires numerical `features` or `predictions` " "to be passed in as an argument to `find_issues`." ) if features is None and self._uses_custom_model: raise ValueError( "Regression requires numerical `features` to be passed in as an argument to `find_issues` " "when using a custom model." ) # If features are provided and either a custom model is used or no predictions are provided use_features = features is not None and (self._uses_custom_model or predictions is None) labels = self.datalab.labels if not isinstance(labels, np.ndarray): error_msg = ( f"Expected labels to be a numpy array of shape (n_samples,) to use with RegressionLabelIssueManager, " f"but got {type(labels)} instead." ) raise TypeError(error_msg) if use_features: assert features is not None # mypy won't narrow the type for some reason self.issues = find_issues_with_features( features=features, y=labels, cl=self.cl, **kwargs, # function sanitizes kwargs ) self.issues.rename(columns={"label_quality": self.issue_score_key}, inplace=True) # Otherwise, if predictions are provided, process them else: assert predictions is not None # mypy won't narrow the type for some reason self.issues = find_issues_with_predictions( predictions=predictions, y=labels, **{**kwargs, **{"threshold": self.threshold}}, # function sanitizes kwargs ) # Get a summarized dataframe of the label issues self.summary = self.make_summary(score=self.issues[self.issue_score_key].mean()) # Collect info about the label issues self.info = self.collect_info(issues=self.issues) # Drop columns from issues that are in the info self.issues = self.issues.drop(columns=["given_label", "predicted_label"])
[docs] def collect_info(self, issues: pd.DataFrame) -> dict: issues_info = { "num_label_issues": sum(issues[f"is_{self.issue_name}_issue"]), "average_label_quality": issues[self.issue_score_key].mean(), "given_label": issues["given_label"].tolist(), "predicted_label": issues["predicted_label"].tolist(), } # health_summary_info, cl_info kept just for consistency with classification, but it could be just return issues_info health_summary_info: dict = {} cl_info: dict = {} info_dict = { **issues_info, **health_summary_info, **cl_info, } return info_dict
[docs]def find_issues_with_predictions( predictions: np.ndarray, y: np.ndarray, threshold: float, **kwargs, ) -> pd.DataFrame: """Find label issues in a regression dataset based on predictions. This uses a threshold to determine if an example has a label issue based on the quality score. Parameters ---------- predictions : The predictions from a regression model. y : The given labels. threshold : The threshold to use to determine if an example has a label issue. It is a multiplier of the median label quality score that sets the absolute threshold. **kwargs : Various keyword arguments. Returns ------- issues : A dataframe of the issues. It contains the following columns: - is_label_issue : bool True if the example has a label issue. - label_score : float The quality score of the label. - given_label : float The given label. It is the same as the y parameter. - predicted_label : float The predicted label. It is the same as the predictions parameter. """ _accepted_kwargs = ["method"] _kwargs = {k: kwargs.get(k) for k in _accepted_kwargs} _kwargs = {k: v for k, v in _kwargs.items() if v is not None} quality_scores = get_label_quality_scores(labels=y, predictions=predictions, **_kwargs) median_score = np.median(quality_scores) is_label_issue_mask = quality_scores < median_score * threshold issues = pd.DataFrame( { "is_label_issue": is_label_issue_mask, "label_score": quality_scores, "given_label": y, "predicted_label": predictions, } ) return issues
[docs]def find_issues_with_features( features: np.ndarray, y: np.ndarray, cl: CleanLearning, **kwargs, ) -> pd.DataFrame: """Find label issues in a regression dataset based on features. This delegates the work to the CleanLearning.find_label_issues method. Parameters ---------- features : The numerical features from a regression dataset. y : The given labels. **kwargs : Various keyword arguments. Returns ------- issues : A dataframe of the issues. It contains the following columns: - is_label_issue : bool True if the example has a label issue. - label_score : float The quality score of the label. - given_label : float The given label. It is the same as the y parameter. - predicted_label : float The predicted label. It is determined by the CleanLearning.find_label_issues method. """ _accepted_kwargs = [ "uncertainty", "coarse_search_range", "fine_search_size", "save_space", "model_kwargs", ] _kwargs = {k: v for k, v in kwargs.items() if k in _accepted_kwargs and v is not None} return cl.find_label_issues(X=features, y=y, **_kwargs)