Source code for cleanlab.datalab.internal.issue_manager.underperforming_group

# Copyright (C) 2017-2023  Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab.  If not, see <https://www.gnu.org/licenses/>.
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, Optional, Union, Tuple
import warnings
import inspect

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.cluster import DBSCAN

from cleanlab.datalab.internal.issue_manager import IssueManager
from cleanlab.datalab.internal.issue_manager.knn_graph_helpers import set_knn_graph
from cleanlab.rank import get_self_confidence_for_each_label

if TYPE_CHECKING:  # pragma: no cover
    import numpy.typing as npt
    from cleanlab.datalab.datalab import Datalab


CLUSTERING_ALGO = "DBSCAN"
CLUSTERING_PARAMS_DEFAULT = {"metric": "precomputed"}


[docs]class UnderperformingGroupIssueManager(IssueManager):
    """
    Manages issues related to underperforming group examples.

    Note: The `min_cluster_samples` argument should not be confused with the
    `min_samples` argument of sklearn.cluster.DBSCAN.

    Examples
    --------
    >>> from cleanlab import Datalab
    >>> import numpy as np
    >>> X = np.random.normal(size=(50, 2))
    >>> y = np.random.randint(2, size=50)
    >>> pred_probs = X / X.sum(axis=1, keepdims=True)
    >>> data = {"X": X, "y": y}
    >>> lab = Datalab(data, label_name="y")
    >>> issue_types={"underperforming_group": {"clustering_kwargs": {"eps": 0.5}}}
    >>> lab.find_issues(pred_probs=pred_probs, features=X, issue_types=issue_types)
    """

    description: ClassVar[
        str
    ] = """An underperforming group refers to a cluster of similar examples
    (i.e. a slice) in the dataset for which the ML model predictions
    are particularly poor (loss evaluation over this subpopulation is high).
    """
    issue_name: ClassVar[str] = "underperforming_group"
    verbosity_levels = {
        0: [],
        1: [],
        2: ["threshold"],
    }
    OUTLIER_CLUSTER_LABELS: ClassVar[Tuple[int]] = (-1,)
    """Specifies labels considered as outliers by the clustering algorithm."""
    NO_UNDERPERFORMING_CLUSTER_ID: ClassVar[int] = min(OUTLIER_CLUSTER_LABELS) - 1
    """Constant to signify absence of any underperforming cluster."""

    def __init__(
        self,
        datalab: Datalab,
        metric: Optional[Union[str, Callable]] = None,
        threshold: float = 0.1,
        k: int = 10,
        clustering_kwargs: Dict[str, Any] = {},
        min_cluster_samples: int = 5,
        **_: Any,
    ):
        super().__init__(datalab)
        self.metric = metric
        self.threshold = self._set_threshold(threshold)
        self.k = k
        self.clustering_kwargs = clustering_kwargs
        self.min_cluster_samples = min_cluster_samples

[docs]    def find_issues(
        self,
        pred_probs: npt.NDArray,
        features: Optional[npt.NDArray] = None,
        cluster_ids: Optional[npt.NDArray[np.int_]] = None,
        **kwargs: Any,
    ) -> None:
        labels = self.datalab.labels
        if not isinstance(labels, np.ndarray):
            error_msg = (
                f"Labels must be a numpy array of shape (n_samples,) for UnderperformingGroupIssueManager. "
                f"Got {type(labels)} instead."
            )
            raise TypeError(error_msg)
        if cluster_ids is None:
            statistics = self.datalab.get_info("statistics")
            knn_graph, self.metric, _ = set_knn_graph(
                features, kwargs, self.metric, self.k, statistics
            )
            cluster_ids = self.perform_clustering(knn_graph)
            performed_clustering = True
        else:
            if self.clustering_kwargs:
                warnings.warn(
                    "`clustering_kwargs` will not be used since `cluster_ids` have been passed."
                )
            performed_clustering = False
            knn_graph = None
        unique_cluster_ids = self.filter_cluster_ids(cluster_ids)
        if not unique_cluster_ids.size:
            raise ValueError(
                "No meaningful clusters were generated for determining underperforming group."
            )
        n_clusters = len(unique_cluster_ids)
        cluster_id_to_score, worst_cluster_id, worst_cluster_ratio = (
            self.get_underperforming_clusters(cluster_ids, unique_cluster_ids, labels, pred_probs)
        )
        is_issue_column = cluster_ids == worst_cluster_id
        scores = np.ones(is_issue_column.shape[0])
        for cluster_id, cluster_score in cluster_id_to_score.items():
            scores[cluster_ids == cluster_id] = cluster_score
        self.issues = pd.DataFrame(
            {
                f"is_{self.issue_name}_issue": is_issue_column,
                self.issue_score_key: scores,
            },
        )
        self.summary = self.make_summary(score=worst_cluster_ratio)
        self.info = self.collect_info(
            knn_graph=knn_graph,
            n_clusters=n_clusters,
            cluster_ids=cluster_ids,
            performed_clustering=performed_clustering,
            worst_cluster_id=worst_cluster_id,
        )

[docs]    def perform_clustering(self, knn_graph: csr_matrix) -> npt.NDArray[np.int_]:
        """Perform clustering of datapoints using a knn graph as distance matrix.

        Args:
            knn_graph (csr_matrix): Sparse Distance Matrix.

        Returns:
            cluster_ids (npt.NDArray[np.int_]): Cluster IDs for each datapoint.
        """
        DBSCAN_VALID_KEYS = inspect.signature(DBSCAN).parameters.keys()
        dbscan_params = {
            key: value
            for key, value in ((k, self.clustering_kwargs.get(k, None)) for k in DBSCAN_VALID_KEYS)
            if value is not None
        }
        dbscan_params["metric"] = "precomputed"
        clusterer = DBSCAN(**dbscan_params)
        cluster_ids = clusterer.fit_predict(
            knn_graph.copy()
        )  # Copy to avoid modification by DBSCAN
        return cluster_ids

[docs]    def filter_cluster_ids(self, cluster_ids: npt.NDArray[np.int_]) -> npt.NDArray[np.int_]:
        """Remove outlier clusters and return IDs of clusters with at least `self.min_cluster_samples` number of datapoints.


        Args:
            cluster_ids (npt.NDArray[np.int_]): Cluster IDs for each datapoint.

        Returns:
            unique_cluster_ids (npt.NDArray[np.int_]):  List of unique cluster IDs after
            removing outlier clusters and clusters with less than `self.min_cluster_samples`
            number of datapoints.
        """
        unique_cluster_ids = np.array(
            [label for label in set(cluster_ids) if label not in self.OUTLIER_CLUSTER_LABELS]
        )
        frequencies = np.bincount(cluster_ids[~np.isin(cluster_ids, self.OUTLIER_CLUSTER_LABELS)])
        unique_cluster_ids = np.array(
            [
                cluster_id
                for cluster_id in unique_cluster_ids
                if frequencies[cluster_id] >= self.min_cluster_samples
            ]
        )
        return unique_cluster_ids

[docs]    def get_underperforming_clusters(
        self,
        cluster_ids: npt.NDArray[np.int_],
        unique_cluster_ids: npt.NDArray[np.int_],
        labels: npt.NDArray,
        pred_probs: npt.NDArray,
    ) -> Tuple[Dict[int, float], int, float]:
        """Get ID and quality score of each underperforming cluster.

        Args:
            cluster_ids (npt.NDArray[np.int_]): Cluster IDs corresponding to each sample
            unique_cluster_ids (npt.NDArray[np.int_]): Unique cluster IDs excluding noisy clusters
            labels (npt.NDArray): Label of each sample
            pred_probs (npt.NDArray): Prediction probability

        Returns:
            Tuple[Dict[int, float], int, float]: (Cluster IDs and their scores, Worst Cluster ID, Worst Cluster Quality Score)
        """
        worst_cluster_ratio = 1.0  # Largest possible probability value
        worst_cluster_id = min(unique_cluster_ids) - 1
        # For calculating mean_performance of the dataset, choose labels and pred-probs of samples belonging to non-noisy clusters
        filtered_cluster_id_mask = np.isin(cluster_ids, unique_cluster_ids)
        filtered_labels = labels[filtered_cluster_id_mask]
        filtered_pred_probs = pred_probs[filtered_cluster_id_mask]
        mean_performance = get_self_confidence_for_each_label(
            filtered_labels, filtered_pred_probs
        ).mean()
        cluster_ids_to_score = {}
        for cluster_id in unique_cluster_ids:
            cluster_mask = cluster_ids == cluster_id
            cur_cluster_ids = labels[cluster_mask]
            cur_cluster_pred_probs = pred_probs[cluster_mask]
            cluster_performance = get_self_confidence_for_each_label(
                cur_cluster_ids, cur_cluster_pred_probs
            ).mean()
            if cluster_performance < mean_performance:
                cluster_ids_to_score[cluster_id] = cluster_performance / mean_performance
                if cluster_performance < worst_cluster_ratio:
                    worst_cluster_ratio = cluster_ids_to_score[cluster_id]
                    worst_cluster_id = cluster_id
        worst_cluster_id = (
            worst_cluster_id
            if worst_cluster_ratio < self.threshold
            else self.NO_UNDERPERFORMING_CLUSTER_ID
        )
        return cluster_ids_to_score, worst_cluster_id, worst_cluster_ratio

[docs]    def collect_info(
        self,
        knn_graph: csr_matrix,
        n_clusters: int,
        cluster_ids: npt.NDArray[np.int_],
        performed_clustering: bool,
        worst_cluster_id: int,
    ) -> Dict[str, Any]:
        params_dict = {
            "k": self.k,
            "metric": self.metric,
            "threshold": self.threshold,
        }

        knn_info_dict = {}
        if knn_graph is not None:
            N = knn_graph.shape[0]
            dists = knn_graph.data.reshape(N, -1)[:, 0]
            nn_ids = knn_graph.indices.reshape(N, -1)[:, 0]

            knn_info_dict = {
                "nearest_neighbor": nn_ids.tolist(),
                "distance_to_nearest_neighbor": dists.tolist(),
            }
        statistics_dict = self._build_statistics_dictionary(knn_graph=knn_graph)

        cluster_stat_dict = self._get_cluster_statistics(
            n_clusters=n_clusters,
            cluster_ids=cluster_ids,
            performed_clustering=performed_clustering,
            worst_cluster_id=worst_cluster_id,
        )
        info_dict = {
            **params_dict,
            **knn_info_dict,
            **statistics_dict,
            **cluster_stat_dict,
        }

        return info_dict

    def _build_statistics_dictionary(self, knn_graph: csr_matrix) -> Dict[str, Dict[str, Any]]:
        statistics_dict: Dict[str, Dict[str, Any]] = {"statistics": {}}

        # Add the knn graph as a statistic if necessary
        graph_key = "weighted_knn_graph"
        old_knn_graph = self.datalab.get_info("statistics").get(graph_key, None)
        old_graph_exists = old_knn_graph is not None
        prefer_new_graph = (
            not old_graph_exists
            or (isinstance(knn_graph, csr_matrix) and knn_graph.nnz > old_knn_graph.nnz)
            or self.metric != self.datalab.get_info("statistics").get("knn_metric", None)
        )
        if prefer_new_graph:
            if knn_graph is not None:
                statistics_dict["statistics"][graph_key] = knn_graph
                if self.metric is not None:
                    statistics_dict["statistics"]["knn_metric"] = self.metric

        return statistics_dict

    def _get_cluster_statistics(
        self,
        n_clusters: int,
        cluster_ids: npt.NDArray[np.int_],
        performed_clustering: bool,
        worst_cluster_id: int,
    ) -> Dict[str, Dict[str, Any]]:
        """Get relevant cluster statistics.

        Args:
            n_clusters (int): Number of clusters
            cluster_ids (npt.NDArray[np.int_]): Cluster IDs for each datapoint.
            performed_clustering (bool): Set to True to indicate that clustering was performed on
            `features` passed to `find_issues`. Set to False to suggest that `cluster_ids` were explicitly
            passed to `find_issues`.
            worst_cluster_id (int): Uderperforming cluster ID.

        Returns:
            cluster_stats (Dict[str, Dict[str, Any]]): Cluster Statistics
        """
        cluster_stats: Dict[str, Dict[str, Any]] = {
            "clustering": {
                "algorithm": None,
                "params": {},
                "stats": {
                    "n_clusters": n_clusters,
                    "cluster_ids": cluster_ids,
                    "underperforming_cluster_id": worst_cluster_id,
                },
            }
        }
        if performed_clustering:
            cluster_stats["clustering"].update(
                {"algorithm": CLUSTERING_ALGO, "params": CLUSTERING_PARAMS_DEFAULT}
            )

        return cluster_stats

    def _set_threshold(
        self,
        threshold: float,
    ) -> float:
        """Computes nearest-neighbors thresholding for near-duplicate detection."""
        if threshold < 0:
            warnings.warn(
                f"Computed threshold {threshold} is less than 0. "
                "Setting threshold to 0."
                "This may indicate that either the only a few examples are in the dataset, "
                "or the data is heavily skewed."
            )
            threshold = 0
        return threshold