Source code for cleanlab.data_valuation

# Copyright (C) 2017-2024  Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab.  If not, see <https://www.gnu.org/licenses/>.
"""
Methods for quantifying the value of each data point in a Machine Learning dataset.
Data Valuation helps us assess individual training data points' contributions to a ML model's predictive performance.
"""


from typing import Callable, Optional, Union

import numpy as np
from scipy.sparse import csr_matrix

from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index


def _knn_shapley_score(neighbor_indices: np.ndarray, y: np.ndarray, k: int) -> np.ndarray:
    """Compute the Data Shapley values of data points using neighbor indices in a K-Nearest Neighbors (KNN) graph.

    This function leverages equations (18) and (19) from the paper available at https://arxiv.org/abs/1908.08619
    for computational efficiency.

    Parameters
    ----------
    neighbor_indices :
        A 2D array where each row contains the indices of the k-nearest neighbors for each data point.
    y :
        A 1D array of target values corresponding to the data points.
    k :
        The number of nearest neighbors to consider for each data point.

    Notes
    -----
    - The training set is used as its own test set for the KNN-Shapley value computation, meaning y_test is the same as y_train.
    - `neighbor_indices` are assumed to be pre-sorted by distance, with the nearest neighbors appearing first, and with at least `k` neighbors.
    - Unlike the referenced paper, this implementation does not account for an upper error bound epsilon.
      Consequently, K* is treated as equal to K instead of K* = max(K, 1/epsilon).
        - This simplification implies that the term min(K, j + 1) will always be j + 1, which is offset by the
          corresponding denominator term in the inner loop.
        - Dividing by K in the end achieves the same result as dividing by K* in the paper.
    - The pre-allocated `scores` array incorporates equation (18) for j = k - 1, ensuring efficient computation.
    """
    N = y.shape[0]
    scores = np.zeros((N, N))

    for y_alpha, s_alpha, idx in zip(y, scores, neighbor_indices):
        y_neighbors = y[idx]
        ans_matches = (y_neighbors == y_alpha).flatten()
        for j in range(k - 2, -1, -1):
            s_alpha[idx[j]] = s_alpha[idx[j + 1]] + float(
                int(ans_matches[j]) - int(ans_matches[j + 1])
            )
    return np.mean(scores / k, axis=0)


[docs]def data_shapley_knn(
    labels: np.ndarray,
    *,
    features: Optional[np.ndarray] = None,
    knn_graph: Optional[csr_matrix] = None,
    metric: Optional[Union[str, Callable]] = None,
    k: int = 10,
) -> np.ndarray:
    """
    Compute the Data Shapley values of data points using a K-Nearest Neighbors (KNN) graph.

    This function calculates the contribution (Data Shapley value) of each data point in a dataset
    for model training, either directly from data features or using a precomputed KNN graph.

    The examples in the dataset with lowest data valuation scores contribute least
    to a trained ML model’s performance (those whose value falls below a threshold are flagged with this type of issue).
    The data valuation score is an approximate Data Shapley value, calculated based on the labels of the top k nearest neighbors of an example. Details on this KNN-Shapley value can be found in these papers:
    https://arxiv.org/abs/1908.08619 and https://arxiv.org/abs/1911.07128.

    Parameters
    ----------
    labels :
        An array of labels for the data points(only for multi-class classification datasets).
    features :
        Feature embeddings (vector representations) of every example in the dataset.

            Necessary if `knn_graph` is not supplied.

            If provided, this must be a 2D array with shape (num_examples, num_features).
    knn_graph :
        A precomputed sparse KNN graph. If not provided, it will be computed from the `features` using the specified `metric`.
    metric : Optional[str or Callable], default=None
        The distance metric for KNN graph construction.
        Supports metrics available in ``sklearn.neighbors.NearestNeighbors``
        Default metric is ``"cosine"`` for ``dim(features) > 3``, otherwise ``"euclidean"`` for lower-dimensional data.
        The euclidean is computed with an efficient implementation from scikit-learn when the number of examples is greater than 100.
        When the number of examples is 100 or fewer, a more numerically stable version of the euclidean distance from scipy is used.
    k :
        The number of neighbors to consider for the KNN graph and Data Shapley value computation.
        Must be less than the total number of data points.
        The value may not exceed the number of neighbors of each data point stored in the KNN graph.

    Returns
    -------
    scores :
        An array of transformed Data Shapley values for each data point, calibrated to indicate their relative importance.
        These scores have been adjusted to fall within 0 to 1.
        Values closer to 1 indicate data points that are highly influential and positively contribute to a trained ML model's performance.
        Conversely, scores below 0.5 indicate data points estimated to negatively impact model performance.

    Raises
    ------
    ValueError
        If neither `knn_graph` nor `features` are provided, or if `k` is larger than the number of examples in `features`.

    Examples
    --------
    >>> import numpy as np
    >>> from cleanlab.data_valuation import data_shapley_knn
    >>> labels = np.array([0, 1, 0, 1, 0])
    >>> features = np.array([[0, 1, 2, 3, 4]]).T
    >>> data_shapley_knn(labels=labels, features=features, k=4)
    array([0.55 , 0.525, 0.55 , 0.525, 0.55 ])
    """
    if knn_graph is None and features is None:
        raise ValueError("Either knn_graph or features must be provided.")

    # Use provided knn_graph or compute it from features
    if knn_graph is None:
        knn_graph, _ = create_knn_graph_and_index(features, n_neighbors=k, metric=metric)

    num_examples = labels.shape[0]
    distances = knn_graph.indices.reshape(num_examples, -1)
    scores = _knn_shapley_score(neighbor_indices=distances, y=labels, k=k)
    return 0.5 * (scores + 1)