# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab. If not, see <https://www.gnu.org/licenses/>.
"""
Helper functions used internally for outlier detection tasks.
"""
from typing import Optional
import numpy as np
from cleanlab.internal.constants import EPSILON
[docs]def correct_precision_errors(
scores: np.ndarray,
avg_distances: np.ndarray,
metric: str,
C: int = 100,
p: Optional[int] = None,
):
"""
Ensure that scores where avg_distances are below the tolerance threshold get a score of one.
Parameters
----------
scores :
An array of scores of shape ``(N)``, where N is the number of examples.
Each entry represents a score between 0 and 1.
avg_distances :
An array of distances of shape ``(N)``, where N is the number of examples.
Each entry represents an example's average distance to its k nearest neighbors.
metric :
The metric used by the knn algorithm to calculate the distances.
It must be 'cosine', 'euclidean' or 'minkowski', otherwise this function does nothing.
C :
Multiplier used to increase the tolerance of the acceptable precision differences.
It is a multiplicative factor of the machine epsilon that is used to calculate the tolerance.
For the type of values that are used in the distances, a value of 100 should be a sensible
default value for small values of the distances, below the order of 1.
p :
This value is only used when metric is 'minkowski'.
A ValueError will be raised if metric is 'minkowski' and 'p' was not provided.
Returns
-------
fixed_scores :
An array of scores of shape ``(N,)`` for N examples with scores between 0 and 1.
"""
if metric == "cosine":
tolerance = C * np.finfo(np.float_).epsneg
elif metric == "euclidean":
tolerance = np.sqrt(C * np.finfo(np.float_).eps)
elif metric == "minkowski":
if p is None:
raise ValueError("When metric is 'minkowski' you must specify the 'p' parameter")
tolerance = (C * np.finfo(np.float_).eps) ** (1 / p)
else:
return scores
candidates_mask = avg_distances < tolerance
scores[candidates_mask] = 1
return scores