# Source code for cleanlab.token_classification.filter

# Copyright (C) 2017-2022  Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab.  If not, see <https://www.gnu.org/licenses/>.

"""
Methods to find label issues in token classification datasets (text data), where each token in a sentence receives its own class label.
"""

import numpy as np
from typing import List, Tuple

from cleanlab.filter import find_label_issues as find_label_issues_main

[docs]def find_label_issues(
labels: list,
pred_probs: list,
*,
return_indices_ranked_by: str = "self_confidence",
) -> List[Tuple[int, int]]:
"""Identifies tokens with label issues in a token classification dataset.

Tokens identified with issues will be ranked by their individual label quality score.

Instead use :py:func:token_classification.rank.get_label_quality_scores <cleanlab.token_classification.rank.get_label_quality_scores>
if you prefer to rank the sentences based on their overall label quality.

Parameters
----------
labels:
Nested list of given labels for all tokens, such that labels[i] is a list of labels, one for each token in the i-th sentence.

For a dataset with K classes, each label must be in 0, 1, ..., K-1. All classes must be present in the dataset.

pred_probs:
List of np arrays, such that pred_probs[i] has shape (T, K) if the i-th sentence contains T tokens.

Each row of pred_probs[i] corresponds to a token t in the i-th sentence,
and contains model-predicted probabilities that t belongs to each of the K possible classes.

Columns of each pred_probs[i] should be ordered such that the probabilities correspond to class 0, 1, ..., K-1.

return_indices_ranked_by: {"self_confidence", "normalized_margin", "confidence_weighted_entropy"}, default="self_confidence"
Returned token-indices are sorted by their label quality score.

See :py:func:cleanlab.filter.find_label_issues <cleanlab.filter.find_label_issues>
documentation for more details on each label quality scoring method.

Returns
-------
issues:
List of label issues identified by cleanlab, such that each element is a tuple (i, j), which
indicates that the j-th token of the i-th sentence has a label issue.

These tuples are ordered in issues list based on the likelihood that the corresponding token is mislabeled.

Use :py:func:token_classification.summary.display_issues <cleanlab.token_classification.summary.display_issues>
to view these issues within the original sentences.

Examples
--------
>>> import numpy as np
>>> from cleanlab.token_classification.filter import find_label_issues
>>> labels = [[0, 0, 1], [0, 1]]
>>> pred_probs = [
...     np.array([[0.9, 0.1], [0.7, 0.3], [0.05, 0.95]]),
...     np.array([[0.8, 0.2], [0.8, 0.2]]),
... ]
>>> find_label_issues(labels, pred_probs)
[(1, 1)]
"""
labels_flatten = [l for label in labels for l in label]
pred_probs_flatten = np.array([pred for pred_prob in pred_probs for pred in pred_prob])

issues_main = find_label_issues_main(
labels_flatten, pred_probs_flatten, return_indices_ranked_by=return_indices_ranked_by
)

lengths = [len(label) for label in labels]
mapping = [[(i, j) for j in range(length)] for i, length in enumerate(lengths)]
mapping_flatten = [index for indicies in mapping for index in indicies]

issues = [mapping_flatten[issue] for issue in issues_main]
return issues