Source code for cleanlab.experimental.span_classification

"""
Methods to find label issues in span classification datasets (text data), each token in a sentence receives one or more class labels.

The underlying label error detection algorithms are in `cleanlab.token_classification`.
"""

import numpy as np
from typing import List, Tuple, Optional

from cleanlab.token_classification.filter import find_label_issues as find_label_issues_token
from cleanlab.token_classification.summary import display_issues as display_issues_token
from cleanlab.token_classification.rank import (
    get_label_quality_scores as get_label_quality_scores_token,
)


[docs]def find_label_issues( labels: list, pred_probs: list, ): """Identifies tokens with label issues in a span classification dataset. Tokens identified with issues will be ranked by their individual label quality score. To rank the sentences based on their overall label quality, use :py:func:`experimental.span_classification.get_label_quality_scores <cleanlab.experimental.span_classification.get_label_quality_scores>` Parameters ---------- labels: Nested list of given labels for all tokens. Refer to documentation for this argument in :py:func:`token_classification.filter.find_label_issues <cleanlab.token_classification.filter.find_label_issues>` for further details. Note: Currently, only a single span class is supported. pred_probs: An array of shape ``(T, K)`` of model-predicted class probabilities. Refer to documentation for this argument in :py:func:`token_classification.filter.find_label_issues <cleanlab.token_classification.filter.find_label_issues>` for further details. Returns ------- issues: List of label issues identified by cleanlab, such that each element is a tuple ``(i, j)``, which indicates that the `j`-th token of the `i`-th sentence has a label issue. These tuples are ordered in `issues` list based on the likelihood that the corresponding token is mislabeled. Use :py:func:`experimental.span_classification.get_label_quality_scores <cleanlab.experimental.span_classification.get_label_quality_scores>` to view these issues within the original sentences. Examples -------- >>> import numpy as np >>> from cleanlab.experimental.span_classification import find_label_issues >>> labels = [[0, 0, 1, 1], [1, 1, 0]] >>> pred_probs = [ ... np.array([0.9, 0.9, 0.9, 0.1]), ... np.array([0.1, 0.1, 0.9]), ... ] >>> find_label_issues(labels, pred_probs) """ pred_probs_token = _get_pred_prob_token(pred_probs) return find_label_issues_token(labels, pred_probs_token)
[docs]def display_issues( issues: list, tokens: List[List[str]], *, labels: Optional[list] = None, pred_probs: Optional[list] = None, exclude: List[Tuple[int, int]] = [], class_names: Optional[List[str]] = None, top: int = 20, ) -> None: """ See documentation of :py:meth:`token_classification.summary.display_issues<cleanlab.token_classification.summary.display_issues>` for description. """ display_issues_token( issues, tokens, labels=labels, pred_probs=pred_probs, exclude=exclude, class_names=class_names, top=top, )
[docs]def get_label_quality_scores( labels: list, pred_probs: list, **kwargs, ) -> Tuple[np.ndarray, list]: """ See documentation of :py:meth:`token_classification.rank.get_label_quality_scores<cleanlab.token_classification.rank.get_label_quality_scores>` for description. """ pred_probs_token = _get_pred_prob_token(pred_probs) return get_label_quality_scores_token(labels, pred_probs_token, **kwargs)
def _get_pred_prob_token(pred_probs: list) -> list: """Converts pred_probs for span classification to pred_probs for token classification.""" pred_probs_token = [] for probs in pred_probs: pred_probs_token.append(np.stack([1 - probs, probs], axis=1)) return pred_probs_token