Source code for cleanlab.internal.token_classification_utils

# Copyright (C) 2017-2023  Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab.  If not, see <https://www.gnu.org/licenses/>.

"""
Helper methods used internally in cleanlab.token_classification
"""

import re
import string
import numpy as np
from termcolor import colored
from typing import List, Optional, Callable, Tuple, TypeVar
import numpy.typing as npt


T = TypeVar("T", bound=npt.NBitBase)


[docs]def get_sentence(words: List[str]) -> str:
    """
    Get sentence formed by a list of words with minor processing for readability

    Parameters
    ----------
    words:
        list of word-level tokens

    Returns
    ----------
    sentence:
        sentence formed by list of word-level tokens

    Examples
    --------
    >>> from cleanlab.internal.token_classification_utils import get_sentence
    >>> words = ["This", "is", "a", "sentence", "."]
    >>> get_sentence(words)
    'This is a sentence.'
    """
    sentence = ""
    for word in words:
        if word not in string.punctuation or word in ["-", "("]:
            word = " " + word
        sentence += word
    sentence = sentence.replace(" '", "'").replace("( ", "(").strip()
    return sentence


[docs]def filter_sentence(
    sentences: List[str],
    condition: Optional[Callable[[str], bool]] = None,
) -> Tuple[List[str], List[bool]]:
    """
    Filter sentence based on some condition, and returns filter mask

    Parameters
    ----------
    sentences:
        list of sentences

    condition:
        sentence filtering condition

    Returns
    ---------
    sentences:
        list of sentences filtered

    mask:
        boolean mask such that `mask[i] == True` if the i'th sentence is included in the
        filtered sentence, otherwise `mask[i] == False`

    Examples
    --------
    >>> from cleanlab.internal.token_classification_utils import filter_sentence
    >>> sentences = ["Short sentence.", "This is a longer sentence."]
    >>> condition = lambda x: len(x.split()) > 2
    >>> long_sentences, _ = filter_sentence(sentences, condition)
    >>> long_sentences
    ['This is a longer sentence.']
    >>> document = ["# Headline", "Sentence 1.", "&", "Sentence 2."]
    >>> sentences, mask = filter_sentence(document)
    >>> sentences, mask
    (['Sentence 1.', 'Sentence 2.'], [False, True, False, True])
    """
    if not condition:
        condition = lambda sentence: len(sentence) > 1 and "#" not in sentence
    mask = list(map(condition, sentences))
    sentences = [sentence for m, sentence in zip(mask, sentences) if m]
    return sentences, mask


[docs]def process_token(token: str, replace: List[Tuple[str, str]] = [("#", "")]) -> str:
    """
    Replaces special characters in the tokens

    Parameters
    ----------
    token:
        token which potentially contains special characters

    replace:
        list of tuples `(s1, s2)`, where all occurances of s1 are replaced by s2

    Returns
    ---------
    processed_token:
        processed token whose special character has been replaced

    Note
    ----
        Only applies to characters in the original input token.

    Examples
    --------
    >>> from cleanlab.internal.token_classification_utils import process_token
    >>> token = "#Comment"
    >>> process_token("#Comment")
    'Comment'

    Specify custom replacement rules

    >>> replace = [("C", "a"), ("a", "C")]
    >>> process_token("Cleanlab", replace)
    'aleCnlCb'
    """
    replace_dict = {re.escape(k): v for (k, v) in replace}
    pattern = "|".join(replace_dict.keys())
    compiled_pattern = re.compile(pattern)
    replacement = lambda match: replace_dict[re.escape(match.group(0))]
    processed_token = compiled_pattern.sub(replacement, token)
    return processed_token


[docs]def mapping(entities: List[int], maps: List[int]) -> List[int]:
    """
    Map a list of entities to its corresponding entities

    Parameters
    ----------
    entities:
        a list of given entities

    maps:
        a list of mapped entities, such that the i'th indexed token should be mapped to `maps[i]`

    Returns
    ---------
    mapped_entities:
        a list of mapped entities

    Examples
    --------
    >>> unique_identities = [0, 1, 2, 3, 4]  # ["O", "B-PER", "I-PER", "B-LOC", "I-LOC"]
    >>> maps = [0, 1, 1, 2, 2]  # ["O", "PER", "PER", "LOC", "LOC"]
    >>> mapping(unique_identities, maps)
    [0, 1, 1, 2, 2]  # ["O", "PER", "PER", "LOC", "LOC"]
    >>> mapping([0, 0, 4, 4, 3, 4, 0, 2], maps)
    [0, 0, 2, 2, 2, 2, 0, 1]  # ["O", "O", "LOC", "LOC", "LOC", "LOC", "O", "PER"]
    """
    f = lambda x: maps[x]
    return list(map(f, entities))


[docs]def merge_probs(
    probs: npt.NDArray["np.floating[T]"], maps: List[int]
) -> npt.NDArray["np.floating[T]"]:
    """
    Merges model-predictive probabilities with desired mapping

    Parameters
    ----------
    probs:
        A 2D np.array of shape `(N, K)`, where N is the number of tokens, and K is the number of classes for the model

    maps:
        a list of mapped index, such that the probability of the token being in the i'th class is mapped to the
        `maps[i]` index. If `maps[i] == -1`, the i'th column of `probs` is ignored. If `np.any(maps == -1)`, the
        returned probability is re-normalized.

    Returns
    ---------
    probs_merged:
        A 2D np.array of shape ``(N, K')``, where `K'` is the number of new classes. Probabilities are merged and
        re-normalized if necessary.

    Examples
    --------
    >>> import numpy as np
    >>> from cleanlab.internal.token_classification_utils import merge_probs
    >>> probs = np.array([
    ...     [0.55, 0.0125, 0.0375, 0.1, 0.3],
    ...     [0.1, 0.8, 0, 0.075, 0.025],
    ... ])
    >>> maps = [0, 1, 1, 2, 2]
    >>> merge_probs(probs, maps)
    array([[0.55, 0.05, 0.4 ],
           [0.1 , 0.8 , 0.1 ]])
    """
    old_classes = probs.shape[1]
    map_size = np.max(maps) + 1
    probs_merged = np.zeros([len(probs), map_size], dtype=probs.dtype.type)

    for i in range(old_classes):
        if maps[i] >= 0:
            probs_merged[:, maps[i]] += probs[:, i]
    if -1 in maps:
        row_sums = probs_merged.sum(axis=1)
        probs_merged /= row_sums[:, np.newaxis]
    return probs_merged


[docs]def color_sentence(sentence: str, word: str) -> str:
    """
    Searches for a given token in the sentence and returns the sentence where the given token is colored red

    Parameters
    ----------
    sentence:
        a sentence where the word is searched

    word:
        keyword to find in `sentence`. Assumes the word exists in the sentence.
    Returns
    ---------
    colored_sentence:
        `sentence` where the every occurrence of the word is colored red, using ``termcolor.colored``

    Examples
    --------
    >>> from cleanlab.internal.token_classification_utils import color_sentence
    >>> sentence = "This is a sentence."
    >>> word = "sentence"
    >>> color_sentence(sentence, word)
    'This is a \x1b[31msentence\x1b[0m.'

    Also works for multiple occurrences of the word

    >>> document = "This is a sentence. This is another sentence."
    >>> word = "sentence"
    >>> color_sentence(document, word)
    'This is a \x1b[31msentence\x1b[0m. This is another \x1b[31msentence\x1b[0m.'
    """
    colored_word = colored(word, "red")
    return _replace_sentence(sentence=sentence, word=word, new_word=colored_word)


def _replace_sentence(sentence: str, word: str, new_word: str) -> str:
    """
    Searches for a given token in the sentence and returns the sentence where the given token has been replaced by
    `new_word`.

    Parameters
    ----------
    sentence:
        a sentence where the word is searched

    word:
        keyword to find in `sentence`. Assumes the word exists in the sentence.

    new_word:
        the word to replace the keyword with

    Returns
    ---------
    new_sentence:
        `sentence` where the every occurrence of the word is replaced by `colored_word`
    """

    new_sentence, number_of_substitions = re.subn(
        r"\b{}\b".format(re.escape(word)), new_word, sentence
    )
    if number_of_substitions == 0:
        # Use basic string manipulation if regex fails
        new_sentence = sentence.replace(word, new_word)
    return new_sentence