Source code for cleanlab.datalab.internal.issue_manager.issue_manager
# Copyright (C) 2017-2023 Cleanlab Inc.# This file is part of cleanlab.## cleanlab is free software: you can redistribute it and/or modify# it under the terms of the GNU Affero General Public License as published# by the Free Software Foundation, either version 3 of the License, or# (at your option) any later version.## cleanlab is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the# GNU Affero General Public License for more details.## You should have received a copy of the GNU Affero General Public License# along with cleanlab. If not, see <https://www.gnu.org/licenses/>.from__future__importannotationsfromabcimportABC,ABCMeta,abstractmethodfromitertoolsimportchainfromtypingimportTYPE_CHECKING,Any,ClassVar,Dict,List,Optional,Set,Tuple,Type,TypeVarimportjsonimportnumpyasnpimportpandasaspdifTYPE_CHECKING:# pragma: no coverfromcleanlab.datalab.datalabimportDatalabT=TypeVar("T",bound="IssueManager")TM=TypeVar("TM",bound="IssueManagerMeta")classIssueManagerMeta(ABCMeta):"""Metaclass for IssueManager that adds issue_score_key to the class. :meta private: """issue_name:ClassVar[str]issue_score_key:ClassVar[str]verbosity_levels:ClassVar[Dict[int,List[str]]]={0:[],1:[],2:[],3:[],}def__new__(meta:Type[TM],name:str,bases:Tuple[Type[Any],...],class_dict:Dict[str,Any],)->TM:# Classes that inherit from ABC don't need to be modifiedifABCinbases:returnsuper().__new__(meta,name,bases,class_dict)# Ensure that the verbosity levels don't have keys other than those in ["issue", "info"]verbosity_levels=class_dict.get("verbosity_levels",meta.verbosity_levels)forlevel,level_listinverbosity_levels.items():ifnotisinstance(level_list,list):raiseValueError(f"Verbosity levels must be lists. "f"Got {level_list} in {name}.verbosity_levels")prohibited_keys=[keyforkeyinlevel_listifnotisinstance(key,str)]ifprohibited_keys:raiseValueError(f"Verbosity levels must be lists of strings. "f"Got {prohibited_keys} in {name}.verbosity_levels[{level}]")# Concrete classes need to have an issue_name attributeif"issue_name"notinclass_dict:raiseTypeError("IssueManagers need an issue_name class variable")# Add issue_score_key to classclass_dict["issue_score_key"]=f"{class_dict['issue_name']}_score"returnsuper().__new__(meta,name,bases,class_dict)
[docs]classIssueManager(ABC,metaclass=IssueManagerMeta):"""Base class for managing data issues of a particular type in a Datalab. For each example in a dataset, the IssueManager for a particular type of issue should compute: - A numeric severity score between 0 and 1, with values near 0 indicating severe instances of the issue. - A boolean `is_issue` value, which is True if we believe this example suffers from the issue in question. `is_issue` may be determined by thresholding the severity score (with an a priori determined reasonable threshold value), or via some other means (e.g. Confident Learning for flagging label issues). The IssueManager should also report: - A global value between 0 and 1 summarizing how severe this issue is in the dataset overall (e.g. the average severity across all examples in dataset or count of examples where `is_issue=True`). - Other interesting `info` about the issue and examples in the dataset, and statistics estimated from current dataset that may be reused to score this issue in future data. For example, `info` for label issues could contain the: confident_thresholds, confident_joint, predicted label for each example, etc. Another example is for (near)-duplicate detection issue, where `info` could contain: which set of examples in the dataset are all (nearly) identical. Implementing a new IssueManager: - Define the `issue_name` class attribute, e.g. "label", "duplicate", "outlier", etc. - Implement the abstract methods `find_issues` and `collect_info`. - `find_issues` is responsible for computing computing the `issues` and `summary` dataframes. - `collect_info` is responsible for computing the `info` dict. It is called by `find_issues`, once the manager has set the `issues` and `summary` dataframes as instance attributes. """description:ClassVar[str]="""""Short text that summarizes the type of issues handled by this IssueManager. :meta hide-value: """issue_name:ClassVar[str]"""Returns a key that is used to store issue summary results about the assigned Lab."""issue_score_key:ClassVar[str]"""Returns a key that is used to store issue score results about the assigned Lab."""verbosity_levels:ClassVar[Dict[int,List[str]]]={0:[],1:[],2:[],3:[],}"""A dictionary of verbosity levels and their corresponding dictionaries of report items to print. :meta hide-value: Example ------- >>> verbosity_levels = { ... 0: [], ... 1: ["some_info_key"], ... 2: ["additional_info_key"], ... } """def__init__(self,datalab:Datalab,**_):self.datalab=datalabself.info:Dict[str,Any]={}self.issues:pd.DataFrame=pd.DataFrame()self.summary:pd.DataFrame=pd.DataFrame()def__repr__(self):class_name=self.__class__.__name__returnclass_name@classmethoddef__init_subclass__(cls):required_class_variables=["issue_name",]forvarinrequired_class_variables:ifnothasattr(cls,var):raiseNotImplementedError(f"Class {cls.__name__} must define class variable {var}")
[docs]@abstractmethoddeffind_issues(self,*args,**kwargs)->None:"""Finds occurrences of this particular issue in the dataset. Computes the `issues` and `summary` dataframes. Calls `collect_info` to compute the `info` dict. """raiseNotImplementedError
[docs]defcollect_info(self,*args,**kwargs)->dict:"""Collects data for the info attribute of the Datalab. NOTE ---- This method is called by :py:meth:`find_issues` after :py:meth:`find_issues` has set the `issues` and `summary` dataframes as instance attributes. """raiseNotImplementedError
[docs]@classmethoddefmake_summary(cls,score:float)->pd.DataFrame:"""Construct a summary dataframe. Parameters ---------- score : The overall score for this issue. Returns ------- summary : A summary dataframe. """ifnot0<=score<=1:raiseValueError(f"Score must be between 0 and 1. Got {score}.")returnpd.DataFrame({"issue_type":[cls.issue_name],"score":[score],},)
[docs]@classmethoddefreport(cls,issues:pd.DataFrame,summary:pd.DataFrame,info:Dict[str,Any],num_examples:int=5,verbosity:int=0,include_description:bool=False,info_to_omit:Optional[List[str]]=None,)->str:"""Compose a report of the issues found by this IssueManager. Parameters ---------- issues : An issues dataframe. Example ------- >>> import pandas as pd >>> issues = pd.DataFrame( ... { ... "is_X_issue": [True, False, True], ... "X_score": [0.2, 0.9, 0.4], ... }, ... ) summary : The summary dataframe. Example ------- >>> summary = pd.DataFrame( ... { ... "issue_type": ["X"], ... "score": [0.5], ... }, ... ) info : The info dict. Example ------- >>> info = { ... "A": "val_A", ... "B": ["val_B1", "val_B2"], ... } num_examples : The number of examples to print. verbosity : The verbosity level of the report. include_description : Whether to include a description of the issue in the report. Returns ------- report_str : A string containing the report. """max_verbosity=max(cls.verbosity_levels.keys())top_level=max_verbosity+1ifverbositynotinlist(cls.verbosity_levels.keys())+[top_level]:raiseValueError(f"Verbosity level {verbosity} not supported. "f"Supported levels: {cls.verbosity_levels.keys()}"f"Use verbosity={top_level} to print all info.")ifissues.empty:print(f"No issues found")topk_ids=issues.sort_values(by=cls.issue_score_key,ascending=True).index[:num_examples]score=summary["score"].loc[0]report_str=f"{' '+cls.issue_name+' issues ':-^60}\n\n"ifinclude_descriptionandcls.description:description=cls.descriptionifverbosity==0:description=description.split("\n\n",maxsplit=1)[0]report_str+="About this issue:\n\t"+description+"\n\n"report_str+=(f"Number of examples with this issue: {issues[f'is_{cls.issue_name}_issue'].sum()}\n"f"Overall dataset quality in terms of this issue: {score:.4f}\n\n")info_to_print:Set[str]=set()_info_to_omit=set(issues.columns).union(info_to_omitor[])verbosity_levels_values=chain.from_iterable(list(cls.verbosity_levels.values())[:verbosity+1])info_to_print.update(set(verbosity_levels_values)-_info_to_omit)ifverbosity==top_level:info_to_print.update(set(info.keys())-_info_to_omit)report_str+="Examples representing most severe instances of this issue:\n"report_str+=issues.loc[topk_ids].to_string()deftruncate(s,max_len=4)->str:ifhasattr(s,"shape")orhasattr(s,"ndim"):s=np.array(s)ifs.ndim>1:description=f"array of shape {s.shape}\n"withnp.printoptions(threshold=max_len):ifs.ndim==2:description+=f"{s}"ifs.ndim>2:description+=f"{s}"returndescriptions=s.tolist()ifisinstance(s,list):ifall([isinstance(s_,list)fors_ins]):returntruncate(np.array(s,dtype=object),max_len=max_len)iflen(s)>max_len:s=s[:max_len]+["..."]returnstr(s)ifinfo_to_print:info_to_print_dict={key:info[key]forkeyininfo_to_print}# Print the info dict, truncating arrays to 4 elements,report_str+=f"\n\nAdditional Information: "forkey,valueininfo_to_print_dict.items():ifkey=="statistics":continueifisinstance(value,dict):report_str+=f"\n{key}:\n{json.dumps(value,indent=4)}"elifisinstance(value,pd.DataFrame):max_rows=5df_str=value.head(max_rows).to_string()iflen(value)>max_rows:df_str+=f"\n... (total {len(value)} rows)"report_str+=f"\n{key}:\n{df_str}"else:report_str+=f"\n{key}: {truncate(value)}"returnreport_str