Source code for cleanlab.datalab.internal.report

# Copyright (C) 2017-2023  Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab.  If not, see <https://www.gnu.org/licenses/>.
"""
Module that handles reporting of all types of issues identified in the data.
"""

from typing import TYPE_CHECKING, List

import pandas as pd

from cleanlab.datalab.internal.adapter.constants import DEFAULT_CLEANVISION_ISSUES
from cleanlab.datalab.internal.issue_manager_factory import _IssueManagerFactory
from cleanlab.datalab.internal.task import Task

if TYPE_CHECKING:  # pragma: no cover
    from cleanlab.datalab.internal.data_issues import DataIssues


[docs]class Reporter: """Class that generates a report about the issues stored in a :py:class:`DataIssues` object. Parameters ---------- data_issues : The :py:class:`DataIssues` object containing the issues to report on. This is usually generated by the :py:class:`Datalab` class, stored in the :py:attr:`data_issues` attribute, and then passed to the :py:class:`Reporter` class to generate a report. task : Specific machine learning task that the datset is intended for. See details about supported tasks in :py:class:`Task <cleanlab.datalab.internal.task.Task>`. verbosity : The default verbosity of the report to generate. Each :py:class`IssueManager` specifies the available verbosity levels and what additional information is included at each level. include_description : Whether to include the description of each issue type in the report. The description is included by default, but can be excluded by setting this parameter to ``False``. Note ---- This class is not intended to be used directly. Instead, use the `Datalab.find_issues` method which internally utilizes an IssueFinder instance. """ def __init__( self, data_issues: "DataIssues", task: Task, verbosity: int = 1, include_description: bool = True, show_summary_score: bool = False, show_all_issues: bool = False, **kwargs, ): self.data_issues = data_issues self.task = task self.verbosity = verbosity self.include_description = include_description self.show_summary_score = show_summary_score self.show_all_issues = show_all_issues def _get_empty_report(self) -> str: """This method is used to return a report when there are no issues found in the data with Datalab.find_issues(). """ report_str = "No issues found in the data. Good job!" if not self.show_summary_score: recommendation_msg = ( "Try re-running Datalab.report() with " "`show_summary_score = True` and `show_all_issues = True`." ) report_str += f"\n\n{recommendation_msg}" return report_str
[docs] def report(self, num_examples: int) -> None: """Prints a report about identified issues in the data. Parameters ---------- num_examples : The number of examples to include in the report for each issue type. """ print(self.get_report(num_examples=num_examples))
[docs] def get_report(self, num_examples: int) -> str: """Constructs a report about identified issues in the data. Parameters ---------- num_examples : The number of examples to include in the report for each issue type. Returns ------- report_str : A string containing the report. Examples -------- >>> from cleanlab.datalab.internal.report import Reporter >>> reporter = Reporter(data_issues=data_issues, include_description=False) >>> report_str = reporter.get_report(num_examples=5) >>> print(report_str) """ report_str = "" issue_summary = self.data_issues.issue_summary should_return_empty_report = not ( self.show_all_issues or issue_summary.empty or issue_summary["num_issues"].sum() > 0 ) if should_return_empty_report: return self._get_empty_report() issue_summary_sorted = issue_summary.sort_values(by="num_issues", ascending=False) report_str += self._write_summary(summary=issue_summary_sorted) issue_types = self._get_issue_types(issue_summary_sorted) def add_issue_to_report(issue_name: str) -> bool: """Returns True if the issue should be added to the report. It is excluded if show_all_issues is False and there are no issues of that type found in the data. """ if self.show_all_issues: return True summary = self.data_issues.get_issue_summary(issue_name=issue_name) has_issues = summary["num_issues"][0] > 0 return has_issues issue_reports = [ _IssueManagerFactory.from_str(issue_type=key, task=self.task).report( issues=self.data_issues.get_issues(issue_name=key), summary=self.data_issues.get_issue_summary(issue_name=key), info=self.data_issues.get_info(issue_name=key), num_examples=num_examples, verbosity=self.verbosity, include_description=self.include_description, ) for key in issue_types ] report_str += "\n\n\n".join(issue_reports) return report_str
def _write_summary(self, summary: pd.DataFrame) -> str: statistics = self.data_issues.get_info("statistics") num_examples = statistics["num_examples"] num_classes = statistics.get( "num_classes" ) # This may not be required for all types of datasets in the future (e.g. unlabeled/regression) dataset_information = f"Dataset Information: num_examples: {num_examples}" if num_classes is not None: dataset_information += f", num_classes: {num_classes}" if not self.show_all_issues: # Drop any items in the issue_summary that have no issues (any issue detected in data needs to have num_issues > 0) summary = summary.query("num_issues > 0") if self.show_summary_score: return ( "Here is a summary of the different kinds of issues found in the data:\n\n" + summary.to_string(index=False) + "\n\n" + "(Note: A lower score indicates a more severe issue across all examples in the dataset.)\n\n" + f"{dataset_information}\n\n\n" ) return ( "Here is a summary of the different kinds of issues found in the data:\n\n" + summary.drop(columns=["score"]).to_string(index=False) + "\n\n" + f"{dataset_information}\n\n\n" ) def _get_issue_types(self, issue_summary: pd.DataFrame) -> List[str]: issue_types = [ issue_type for issue_type, num_issues in zip( issue_summary["issue_type"].tolist(), issue_summary["num_issues"].tolist() ) if issue_type not in DEFAULT_CLEANVISION_ISSUES and (self.show_all_issues or num_issues > 0) ] return issue_types