Source code for cleanlab.datalab.internal.report

# Copyright (C) 2017-2023  Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab.  If not, see <https://www.gnu.org/licenses/>.
"""
Module that handles reporting of all types of issues identified in the data.
"""

from typing import TYPE_CHECKING, List

import pandas as pd

from cleanlab.datalab.internal.adapter.constants import DEFAULT_CLEANVISION_ISSUES
from cleanlab.datalab.internal.issue_manager_factory import _IssueManagerFactory

if TYPE_CHECKING:  # pragma: no cover
    from cleanlab.datalab.internal.data_issues import DataIssues


[docs]class Reporter: """Class that generates a report about the issues stored in a :py:class:`DataIssues` object. Parameters ---------- data_issues : The :py:class:`DataIssues` object containing the issues to report on. This is usually generated by the :py:class:`Datalab` class, stored in the :py:attr:`data_issues` attribute, and then passed to the :py:class:`Reporter` class to generate a report. verbosity : The default verbosity of the report to generate. Each :py:class`IssueManager` specifies the available verbosity levels and what additional information is included at each level. include_description : Whether to include the description of each issue type in the report. The description is included by default, but can be excluded by setting this parameter to ``False``. Note ---- This class is not intended to be used directly. Instead, use the `Datalab.find_issues` method which internally utilizes an IssueFinder instance. """ def __init__( self, data_issues: "DataIssues", verbosity: int = 1, include_description: bool = True, show_summary_score: bool = False, **kwargs, ): self.data_issues = data_issues self.verbosity = verbosity self.include_description = include_description self.show_summary_score = show_summary_score
[docs] def report(self, num_examples: int) -> None: """Prints a report about identified issues in the data. Parameters ---------- num_examples : The number of examples to include in the report for each issue type. """ print(self.get_report(num_examples=num_examples))
[docs] def get_report(self, num_examples: int) -> str: """Constructs a report about identified issues in the data. Parameters ---------- num_examples : The number of examples to include in the report for each issue type. Returns ------- report_str : A string containing the report. Examples -------- >>> from cleanlab.datalab.internal.report import Reporter >>> reporter = Reporter(data_issues=data_issues, include_description=False) >>> report_str = reporter.get_report(num_examples=5) >>> print(report_str) """ report_str = "" issue_summary = self.data_issues.issue_summary issue_summary_sorted = issue_summary.sort_values(by="num_issues", ascending=False) report_str += self._write_summary(summary=issue_summary_sorted) issue_types = self._get_issue_types(issue_summary_sorted) issue_reports = [ _IssueManagerFactory.from_str(issue_type=key).report( issues=self.data_issues.get_issues(issue_name=key), summary=self.data_issues.get_issue_summary(issue_name=key), info=self.data_issues.get_info(issue_name=key), num_examples=num_examples, verbosity=self.verbosity, include_description=self.include_description, ) for key in issue_types ] report_str += "\n\n\n".join(issue_reports) return report_str
def _write_summary(self, summary: pd.DataFrame) -> str: statistics = self.data_issues.get_info("statistics") num_examples = statistics["num_examples"] num_classes = statistics.get( "num_classes" ) # This may not be required for all types of datasets in the future (e.g. unlabeled/regression) dataset_information = f"Dataset Information: num_examples: {num_examples}" if num_classes is not None: dataset_information += f", num_classes: {num_classes}" if self.show_summary_score: return ( "Here is a summary of the different kinds of issues found in the data:\n\n" + summary.to_string(index=False) + "\n\n" + "(Note: A lower score indicates a more severe issue across all examples in the dataset.)\n\n" + f"{dataset_information}\n\n\n" ) return ( "Here is a summary of the different kinds of issues found in the data:\n\n" + summary.drop(columns=["score"]).to_string(index=False) + "\n\n" + f"{dataset_information}\n\n\n" ) def _get_issue_types(self, issue_summary: pd.DataFrame) -> List[str]: issue_types = [ issue_type for issue_type in issue_summary["issue_type"].tolist() if issue_type not in DEFAULT_CLEANVISION_ISSUES ] return issue_types