# Copyright (C) 2017-2023 Cleanlab Inc.
# This file is part of cleanlab.
#
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab. If not, see <https://www.gnu.org/licenses/>.
"""
Module that handles reporting of all types of issues identified in the data.
"""
from typing import TYPE_CHECKING, List
import pandas as pd
from cleanlab.datalab.internal.adapter.constants import DEFAULT_CLEANVISION_ISSUES
from cleanlab.datalab.internal.issue_manager_factory import _IssueManagerFactory
from cleanlab.datalab.internal.task import Task
if TYPE_CHECKING: # pragma: no cover
from cleanlab.datalab.internal.data_issues import DataIssues
[docs]class Reporter:
"""Class that generates a report about the issues stored in a :py:class:`DataIssues` object.
Parameters
----------
data_issues :
The :py:class:`DataIssues` object containing the issues to report on. This is usually
generated by the :py:class:`Datalab` class, stored in the :py:attr:`data_issues` attribute,
and then passed to the :py:class:`Reporter` class to generate a report.
task :
Specific machine learning task that the datset is intended for.
See details about supported tasks in :py:class:`Task <cleanlab.datalab.internal.task.Task>`.
verbosity :
The default verbosity of the report to generate. Each :py:class`IssueManager`
specifies the available verbosity levels and what additional information
is included at each level.
include_description :
Whether to include the description of each issue type in the report. The description
is included by default, but can be excluded by setting this parameter to ``False``.
Note
----
This class is not intended to be used directly. Instead, use the
`Datalab.find_issues` method which internally utilizes an IssueFinder instance.
"""
def __init__(
self,
data_issues: "DataIssues",
task: Task,
verbosity: int = 1,
include_description: bool = True,
show_summary_score: bool = False,
show_all_issues: bool = False,
**kwargs,
):
self.data_issues = data_issues
self.task = task
self.verbosity = verbosity
self.include_description = include_description
self.show_summary_score = show_summary_score
self.show_all_issues = show_all_issues
def _get_empty_report(self) -> str:
"""This method is used to return a report when there are
no issues found in the data with Datalab.find_issues().
"""
report_str = "No issues found in the data. Good job!"
if not self.show_summary_score:
recommendation_msg = (
"Try re-running Datalab.report() with "
"`show_summary_score = True` and `show_all_issues = True`."
)
report_str += f"\n\n{recommendation_msg}"
return report_str
[docs] def report(self, num_examples: int) -> None:
"""Prints a report about identified issues in the data.
Parameters
----------
num_examples :
The number of examples to include in the report for each issue type.
"""
print(self.get_report(num_examples=num_examples))
[docs] def get_report(self, num_examples: int) -> str:
"""Constructs a report about identified issues in the data.
Parameters
----------
num_examples :
The number of examples to include in the report for each issue type.
Returns
-------
report_str :
A string containing the report.
Examples
--------
>>> from cleanlab.datalab.internal.report import Reporter
>>> reporter = Reporter(data_issues=data_issues, include_description=False)
>>> report_str = reporter.get_report(num_examples=5)
>>> print(report_str)
"""
report_str = ""
issue_summary = self.data_issues.issue_summary
should_return_empty_report = not (
self.show_all_issues or issue_summary.empty or issue_summary["num_issues"].sum() > 0
)
if should_return_empty_report:
return self._get_empty_report()
issue_summary_sorted = issue_summary.sort_values(by="num_issues", ascending=False)
report_str += self._write_summary(summary=issue_summary_sorted)
issue_types = self._get_issue_types(issue_summary_sorted)
def add_issue_to_report(issue_name: str) -> bool:
"""Returns True if the issue should be added to the report.
It is excluded if show_all_issues is False and there are no issues of that type
found in the data.
"""
if self.show_all_issues:
return True
summary = self.data_issues.get_issue_summary(issue_name=issue_name)
has_issues = summary["num_issues"][0] > 0
return has_issues
issue_reports = [
_IssueManagerFactory.from_str(issue_type=key, task=self.task).report(
issues=self.data_issues.get_issues(issue_name=key),
summary=self.data_issues.get_issue_summary(issue_name=key),
info=self.data_issues.get_info(issue_name=key),
num_examples=num_examples,
verbosity=self.verbosity,
include_description=self.include_description,
)
for key in issue_types
]
report_str += "\n\n\n".join(issue_reports)
return report_str
def _write_summary(self, summary: pd.DataFrame) -> str:
statistics = self.data_issues.get_info("statistics")
num_examples = statistics["num_examples"]
num_classes = statistics.get(
"num_classes"
) # This may not be required for all types of datasets in the future (e.g. unlabeled/regression)
dataset_information = f"Dataset Information: num_examples: {num_examples}"
if num_classes is not None:
dataset_information += f", num_classes: {num_classes}"
if not self.show_all_issues:
# Drop any items in the issue_summary that have no issues (any issue detected in data needs to have num_issues > 0)
summary = summary.query("num_issues > 0")
if self.show_summary_score:
return (
"Here is a summary of the different kinds of issues found in the data:\n\n"
+ summary.to_string(index=False)
+ "\n\n"
+ "(Note: A lower score indicates a more severe issue across all examples in the dataset.)\n\n"
+ f"{dataset_information}\n\n\n"
)
return (
"Here is a summary of the different kinds of issues found in the data:\n\n"
+ summary.drop(columns=["score"]).to_string(index=False)
+ "\n\n"
+ f"{dataset_information}\n\n\n"
)
def _get_issue_types(self, issue_summary: pd.DataFrame) -> List[str]:
issue_types = [
issue_type
for issue_type, num_issues in zip(
issue_summary["issue_type"].tolist(), issue_summary["num_issues"].tolist()
)
if issue_type not in DEFAULT_CLEANVISION_ISSUES
and (self.show_all_issues or num_issues > 0)
]
return issue_types