pythonbook/实例学习spotlight/2.py

81 lines
3.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import datasets
# 加载cifar100-enriched数据集的测试集如同阅读一本心法
ds = datasets.load_dataset("cifar100-enriched", split="test")
# 将数据集转换为Pandas DataFrame就像将零散的招式编纂成连招
df = ds.to_pandas()
import pandas as pd
import numpy as np
from cleanlab import Datalab
# 使用Datalab寻找数据中的问题如同武林高手寻找对手的破绽
lab = Datalab(data=df, label_name="fine_label")
# 将DataFrame中的"embedding"列转换为numpy数组如同将散落的招式秘籍编纂成连贯的招式
features = np.array([x.tolist() for x in df["embedding"]])
# 将DataFrame中的"probabilities"列也转换为numpy数组如同收集各路信息为出招做准备
pred_probs = np.array([x.tolist() for x in df["probabilities"]])
# 调用find_issues方法寻找数据问题如同武林侠客挑战江湖隐秘揭示真相
lab.find_issues(features=features, pred_probs=pred_probs)
# 将原始数据框与发现的问题合并,如同将新发现的武林秘籍融入已有的知识体系中
df = pd.concat([df, lab.get_issues()], axis=1)
from renumics.spotlight.analysis import DataIssue
# 寻找标签问题的行,如同武林中的侠客追踪敌人的踪迹
label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist()
# 记录标签问题,如同将侦查到的情报记录在案
label_issue = DataIssue(
severity="medium",
title="label-issue",
rows=label_issue_rows,
description="Label issue found by Cleanlab - Review and correct if necessary",
)
# 寻找异常值问题的行,如同寻找隐藏在暗处的敌人
outlier_issue_row = (
df[df["outlier_score"] < 0.6].sort_values("outlier_score").index.tolist()
)
# 记录异常值问题,如同标记出潜藏的危险
outlier_issue = DataIssue(
severity="medium",
title="outlier-issue",
rows=outlier_issue_row,
description="Outlier score < 0.6 - Review and remove or collect more data",
)
# 寻找几乎重复问题的行,如同寻找武林中几乎看不出差别的影子剑客
near_duplicate_issue_row = (
df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist()
)
# 记录几乎重复的问题,如同揭露伪装的敌人
near_duplicate_issue = DataIssue(
severity="medium",
title="near-duplicate-issue",
rows=near_duplicate_issue_row,
description="Near duplicate issue found by Cleanlab - Review and remove if necessary",
)
from renumics import spotlight
# 定义数据类型,如同为武林中的各种武器赋予名号
dtypes = {
"image": spotlight.Image, # 图像数据,如同武林中的画卷,记录着形象
"full_image": spotlight.Image, # 完整图像数据,如同详尽的地图,展现全貌
"embedding": spotlight.Embedding, # 嵌入数据,如同内力深厚的武功秘籍
"embedding_reduced": spotlight.Embedding, # 简化的嵌入数据,精简而不失精髓
"probabilities": spotlight.Embedding, # 概率数据,如同预测敌人下一步的先天能力
}
# 展示数据及其问题,如同武林大会上展示武功,亮出绝技
spotlight.show(
df.rename(columns={"fine_label_str": "label", "fine_label_prediction_str": "pred"}), # 重命名列,如同为武功起名,更易理解
dtype=dtypes, # 指定数据类型,如同明确每位武林人士的门派和武功
layout="https://spotlight.renumics.com/resources/layout_data_issues.json", # 使用布局文件,如同选择武林大会的比武场地
issues=[label_issue, outlier_issue, near_duplicate_issue], # 展示发现的问题,如同揭示对手的破绽
)