81 lines
3.6 KiB
Python
81 lines
3.6 KiB
Python
import datasets
|
||
|
||
# 加载cifar100-enriched数据集的测试集,如同阅读一本心法
|
||
ds = datasets.load_dataset("cifar100-enriched", split="test")
|
||
|
||
# 将数据集转换为Pandas DataFrame,就像将零散的招式编纂成连招
|
||
df = ds.to_pandas()
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
from cleanlab import Datalab
|
||
|
||
# 使用Datalab寻找数据中的问题,如同武林高手寻找对手的破绽
|
||
lab = Datalab(data=df, label_name="fine_label")
|
||
|
||
# 将DataFrame中的"embedding"列转换为numpy数组,如同将散落的招式秘籍编纂成连贯的招式
|
||
features = np.array([x.tolist() for x in df["embedding"]])
|
||
|
||
# 将DataFrame中的"probabilities"列也转换为numpy数组,如同收集各路信息,为出招做准备
|
||
pred_probs = np.array([x.tolist() for x in df["probabilities"]])
|
||
|
||
# 调用find_issues方法寻找数据问题,如同武林侠客挑战江湖隐秘,揭示真相
|
||
lab.find_issues(features=features, pred_probs=pred_probs)
|
||
|
||
# 将原始数据框与发现的问题合并,如同将新发现的武林秘籍融入已有的知识体系中
|
||
df = pd.concat([df, lab.get_issues()], axis=1)
|
||
|
||
from renumics.spotlight.analysis import DataIssue
|
||
|
||
# 寻找标签问题的行,如同武林中的侠客追踪敌人的踪迹
|
||
label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist()
|
||
# 记录标签问题,如同将侦查到的情报记录在案
|
||
label_issue = DataIssue(
|
||
severity="medium",
|
||
title="label-issue",
|
||
rows=label_issue_rows,
|
||
description="Label issue found by Cleanlab - Review and correct if necessary",
|
||
)
|
||
|
||
# 寻找异常值问题的行,如同寻找隐藏在暗处的敌人
|
||
outlier_issue_row = (
|
||
df[df["outlier_score"] < 0.6].sort_values("outlier_score").index.tolist()
|
||
)
|
||
# 记录异常值问题,如同标记出潜藏的危险
|
||
outlier_issue = DataIssue(
|
||
severity="medium",
|
||
title="outlier-issue",
|
||
rows=outlier_issue_row,
|
||
description="Outlier score < 0.6 - Review and remove or collect more data",
|
||
)
|
||
|
||
# 寻找几乎重复问题的行,如同寻找武林中几乎看不出差别的影子剑客
|
||
near_duplicate_issue_row = (
|
||
df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist()
|
||
)
|
||
# 记录几乎重复的问题,如同揭露伪装的敌人
|
||
near_duplicate_issue = DataIssue(
|
||
severity="medium",
|
||
title="near-duplicate-issue",
|
||
rows=near_duplicate_issue_row,
|
||
description="Near duplicate issue found by Cleanlab - Review and remove if necessary",
|
||
)
|
||
|
||
from renumics import spotlight
|
||
|
||
# 定义数据类型,如同为武林中的各种武器赋予名号
|
||
dtypes = {
|
||
"image": spotlight.Image, # 图像数据,如同武林中的画卷,记录着形象
|
||
"full_image": spotlight.Image, # 完整图像数据,如同详尽的地图,展现全貌
|
||
"embedding": spotlight.Embedding, # 嵌入数据,如同内力深厚的武功秘籍
|
||
"embedding_reduced": spotlight.Embedding, # 简化的嵌入数据,精简而不失精髓
|
||
"probabilities": spotlight.Embedding, # 概率数据,如同预测敌人下一步的先天能力
|
||
}
|
||
|
||
# 展示数据及其问题,如同武林大会上展示武功,亮出绝技
|
||
spotlight.show(
|
||
df.rename(columns={"fine_label_str": "label", "fine_label_prediction_str": "pred"}), # 重命名列,如同为武功起名,更易理解
|
||
dtype=dtypes, # 指定数据类型,如同明确每位武林人士的门派和武功
|
||
layout="https://spotlight.renumics.com/resources/layout_data_issues.json", # 使用布局文件,如同选择武林大会的比武场地
|
||
issues=[label_issue, outlier_issue, near_duplicate_issue], # 展示发现的问题,如同揭示对手的破绽
|
||
) |