-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathplot_truerank.py
More file actions
101 lines (78 loc) · 2.86 KB
/
plot_truerank.py
File metadata and controls
101 lines (78 loc) · 2.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python3
"""
Generate a scatter plot for true rank
"""
import pandas as pd
import matplotlib.pyplot as plt
PROFILE_COLORS = {
"Alice": "steelblue",
"Bob": "darkorange",
"Charlie": "seagreen",
"Diane": "mediumpurple",
}
PROFILE_LABELS = {
"Alice": "Deliberative",
"Bob": "Low Engagement",
"Charlie": "High Frequency",
"Diane": "Intermittent",
}
PROFILE_SORT = ["Alice", "Bob", "Charlie", "Diane"]
def trim_min_max(df: pd.DataFrame) -> pd.DataFrame:
kept = []
for user, g in df.groupby("held_out_user", sort=False):
g = g.sort_values("posterior_true", kind="mergesort").reset_index(drop=True)
if len(g) > 2:
g = g.iloc[1:-1] # drop min and max
kept.append(g)
return pd.concat(kept, ignore_index=True)
def main():
path = "data/truerank.csv"
df = pd.read_csv(path)
# Extract profile from held_out_user (e.g., Alice_1 -> Alice)
df["profile"] = df["held_out_user"].apply(lambda x: x.split("_")[0])
# Trim min and max
df_trim = trim_min_max(df)
# plot
plt.figure(figsize=(9, 5))
# order profiles
profiles_present = [p for p in PROFILE_SORT if p in set(df_trim["profile"])]
profiles_present += [p for p in sorted(df_trim["profile"].unique()) if p not in PROFILE_SORT]
for profile in profiles_present:
subset = df_trim[df_trim["profile"] == profile]
if subset.empty:
continue
plt.scatter(
subset["true_rank"],
subset["posterior_true"],
color=PROFILE_COLORS.get(profile, "gray"),
label=PROFILE_LABELS.get(profile, profile),
s=50,
alpha=0.8,
)
# calculate baseline based on number of users
mode_N = int(df_trim["N_pool"].mode().iloc[0])
baseline = 1.0 / mode_N
plt.axhline(y=baseline,color="gray",linestyle="--",linewidth=1,label=f"Uniform prior (1/{mode_N})")
plt.xlabel("True User Rank in Posterior")
plt.ylabel("Posterior Probability for True Identity")
plt.title("True Rank Pool Re-identification")
plt.legend(fontsize=9)
plt.grid(True, linestyle="--", alpha=0.5)
plt.tight_layout()
out = "out/true_rank.pdf"
plt.savefig(out)
plt.show()
# Summary stats (trimmed)
total = len(df_trim)
print("\n==== True Rank ====")
for k in [1, 3, 5]:
count = (df_trim["true_rank"] <= k).sum()
print(f"Top-{k} Accuracy: {count} / {total} ({100.0 * count / total:.1f}%)")
print(f"Mean Rank: {df_trim['true_rank'].mean():.2f}")
print(f"Median Rank: {df_trim['true_rank'].median()}")
print(f"Mean Posterior(True): {df_trim['posterior_true'].mean():.3f}")
print(f"Min Posterior(True): {df_trim['posterior_true'].min():.3f}")
print(f"Max Posterior(True): {df_trim['posterior_true'].max():.3f}")
print(f"\nCreated {out}")
if __name__ == "__main__":
main()