I made a python notebook that takes any public dataset and then analyzes and calculates it's k-anonymity, l-diversity and t-closeness using my own implementation of Pandas and Numpy functions. This will allow for analysis of the anonymity of the given data.
Written & Tested by Chelle Davies, July 2024
# Import Libraries
import pandas as pd
import numpy as np
# Define functions
## Function to calculate k-anonymity
def calculate_k_anonymity(df, quasi_identifiers):
return df.groupby(quasi_identifiers).size()
## Function to calculate l-diversity
def calculate_l_diversity(df, quasi_identifiers, sensitive_attribute):
def diversity(group):
return group[sensitive_attribute].nunique()
return df.groupby(quasi_identifiers).apply(diversity)
## Function to calculate t-closeness
def calculate_t_closeness(df, quasi_identifiers, sensitive_attribute):
global_dist = df[sensitive_attribute].value_counts(normalize=True)
def closeness(group):
local_dist = group[sensitive_attribute].value_counts(normalize=True)
return np.sum(np.abs(local_dist - global_dist))
return df.groupby(quasi_identifiers).apply(closeness)
def flag_re_identifiability(df, quasi_identifiers, sensitive_attribute):
k_anonymity = calculate_k_anonymity(df, quasi_identifiers)
l_diversity = calculate_l_diversity(df, quasi_identifiers, sensitive_attribute)
t_closeness = calculate_t_closeness(df, quasi_identifiers, sensitive_attribute)
df['k_anonymity'] = df[quasi_identifiers].apply(tuple, axis=1).map(k_anonymity)
df['l_diversity'] = df[quasi_identifiers].apply(tuple, axis=1).map(l_diversity)
df['t_closeness'] = df[quasi_identifiers].apply(tuple, axis=1).map(t_closeness)
return df
## Infinite loop to get inputs until a blank entry
def get_user_inputs():
quasi_identifiers = []
inputs = []
while True:
user_input = input("Enter a column name or index that is a quasi_identifier (leave blank to finish): ")
if user_input == "":
break
inputs.append(user_input)
for item in inputs:
try:
quasi_identifiers.append(int(item))
except ValueError:
quasi_identifiers.append(item)
return quasi_identifiers
# Pass in the file path of the public data from user input
file_path = input("Enter the file path for the data, as a CSV or Parquet file: ")
if 'parquet' in file_path.lower():
df = pd.read_parquet(file_path)
else:
df = pd.read_csv(file_path, header=None)
df.head(10)
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | id | name | type | state | oricodes | total_shootings |
1 | 3145 | Abbeville County Sheriff's Office | sheriff | SC | SC00100 | 1 |
2 | 2576 | Aberdeen Police Department | local_police | WA | WA01401 | 1 |
3 | 2114 | Abilene Police Department | local_police | TX | TX22101 | 6 |
4 | 2088 | Abington Township Police Department | local_police | PA | PA04601 | 1 |
5 | 3187 | Acadia Parish Sheriff's Office | sheriff | LA | LA00100 | 1 |
6 | 3375 | Acworth Police Department | local_police | GA | GA03305 | 1 |
7 | 1241 | Ada County Sheriff's Office | sheriff | ID | ID00100 | 5 |
8 | 1615 | Adair County Sheriff's Office | sheriff | OK | OK00100 | 1 |
9 | 1978 | Adams County Sheriff's Department | sheriff | CO | CO00100 | 14 |
# Define the quasi-identifiers and sensitive attribute of data
quasi_identifiers = get_user_inputs()
print("quasi_identifiers=", quasi_identifiers)
user_input = input("Enter the column name or index that is the sensitive attribute: ")
try:
sensitive_attribute = int(user_input)
except:
sensitive_attribute = user_input
print("sensitive_attribute =", sensitive_attribute)
quasi_identifiers= [0, 2, 3, 4, 5] sensitive_attribute = 1
df = flag_re_identifiability(df, quasi_identifiers, sensitive_attribute)
# Flagging rows with >= 80% likelihood of re-identification
df['re_identifiability_likelihood'] = np.where(
(df['k_anonymity'] <= (0.2 * len(df))) &
(df['l_diversity'] <= 1) &
(df['t_closeness'] >= (0.8 * df['t_closeness'].max())),
'High', 'Low'
)
high_risk_df = df[df['re_identifiability_likelihood'] == 'High']
/var/folders/gr/6ytyl_l54r34xk4zb10_hq680000gn/T/ipykernel_87705/4221014718.py:11: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. return df.groupby(quasi_identifiers).apply(diversity) /var/folders/gr/6ytyl_l54r34xk4zb10_hq680000gn/T/ipykernel_87705/4221014718.py:19: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. return df.groupby(quasi_identifiers).apply(closeness)
# Overall statistics summary
stats_summary = {
'k_anonymity': df['k_anonymity'].describe(),
'l_diversity': df['l_diversity'].describe(),
't_closeness': df['t_closeness'].describe(),
'high_risk_rows_count': high_risk_df.shape[0]
}
# Display the high risk dataframe and the overall statistics summary
high_risk_df_copy = high_risk_df.copy()
summary_df = pd.DataFrame(stats_summary)
# Listing the indexes of the rows in high_risk_df_copy
high_risk_indexes = high_risk_df_copy.index.tolist()
print("\nNumber of Indexes for High Risk Rows:")
print(len(high_risk_indexes), " out of ", len(df), " rows ({}%)".format(round((len(high_risk_indexes)/len(df))*100, 2)))
Number of Indexes for High Risk Rows: 3453 out of 3583 rows (96.37%)
# Displaying the dataframes - Interpretation
print("\nInterpretation of Summary Statistics:")
# Conditional interpretation for k-anonymity
k_anonymity_mean = summary_df['k_anonymity']['mean']
k_anonymity_median = summary_df['k_anonymity']['50%']
if k_anonymity_mean > 20:
print(f"\nK-Anonymity: Mean: {k_anonymity_mean} (Low Risk)")
else:
print(f"\nK-Anonymity: Mean: {k_anonymity_mean} (Moderate to High Risk)")
if k_anonymity_median > 20:
print(f"Median: {k_anonymity_median} (Low Risk)")
else:
print(f"Median: {k_anonymity_median} (Moderate to High Risk)")
# Conditional interpretation for l-diversity
l_diversity_mean = summary_df['l_diversity']['mean']
l_diversity_median = summary_df['l_diversity']['50%']
if l_diversity_mean > 5:
print(f"\nL-Diversity: Mean: {l_diversity_mean} (Low Risk)")
else:
print(f"\nL-Diversity: Mean: {l_diversity_mean} (Moderate to High Risk)")
if l_diversity_median > 5:
print(f"Median: {l_diversity_median} (Low Risk)")
else:
print(f"Median: {l_diversity_median} (Moderate to High Risk)")
# Conditional interpretation for t-closeness
t_closeness_mean = summary_df['t_closeness']['mean']
t_closeness_median = summary_df['t_closeness']['50%']
if t_closeness_mean <= 0.05:
print(f"\nT-Closeness: Mean: {t_closeness_mean} (Low Risk)")
else:
print(f"\nT-Closeness: Mean: {t_closeness_mean} (Moderate to High Risk)")
if t_closeness_median <= 0.05:
print(f"Median: {t_closeness_median} (Low Risk)")
else:
print(f"Median: {t_closeness_median} (Moderate to High Risk)")
Interpretation of Summary Statistics: K-Anonymity: Mean: 1.0 (Moderate to High Risk) Median: 1.0 (Moderate to High Risk) L-Diversity: Mean: 1.0 (Moderate to High Risk) Median: 1.0 (Moderate to High Risk) T-Closeness: Mean: 0.999431139372551 (Moderate to High Risk) Median: 0.9997209042701647 (Moderate to High Risk)
print("\nStatistics Summary:")
for column in summary_df.columns:
if column != 'high_risk_rows_count':
print(f"\nSummary statistics for {column}:")
print(f"Mean: {summary_df[column]['mean']}")
print(f"Median: {summary_df[column]['50%']}")
print(f"Standard Deviation: {summary_df[column]['std']}")
print(f"\nTotal Hish Risk Count: {summary_df['high_risk_rows_count']['count']} out of {len(df)}, ({round((summary_df['high_risk_rows_count']['count']/len(df))*100, 2)}%)")
Statistics Summary: Summary statistics for k_anonymity: Mean: 1.0 Median: 1.0 Standard Deviation: 0.0 Summary statistics for l_diversity: Mean: 1.0 Median: 1.0 Standard Deviation: 0.0 Summary statistics for t_closeness: Mean: 0.999431139372551 Median: 0.9997209042701647 Standard Deviation: 0.0012740892681340204 Total Hish Risk Count: 3453 out of 3583, (96.37%)