Reidentication Project¶

I made a python notebook that takes any public dataset and then analyzes and calculates it's k-anonymity, l-diversity and t-closeness using my own implementation of Pandas and Numpy functions. This will allow for analysis of the anonymity of the given data.

Written & Tested by Chelle Davies, July 2024

Program Definitions¶

In [4]:
# Import Libraries
import pandas as pd
import numpy as np
In [5]:
# Define functions

## Function to calculate k-anonymity
def calculate_k_anonymity(df, quasi_identifiers):
    return df.groupby(quasi_identifiers).size()

## Function to calculate l-diversity
def calculate_l_diversity(df, quasi_identifiers, sensitive_attribute):
    def diversity(group):
        return group[sensitive_attribute].nunique()
    return df.groupby(quasi_identifiers).apply(diversity)

## Function to calculate t-closeness
def calculate_t_closeness(df, quasi_identifiers, sensitive_attribute):
    global_dist = df[sensitive_attribute].value_counts(normalize=True)
    def closeness(group):
        local_dist = group[sensitive_attribute].value_counts(normalize=True)
        return np.sum(np.abs(local_dist - global_dist))
    return df.groupby(quasi_identifiers).apply(closeness)

def flag_re_identifiability(df, quasi_identifiers, sensitive_attribute):
    k_anonymity = calculate_k_anonymity(df, quasi_identifiers)
    l_diversity = calculate_l_diversity(df, quasi_identifiers, sensitive_attribute)
    t_closeness = calculate_t_closeness(df, quasi_identifiers, sensitive_attribute)
    df['k_anonymity'] = df[quasi_identifiers].apply(tuple, axis=1).map(k_anonymity)
    df['l_diversity'] = df[quasi_identifiers].apply(tuple, axis=1).map(l_diversity)
    df['t_closeness'] = df[quasi_identifiers].apply(tuple, axis=1).map(t_closeness)
    return df

## Infinite loop to get inputs until a blank entry
def get_user_inputs():
    quasi_identifiers = []
    inputs = []
    while True:
        user_input = input("Enter a column name or index that is a quasi_identifier (leave blank to finish): ")
        if user_input == "":
            break
        inputs.append(user_input)
    for item in inputs:
        try:
            quasi_identifiers.append(int(item))
        except ValueError:
            quasi_identifiers.append(item)
    return quasi_identifiers

Main Program¶

In [6]:
# Pass in the file path of the public data from user input
file_path = input("Enter the file path for the data, as a CSV or Parquet file: ")
if 'parquet' in file_path.lower():
    df = pd.read_parquet(file_path)
else:
    df = pd.read_csv(file_path, header=None)
df.head(10)
Out[6]:
0 1 2 3 4 5
0 id name type state oricodes total_shootings
1 3145 Abbeville County Sheriff's Office sheriff SC SC00100 1
2 2576 Aberdeen Police Department local_police WA WA01401 1
3 2114 Abilene Police Department local_police TX TX22101 6
4 2088 Abington Township Police Department local_police PA PA04601 1
5 3187 Acadia Parish Sheriff's Office sheriff LA LA00100 1
6 3375 Acworth Police Department local_police GA GA03305 1
7 1241 Ada County Sheriff's Office sheriff ID ID00100 5
8 1615 Adair County Sheriff's Office sheriff OK OK00100 1
9 1978 Adams County Sheriff's Department sheriff CO CO00100 14
In [7]:
# Define the quasi-identifiers and sensitive attribute of data
quasi_identifiers = get_user_inputs()
print("quasi_identifiers=", quasi_identifiers)
user_input = input("Enter the column name or index that is the sensitive attribute: ") 
try:
    sensitive_attribute = int(user_input)
except:
    sensitive_attribute = user_input 
print("sensitive_attribute =", sensitive_attribute)
quasi_identifiers= [0, 2, 3, 4, 5]
sensitive_attribute = 1
In [8]:
df = flag_re_identifiability(df, quasi_identifiers, sensitive_attribute)

# Flagging rows with >= 80% likelihood of re-identification
df['re_identifiability_likelihood'] = np.where(
    (df['k_anonymity'] <= (0.2 * len(df))) & 
    (df['l_diversity'] <= 1) & 
    (df['t_closeness'] >= (0.8 * df['t_closeness'].max())), 
    'High', 'Low'
)
high_risk_df = df[df['re_identifiability_likelihood'] == 'High']
/var/folders/gr/6ytyl_l54r34xk4zb10_hq680000gn/T/ipykernel_87705/4221014718.py:11: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  return df.groupby(quasi_identifiers).apply(diversity)
/var/folders/gr/6ytyl_l54r34xk4zb10_hq680000gn/T/ipykernel_87705/4221014718.py:19: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  return df.groupby(quasi_identifiers).apply(closeness)
In [9]:
# Overall statistics summary
stats_summary = {
    'k_anonymity': df['k_anonymity'].describe(),
    'l_diversity': df['l_diversity'].describe(),
    't_closeness': df['t_closeness'].describe(),
    'high_risk_rows_count': high_risk_df.shape[0]
}
In [10]:
# Display the high risk dataframe and the overall statistics summary
high_risk_df_copy = high_risk_df.copy()
summary_df = pd.DataFrame(stats_summary)
In [11]:
# Listing the indexes of the rows in high_risk_df_copy
high_risk_indexes = high_risk_df_copy.index.tolist()
print("\nNumber of Indexes for High Risk Rows:")
print(len(high_risk_indexes), " out of ", len(df), " rows ({}%)".format(round((len(high_risk_indexes)/len(df))*100, 2)))
Number of Indexes for High Risk Rows:
3453  out of  3583  rows (96.37%)
In [12]:
# Displaying the dataframes - Interpretation
print("\nInterpretation of Summary Statistics:")

# Conditional interpretation for k-anonymity
k_anonymity_mean = summary_df['k_anonymity']['mean']
k_anonymity_median = summary_df['k_anonymity']['50%']
if k_anonymity_mean > 20:
    print(f"\nK-Anonymity: Mean: {k_anonymity_mean} (Low Risk)")
else:
    print(f"\nK-Anonymity: Mean: {k_anonymity_mean} (Moderate to High Risk)")
if k_anonymity_median > 20:
    print(f"Median: {k_anonymity_median} (Low Risk)")
else:
    print(f"Median: {k_anonymity_median} (Moderate to High Risk)")

# Conditional interpretation for l-diversity
l_diversity_mean = summary_df['l_diversity']['mean']
l_diversity_median = summary_df['l_diversity']['50%']
if l_diversity_mean > 5:
    print(f"\nL-Diversity: Mean: {l_diversity_mean} (Low Risk)")
else:
    print(f"\nL-Diversity: Mean: {l_diversity_mean} (Moderate to High Risk)")
if l_diversity_median > 5:
    print(f"Median: {l_diversity_median} (Low Risk)")
else:
    print(f"Median: {l_diversity_median} (Moderate to High Risk)")

# Conditional interpretation for t-closeness
t_closeness_mean = summary_df['t_closeness']['mean']
t_closeness_median = summary_df['t_closeness']['50%']
if t_closeness_mean <= 0.05:
    print(f"\nT-Closeness: Mean: {t_closeness_mean} (Low Risk)")
else:
    print(f"\nT-Closeness: Mean: {t_closeness_mean} (Moderate to High Risk)")
if t_closeness_median <= 0.05:
    print(f"Median: {t_closeness_median} (Low Risk)")
else:
    print(f"Median: {t_closeness_median} (Moderate to High Risk)")
Interpretation of Summary Statistics:

K-Anonymity: Mean: 1.0 (Moderate to High Risk)
Median: 1.0 (Moderate to High Risk)

L-Diversity: Mean: 1.0 (Moderate to High Risk)
Median: 1.0 (Moderate to High Risk)

T-Closeness: Mean: 0.999431139372551 (Moderate to High Risk)
Median: 0.9997209042701647 (Moderate to High Risk)
In [13]:
print("\nStatistics Summary:")
for column in summary_df.columns:
    if column != 'high_risk_rows_count':
        print(f"\nSummary statistics for {column}:")
        print(f"Mean: {summary_df[column]['mean']}")
        print(f"Median: {summary_df[column]['50%']}")
        print(f"Standard Deviation: {summary_df[column]['std']}")
print(f"\nTotal Hish Risk Count: {summary_df['high_risk_rows_count']['count']} out of {len(df)}, ({round((summary_df['high_risk_rows_count']['count']/len(df))*100, 2)}%)")
Statistics Summary:

Summary statistics for k_anonymity:
Mean: 1.0
Median: 1.0
Standard Deviation: 0.0

Summary statistics for l_diversity:
Mean: 1.0
Median: 1.0
Standard Deviation: 0.0

Summary statistics for t_closeness:
Mean: 0.999431139372551
Median: 0.9997209042701647
Standard Deviation: 0.0012740892681340204

Total Hish Risk Count: 3453 out of 3583, (96.37%)