-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstring-similarity-edges.py
66 lines (51 loc) · 2.74 KB
/
string-similarity-edges.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
from fuzzywuzzy import fuzz
import string
# File paths
edgelist_file = '/home/user/HF-Analysis/Data/precleaning/hf-edgelist-contributors-similarity.csv'
output_file = '/home/user/HF-Analysis/Data/username-matches-90.csv'
# Constants
THRESHOLD = 90
# Function to preprocess usernames
def preprocess_username(username):
username = username.lower()
username = ''.join(char for char in username if char not in string.punctuation)
username = username.replace(" ", "")
return username
# Load edgelist data
df = pd.read_csv(edgelist_file, dtype={'source': str, 'target': str, 'freq': int})
df['source'] = df['source'].astype(str)
df['target'] = df['target'].astype(str)
print('1/4: Imported data')
# Remove self-loops
df = df[df['source'] != df['target']]
# Create a dictionary to map preprocessed usernames to their original values
username_to_original = {}
print('2/4: Finding potential matches')
for index, row in df.iterrows():
original_source = row['source']
original_target = row['target']
preprocessed_source = preprocess_username(original_source)
preprocessed_target = preprocess_username(original_target)
username_to_original[preprocessed_source] = original_source
username_to_original[preprocessed_target] = original_target
# Create a DataFrame to store the potential matches
potential_matches_df = pd.DataFrame(columns=['Username-1', 'Username-2', 'Similarity'])
# Iterate through the usernames and run similarity analysis
for preprocessed_source, original_source in username_to_original.items():
for preprocessed_target, original_target in username_to_original.items():
similarity = fuzz.token_set_ratio(preprocessed_source, preprocessed_target)
if similarity >= THRESHOLD:
potential_matches_df = potential_matches_df.append({'Username-1': original_source, 'Username-2': original_target, 'Similarity': similarity}, ignore_index=True)
# Drop the matches where the values are identical
potential_matches_df = potential_matches_df[potential_matches_df['Username-1'] != potential_matches_df['Username-2']]
# Drop duplicates based on the sorted combinations, keeping the first occurrence
potential_matches_df['Sorted_Username'] = potential_matches_df.apply(lambda row: tuple(sorted([row['Username-1'], row['Username-2']])), axis=1)
potential_matches_df.drop_duplicates(subset='Sorted_Username', keep='first', inplace=True)
potential_matches_df.drop(columns='Sorted_Username', inplace=True)
potential_matches_df.reset_index(drop=True, inplace=True)
# Print match count
print('3/4: Found', len(potential_matches_df), 'potential matches.')
# Save the potential_matches_df DataFrame to a CSV file
potential_matches_df.to_csv(output_file, index=False)
print('4/4: Results saved to', output_file)