forked from aszhang95/123proj
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatrixcsvmaker.py
85 lines (63 loc) · 2.93 KB
/
matrixcsvmaker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from mrjob.job import MRJob
from mrjob.step import MRStep
import sys
import csv
import numpy as np
import ast
import re
class MRUserbyUserMatrix(MRJob):
'''
Takes the csv with sentence pairs and creates a csv that gives the
location of the matrix and the similarity score that should be put there
Input:
csv file with each line like --> sentence 1, user of sentence 1, sentence 2, user of sentence 2
Output:
csv file with each line like --> (x coosrdinate of matrix, y coordinate of matrix), similarity score
'''
def mapper_init(self):
self.current_dict_location = 0
self.user_dict = {}
def mapper(self, _, pair):
pair_info = [x.strip() for x in pair.split(',')]
sentence1 = re.findall(r'\'(.*?)\'', pair_info[1])
sentence1_user = pair_info[0]
sentence1_user = re.findall(r'\'(.*?)\'', pair_info[0])[0]
sentence1_vector = ",".join(pair_info[2:302])
sentence1_vector = '[' + sentence1_vector + ']'
sentence1_vector = np.asarray(ast.literal_eval(sentence1_vector))
sentence2 = re.findall(r'\'(.*?)\'', pair_info[303])
sentence2_user = re.findall(r'\'(.*?)\'', pair_info[302])[0]
sentence2_vector = ",".join(pair_info[304:])
sentence2_vector = '[' + sentence2_vector.strip('"')
sentence2_vector = np.asarray(ast.literal_eval(sentence2_vector))
if sentence1_user not in self.user_dict:
self.user_dict[sentence1_user] = self.current_dict_location
self.current_dict_location += 1
if sentence2_user not in self.user_dict:
self.user_dict[sentence2_user] = self.current_dict_location
self.current_dict_location += 1
similarity_score = np.dot(sentence1_vector,sentence2_vector)
yield (sentence1_user, self.user_dict[sentence1_user], sentence2_user,self.user_dict[sentence2_user]), similarity_score
yield (sentence2_user, self.user_dict[sentence2_user], sentence1_user, self.user_dict[sentence1_user]), similarity_score
def combiner(self, users, scores):
yield users, sum(scores)
def reducer_init(self):
active = open('active.csv')
active = active.readlines()
self.num_comments_dict = {}
for line in active:
user = re.findall(r'"(.*)"', line)[0]
num_comments = ast.literal_eval(re.findall(r'\t(\d+)', line)[0])
self.num_comments_dict[user] = num_comments
def reducer(self, users, scores):
normal_factor = self.num_comments_dict[users[0]] * self.num_comments_dict[users[2]]
yield (users), sum(scores)/ normal_factor
def steps(self):
return [
MRStep(mapper_init=self.mapper_init,
mapper=self.mapper,
combiner=self.combiner,
reducer_init = self.reducer_init,
reducer=self.reducer)]
if __name__ == '__main__':
MRUserbyUserMatrix.run()