forked from aszhang95/123proj
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIsPolitical.py
152 lines (114 loc) · 4.54 KB
/
IsPolitical.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
import spacy
import json
from entity import *
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import dok_matrix
from random import randint
NUM_BUCKETS = 10
#This File Does the important work of collecting named entities and sentiment associated with them.
class IsPolitical(MRJob):
def mapper_init(self):
self.users = dict()
self.user_index = 0
self.entities = dict()
self.entity_index = 0
self.searches = dict()
self.comments = 0
def mapper(self, _, line):
self.comments += 1
#parse line
line_len = len(line)
line = line[1:line_len-1]
parts = line.split(',')
user = parts[0]
if user != "[deleted]":
comment = parts[1][1:len(parts[1])-1]
#get entities and sentiments associated with them
sentiments = sentiment(comment, self.searches, False)
if sentiments:
for text, iden, is_political, score in sentiments:
#recording searches so as not to do redundant searches
self.searches[text] = (iden, is_political)
if score:
if is_political:
#keeping track of users and entities
if user not in self.users:
self.users[user] = self.user_index
self.user_index += 1
if iden not in self.entities:
self.entities[iden] = self.entity_index
self.entity_index += 1
#yield for matrix data (Hypothesis 2)
yield (user, self.users[user], iden, self.entities[iden]), score
#yield for polarization data (Hypothesis 1)
yield is_political, score
def combiner(self, key, value):
scores = value
if type(key) == bool:
is_political = key
sum_ex = 0
sum_ex2 = 0
n = 0
heights = np.zeros(NUM_BUCKETS + 1)
for score in scores:
n += 1
sum_ex += score
sum_ex2 += score ** 2
bucket = int((score + 1) * NUM_BUCKETS/2)
heights[bucket] += 1
yield is_political, (sum_ex, sum_ex2, n, heights)
else:
pair = key
yield pair, sum(scores)
def reducer(self, key, value):
if type(key) == bool:
is_political = key
#algorithm for parallel calculation of variance is here: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Na.C3.AFve_algorithm
ex_ex2_n_heights = value
sum_ex = 0
sum_ex2 = 0
sum_n = 0
sum_heights = np.zeros(NUM_BUCKETS + 1)
for ex, ex2, n, heights in ex_ex2_n_heights:
sum_ex += ex
sum_ex2 += ex2
sum_n += n
sum_heights += np.array(heights)
yield is_political, (sum_ex, sum_ex2, sum_n, sum_heights)
else:
yield key, sum(value)
def reducer_stddev(self, key, value):
if type(key) == bool:
is_political = key
sum_ex, sum_ex2, n, sum_heights = next(value)
mean = sum_ex / n
#making a histogram of sentiment
x = np.arange(NUM_BUCKETS + 1)
plt.bar(x, height = sum_heights)
locs = np.arange(0,NUM_BUCKETS,NUM_BUCKETS/10)
off = NUM_BUCKETS / 2
ticks = ['{} to {}'.format((boundary - off)/off,((boundary - off)/off) + 2/NUM_BUCKETS) for boundary in locs]
plt.xticks(locs, ticks)
if is_political:
title = 'Histogram of Sentiment for Political entities'
else:
title = 'Histogram of Sentiment for non-political entities'
plt.title(title)
plt.show()
yield is_political, ((sum_ex2 - ((sum_ex) ** 2) / n) / n, n)
else:
yield key, value
def steps(self):
return [
MRStep(
mapper_init = self.mapper_init,
mapper=self.mapper,
combiner=self.combiner,
reducer=self.reducer),
MRStep(reducer=self.reducer_stddev)]
if __name__ == '__main__':
IsPolitical.run()