-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathapp.py
217 lines (184 loc) · 5.88 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import os, sys
import math
import numpy as np
from rouge import Rouge
# CONSTANTS
COSINE_SIMILARITY_THRESHOLD = 0.10
TELEPORTATION_RATE = 0.15
ERROR_TOLERANCE = 0.00001
def read_news(directory):
"""
Reads news from files. Returns news and summaries
"""
filenames = os.listdir(directory)
# Get all file with .txt extension
filenames = [filename for filename in filenames if filename.endswith(".txt")]
filenames.sort()
news = {}
summaries = {}
# Extract news from each file
for filename in filenames:
raw_data = open(os.path.join(directory, filename), "r", encoding="utf-8").read()
raw_news = raw_data.split('\n\n')
news[filename] = raw_news[0].strip().split('\n')
summaries[filename] = raw_news[1].strip()
return news, summaries
def calculate_idf(news):
"""
Calculates idf for given news
"""
df = {}
idf = {}
N = len(news)
# Count document frequency for each term
for text in news.values():
tokens = " ".join(text).split()
terms = set(tokens)
for term in terms:
df[term] = df.get(term, 0) + 1
# Calculate idf for each term
for term, freq in df.items():
idf[term] = math.log10(N / freq)
return idf
def calculate_tf_idf(sentence, idf, terms):
"""
Calculates tf idf vector for given sentence
"""
counts = {}
# Count term frequencies
for token in sentence.split():
counts[token] = counts.get(token, 0) + 1
tf_idf = []
for term in terms:
# Calculate tf
tf = 1 + math.log10(counts.get(term, 0.1))
# Add tf idf to result
tf_idf.append(tf * idf.get(term))
return tf_idf
def unit_vector(vec):
"""
Returns unit vector that points same direction with given vector
"""
return vec / np.linalg.norm(vec)
def cosine_similarity(vec1, vec2):
"""
Calculated cosine similarity between given two vectors
"""
return np.dot(unit_vector(vec1), unit_vector(vec2))
def print_matrix(m):
"""
Prints matrix prettier
"""
for row in m:
text = ""
for column in row:
text += "{:^10.5f}".format(column)
print(text)
def power_iteration(m):
"""
Returns eagen vector of given matrix
"""
x = [1/len(m)] * len(m)
while True:
x_new = np.matmul(x, m)
for i in range(len(m)):
if abs(x_new[i] - x[i]) >= ERROR_TOLERANCE:
break
return x_new
x = x_new
def calculate_lex_rank(sentences, idf):
"""
Calculates lex rank of given sentences
"""
lex_ranks = []
tf_idf = []
terms = list(set(" ".join(sentences).split()))
dim = len(sentences)
# Calculate tf idfs of sentences
for sentence in sentences:
tf_idf.append(calculate_tf_idf(sentence, idf, terms))
# Build adjencency matrix with 0 and 1
adj_mat = []
for x in range(dim):
adj_mat.append([])
for y in range(dim):
cos_sim = cosine_similarity(tf_idf[x], tf_idf[y])
adj_mat[x].append(1 if (cos_sim >= COSINE_SIMILARITY_THRESHOLD) else 0)
# Convert adjencency matrix to probability matrix with teleportation rate
for x in range(dim):
N = sum(adj_mat[x])
for y in range(dim):
adj_mat[x][y] = ((adj_mat[x][y] / N) * (1 - TELEPORTATION_RATE)) + (TELEPORTATION_RATE / dim)
return list(power_iteration(adj_mat))
def summarize(sentences, idf):
"""
Generates summary for given sentences
"""
maxest = []
lex_rank = calculate_lex_rank(sentences, idf)
# Find three sentences indexes that have highest lex rank
for i in range(3):
max_lex = max(lex_rank)
max_index = lex_rank.index(max_lex)
maxest.append(max_index)
lex_rank[max_index] = 0
# Sort indexes. Don't change topic flow
maxest.sort()
summary = []
for index in maxest:
summary.append(sentences[index])
return "\n".join(summary)
###############################
####### APP ENTRY POINT #######
###############################
# Check arguments
if len(sys.argv) < 2:
print("You have to give command name")
print("python3 app.py [COMMAND] [DATA_DIRECTORY] [FILE_NAME]")
exit(1)
elif len(sys.argv) < 3:
print("You have to give directory name")
print("python3 app.py [COMMAND] [DATA_DIRECTORY] [FILE_NAME]")
exit(1)
command = sys.argv[1]
directory = sys.argv[2]
# Read data set
news, summaries = read_news(directory)
# if no file is given process all of them
files = [sys.argv[3]] if len(sys.argv) == 4 else news.keys()
# Calculate IDFs
idf = calculate_idf(news)
# Run command
if command == "lex":
for file in files:
lex_rank = calculate_lex_rank(news[file], idf)
print(" ".join(["{:.3f}".format(rank) for rank in lex_rank]))
elif command == "summary":
for file in files:
print(summarize(news[file], idf))
elif command == "gold":
for file in files:
print(summaries[file])
elif command == "rouge":
rouge = Rouge()
total = {}
total["rouge-1"] = {"f":0, "r":0, "p":0}
total["rouge-2"] = {"f":0, "r":0, "p":0}
total["rouge-l"] = {"f":0, "r":0, "p":0}
for file in files:
generated_summary = summarize(news[file], idf)
gold_summary = summaries[file]
scores = rouge.get_scores(gold_summary, generated_summary)
for type in ["rouge-1", "rouge-2", "rouge-l"]:
for stat in ["p", "r", "f"]:
total[type][stat] += scores[0][type][stat]
# Divide sum to lenghts
for type in ["rouge-1", "rouge-2", "rouge-l"]:
for stat in ["p", "r", "f"]:
total[type][stat] /= len(files)
print("Average Rouge Scores")
for type in ["rouge-1", "rouge-2", "rouge-l"]:
print(type)
for stat in ["p", "r", "f"]:
print("\t" + stat + ": " + str(total[type][stat]))
print("\n")