Skip to content

Commit d26e9b2

Browse files
committed
fetch hyperpartisan values from sql
1 parent 98633f1 commit d26e9b2

File tree

3 files changed

+342
-0
lines changed

3 files changed

+342
-0
lines changed

conv_prediction.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
#------------------------------------------------------------------------------------
2+
# Name: Prediction
3+
# Purpose: This module is used to predict the hyperpartisan values for a test or
4+
# validation set, and write the predictions to a file.
5+
#
6+
# Execution:
7+
#
8+
# Author: Ashwath Sampath
9+
#
10+
# Created: 12-12-2018 (V1.0): partly based on validation.py
11+
#------------------------------------------------------------------------------------
12+
13+
import pandas as pd
14+
import os
15+
import argparse
16+
from gensim.models.doc2vec import Doc2Vec
17+
from sklearn.linear_model import SGDClassifier
18+
from sklearn.externals import joblib
19+
from sklearn.metrics import recall_score, f1_score, accuracy_score, precision_score, confusion_matrix, classification_report
20+
import clean_shuffle
21+
from para2vec import ParagraphVectorModel, get_vector_label_mapping, get_vector_tag_mapping
22+
from datetime import datetime
23+
import create_unified_tsv
24+
from keras.models import load_model
25+
26+
import os
27+
import getopt
28+
import sys
29+
from time import sleep
30+
31+
runOutputFileName = "prediction.txt"
32+
sem_eval_path = '/home/peter-brinkmann'
33+
34+
35+
def parse_options():
36+
"""Parses the command line options."""
37+
try:
38+
long_options = ["inputDataset=", "outputDir="]
39+
opts, _ = getopt.getopt(sys.argv[1:], "d:o:", long_options)
40+
except getopt.GetoptError as err:
41+
print(str(err))
42+
sys.exit(2)
43+
44+
inputDataset = "undefined"
45+
outputDir = "undefined"
46+
47+
for opt, arg in opts:
48+
if opt in ("-d", "--inputDataset"):
49+
inputDataset = arg
50+
elif opt in ("-o", "--outputDir"):
51+
outputDir = arg
52+
else:
53+
assert False, "Unknown option."
54+
if inputDataset == "undefined":
55+
sys.exit("Input dataset, the directory that contains the articles XML file, is undefined. Use option -d or --inputDataset.")
56+
elif not os.path.exists(inputDataset):
57+
sys.exit("The input dataset folder does not exist (%s)." % inputDataset)
58+
59+
if outputDir == "undefined":
60+
sys.exit("Output path, the directory into which the predictions should be written, is undefined. Use option -o or --outputDir.")
61+
elif not os.path.exists(outputDir):
62+
os.mkdir(outputDir)
63+
64+
return (inputDataset, outputDir)
65+
66+
67+
def loadmodels_global():
68+
""" Load the models in the global scope. sem_eval_path is global. """
69+
global model_content_dbow
70+
model_content_dbow = Doc2Vec.load(os.path.join(sem_eval_path, 'embeddings', 'doc2vec_dbow_model_content_idtags'))
71+
global model_title_dbow
72+
model_title_dbow = Doc2Vec.load(os.path.join(sem_eval_path, 'embeddings', 'doc2vec_dbow_model_title_idtags'))
73+
global conv
74+
conv = load_model(os.path.join(sem_eval_path, 'models', 'conv_embeddings.joblib'))
75+
76+
def predict_vals(model, X_val):
77+
""" Predicts the labels for the validation set using the given model
78+
ARGUMENTS: model: an sklearn model
79+
X_val: the validation matrix for which labels have to be predicted
80+
RETURNS: y_pred: predicted labels Pandas series"""
81+
return pd.Series(model.predict(X_val))
82+
83+
def test(test_file, outfile):
84+
""" Performs validation on the file supplied in the first argument.
85+
ARGUMENTS: test_file: the path to the test file, string
86+
out_file: path to output file
87+
RETURNS: None
88+
"""
89+
test_df = clean_shuffle.read_prepare_test_df(test_file)
90+
# Load the model, and tag the docs (obviously, no training step, so set
91+
# init_models to False)
92+
pv = ParagraphVectorModel(test_df, init_models=False)
93+
# Remove the df to save memory
94+
del test_df
95+
# Tag the documents (title + content separately)
96+
pv.get_tagged_docs()
97+
pv.model_content_dbow = model_content_dbow
98+
pv.model_title_dbow = model_title_dbow
99+
# y_test_df is a DataFrame with id as the only column
100+
X_val, y_test_df = get_vector_tag_mapping(pv)
101+
# Get the predictions
102+
y_pred = predict_vals(conv, X_val)
103+
# Convert 0 and 1 back to true and false (as it was in the xml file)
104+
# ATTENTION: we don't need to convert it to 0 and 1 in the previous step any more.
105+
truefalsedict = {0: 'false', 1: 'true'}
106+
y_pred_df = pd.DataFrame(y_pred, columns=['predicted_hyperpartisan'])
107+
y_pred_df['predicted_hyperpartisan'] = y_pred_df['predicted_hyperpartisan'].map(truefalsedict, na_action=None)
108+
# The order of ids will be the same, also add leading zeros (to make it like the input dataset)
109+
y_pred_df['id'] = y_test_df['id'].astype(str).str.zfill(7)
110+
# Reorder the columns
111+
y_pred_df = y_pred_df[['id', 'predicted_hyperpartisan']]
112+
# Write to file
113+
y_pred_df.to_csv(outfile, sep=' ', index=False, header=False)
114+
115+
########## MAIN ##########
116+
117+
118+
def main(inputDataset, outputDir):
119+
"""Main method of this module."""
120+
# Load the models in the global scope
121+
loadmodels_global()
122+
outfile = outputDir + "/" + runOutputFileName
123+
124+
for file in os.listdir(inputDataset):
125+
if file.endswith(".xml"):
126+
xml_file = inputDataset + "/" + file
127+
if 'test' in xml_file:
128+
if 'article' in xml_file:
129+
intermediate_tsv = '{}/data/crowdsourced_test_withid'.format(sem_eval_path)
130+
else:
131+
intermediate_tsv = '{}/data/buzzfeed_test_withid'.format(sem_eval_path)
132+
if 'validation' in xml_file:
133+
if 'article' in xml_file:
134+
intermediate_tsv = '{}/data/crowdsourced_validation_withid'.format(sem_eval_path)
135+
else:
136+
intermediate_tsv = '{}/data/buzzfeed_validation_withid'.format(sem_eval_path)
137+
if 'train' in xml_file:
138+
if 'article' in xml_file:
139+
intermediate_tsv = '{}/data/crowdsourced_validation_withid'.format(sem_eval_path)
140+
else:
141+
intermediate_tsv = '{}/data/buzzfeed_validation_withid'.format(sem_eval_path)
142+
create_unified_tsv.write_to_tsv(intermediate_tsv, xml_file)
143+
print("Written to TSV intermediate file")
144+
sleep(2)
145+
146+
# Do the testing/validation: intermediate_tsv is the input file, outfile is the output file for the predictions.
147+
test(intermediate_tsv, outfile)
148+
149+
print("The predictions have been written to the output folder.")
150+
151+
152+
if __name__ == '__main__':
153+
main(*parse_options())

ground_truth_sqlite.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,24 @@ def select_from_ground_truth(conn, identifier, table_name):
159159
# Returns a tuple of form (id, bias, hyperpartisan, url)
160160
return cur.fetchone()
161161

162+
def select_id_hyperpartisan_mappings(sem_eval_dir_path, table_name):
163+
""" This function queries the sqlite3 table specified in the arguments
164+
for a mapping of article ids to hyperpartisans, and returns the results in a dictionary where key is the id and hyperpartisan value is the value """
165+
db_path = os.path.join(sem_eval_dir_path, 'data', 'Databases', 'ground_truth.sqlite3')
166+
conn = db_connect(db_path)
167+
cur = conn.cursor()
168+
query = """
169+
SELECT id, hyperpartisan
170+
FROM {} """.format(table_name)
171+
# df = pd.read_sql_query(query, conn)
172+
cur.execute(query)
173+
174+
mappings = {}
175+
for row in cur:
176+
mappings[int(row[0])] = row[1]
177+
178+
return mappings
179+
162180
def main():
163181
""" Main function which creates the appropriate ground truth sqlite table based
164182
on command-line args, and inserts data from the appropriate ground truth xml file

train_words_conv_model.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
from keras.preprocessing import sequence
2+
from keras.models import Sequential
3+
from keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Activation, Embedding, Flatten, GlobalMaxPooling1D
4+
from keras.preprocessing.text import Tokenizer
5+
from keras.preprocessing.sequence import pad_sequences
6+
from keras.engine.input_layer import Input
7+
import argparse
8+
from numpy import zeros
9+
import clean_shuffle
10+
from para2vec import ParagraphVectorModel, get_vector_label_mapping
11+
from gensim.models.doc2vec import Doc2Vec
12+
import numpy
13+
import pickle
14+
import os
15+
import pandas as pd
16+
import ground_truth_sqlite
17+
import tensorflow as tf
18+
from gensim.models import KeyedVectors
19+
20+
21+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
22+
23+
sem_eval_path = ''
24+
most_common_count = 100000
25+
seq_len = 5000 # 2500 # Inferred from checking the sequences length distributions
26+
embedding_dims = 300
27+
28+
29+
def load_word_vectors():
30+
print('Loading word vectors...')
31+
filename = '{}/GoogleNews-vectors-negative300.bin'.format(sem_eval_path)
32+
model = KeyedVectors.load_word2vec_format(filename, binary=True)
33+
return model.wv
34+
35+
def load_texts():
36+
filename = os.path.join(sem_eval_path, 'data', 'IntegratedFiles', 'buzzfeed_training_withid.tsv')# 'crowdsourced_train_withid.tsv')
37+
df_location = os.path.join(sem_eval_path, 'data', 'Pickles', 'training_df.pickle')
38+
39+
df = clean_shuffle.read_prepare_df(filename, file_path=df_location)
40+
41+
ids_to_labels = ground_truth_sqlite.select_id_hyperpartisan_mappings(sem_eval_path, 'ground_truth_training') # 'ground_truth_crowdsourced_train'
42+
df['hyperpartisan'] = df.apply(lambda row: 1 if ids_to_labels[row['id']] == 'true' else 0, axis=1)
43+
44+
df["text"] = df["title"] + ' ' + df["content"]
45+
46+
return df['text'], df['hyperpartisan']
47+
48+
def load_tokenizer(texts):
49+
num_words = most_common_count + 1
50+
file_path = os.path.join(sem_eval_path, 'data', 'Tokenizers', 'buzzfeed_trained_{}_tokenizer.pickle'.format(num_words))
51+
52+
if os.path.isfile(file_path):
53+
with open(file_path, 'rb') as tokenizer_file:
54+
tokenizer = pickle.load(tokenizer_file)
55+
56+
print('Tokenizer loaded from disk')
57+
else:
58+
tokenizer = Tokenizer(num_words=num_words)
59+
tokenizer.fit_on_texts(texts)
60+
61+
with open(file_path, 'wb') as tokenizer_file:
62+
pickle.dump(tokenizer, tokenizer_file, protocol=pickle.HIGHEST_PROTOCOL)
63+
64+
print('Tokenizer fit on texts and stored on disk')
65+
return tokenizer
66+
67+
def get_embedding_weights(word_vectors, word_index):
68+
weights_matrix = zeros((len(word_index) + 1, embedding_dims))
69+
70+
count = 0
71+
for word, idx in word_index.items():
72+
if word in word_vectors:
73+
weights_matrix[idx] = word_vectors[word]
74+
count += 1
75+
print('Words found on word2vec: {}'.format(count))
76+
77+
return weights_matrix
78+
79+
def main():
80+
parser = argparse.ArgumentParser()
81+
parser.add_argument("--path",'-p', default="/home/agon/Files/SemEval",
82+
help="Use this argument to change the SemEval directory path (the default path is: '/home/ashwath/Files/SemEval')")
83+
parser.add_argument("--epochs", '-e', default="10",
84+
help="Use this argument to set the number of epochs. Default: 10")
85+
parser.add_argument("--filters", '-f', default="64",
86+
help="Use this argument to set the number of filters. Default: 64")
87+
parser.add_argument("--kernel", '-k', default="4",
88+
help="Use this argument to set the size of kernels. Default: 4")
89+
args = parser.parse_args()
90+
91+
global sem_eval_path
92+
sem_eval_path = args.path
93+
94+
# Hyperparameters
95+
filters = int(args.filters)
96+
kernel_size = int(args.kernel)
97+
epochs = int(args.epochs)
98+
hidden_dims = 250
99+
batch_size = 32 # default
100+
101+
# Get data
102+
texts, y_train = load_texts()
103+
104+
tokenizer = load_tokenizer(texts)
105+
106+
train_sequences = tokenizer.texts_to_sequences(texts)
107+
del texts
108+
109+
with tf.device('/cpu:0'):
110+
X_train = pad_sequences(train_sequences, maxlen=seq_len, padding='post')
111+
del train_sequences
112+
113+
vocab_size = len(tokenizer.word_index) + 1
114+
print('Vocab size: {}'.format(vocab_size))
115+
116+
# 5. Load word vectors
117+
word_vectors = load_word_vectors()
118+
weights_matrix = get_embedding_weights(word_vectors, tokenizer.word_index)
119+
120+
# Remove word_vectors to free up memory
121+
del word_vectors
122+
123+
# 7. Create Embeddings layer
124+
embedding_layer = Embedding(input_dim=vocab_size,
125+
output_dim=embedding_dims,
126+
weights=[weights_matrix],
127+
input_length=seq_len,
128+
trainable=False
129+
)
130+
131+
# Model definition
132+
model = Sequential()
133+
134+
model.add(embedding_layer)
135+
136+
model.add(Conv1D(filters,
137+
kernel_size,
138+
activation='relu'))
139+
model.add(Dropout(0.2))
140+
model.add(MaxPooling1D(pool_size=4))
141+
142+
model.add(Conv1D(filters,
143+
kernel_size,
144+
activation='relu'))
145+
model.add(Dropout(0.2))
146+
model.add(MaxPooling1D(pool_size=4))
147+
148+
model.add(GlobalMaxPooling1D())
149+
# model.add(Flatten())
150+
151+
model.add(Dense(hidden_dims, activation='relu'))
152+
model.add(Dropout(0.5))
153+
154+
model.add(Dense(1, activation='sigmoid'))
155+
156+
print(model.summary())
157+
158+
model.compile(loss='binary_crossentropy',
159+
optimizer='adam',
160+
metrics=['accuracy'])
161+
162+
model.fit(X_train, y_train,
163+
batch_size=batch_size,
164+
epochs=epochs,
165+
verbose=2)
166+
167+
conv_model_location = os.path.join(sem_eval_path, 'models', 'words_conv_model.h5')
168+
model.save(conv_model_location)
169+
170+
if __name__ == "__main__":
171+
main()

0 commit comments

Comments
 (0)