-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathvalidation.py
147 lines (137 loc) · 8.05 KB
/
validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#------------------------------------------------------------------------------------
# Name: Validation
# Purpose: This module contains is use to perform validation using the provided
# validation set, and to calculate a number of metrics. Further, the
# hand-prepared training file with 645 records is used as a second
# validation file, mimicking the 2 test files.
#
# Execution: Not executable
#
# Author: Ashwath Sampath
#
# Created: 25-11-2018 (V1.0): Validation performed on 2 validation sets, a number
# of metrics are written to logs/validation_log.log
#
# Revisions: 04-12-2018 (V1.1): Cleaned up paths, read/write dfs from pickle,
# Combine dataframes and write predictions to file
# with id, add globals.
#------------------------------------------------------------------------------------
import pandas as pd
import os
import argparse
from gensim.models.doc2vec import Doc2Vec
from sklearn.linear_model import SGDClassifier
from sklearn.externals import joblib
from sklearn.metrics import recall_score, f1_score, accuracy_score, precision_score, confusion_matrix, classification_report
import clean_shuffle
from para2vec import ParagraphVectorModel, get_vector_label_mapping
from datetime import datetime
log_name = '/home/ashwath/Files/SemEval/logs/validation_log_{}.log'.format(
datetime.now().strftime("%Y-%m-%d_%H%M%S"))
results_log = open(log_name, 'a')
def loadmodels_global():
""" Load the models in the global scope. sem_eval_path is global. """
global model_content_dbow
model_content_dbow = Doc2Vec.load(os.path.join(sem_eval_path, 'embeddings', 'doc2vec_dbow_model_content_idtags'))
global model_title_dbow
model_title_dbow = Doc2Vec.load(os.path.join(sem_eval_path, 'embeddings', 'doc2vec_dbow_model_title_idtags'))
global svc
svc = joblib.load(os.path.join(sem_eval_path, 'models', 'svc_embeddings.joblib'))
def predict_vals(model, X_val):
""" Predicts the labels for the validation set using the given model
ARGUMENTS: model: an sklearn model
X_val: the validation matrix for which labels have to be predicted
RETURNS: y_pred: predicted labels Pandas series"""
return pd.Series(model.predict(X_val))
def calculate_metrics(y_test_df, y_pred_df, ml_model, val_filetype):
""" Calculates a number of metrics using the model, the predicted y and the true y.
ARGUMENTS: y_test_df: test (validation) set labels and ids, Pandas DataFrame
y_pred_df: predicted labels and ids, Pandas DataFrame
ml_model: sklearn model (hyperparams printed in log file)
val_filetype: string 'Buzzfeed Validation File' or
'Crowdsourced File used as a validation file'
RETURNS: None
"""
y_pred = y_pred_df.hyperpartisan
y_test = y_test_df.hyperpartisan
results_log.write("{}: \n".format(val_filetype))
results_log.write("ML Model for classification: {}\n".format(ml_model))
results_log.write("Predicted value counts per class (predictions):\n{}\n ".format(y_pred.value_counts()))
results_log.write("Predicted value counts per class (val set):\n{}\n ".format(y_test.value_counts()))
f1 = f1_score(y_test, y_pred, average=None)
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
accuracy = accuracy_score(y_test, y_pred)
results_log.write("F1={}, Precision={}, Recall={}, Accuracy={}".format(f1, precision, recall, accuracy))
results_log.write(classification_report(y_test, y_pred, target_names=['fair', 'biased'] ))
results_log.write("Confusion matrix: \n{}\n".format(confusion_matrix(y_test, y_pred)))
results_log.write('****************************************************************************************************************\n')
def write_to_tsv(y_pred_df, y_val_df, outfile):
""" Combine the contents of the predicted y and original y dataframes, and write the results to outfile
ARGUMENTS: y_test_df: test (validation) set labels and ids, Pandas DataFrame
y_pred_df: predicted labels and ids, Pandas DataFrame
outfile: output file to write to (whole path), string
RETURNS: None
"""
# Convert 0 and 1 back to true and false (as it was in the xml file)
truefalsedict = {0: 'false', 1: 'true'}
y_pred_df['hyperpartisan'] = y_pred_df['hyperpartisan'].map(truefalsedict, na_action=None)
y_val_df['hyperpartisan'] = y_val_df['hyperpartisan'].map(truefalsedict, na_action=None)
y_pred_df = y_pred_df.rename(columns={'hyperpartisan': 'predicted_hyperpartisan'})
y_val_df = y_val_df.rename(columns={'hyperpartisan': 'actual_hyperpartisan'})
df = pd.merge(y_val_df, y_pred_df, how='inner', left_on='id', right_on='id')
# Reorder columns
df = df[['id', 'actual_hyperpartisan', 'predicted_hyperpartisan']]
df.to_csv(outfile, sep='\t', index=False)
def validate(val_file, val_filetype, df_location, outfile):
""" Performs validation on the file supplied in the first argument.
ARGUMENTS: val_file: the path to the validation file, string
val_filetype: string 'Buzzfeed Validation File' or
'Crowdsourced File used as a validation file'
df_location: location to load/save validation df from
out_file: path to output file
RETURNS: None
"""
val_df = clean_shuffle.read_prepare_df(val_file, file_path=df_location)
# Load the model, and tag the docs (obviously, no training step, so set
# init_models to False)
pv = ParagraphVectorModel(val_df, init_models=False)
# Tag the documents (title + content separately)
pv.get_tagged_docs()
pv.model_content_dbow = model_content_dbow
pv.model_title_dbow = model_title_dbow
# y_val_df is a DataFrame with id and hyperpartisan
X_val, y_val_df = get_vector_label_mapping(pv)
# Get the predictions
y_pred = predict_vals(svc, X_val)
y_pred_df = pd.DataFrame(y_pred, columns=['hyperpartisan'])
# The order of ids will be the same
y_pred_df['id'] = y_val_df.id
calculate_metrics(y_val_df, y_pred_df, svc, val_filetype)
write_to_tsv(y_pred_df, y_val_df, outfile)
def main():
""" Main function which performs validation on 2 validation files."""
parser = argparse.ArgumentParser()
parser.add_argument("--inputDataset",'-c', default="/home/ashwath/Files/SemEval",
help="Use this argument to change the SemEval directory path (the default path is: '/home/ashwath/Files/SemEval')")
parser.add_argument("--outputDir",'-o', default="/home/ashwath/Files/SemEval",
help="Use this argument to change the SemEval directory path (the default path is: '/home/ashwath/Files/SemEval')")
args = parser.parse_args()
global sem_eval_path
sem_eval_path = '/home/peter-brinkmann'
val_file = os.path.join(sem_eval_path, 'data', 'IntegratedFiles', 'buzzfeed_validation_withid.tsv')
crowdsourced_file = os.path.join(sem_eval_path, 'data', 'IntegratedFiles', 'crowdsourced_train_withid.tsv')
val_df = os.path.join(sem_eval_path, 'data', 'Pickles', 'validation_df.pickle')
crowdsourced_df = os.path.join(sem_eval_path, 'data', 'Pickles', 'crowdsourced_df.pickle')
outfile_buzzfeedval = os.path.join(sem_eval_path, 'predictions', 'buzzfeedval_predictions.tsv')
outfile_crowdsourced = os.path.join(sem_eval_path, 'predictions', 'crowdsourced_predictions.tsv')
# Load the models in the global scope
loadmodels_global()
validate(val_file, 'Buzzfeed Validation File', val_df, outfile=outfile_buzzfeedval)
validate(crowdsourced_file, 'Crowdsourced File used as a validation file', crowdsourced_df, outfile=outfile_crowdsourced)
print("DONE! Metrics and results stored in {}".format(log_name))
print("View Buzzfeed validation predictions at: {}".format(outfile_buzzfeedval))
print("View Crowdsourced validation predictions at: {}".format(outfile_crowdsourced))
results_log.close()
if __name__ == '__main__':
main()