|
| 1 | +#------------------------------------------------------------------------------------ |
| 2 | +# Name: Prediction |
| 3 | +# Purpose: This module is used to predict the hyperpartisan values for a test or |
| 4 | +# validation set, and write the predictions to a file. |
| 5 | +# |
| 6 | +# Execution: |
| 7 | +# |
| 8 | +# Author: Ashwath Sampath |
| 9 | +# |
| 10 | +# Created: 12-12-2018 (V1.0): partly based on validation.py |
| 11 | +#------------------------------------------------------------------------------------ |
| 12 | + |
| 13 | +import pandas as pd |
| 14 | +import os |
| 15 | +import argparse |
| 16 | +from gensim.models.doc2vec import Doc2Vec |
| 17 | +from sklearn.linear_model import SGDClassifier |
| 18 | +from sklearn.externals import joblib |
| 19 | +from sklearn.metrics import recall_score, f1_score, accuracy_score, precision_score, confusion_matrix, classification_report |
| 20 | +import clean_shuffle |
| 21 | +from para2vec import ParagraphVectorModel, get_vector_label_mapping, get_vector_tag_mapping |
| 22 | +from datetime import datetime |
| 23 | +import create_unified_tsv |
| 24 | +from keras.models import load_model |
| 25 | + |
| 26 | +import os |
| 27 | +import getopt |
| 28 | +import sys |
| 29 | +from time import sleep |
| 30 | + |
| 31 | +runOutputFileName = "prediction.txt" |
| 32 | +sem_eval_path = '/home/peter-brinkmann' |
| 33 | + |
| 34 | + |
| 35 | +def parse_options(): |
| 36 | + """Parses the command line options.""" |
| 37 | + try: |
| 38 | + long_options = ["inputDataset=", "outputDir="] |
| 39 | + opts, _ = getopt.getopt(sys.argv[1:], "d:o:", long_options) |
| 40 | + except getopt.GetoptError as err: |
| 41 | + print(str(err)) |
| 42 | + sys.exit(2) |
| 43 | + |
| 44 | + inputDataset = "undefined" |
| 45 | + outputDir = "undefined" |
| 46 | + |
| 47 | + for opt, arg in opts: |
| 48 | + if opt in ("-d", "--inputDataset"): |
| 49 | + inputDataset = arg |
| 50 | + elif opt in ("-o", "--outputDir"): |
| 51 | + outputDir = arg |
| 52 | + else: |
| 53 | + assert False, "Unknown option." |
| 54 | + if inputDataset == "undefined": |
| 55 | + sys.exit("Input dataset, the directory that contains the articles XML file, is undefined. Use option -d or --inputDataset.") |
| 56 | + elif not os.path.exists(inputDataset): |
| 57 | + sys.exit("The input dataset folder does not exist (%s)." % inputDataset) |
| 58 | + |
| 59 | + if outputDir == "undefined": |
| 60 | + sys.exit("Output path, the directory into which the predictions should be written, is undefined. Use option -o or --outputDir.") |
| 61 | + elif not os.path.exists(outputDir): |
| 62 | + os.mkdir(outputDir) |
| 63 | + |
| 64 | + return (inputDataset, outputDir) |
| 65 | + |
| 66 | + |
| 67 | +def loadmodels_global(): |
| 68 | + """ Load the models in the global scope. sem_eval_path is global. """ |
| 69 | + global model_content_dbow |
| 70 | + model_content_dbow = Doc2Vec.load(os.path.join(sem_eval_path, 'embeddings', 'doc2vec_dbow_model_content_idtags')) |
| 71 | + global model_title_dbow |
| 72 | + model_title_dbow = Doc2Vec.load(os.path.join(sem_eval_path, 'embeddings', 'doc2vec_dbow_model_title_idtags')) |
| 73 | + global conv |
| 74 | + conv = load_model(os.path.join(sem_eval_path, 'models', 'conv_embeddings.joblib')) |
| 75 | + |
| 76 | +def predict_vals(model, X_val): |
| 77 | + """ Predicts the labels for the validation set using the given model |
| 78 | + ARGUMENTS: model: an sklearn model |
| 79 | + X_val: the validation matrix for which labels have to be predicted |
| 80 | + RETURNS: y_pred: predicted labels Pandas series""" |
| 81 | + return pd.Series(model.predict(X_val)) |
| 82 | + |
| 83 | +def test(test_file, outfile): |
| 84 | + """ Performs validation on the file supplied in the first argument. |
| 85 | + ARGUMENTS: test_file: the path to the test file, string |
| 86 | + out_file: path to output file |
| 87 | + RETURNS: None |
| 88 | + """ |
| 89 | + test_df = clean_shuffle.read_prepare_test_df(test_file) |
| 90 | + # Load the model, and tag the docs (obviously, no training step, so set |
| 91 | + # init_models to False) |
| 92 | + pv = ParagraphVectorModel(test_df, init_models=False) |
| 93 | + # Remove the df to save memory |
| 94 | + del test_df |
| 95 | + # Tag the documents (title + content separately) |
| 96 | + pv.get_tagged_docs() |
| 97 | + pv.model_content_dbow = model_content_dbow |
| 98 | + pv.model_title_dbow = model_title_dbow |
| 99 | + # y_test_df is a DataFrame with id as the only column |
| 100 | + X_val, y_test_df = get_vector_tag_mapping(pv) |
| 101 | + # Get the predictions |
| 102 | + y_pred = predict_vals(conv, X_val) |
| 103 | + # Convert 0 and 1 back to true and false (as it was in the xml file) |
| 104 | + # ATTENTION: we don't need to convert it to 0 and 1 in the previous step any more. |
| 105 | + truefalsedict = {0: 'false', 1: 'true'} |
| 106 | + y_pred_df = pd.DataFrame(y_pred, columns=['predicted_hyperpartisan']) |
| 107 | + y_pred_df['predicted_hyperpartisan'] = y_pred_df['predicted_hyperpartisan'].map(truefalsedict, na_action=None) |
| 108 | + # The order of ids will be the same, also add leading zeros (to make it like the input dataset) |
| 109 | + y_pred_df['id'] = y_test_df['id'].astype(str).str.zfill(7) |
| 110 | + # Reorder the columns |
| 111 | + y_pred_df = y_pred_df[['id', 'predicted_hyperpartisan']] |
| 112 | + # Write to file |
| 113 | + y_pred_df.to_csv(outfile, sep=' ', index=False, header=False) |
| 114 | + |
| 115 | +########## MAIN ########## |
| 116 | + |
| 117 | + |
| 118 | +def main(inputDataset, outputDir): |
| 119 | + """Main method of this module.""" |
| 120 | + # Load the models in the global scope |
| 121 | + loadmodels_global() |
| 122 | + outfile = outputDir + "/" + runOutputFileName |
| 123 | + |
| 124 | + for file in os.listdir(inputDataset): |
| 125 | + if file.endswith(".xml"): |
| 126 | + xml_file = inputDataset + "/" + file |
| 127 | + if 'test' in xml_file: |
| 128 | + if 'article' in xml_file: |
| 129 | + intermediate_tsv = '{}/data/crowdsourced_test_withid'.format(sem_eval_path) |
| 130 | + else: |
| 131 | + intermediate_tsv = '{}/data/buzzfeed_test_withid'.format(sem_eval_path) |
| 132 | + if 'validation' in xml_file: |
| 133 | + if 'article' in xml_file: |
| 134 | + intermediate_tsv = '{}/data/crowdsourced_validation_withid'.format(sem_eval_path) |
| 135 | + else: |
| 136 | + intermediate_tsv = '{}/data/buzzfeed_validation_withid'.format(sem_eval_path) |
| 137 | + if 'train' in xml_file: |
| 138 | + if 'article' in xml_file: |
| 139 | + intermediate_tsv = '{}/data/crowdsourced_validation_withid'.format(sem_eval_path) |
| 140 | + else: |
| 141 | + intermediate_tsv = '{}/data/buzzfeed_validation_withid'.format(sem_eval_path) |
| 142 | + create_unified_tsv.write_to_tsv(intermediate_tsv, xml_file) |
| 143 | + print("Written to TSV intermediate file") |
| 144 | + sleep(2) |
| 145 | + |
| 146 | + # Do the testing/validation: intermediate_tsv is the input file, outfile is the output file for the predictions. |
| 147 | + test(intermediate_tsv, outfile) |
| 148 | + |
| 149 | + print("The predictions have been written to the output folder.") |
| 150 | + |
| 151 | + |
| 152 | +if __name__ == '__main__': |
| 153 | + main(*parse_options()) |
0 commit comments