fetch hyperpartisan values from sql

agon-qurdina · agon-qurdina · commit d26e9b2d9427 · 2018-12-15T11:49:59.000+01:00
diff --git a/conv_prediction.py b/conv_prediction.py
@@ -0,0 +1,153 @@
+#------------------------------------------------------------------------------------
+# Name:        Prediction
+# Purpose:     This module is used to predict the hyperpartisan values for a test or
+#              validation set, and write the predictions to a file.
+#
+# Execution:   
+#
+# Author:      Ashwath Sampath
+#
+# Created:     12-12-2018 (V1.0): partly based on validation.py 
+#------------------------------------------------------------------------------------
+
+import pandas as pd
+import os
+import argparse
+from gensim.models.doc2vec import Doc2Vec
+from sklearn.linear_model import SGDClassifier
+from sklearn.externals import joblib
+from sklearn.metrics import recall_score, f1_score, accuracy_score, precision_score, confusion_matrix, classification_report
+import clean_shuffle
+from para2vec import ParagraphVectorModel, get_vector_label_mapping, get_vector_tag_mapping
+from datetime import datetime
+import create_unified_tsv
+from keras.models import load_model
+
+import os
+import getopt
+import sys
+from time import sleep
+
+runOutputFileName = "prediction.txt"
+sem_eval_path = '/home/peter-brinkmann'
+
+
+def parse_options():
+    """Parses the command line options."""
+    try:
+        long_options = ["inputDataset=", "outputDir="]
+        opts, _ = getopt.getopt(sys.argv[1:], "d:o:", long_options)
+    except getopt.GetoptError as err:
+        print(str(err))
+        sys.exit(2)
+
+    inputDataset = "undefined"
+    outputDir = "undefined"
+
+    for opt, arg in opts:
+        if opt in ("-d", "--inputDataset"):
+            inputDataset = arg
+        elif opt in ("-o", "--outputDir"):
+            outputDir = arg
+        else:
+            assert False, "Unknown option."
+    if inputDataset == "undefined":
+        sys.exit("Input dataset, the directory that contains the articles XML file, is undefined. Use option -d or --inputDataset.")
+    elif not os.path.exists(inputDataset):
+        sys.exit("The input dataset folder does not exist (%s)." % inputDataset)
+
+    if outputDir == "undefined":
+        sys.exit("Output path, the directory into which the predictions should be written, is undefined. Use option -o or --outputDir.")
+    elif not os.path.exists(outputDir):
+        os.mkdir(outputDir)
+
+    return (inputDataset, outputDir)
+
+
+def loadmodels_global():
+    """ Load the models in the global scope. sem_eval_path is global. """
+    global model_content_dbow
+    model_content_dbow = Doc2Vec.load(os.path.join(sem_eval_path, 'embeddings', 'doc2vec_dbow_model_content_idtags'))
+    global model_title_dbow
+    model_title_dbow = Doc2Vec.load(os.path.join(sem_eval_path, 'embeddings', 'doc2vec_dbow_model_title_idtags'))
+    global conv
+    conv = load_model(os.path.join(sem_eval_path, 'models', 'conv_embeddings.joblib'))
+
+def predict_vals(model, X_val):
+    """ Predicts the labels for the validation set using the given model
+    ARGUMENTS: model: an sklearn model
+               X_val: the validation matrix for which labels have to be predicted
+    RETURNS: y_pred: predicted labels Pandas series"""
+    return pd.Series(model.predict(X_val))
+
+def test(test_file, outfile):
+    """ Performs validation on the file supplied in the first argument.
+    ARGUMENTS: test_file: the path to the test file, string
+               out_file: path to output file
+    RETURNS: None
+    """
+    test_df = clean_shuffle.read_prepare_test_df(test_file)
+    # Load the model, and tag the docs (obviously, no training step, so set
+    # init_models to False)
+    pv = ParagraphVectorModel(test_df, init_models=False)
+    # Remove the df to save memory
+    del test_df
+    # Tag the documents (title + content separately)
+    pv.get_tagged_docs()
+    pv.model_content_dbow = model_content_dbow
+    pv.model_title_dbow = model_title_dbow
+    # y_test_df is a DataFrame with id as the only column
+    X_val, y_test_df = get_vector_tag_mapping(pv)
+    # Get the predictions
+    y_pred = predict_vals(conv, X_val)
+    # Convert 0 and 1 back to true and false (as it was in the xml file)
+    # ATTENTION: we don't need to convert it to 0 and 1 in the previous step any more.
+    truefalsedict = {0: 'false', 1: 'true'}
+    y_pred_df = pd.DataFrame(y_pred, columns=['predicted_hyperpartisan'])
+    y_pred_df['predicted_hyperpartisan'] = y_pred_df['predicted_hyperpartisan'].map(truefalsedict, na_action=None)
+    # The order of ids will be the same, also add leading zeros (to make it like the input dataset)
+    y_pred_df['id'] = y_test_df['id'].astype(str).str.zfill(7)
+    # Reorder the columns
+    y_pred_df = y_pred_df[['id', 'predicted_hyperpartisan']]
+    # Write to file
+    y_pred_df.to_csv(outfile, sep=' ', index=False, header=False)
+
+########## MAIN ##########
+
+
+def main(inputDataset, outputDir):
+    """Main method of this module."""
+    # Load the models in the global scope
+    loadmodels_global()
+    outfile = outputDir + "/" + runOutputFileName
+    
+    for file in os.listdir(inputDataset):
+        if file.endswith(".xml"):
+            xml_file = inputDataset + "/" + file
+            if 'test' in xml_file:
+                if 'article' in xml_file:
+                    intermediate_tsv = '{}/data/crowdsourced_test_withid'.format(sem_eval_path)
+                else:
+                    intermediate_tsv = '{}/data/buzzfeed_test_withid'.format(sem_eval_path)
+            if 'validation' in xml_file:
+                if 'article' in xml_file:
+                    intermediate_tsv = '{}/data/crowdsourced_validation_withid'.format(sem_eval_path)
+                else:
+                    intermediate_tsv = '{}/data/buzzfeed_validation_withid'.format(sem_eval_path)
+            if 'train' in xml_file:
+                if 'article' in xml_file:
+                    intermediate_tsv = '{}/data/crowdsourced_validation_withid'.format(sem_eval_path)
+                else:
+                    intermediate_tsv = '{}/data/buzzfeed_validation_withid'.format(sem_eval_path)                    
+            create_unified_tsv.write_to_tsv(intermediate_tsv, xml_file)
+            print("Written to TSV intermediate file")
+            sleep(2)
+
+    # Do the testing/validation: intermediate_tsv is the input file, outfile is the output file for the predictions.
+    test(intermediate_tsv, outfile)
+
+    print("The predictions have been written to the output folder.")
+
+
+if __name__ == '__main__':
+    main(*parse_options())
diff --git a/ground_truth_sqlite.py b/ground_truth_sqlite.py
@@ -159,6 +159,24 @@ def select_from_ground_truth(conn, identifier, table_name):
     # Returns a tuple of form (id, bias, hyperpartisan, url)
     return cur.fetchone()
 
+def select_id_hyperpartisan_mappings(sem_eval_dir_path, table_name):
+    """ This function queries the sqlite3 table specified in the arguments
+    for a mapping of article ids to hyperpartisans, and returns the results in a dictionary where key is the id and hyperpartisan value is the value """
+    db_path = os.path.join(sem_eval_dir_path, 'data', 'Databases', 'ground_truth.sqlite3')
+    conn = db_connect(db_path)
+    cur = conn.cursor()
+    query = """
+    SELECT id, hyperpartisan 
+    FROM {} """.format(table_name)
+    # df = pd.read_sql_query(query, conn)
+    cur.execute(query)
+
+    mappings = {}
+    for row in cur:
+        mappings[int(row[0])] = row[1]
+        
+    return mappings
+
 def main():
     """ Main function which creates the appropriate ground truth sqlite table based
     on command-line args, and inserts data from the appropriate ground truth xml file
diff --git a/train_words_conv_model.py b/train_words_conv_model.py
@@ -0,0 +1,171 @@
+from keras.preprocessing import sequence
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Activation, Embedding, Flatten, GlobalMaxPooling1D
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import pad_sequences
+from keras.engine.input_layer import Input
+import argparse
+from numpy import zeros
+import clean_shuffle
+from para2vec import ParagraphVectorModel, get_vector_label_mapping
+from gensim.models.doc2vec import Doc2Vec
+import numpy
+import pickle
+import os
+import pandas as pd
+import ground_truth_sqlite
+import tensorflow as tf
+from gensim.models import KeyedVectors
+
+
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+
+sem_eval_path = ''
+most_common_count = 100000
+seq_len = 5000 # 2500 # Inferred from checking the sequences length distributions
+embedding_dims = 300
+
+
+def load_word_vectors():
+  print('Loading word vectors...')
+  filename = '{}/GoogleNews-vectors-negative300.bin'.format(sem_eval_path)
+  model = KeyedVectors.load_word2vec_format(filename, binary=True)
+  return model.wv
+
+def load_texts():
+    filename = os.path.join(sem_eval_path, 'data', 'IntegratedFiles', 'buzzfeed_training_withid.tsv')# 'crowdsourced_train_withid.tsv')
+    df_location = os.path.join(sem_eval_path, 'data', 'Pickles', 'training_df.pickle')
+
+    df = clean_shuffle.read_prepare_df(filename, file_path=df_location)
+
+    ids_to_labels = ground_truth_sqlite.select_id_hyperpartisan_mappings(sem_eval_path, 'ground_truth_training') # 'ground_truth_crowdsourced_train'
+    df['hyperpartisan'] = df.apply(lambda row: 1 if ids_to_labels[row['id']] == 'true' else 0, axis=1)
+
+    df["text"] = df["title"] + ' ' + df["content"]
+
+    return df['text'], df['hyperpartisan']
+
+def load_tokenizer(texts):
+    num_words = most_common_count + 1
+    file_path = os.path.join(sem_eval_path, 'data', 'Tokenizers', 'buzzfeed_trained_{}_tokenizer.pickle'.format(num_words))
+
+    if os.path.isfile(file_path):
+        with open(file_path, 'rb') as tokenizer_file:
+            tokenizer = pickle.load(tokenizer_file)
+
+        print('Tokenizer loaded from disk')
+    else:
+        tokenizer = Tokenizer(num_words=num_words)
+        tokenizer.fit_on_texts(texts)
+
+        with open(file_path, 'wb') as tokenizer_file:
+            pickle.dump(tokenizer, tokenizer_file, protocol=pickle.HIGHEST_PROTOCOL)
+
+        print('Tokenizer fit on texts and stored on disk')
+    return tokenizer
+
+def get_embedding_weights(word_vectors, word_index):
+  weights_matrix = zeros((len(word_index) + 1, embedding_dims))
+
+  count = 0
+  for word, idx in word_index.items():
+    if word in word_vectors:
+      weights_matrix[idx] = word_vectors[word]
+      count += 1
+  print('Words found on word2vec: {}'.format(count))
+
+  return weights_matrix 
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--path",'-p', default="/home/agon/Files/SemEval",
+                        help="Use this argument to change the SemEval directory path (the default path is: '/home/ashwath/Files/SemEval')")
+    parser.add_argument("--epochs", '-e', default="10",
+                        help="Use this argument to set the number of epochs. Default: 10")
+    parser.add_argument("--filters", '-f', default="64",
+                        help="Use this argument to set the number of filters. Default: 64")
+    parser.add_argument("--kernel", '-k', default="4",
+                        help="Use this argument to set the size of kernels. Default: 4")
+    args = parser.parse_args()
+    
+    global sem_eval_path
+    sem_eval_path = args.path
+
+    # Hyperparameters
+    filters = int(args.filters)
+    kernel_size = int(args.kernel)
+    epochs = int(args.epochs)
+    hidden_dims = 250
+    batch_size = 32 # default
+
+    # Get data
+    texts, y_train = load_texts()
+
+    tokenizer = load_tokenizer(texts)
+
+    train_sequences = tokenizer.texts_to_sequences(texts)
+    del texts
+
+    with tf.device('/cpu:0'):
+        X_train = pad_sequences(train_sequences, maxlen=seq_len, padding='post')
+        del train_sequences
+
+    vocab_size = len(tokenizer.word_index) + 1
+    print('Vocab size: {}'.format(vocab_size))
+
+    # 5. Load word vectors
+    word_vectors = load_word_vectors()
+    weights_matrix = get_embedding_weights(word_vectors, tokenizer.word_index)
+    
+    # Remove word_vectors to free up memory
+    del word_vectors
+
+    # 7. Create Embeddings layer
+    embedding_layer = Embedding(input_dim=vocab_size, 
+                                output_dim=embedding_dims, 
+                                weights=[weights_matrix],
+                                input_length=seq_len,
+                                trainable=False
+                                )
+
+    # Model definition
+    model = Sequential()
+
+    model.add(embedding_layer)
+
+    model.add(Conv1D(filters,
+                    kernel_size,
+                    activation='relu'))
+    model.add(Dropout(0.2))
+    model.add(MaxPooling1D(pool_size=4))
+
+    model.add(Conv1D(filters,
+                    kernel_size,
+                    activation='relu'))
+    model.add(Dropout(0.2))
+    model.add(MaxPooling1D(pool_size=4))
+
+    model.add(GlobalMaxPooling1D())
+    # model.add(Flatten())
+
+    model.add(Dense(hidden_dims, activation='relu'))
+    model.add(Dropout(0.5))
+
+    model.add(Dense(1, activation='sigmoid'))
+
+    print(model.summary())
+
+    model.compile(loss='binary_crossentropy',
+                    optimizer='adam',
+                    metrics=['accuracy'])
+
+    model.fit(X_train, y_train,
+                batch_size=batch_size,
+                epochs=epochs,
+                verbose=2)
+
+    conv_model_location = os.path.join(sem_eval_path, 'models', 'words_conv_model.h5')
+    model.save(conv_model_location)
+
+if __name__ == "__main__":
+    main()