|
3 | 3 | from stanfordcorenlp import StanfordCoreNLP
|
4 | 4 | nlp = StanfordCoreNLP(r'../stanford-corenlp-4.3.2')#, lang='de')
|
5 | 5 |
|
6 |
| -with open('train_val_test2016.en','r') as f: |
7 |
| - sentences = f.readlines() |
8 |
| -sentence_list = [] |
9 |
| -for i in sentences: |
10 |
| - sentence_list.append(i.strip()) |
| 6 | +def get_sentence_list(): |
| 7 | + sentence_list = [] |
| 8 | + with open('train_val_test2016.en','r') as f: |
| 9 | + for l in f: |
| 10 | + sentence_list.append(l.strip()) |
| 11 | + return sentence_list |
11 | 12 |
|
12 |
| -with open('train_val_test2016.txt','r') as f: |
13 |
| - image_names = f.readlines() |
14 |
| -name_list = [] |
15 |
| -for i in image_names: |
16 |
| - name_list.append(i.strip()) |
| 13 | +def filter_EscapeString(l): |
| 14 | + l = l.replace(''', '\'') |
| 15 | + l = l.replace("&", "&") |
| 16 | + l = l.replace("& amp ;", '&') |
| 17 | + l = l.replace(""", '"') |
| 18 | + return l |
17 | 19 |
|
18 |
| -noun = defaultdict(int) |
19 |
| -nouns = defaultdict(int) |
| 20 | +def get_name_list(): |
| 21 | + name_list = [] |
| 22 | + with open('train_val_test2016.txt','r') as f: |
| 23 | + for i in f: |
| 24 | + name_list.append(i.split('.')[0]) |
| 25 | + return name_list |
| 26 | + |
| 27 | +def fix_post_tag(phrase_pos_tag, phrase): |
| 28 | + tmp = [] |
| 29 | + tmp_idx = 0 |
| 30 | + words = phrase.split() |
| 31 | + for idx, i in enumerate(words): |
| 32 | + if i == phrase_pos_tag[tmp_idx][0]: |
| 33 | + tmp.append(phrase_pos_tag[tmp_idx]) |
| 34 | + tmp_idx += 1 |
| 35 | + else: |
| 36 | + str1 = phrase_pos_tag[tmp_idx][0] |
| 37 | + tmp_idx += 1 |
| 38 | + while str1 != i: |
| 39 | + str1 += phrase_pos_tag[tmp_idx][0] |
| 40 | + tmp_idx += 1 |
| 41 | + tmp.append((i, 'UNK')) |
| 42 | + return tmp |
| 43 | + |
| 44 | +def write_dict(filename, dic): |
| 45 | + out = open(filename, 'w', encoding='utf-8') |
| 46 | + t = sorted(dic.items(), key=lambda item:item[1]) |
| 47 | + for i in t: |
| 48 | + out.write(i[0] + ' ' + i[1]) |
| 49 | + out.close() |
20 | 50 |
|
21 | 51 | if __name__ == "__main__":
|
| 52 | + noun = defaultdict(int) |
| 53 | + nouns = defaultdict(int) |
| 54 | + #people = defaultdict(int) |
| 55 | + name_list = get_name_list() |
| 56 | + sentence_list = get_sentence_list() |
| 57 | + |
22 | 58 | for index in range(len(name_list)):
|
23 | 59 | image = name_list[index]
|
24 |
| - sentence = sentence_list[index] |
25 |
| - x = get_sentence_data('../flickr30k_entities/Sentences/'+image.split('.')[0]+'.txt') |
26 |
| - flag = True |
27 |
| - for j in x: # all matched |
28 |
| - if j['sentence'].replace(' ','').replace('”','"').replace('`', '\'').replace('"', '') == sentence.replace('"', '').replace(' ', ''): |
| 60 | + origin_sentence = sentence_list[index] |
| 61 | + sentence = filter_EscapeString(origin_sentence) |
| 62 | + |
| 63 | + # a list |
| 64 | + x = get_sentence_data('flickr30k_entities/Sentences/'+image.split('.')[0]+'.txt') |
| 65 | + |
| 66 | + for j in x: |
| 67 | + entity_sentence = j['sentence'].replace(' ','').replace('”','"').replace('`','\'').replace('"','').lower() |
| 68 | + if entity_sentence == sentence.replace('"','').replace(' ',''): |
29 | 69 | for t in j['phrases']:
|
30 |
| - phrase = t['phrase']#.lower() |
| 70 | + phrase = t['phrase'].lower() |
| 71 | + # if 'people' in t['phrase_type']: |
31 | 72 | try:
|
32 |
| - phrase_pos = nlp.pos_tag(phrase) |
| 73 | + phrase_pos_tag = nlp.pos_tag(phrase) |
| 74 | + if len(phrase_pos_tag) > len(phrase.split()): |
| 75 | + fix_post_tag(phrase_pos_tag, phrase) |
| 76 | + assert len(phrase_pos_tag) == len(phrase.split()): |
33 | 77 | except:
|
34 | 78 | print(phrase)
|
35 |
| - for pos in phrase_pos: |
36 |
| - if pos[1] == 'NN': |
37 |
| - noun[pos[0]] += 1 |
38 |
| - elif pos[1] == 'NNS': |
39 |
| - nouns[pos[0]] += 1 |
40 |
| - flag = False |
| 79 | + |
| 80 | + #for pos_tag in phrase_pos_tag: |
| 81 | + # if pos_tag[1] == 'NN': |
| 82 | + # noun[pos_tag[0]] += 1 |
| 83 | + # elif pos_tag[1] == 'NNS': |
| 84 | + # nouns[pos_tag[0]] += 1 |
41 | 85 | break
|
42 |
| - if flag: |
43 |
| - print(sentence) |
44 |
| - for j in x: |
45 |
| - print(j['sentence'].lower()) |
46 |
| - print() |
47 |
| - |
48 |
| - print(len(noun)) |
49 |
| - print(len(nouns)) |
| 86 | + |
| 87 | + write_dict('data/masking/noun.en', noun) |
| 88 | + write_dict('data/masking/nouns.en', nouns) |
0 commit comments