Skip to content

Commit 6bb6a3e

Browse files
update
1 parent 3fed529 commit 6bb6a3e

8 files changed

+756
-873
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@ Multi30k data from [here](https://github.com/multi30k/dataset) and [here](https:
2020
flickr30k entities data from [here](https://github.com/BryanPlummer/flickr30k_entities)
2121
We get multi30k text data from [Revisit-MMT](https://github.com/LividWo/Revisit-MMT)
2222
```bash
23+
cd fairseq_mmt
24+
git clone https://github.com/BryanPlummer/flickr30k_entities.git
25+
cd flickr30k_entities
26+
unzip annotations.zip
27+
2328
# create a directory
2429
flickr30k
2530
├─ flickr30k-images
@@ -52,6 +57,7 @@ pip3 install stanfordcorenlp
5257
wget https://nlp.stanford.edu/software/stanford-corenlp-latest.zip
5358
unzip stanford-corenlp-latest.zip
5459
cd fairseq_mmt
60+
cat data/multi30k/train.en data/multi30k/valid.en data/multi30k/test.2016.en > train_val_test2016.en
5561
python3 record_masking_position.py
5662

5763
cd data/masking

data/get_origin_en.py

Lines changed: 0 additions & 8 deletions
This file was deleted.

data/masking/create_masking1234_multi30k.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import os
33
import shutil
44

5+
src-tgt = 'en-de'
6+
57
data_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
68

79
train_lines = 29000

data/masking/match_origin2bpe_position.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
import os
2+
3+
src-tgt = 'en-de'
4+
25
now_path = os.getcwd()
36
if os.path.exists(os.path.join(now_path, 'data')):
47
os.mkdir(os.path.join(now_path, 'data'))
8+
if os.path.exists(os.path.join(now_path, 'data', src-tgt)):
9+
os.mkdir(os.path.join(now_path, 'data', src-tgt))
510

611
data_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
712
multi30k_dir = os.path.join(data_path, 'multi30k')
@@ -10,7 +15,7 @@
1015
_list = []
1116

1217
_f = open(os.path.join(multi30k_dir, 'multi30k.en'), 'r', encoding='utf-8')
13-
with open(os.path.join(multi30k_dir, 'multi30k-en-de.bpe.en'), 'r', encoding='utf-8') as f:
18+
with open(os.path.join(multi30k_dir, 'multi30k-'+src-tgt+'.bpe.en'), 'r', encoding='utf-8') as f:
1419
for sentence_bpe, sentence in zip(f, _f):
1520
count += 1
1621
bpe = sentence_bpe.strip().split()
@@ -37,7 +42,7 @@
3742

3843
_list.append(dic)
3944

40-
with open(os.path.join(now_path, 'data', 'origin2bpe.en-de.match'), 'w', encoding='utf-8') as f:
45+
with open(os.path.join(now_path, 'data', src-tgt, 'origin2bpe.'+src-tgt+'.match'), 'w', encoding='utf-8') as f:
4146
for i in _list:
4247
if isinstance(i, int):
4348
f.write(str(-1)+'\n')

get_noun_from_f30k_entities.py

Lines changed: 72 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,47 +3,86 @@
33
from stanfordcorenlp import StanfordCoreNLP
44
nlp = StanfordCoreNLP(r'../stanford-corenlp-4.3.2')#, lang='de')
55

6-
with open('train_val_test2016.en','r') as f:
7-
sentences = f.readlines()
8-
sentence_list = []
9-
for i in sentences:
10-
sentence_list.append(i.strip())
6+
def get_sentence_list():
7+
sentence_list = []
8+
with open('train_val_test2016.en','r') as f:
9+
for l in f:
10+
sentence_list.append(l.strip())
11+
return sentence_list
1112

12-
with open('train_val_test2016.txt','r') as f:
13-
image_names = f.readlines()
14-
name_list = []
15-
for i in image_names:
16-
name_list.append(i.strip())
13+
def filter_EscapeString(l):
14+
l = l.replace(''', '\'')
15+
l = l.replace("&", "&")
16+
l = l.replace("& amp ;", '&')
17+
l = l.replace(""", '"')
18+
return l
1719

18-
noun = defaultdict(int)
19-
nouns = defaultdict(int)
20+
def get_name_list():
21+
name_list = []
22+
with open('train_val_test2016.txt','r') as f:
23+
for i in f:
24+
name_list.append(i.split('.')[0])
25+
return name_list
26+
27+
def fix_post_tag(phrase_pos_tag, phrase):
28+
tmp = []
29+
tmp_idx = 0
30+
words = phrase.split()
31+
for idx, i in enumerate(words):
32+
if i == phrase_pos_tag[tmp_idx][0]:
33+
tmp.append(phrase_pos_tag[tmp_idx])
34+
tmp_idx += 1
35+
else:
36+
str1 = phrase_pos_tag[tmp_idx][0]
37+
tmp_idx += 1
38+
while str1 != i:
39+
str1 += phrase_pos_tag[tmp_idx][0]
40+
tmp_idx += 1
41+
tmp.append((i, 'UNK'))
42+
return tmp
43+
44+
def write_dict(filename, dic):
45+
out = open(filename, 'w', encoding='utf-8')
46+
t = sorted(dic.items(), key=lambda item:item[1])
47+
for i in t:
48+
out.write(i[0] + ' ' + i[1])
49+
out.close()
2050

2151
if __name__ == "__main__":
52+
noun = defaultdict(int)
53+
nouns = defaultdict(int)
54+
#people = defaultdict(int)
55+
name_list = get_name_list()
56+
sentence_list = get_sentence_list()
57+
2258
for index in range(len(name_list)):
2359
image = name_list[index]
24-
sentence = sentence_list[index]
25-
x = get_sentence_data('../flickr30k_entities/Sentences/'+image.split('.')[0]+'.txt')
26-
flag = True
27-
for j in x: # all matched
28-
if j['sentence'].replace(' ','').replace('”','"').replace('`', '\'').replace('"', '') == sentence.replace('"', '').replace(' ', ''):
60+
origin_sentence = sentence_list[index]
61+
sentence = filter_EscapeString(origin_sentence)
62+
63+
# a list
64+
x = get_sentence_data('flickr30k_entities/Sentences/'+image.split('.')[0]+'.txt')
65+
66+
for j in x:
67+
entity_sentence = j['sentence'].replace(' ','').replace('”','"').replace('`','\'').replace('"','').lower()
68+
if entity_sentence == sentence.replace('"','').replace(' ',''):
2969
for t in j['phrases']:
30-
phrase = t['phrase']#.lower()
70+
phrase = t['phrase'].lower()
71+
# if 'people' in t['phrase_type']:
3172
try:
32-
phrase_pos = nlp.pos_tag(phrase)
73+
phrase_pos_tag = nlp.pos_tag(phrase)
74+
if len(phrase_pos_tag) > len(phrase.split()):
75+
fix_post_tag(phrase_pos_tag, phrase)
76+
assert len(phrase_pos_tag) == len(phrase.split()):
3377
except:
3478
print(phrase)
35-
for pos in phrase_pos:
36-
if pos[1] == 'NN':
37-
noun[pos[0]] += 1
38-
elif pos[1] == 'NNS':
39-
nouns[pos[0]] += 1
40-
flag = False
79+
80+
#for pos_tag in phrase_pos_tag:
81+
# if pos_tag[1] == 'NN':
82+
# noun[pos_tag[0]] += 1
83+
# elif pos_tag[1] == 'NNS':
84+
# nouns[pos_tag[0]] += 1
4185
break
42-
if flag:
43-
print(sentence)
44-
for j in x:
45-
print(j['sentence'].lower())
46-
print()
47-
48-
print(len(noun))
49-
print(len(nouns))
86+
87+
write_dict('data/masking/noun.en', noun)
88+
write_dict('data/masking/nouns.en', nouns)

record_masking_position.py

Lines changed: 0 additions & 161 deletions
This file was deleted.

0 commit comments

Comments
 (0)