Skip to content

Commit 8ca1417

Browse files
authored
Merge pull request #125 from alexhorn/master
Improve memory usage of Python150kExtractor
2 parents af04b4c + b629b7b commit 8ca1417

File tree

1 file changed

+5
-9
lines changed

1 file changed

+5
-9
lines changed

Python150kExtractor/extract.py

+5-9
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,9 @@
2525

2626

2727
def __collect_asts(json_file):
28-
asts = []
2928
with open(json_file, 'r', encoding='utf-8') as f:
30-
for line in f:
31-
ast = json.loads(line.strip())
32-
asts.append(ast)
33-
34-
return asts
29+
for line in tqdm.tqdm(f):
30+
yield line
3531

3632

3733
def __terminals(ast, node_index, args):
@@ -170,8 +166,8 @@ def main():
170166
np.random.seed(args.seed)
171167

172168
data_dir = Path(args.data_dir)
173-
trains = __collect_asts(data_dir / 'python100k_train.json')
174-
evals = __collect_asts(data_dir / 'python50k_eval.json')
169+
trains = list(__collect_asts(data_dir / 'python100k_train.json'))
170+
evals = list(__collect_asts(data_dir / 'python50k_eval.json'))
175171

176172
train, valid = sklearn_model_selection.train_test_split(
177173
trains,
@@ -186,7 +182,7 @@ def main():
186182
(train, valid, test),
187183
):
188184
output_file = output_dir / f'{split_name}_output_file.txt'
189-
__collect_all_and_save(split, args, output_file)
185+
__collect_all_and_save((json.loads(line) for line in split), args, output_file)
190186

191187

192188
if __name__ == '__main__':

0 commit comments

Comments
 (0)