-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathutils.py
115 lines (90 loc) · 3.13 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
from pypdf import PdfReader
def list_files(path, extensions=''):
"""Returns a List of strings with the file names on the given path"""
files_list = [
os.path.join(path, f)
for f in os.listdir(path)
if os.path.isfile(os.path.join(path, f)) and f.endswith(extensions)
]
return sorted(files_list)
def read_file(doc):
"""Get text from pdf and txt files"""
text = ''
if doc.endswith('.txt'):
with open(doc, 'r') as f:
text = f.read()
elif doc.endswith('.pdf'):
pdf_reader = PdfReader(doc)
text = ''.join([page.extract_text() for page in pdf_reader.pages])
return text
def get_chunks_basic(text, max_words=256):
"""Split text in chunks with less than max_words"""
# List of lines skipping empty lines
lines = [l for l in text.splitlines(True) if l.strip()]
chunks = []
chunk = ''
for l in lines:
if len(chunk.split() + l.split()) <= max_words:
chunk += l # if splitline(False) do += "\n" + l
continue
chunks.append(chunk)
chunk = l
if chunk:
chunks.append(chunk)
return chunks
def get_chunks(text, max_words=256, max_title_words=4):
"""Split text in trivial context-awared chunks with less than max_words"""
# List of lines skipping empty lines
lines = [l for l in text.splitlines(True) if l.strip()]
chunks = []
chunk = ''
for l in lines:
nwords = len(l.split())
if len(chunk.split()) + nwords <= max_words and (
nwords > max_title_words
or all(len(s.split()) <= max_title_words for s in chunk.splitlines())
):
chunk += l # if splitline(False) do += "\n" + l
continue
chunks.append(chunk)
chunk = l
if chunk:
chunks.append(chunk)
return chunks
def get_chunks_fast(text, max_words=256, max_title_words=4): # optimized for performance
"""Split text in trivial context-awared chunks with less than max_words"""
# List of lines skipping empty lines
lines = [l for l in text.splitlines(True) if l.strip()]
chunks = []
chunk = []
chunk_length = 0
for l in lines:
line_length = len(l.split())
if chunk_length + line_length <= max_words and (
line_length > max_title_words or all(len(s.split()) <= max_title_words for s in chunk)
):
chunk.append(l)
chunk_length += line_length
continue
chunks.append(''.join(chunk)) # if splitlines(False) do "\n".join()
chunk = [l]
chunk_length = len(l.split())
if chunk:
chunks.append(''.join(chunk))
return chunks
if __name__ == '__main__':
# check chunk splitting
docs_path = 'sample_data'
files = list_files(docs_path, extensions=('.txt', '.pdf'))
print("Found files:")
print("\n".join(files))
chosen_file = files[1]
print(f"\nReading and splitting {chosen_file}...")
text = read_file(chosen_file)
chunks = get_chunks(text)
print("\nChunks:")
for c in chunks:
print(c)
print("Words Length:", len(c.split()))
print(80 * '-')