Skip to content

Commit 3ca1ce0

Browse files
committed
Introduce new trie representation
Now each link to the next node stores also letter (an edge label), thanks to that traversing the the trie is faster, as there are fewer dereferences. My tests show improvement up to 2 times, seems it's worth to do.
1 parent 612e530 commit 3ca1ce0

12 files changed

+166
-125
lines changed

Automaton.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,7 @@ automaton_make_automaton(PyObject* self, PyObject* args) {
568568
TrieNode* node;
569569
TrieNode* child;
570570
TrieNode* state;
571+
TRIE_LETTER_TYPE letter;
571572

572573

573574
if (automaton->kind != TRIE)
@@ -605,7 +606,8 @@ automaton_make_automaton(PyObject* self, PyObject* args) {
605606
}
606607

607608
for (i=0; i < node->n; i++) {
608-
child = trienode_get_ith_unsafe(node, i);
609+
child = trienode_get_ith_unsafe(node, i);
610+
letter = trieletter_get_ith_unsafe(node, i);
609611
ASSERT(child);
610612

611613
item = (AutomatonQueueItem*)list_item_new(sizeof(AutomatonQueueItem));
@@ -620,13 +622,13 @@ automaton_make_automaton(PyObject* self, PyObject* args) {
620622
ASSERT(state);
621623
ASSERT(child);
622624
while (state != automaton->root and\
623-
not trienode_get_next(state, child->letter)) {
625+
not trienode_get_next(state, letter)) {
624626

625627
state = state->fail;
626628
ASSERT(state);
627629
}
628630

629-
child->fail = trienode_get_next(state, child->letter);
631+
child->fail = trienode_get_next(state, letter);
630632
if (child->fail == NULL)
631633
child->fail = automaton->root;
632634

@@ -1052,7 +1054,7 @@ dump_aux(TrieNode* node, const int depth, void* extra) {
10521054
// 2.
10531055
for (i=0; i < node->n; i++) {
10541056
child = trienode_get_ith_unsafe(node, i);
1055-
tuple = F(Py_BuildValue)("ici", node, child->letter, child);
1057+
tuple = F(Py_BuildValue)("ici", node, trieletter_get_ith_unsafe(node, i), child);
10561058
append_tuple(Dump->edges)
10571059
}
10581060

AutomatonItemsIter.c

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ typedef struct AutomatonItemsStackItem {
1616
LISTITEM_data;
1717

1818
struct TrieNode* node;
19+
TRIE_LETTER_TYPE letter;
1920
size_t depth;
2021
} AutomatonItemsStackItem;
2122

@@ -126,6 +127,7 @@ automaton_items_iter_next(PyObject* self) {
126127

127128
bool output;
128129
TrieNode* node;
130+
TRIE_LETTER_TYPE letter;
129131
size_t depth;
130132

131133
if (UNLIKELY(iter->version != iter->automaton->version)) {
@@ -143,8 +145,9 @@ automaton_items_iter_next(PyObject* self) {
143145
return NULL; /* Stop iteration */
144146
}
145147

146-
node = top->node;
147-
depth = top->depth;
148+
node = top->node;
149+
letter = top->letter;
150+
depth = top->depth;
148151
memory_free(top);
149152

150153
if (iter->matchtype != MATCH_AT_LEAST_PREFIX and depth > iter->pattern_length)
@@ -166,7 +169,8 @@ automaton_items_iter_next(PyObject* self) {
166169

167170
}
168171

169-
iter->state = node;
172+
iter->state = node;
173+
iter->letter = letter;
170174
if ((depth >= iter->pattern_length) or
171175
(iter->use_wildcard and iter->pattern[depth] == iter->wildcard)) {
172176

@@ -180,8 +184,9 @@ automaton_items_iter_next(PyObject* self) {
180184
return NULL;
181185
}
182186

183-
new_item->node = trienode_get_ith_unsafe(iter->state, i);
184-
new_item->depth = depth + 1;
187+
new_item->node = trienode_get_ith_unsafe(iter->state, i);
188+
new_item->letter = trieletter_get_ith_unsafe(iter->state, i);
189+
new_item->depth = depth + 1;
185190
list_push_front(&iter->stack, (ListItem*)new_item);
186191
}
187192
}
@@ -196,18 +201,21 @@ automaton_items_iter_next(PyObject* self) {
196201
return NULL;
197202
}
198203

199-
new_item->node = node;
200-
new_item->depth = depth + 1;
204+
new_item->node = node;
205+
new_item->letter = iter->pattern[depth];
206+
new_item->depth = depth + 1;
201207
list_push_front(&iter->stack, (ListItem*)new_item);
202208
}
203209
}
204210

205-
if (iter->type != ITER_VALUES)
211+
if (iter->type != ITER_VALUES) {
206212
// update keys when needed
207-
iter->buffer[depth] = iter->state->letter;
213+
iter->buffer[depth] = iter->letter;
208214
#ifndef AHOCORASICK_UNICODE
209-
iter->char_buffer[depth] = (char)iter->state->letter;
215+
iter->char_buffer[depth] = (char)iter->letter;
210216
#endif
217+
}
218+
211219
if (output and iter->state->eow) {
212220
PyObject* val;
213221

AutomatonItemsIter.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ typedef struct AutomatonItemsIter {
3535
Automaton* automaton;
3636
int version; ///< automaton version
3737
TrieNode* state; ///< current automaton node
38+
TRIE_LETTER_TYPE letter; ///< current letter
3839
List stack; ///< stack
3940
ItemsType type; ///< type of iterator (KEYS/VALUES/ITEMS)
4041
TRIE_LETTER_TYPE* buffer; ///< buffer to construct key representation

Automaton_pickle.c

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ typedef struct DumpState {
6464
static size_t
6565
get_pickled_size(TrieNode* node) {
6666
ASSERT(node != NULL);
67-
return PICKLE_TRIENODE_SIZE + node->n * PICKLE_POINTER_SIZE;
67+
return PICKLE_TRIENODE_SIZE + node->n * sizeof(Pair);
6868
}
6969

7070
// replace fail with pairs (fail, id)
@@ -132,7 +132,7 @@ pickle_dump_save(TrieNode* node, const int depth, void* extra) {
132132

133133
TrieNode* dump;
134134
TrieNode* tmp;
135-
TrieNode** arr;
135+
Pair* arr;
136136
unsigned i;
137137
size_t size;
138138

@@ -147,7 +147,7 @@ pickle_dump_save(TrieNode* node, const int depth, void* extra) {
147147
dump = (TrieNode*)(self->data + self->top);
148148

149149
// we do not save the last pointer in array
150-
arr = (TrieNode**)(self->data + self->top + PICKLE_TRIENODE_SIZE);
150+
arr = (Pair*)(self->data + self->top + PICKLE_TRIENODE_SIZE);
151151

152152
// append the python object to the list
153153
if (node->eow and self->values) {
@@ -165,7 +165,6 @@ pickle_dump_save(TrieNode* node, const int depth, void* extra) {
165165

166166
dump->n = node->n;
167167
dump->eow = node->eow;
168-
dump->letter = node->letter;
169168

170169
tmp = NODEID(node)->fail;
171170
if (tmp)
@@ -177,12 +176,12 @@ pickle_dump_save(TrieNode* node, const int depth, void* extra) {
177176
for (i=0; i < node->n; i++) {
178177
TrieNode* child = trienode_get_ith_unsafe(node, i);
179178
ASSERT(child);
180-
arr[i] = (TrieNode*)(NODEID(child)->id); // save id of child node
179+
arr[i].child = (TrieNode*)(NODEID(child)->id); // save the id of child node
180+
arr[i].letter = trieletter_get_ith_unsafe(node, i);
181181
}
182182

183183
self->top += size;
184184
(*self->count) += 1;
185-
186185
return 1;
187186
#undef NODEID
188187
#undef self
@@ -334,7 +333,7 @@ automaton_unpickle(
334333

335334
TrieNode* node;
336335
TrieNode* dump;
337-
TrieNode** next;
336+
Pair* next;
338337
PyObject* bytes;
339338
PyObject* value;
340339
Py_ssize_t nodes_count;
@@ -383,7 +382,6 @@ automaton_unpickle(
383382
if (LIKELY(node != NULL)) {
384383
node->output = dump->output;
385384
node->fail = dump->fail;
386-
node->letter = dump->letter;
387385
node->n = dump->n;
388386
node->eow = dump->eow;
389387
node->next = NULL;
@@ -396,26 +394,26 @@ automaton_unpickle(
396394
id2node[id++] = node;
397395

398396
if (node->n > 0) {
399-
if (UNLIKELY(ptr + node->n * PICKLE_POINTER_SIZE > end)) {
397+
if (UNLIKELY(ptr + node->n * sizeof(Pair) > end)) {
400398
PyErr_Format(PyExc_ValueError,
401399
"Data truncated [parsing children of node #%d]: "
402400
"chunk #%d @ offset %lu, expected at least %ld bytes",
403-
i, k, ptr - data + i, node->n * PICKLE_POINTER_SIZE);
401+
i, k, ptr - data + i, node->n * sizeof(Pair));
404402

405403
goto exception;
406404
}
407405

408-
node->next = (TrieNode**)memory_alloc(node->n * sizeof(TrieNode*));
406+
node->next = (Pair*)memory_alloc(node->n * sizeof(Pair));
409407
if (UNLIKELY(node->next == NULL)) {
410408
goto no_mem;
411409
}
412410

413-
next = (TrieNode**)(ptr);
411+
next = (Pair*)(ptr);
414412
for (j=0; j < node->n; j++) {
415413
node->next[j] = next[j];
416414
}
417415

418-
ptr += node->n * PICKLE_POINTER_SIZE;
416+
ptr += node->n * sizeof(Pair);
419417
}
420418
}
421419
}
@@ -450,9 +448,9 @@ automaton_unpickle(
450448
}
451449

452450
for (j=0; j < node->n; j++) {
453-
index = (size_t)(node->next[j]);
451+
index = (size_t)(node->next[j].child);
454452
if (LIKELY(index < count + 1)) {
455-
node->next[j] = id2node[index];
453+
node->next[j].child = id2node[index];
456454
} else {
457455
PyErr_Format(PyExc_ValueError,
458456
"Node #%lu malformed: next link #%lu points to node #%lu, while there are %lu nodes",

src/custompickle/load/module_automaton_load.c

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,8 @@ automaton_load_node(LoadBuffer* input) {
132132

133133
// 3. load next pointers
134134
if (node->n > 0) {
135-
size = sizeof(TrieNode*) * node->n;
136-
node->next = (TrieNode**)memory_alloc(size);
135+
size = sizeof(Pair) * node->n;
136+
node->next = (Pair*)memory_alloc(size);
137137
if (UNLIKELY(node->next == NULL)) {
138138
PyErr_NoMemory();
139139
goto exception;
@@ -239,14 +239,13 @@ automaton_load_fixup_node(LoadBuffer* input, TrieNode* node) {
239239

240240
if (node->n > 0) {
241241
for (i=0; i < node->n; i++) {
242-
node->next[i] = lookup_address(input, node->next[i]);
243-
if (UNLIKELY(node->next[i] == NULL)) {
242+
node->next[i].child = lookup_address(input, node->next[i].child);
243+
if (UNLIKELY(node->next[i].child == NULL)) {
244244
return false;
245245
}
246246
}
247247
}
248248

249-
250249
return true;
251250
}
252251

src/custompickle/save/automaton_save.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ automaton_save_node(TrieNode* node, const int depth, void* extra) {
101101

102102
dump->n = node->n;
103103
dump->eow = node->eow;
104-
dump->letter = node->letter;
105104
dump->fail = node->fail;
106105

107106
// 3. pickle python value associated with word
@@ -116,15 +115,15 @@ automaton_save_node(TrieNode* node, const int depth, void* extra) {
116115
return 0;
117116
}
118117

119-
// store the size of buffer in trie node [which is not saved yet in a file]
118+
// store the size of buffer in trie node [which is not saved yet in the file]
120119
*(size_t*)(&dump->output.integer) = PyBytes_GET_SIZE(bytes);
121120
} else {
122121
bytes = NULL;
123122
}
124123

125124
// 4. save array of pointers
126125
if (node->n > 0) {
127-
savebuffer_store(output, (const char*)node->next, node->n * sizeof(PICKLE_POINTER_SIZE));
126+
savebuffer_store(output, (const char*)node->next, node->n * sizeof(Pair));
128127
}
129128

130129
// 5. save pickled data, if any

src/custompickle/save/savebuffer.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ savebuffer_init(SaveBuffer* output, PyObject* serializer, KeysStore store, const
1212
output->nodes_count = 0;
1313

1414
if (PICKLE_SIZE_T_SIZE < sizeof(PyObject*)) {
15-
// XXX: this must be reworked, likely to module level
15+
// XXX: this must be reworked, likely moved to module level
1616
PyErr_SetString(PyExc_SystemError, "unable to save data due to technical reasons");
1717
return false;
1818
}

src/pickle/pickle.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
// We save all TrieNode's fields except the last one, which is a pointer to array,
66
// as we're store that array just after the node
7-
#define PICKLE_TRIENODE_SIZE (sizeof(TrieNode) - sizeof(TrieNode**))
8-
#define PICKLE_POINTER_SIZE (sizeof(TrieNode*))
7+
#define PICKLE_TRIENODE_SIZE (sizeof(TrieNode) - sizeof(Pair*))
98
#define PICKLE_SIZE_T_SIZE (sizeof(size_t))
109
#define PICKLE_CHUNK_COUNTER_SIZE (sizeof(Py_ssize_t))

trie.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ trie_add_word(Automaton* automaton, const TRIE_LETTER_TYPE* word, const size_t w
2020

2121
if (automaton->kind == EMPTY) {
2222
ASSERT(automaton->root == NULL);
23-
automaton->root = trienode_new('\0', false);
23+
automaton->root = trienode_new(false);
2424
if (automaton->root == NULL)
2525
return NULL;
2626
}
@@ -32,7 +32,7 @@ trie_add_word(Automaton* automaton, const TRIE_LETTER_TYPE* word, const size_t w
3232

3333
child = trienode_get_next(node, letter);
3434
if (child == NULL) {
35-
child = trienode_new(letter, false);
35+
child = trienode_new(false);
3636
if (LIKELY(child != NULL)) {
3737
if (UNLIKELY(trienode_set_next(node, letter, child) == NULL)) {
3838
memory_free(child);
@@ -201,7 +201,6 @@ trie_traverse_aux(
201201
void *extra
202202
) {
203203
unsigned i;
204-
205204
if (callback(node, depth, extra) == 0)
206205
return 0;
207206

0 commit comments

Comments
 (0)