Skip to content

Commit ecf7fc6

Browse files
committed
Merge branch 'tb/path-filter-fix'
The Bloom filter used for path limited history traversal was broken on systems whose "char" is unsigned; update the implementation and bump the format version to 2. * tb/path-filter-fix: bloom: introduce `deinit_bloom_filters()` commit-graph: reuse existing Bloom filters where possible object.h: fix mis-aligned flag bits table commit-graph: new Bloom filter version that fixes murmur3 commit-graph: unconditionally load Bloom filters bloom: prepare to discard incompatible Bloom filters bloom: annotate filters with hash version repo-settings: introduce commitgraph.changedPathsVersion t4216: test changed path filters with high bit paths t/helper/test-read-graph: implement `bloom-filters` mode bloom.h: make `load_bloom_filter_from_graph()` public t/helper/test-read-graph.c: extract `dump_graph_info()` gitformat-commit-graph: describe version 2 of BDAT commit-graph: ensure Bloom filters are read with consistent settings revision.c: consult Bloom filters for root commits t/t4216-log-bloom.sh: harden `test_bloom_filters_not_used()`
2 parents 6f75d23 + 9c8a9ec commit ecf7fc6

14 files changed

+736
-58
lines changed

Documentation/config/commitgraph.txt

+26-3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,29 @@ commitGraph.maxNewFilters::
99
commit-graph write` (c.f., linkgit:git-commit-graph[1]).
1010

1111
commitGraph.readChangedPaths::
12-
If true, then git will use the changed-path Bloom filters in the
13-
commit-graph file (if it exists, and they are present). Defaults to
14-
true. See linkgit:git-commit-graph[1] for more information.
12+
Deprecated. Equivalent to commitGraph.changedPathsVersion=-1 if true, and
13+
commitGraph.changedPathsVersion=0 if false. (If commitGraph.changedPathVersion
14+
is also set, commitGraph.changedPathsVersion takes precedence.)
15+
16+
commitGraph.changedPathsVersion::
17+
Specifies the version of the changed-path Bloom filters that Git will read and
18+
write. May be -1, 0, 1, or 2. Note that values greater than 1 may be
19+
incompatible with older versions of Git which do not yet understand
20+
those versions. Use caution when operating in a mixed-version
21+
environment.
22+
+
23+
Defaults to -1.
24+
+
25+
If -1, Git will use the version of the changed-path Bloom filters in the
26+
repository, defaulting to 1 if there are none.
27+
+
28+
If 0, Git will not read any Bloom filters, and will write version 1 Bloom
29+
filters when instructed to write.
30+
+
31+
If 1, Git will only read version 1 Bloom filters, and will write version 1
32+
Bloom filters.
33+
+
34+
If 2, Git will only read version 2 Bloom filters, and will write version 2
35+
Bloom filters.
36+
+
37+
See linkgit:git-commit-graph[1] for more information.

Documentation/gitformat-commit-graph.txt

+6-3
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,16 @@ All multi-byte numbers are in network byte order.
142142

143143
==== Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional]
144144
* It starts with header consisting of three unsigned 32-bit integers:
145-
- Version of the hash algorithm being used. We currently only support
146-
value 1 which corresponds to the 32-bit version of the murmur3 hash
145+
- Version of the hash algorithm being used. We currently support
146+
value 2 which corresponds to the 32-bit version of the murmur3 hash
147147
implemented exactly as described in
148148
https://en.wikipedia.org/wiki/MurmurHash#Algorithm and the double
149149
hashing technique using seed values 0x293ae76f and 0x7e646e2 as
150150
described in https://doi.org/10.1007/978-3-540-30494-4_26 "Bloom Filters
151-
in Probabilistic Verification"
151+
in Probabilistic Verification". Version 1 Bloom filters have a bug that appears
152+
when char is signed and the repository has path names that have characters >=
153+
0x80; Git supports reading and writing them, but this ability will be removed
154+
in a future version of Git.
152155
- The number of times a path is hashed and hence the number of bit positions
153156
that cumulatively determine whether a file is present in the commit.
154157
- The minimum number of bits 'b' per entry in the Bloom filter. If the filter

bloom.c

+196-12
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
#include "commit-graph.h"
77
#include "commit.h"
88
#include "commit-slab.h"
9+
#include "tree.h"
10+
#include "tree-walk.h"
11+
#include "config.h"
912
#include "repository.h"
1013

1114
define_commit_slab(bloom_filter_slab, struct bloom_filter);
@@ -49,9 +52,9 @@ static int check_bloom_offset(struct commit_graph *g, uint32_t pos,
4952
return -1;
5053
}
5154

52-
static int load_bloom_filter_from_graph(struct commit_graph *g,
53-
struct bloom_filter *filter,
54-
uint32_t graph_pos)
55+
int load_bloom_filter_from_graph(struct commit_graph *g,
56+
struct bloom_filter *filter,
57+
uint32_t graph_pos)
5558
{
5659
uint32_t lex_pos, start_index, end_index;
5760

@@ -89,6 +92,8 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
8992
filter->data = (unsigned char *)(g->chunk_bloom_data +
9093
sizeof(unsigned char) * start_index +
9194
BLOOMDATA_CHUNK_HEADER_SIZE);
95+
filter->version = g->bloom_filter_settings->hash_version;
96+
filter->to_free = NULL;
9297

9398
return 1;
9499
}
@@ -100,7 +105,64 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
100105
* Not considered to be cryptographically secure.
101106
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
102107
*/
103-
uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len)
108+
uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
109+
{
110+
const uint32_t c1 = 0xcc9e2d51;
111+
const uint32_t c2 = 0x1b873593;
112+
const uint32_t r1 = 15;
113+
const uint32_t r2 = 13;
114+
const uint32_t m = 5;
115+
const uint32_t n = 0xe6546b64;
116+
int i;
117+
uint32_t k1 = 0;
118+
const char *tail;
119+
120+
int len4 = len / sizeof(uint32_t);
121+
122+
uint32_t k;
123+
for (i = 0; i < len4; i++) {
124+
uint32_t byte1 = (uint32_t)(unsigned char)data[4*i];
125+
uint32_t byte2 = ((uint32_t)(unsigned char)data[4*i + 1]) << 8;
126+
uint32_t byte3 = ((uint32_t)(unsigned char)data[4*i + 2]) << 16;
127+
uint32_t byte4 = ((uint32_t)(unsigned char)data[4*i + 3]) << 24;
128+
k = byte1 | byte2 | byte3 | byte4;
129+
k *= c1;
130+
k = rotate_left(k, r1);
131+
k *= c2;
132+
133+
seed ^= k;
134+
seed = rotate_left(seed, r2) * m + n;
135+
}
136+
137+
tail = (data + len4 * sizeof(uint32_t));
138+
139+
switch (len & (sizeof(uint32_t) - 1)) {
140+
case 3:
141+
k1 ^= ((uint32_t)(unsigned char)tail[2]) << 16;
142+
/*-fallthrough*/
143+
case 2:
144+
k1 ^= ((uint32_t)(unsigned char)tail[1]) << 8;
145+
/*-fallthrough*/
146+
case 1:
147+
k1 ^= ((uint32_t)(unsigned char)tail[0]) << 0;
148+
k1 *= c1;
149+
k1 = rotate_left(k1, r1);
150+
k1 *= c2;
151+
seed ^= k1;
152+
break;
153+
}
154+
155+
seed ^= (uint32_t)len;
156+
seed ^= (seed >> 16);
157+
seed *= 0x85ebca6b;
158+
seed ^= (seed >> 13);
159+
seed *= 0xc2b2ae35;
160+
seed ^= (seed >> 16);
161+
162+
return seed;
163+
}
164+
165+
static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len)
104166
{
105167
const uint32_t c1 = 0xcc9e2d51;
106168
const uint32_t c2 = 0x1b873593;
@@ -165,8 +227,14 @@ void fill_bloom_key(const char *data,
165227
int i;
166228
const uint32_t seed0 = 0x293ae76f;
167229
const uint32_t seed1 = 0x7e646e2c;
168-
const uint32_t hash0 = murmur3_seeded(seed0, data, len);
169-
const uint32_t hash1 = murmur3_seeded(seed1, data, len);
230+
uint32_t hash0, hash1;
231+
if (settings->hash_version == 2) {
232+
hash0 = murmur3_seeded_v2(seed0, data, len);
233+
hash1 = murmur3_seeded_v2(seed1, data, len);
234+
} else {
235+
hash0 = murmur3_seeded_v1(seed0, data, len);
236+
hash1 = murmur3_seeded_v1(seed1, data, len);
237+
}
170238

171239
key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
172240
for (i = 0; i < settings->num_hashes; i++)
@@ -198,6 +266,18 @@ void init_bloom_filters(void)
198266
init_bloom_filter_slab(&bloom_filters);
199267
}
200268

269+
static void free_one_bloom_filter(struct bloom_filter *filter)
270+
{
271+
if (!filter)
272+
return;
273+
free(filter->to_free);
274+
}
275+
276+
void deinit_bloom_filters(void)
277+
{
278+
deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter);
279+
}
280+
201281
static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
202282
const struct hashmap_entry *eptr,
203283
const struct hashmap_entry *entry_or_key,
@@ -211,11 +291,97 @@ static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
211291
return strcmp(e1->path, e2->path);
212292
}
213293

214-
static void init_truncated_large_filter(struct bloom_filter *filter)
294+
static void init_truncated_large_filter(struct bloom_filter *filter,
295+
int version)
215296
{
216-
filter->data = xmalloc(1);
297+
filter->data = filter->to_free = xmalloc(1);
217298
filter->data[0] = 0xFF;
218299
filter->len = 1;
300+
filter->version = version;
301+
}
302+
303+
#define VISITED (1u<<21)
304+
#define HIGH_BITS (1u<<22)
305+
306+
static int has_entries_with_high_bit(struct repository *r, struct tree *t)
307+
{
308+
if (parse_tree(t))
309+
return 1;
310+
311+
if (!(t->object.flags & VISITED)) {
312+
struct tree_desc desc;
313+
struct name_entry entry;
314+
315+
init_tree_desc(&desc, &t->object.oid, t->buffer, t->size);
316+
while (tree_entry(&desc, &entry)) {
317+
size_t i;
318+
for (i = 0; i < entry.pathlen; i++) {
319+
if (entry.path[i] & 0x80) {
320+
t->object.flags |= HIGH_BITS;
321+
goto done;
322+
}
323+
}
324+
325+
if (S_ISDIR(entry.mode)) {
326+
struct tree *sub = lookup_tree(r, &entry.oid);
327+
if (sub && has_entries_with_high_bit(r, sub)) {
328+
t->object.flags |= HIGH_BITS;
329+
goto done;
330+
}
331+
}
332+
333+
}
334+
335+
done:
336+
t->object.flags |= VISITED;
337+
}
338+
339+
return !!(t->object.flags & HIGH_BITS);
340+
}
341+
342+
static int commit_tree_has_high_bit_paths(struct repository *r,
343+
struct commit *c)
344+
{
345+
struct tree *t;
346+
if (repo_parse_commit(r, c))
347+
return 1;
348+
t = repo_get_commit_tree(r, c);
349+
if (!t)
350+
return 1;
351+
return has_entries_with_high_bit(r, t);
352+
}
353+
354+
static struct bloom_filter *upgrade_filter(struct repository *r, struct commit *c,
355+
struct bloom_filter *filter,
356+
int hash_version)
357+
{
358+
struct commit_list *p = c->parents;
359+
if (commit_tree_has_high_bit_paths(r, c))
360+
return NULL;
361+
362+
if (p && commit_tree_has_high_bit_paths(r, p->item))
363+
return NULL;
364+
365+
filter->version = hash_version;
366+
367+
return filter;
368+
}
369+
370+
struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c)
371+
{
372+
struct bloom_filter *filter;
373+
int hash_version;
374+
375+
filter = get_or_compute_bloom_filter(r, c, 0, NULL, NULL);
376+
if (!filter)
377+
return NULL;
378+
379+
prepare_repo_settings(r);
380+
hash_version = r->settings.commit_graph_changed_paths_version;
381+
382+
if (!(hash_version == -1 || hash_version == filter->version))
383+
return NULL; /* unusable filter */
384+
return filter;
219385
}
220386

221387
struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
@@ -243,8 +409,23 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
243409
filter, graph_pos);
244410
}
245411

246-
if (filter->data && filter->len)
247-
return filter;
412+
if (filter->data && filter->len) {
413+
struct bloom_filter *upgrade;
414+
if (!settings || settings->hash_version == filter->version)
415+
return filter;
416+
417+
/* version mismatch, see if we can upgrade */
418+
if (compute_if_not_present &&
419+
git_env_bool("GIT_TEST_UPGRADE_BLOOM_FILTERS", 1)) {
420+
upgrade = upgrade_filter(r, c, filter,
421+
settings->hash_version);
422+
if (upgrade) {
423+
if (computed)
424+
*computed |= BLOOM_UPGRADED;
425+
return upgrade;
426+
}
427+
}
428+
}
248429
if (!compute_if_not_present)
249430
return NULL;
250431

@@ -300,19 +481,22 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
300481
}
301482

302483
if (hashmap_get_size(&pathmap) > settings->max_changed_paths) {
303-
init_truncated_large_filter(filter);
484+
init_truncated_large_filter(filter,
485+
settings->hash_version);
304486
if (computed)
305487
*computed |= BLOOM_TRUNC_LARGE;
306488
goto cleanup;
307489
}
308490

309491
filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
492+
filter->version = settings->hash_version;
310493
if (!filter->len) {
311494
if (computed)
312495
*computed |= BLOOM_TRUNC_EMPTY;
313496
filter->len = 1;
314497
}
315498
CALLOC_ARRAY(filter->data, filter->len);
499+
filter->to_free = filter->data;
316500

317501
hashmap_for_each_entry(&pathmap, &iter, e, entry) {
318502
struct bloom_key key;
@@ -326,7 +510,7 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
326510
} else {
327511
for (i = 0; i < diff_queued_diff.nr; i++)
328512
diff_free_filepair(diff_queued_diff.queue[i]);
329-
init_truncated_large_filter(filter);
513+
init_truncated_large_filter(filter, settings->hash_version);
330514

331515
if (computed)
332516
*computed |= BLOOM_TRUNC_LARGE;

0 commit comments

Comments
 (0)