Skip to content

Commit 9ca91df

Browse files
committed
survey: add report of "largest" paths
Since we are already walking our reachable objects using the path-walk API, let's now collect lists of the paths that contribute most to different metrics. Specifically, we care about * Number of versions. * Total size on disk. * Total inflated size (no delta or zlib compression). This information can be critical to discovering which parts of the repository are causing the most growth, especially on-disk size. Different packing strategies might help compress data more efficiently, but the toal inflated size is a representation of the raw size of all snapshots of those paths. Even when stored efficiently on disk, that size represents how much information must be processed to complete a command such as 'git blame'. Since the on-disk size is likely to be fragile, stop testing the exact output of 'git survey' and check that the correct set of headers is output. Signed-off-by: Derrick Stolee <stolee@gmail.com>
1 parent ad71878 commit 9ca91df

File tree

2 files changed

+82
-9
lines changed

2 files changed

+82
-9
lines changed

builtin/survey.c

+71-8
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@ struct survey_report_object_size_summary {
8080

8181
typedef int (*survey_top_cmp)(void *v1, void *v2);
8282

83-
MAYBE_UNUSED
8483
static int cmp_by_nr(void *v1, void *v2)
8584
{
8685
struct survey_report_object_size_summary *s1 = v1;
@@ -93,7 +92,6 @@ static int cmp_by_nr(void *v1, void *v2)
9392
return 0;
9493
}
9594

96-
MAYBE_UNUSED
9795
static int cmp_by_disk_size(void *v1, void *v2)
9896
{
9997
struct survey_report_object_size_summary *s1 = v1;
@@ -106,7 +104,6 @@ static int cmp_by_disk_size(void *v1, void *v2)
106104
return 0;
107105
}
108106

109-
MAYBE_UNUSED
110107
static int cmp_by_inflated_size(void *v1, void *v2)
111108
{
112109
struct survey_report_object_size_summary *s1 = v1;
@@ -137,7 +134,6 @@ struct survey_report_top_table {
137134
void *data;
138135
};
139136

140-
MAYBE_UNUSED
141137
static void init_top_sizes(struct survey_report_top_table *top,
142138
size_t limit, const char *name,
143139
survey_top_cmp cmp)
@@ -163,7 +159,6 @@ static void clear_top_sizes(struct survey_report_top_table *top)
163159
free(sz_array);
164160
}
165161

166-
MAYBE_UNUSED
167162
static void maybe_insert_into_top_size(struct survey_report_top_table *top,
168163
struct survey_report_object_size_summary *summary)
169164
{
@@ -200,6 +195,10 @@ struct survey_report {
200195
struct survey_report_object_summary reachable_objects;
201196

202197
struct survey_report_object_size_summary *by_type;
198+
199+
struct survey_report_top_table *top_paths_by_count;
200+
struct survey_report_top_table *top_paths_by_disk;
201+
struct survey_report_top_table *top_paths_by_inflate;
203202
};
204203

205204
#define REPORT_TYPE_COMMIT 0
@@ -451,6 +450,13 @@ static void survey_report_object_sizes(const char *title,
451450
clear_table(&table);
452451
}
453452

453+
static void survey_report_plaintext_sorted_size(
454+
struct survey_report_top_table *top)
455+
{
456+
survey_report_object_sizes(top->name, _("Path"),
457+
top->data, top->nr);
458+
}
459+
454460
static void survey_report_plaintext(struct survey_context *ctx)
455461
{
456462
printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
@@ -461,6 +467,21 @@ static void survey_report_plaintext(struct survey_context *ctx)
461467
_("Object Type"),
462468
ctx->report.by_type,
463469
REPORT_TYPE_COUNT);
470+
471+
survey_report_plaintext_sorted_size(
472+
&ctx->report.top_paths_by_count[REPORT_TYPE_TREE]);
473+
survey_report_plaintext_sorted_size(
474+
&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]);
475+
476+
survey_report_plaintext_sorted_size(
477+
&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]);
478+
survey_report_plaintext_sorted_size(
479+
&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]);
480+
481+
survey_report_plaintext_sorted_size(
482+
&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]);
483+
survey_report_plaintext_sorted_size(
484+
&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]);
464485
}
465486

466487
/*
@@ -701,7 +722,8 @@ static void increment_totals(struct survey_context *ctx,
701722

702723
static void increment_object_totals(struct survey_context *ctx,
703724
struct oid_array *oids,
704-
enum object_type type)
725+
enum object_type type,
726+
const char *path)
705727
{
706728
struct survey_report_object_size_summary *total;
707729
struct survey_report_object_size_summary summary = { 0 };
@@ -733,9 +755,30 @@ static void increment_object_totals(struct survey_context *ctx,
733755
total->disk_size += summary.disk_size;
734756
total->inflated_size += summary.inflated_size;
735757
total->num_missing += summary.num_missing;
758+
759+
if (type == OBJ_TREE || type == OBJ_BLOB) {
760+
int index = type == OBJ_TREE ?
761+
REPORT_TYPE_TREE : REPORT_TYPE_BLOB;
762+
struct survey_report_top_table *top;
763+
764+
/*
765+
* Temporarily store (const char *) here, but it will
766+
* be duped if inserted and will not be freed.
767+
*/
768+
summary.label = (char *)path;
769+
770+
top = ctx->report.top_paths_by_count;
771+
maybe_insert_into_top_size(&top[index], &summary);
772+
773+
top = ctx->report.top_paths_by_disk;
774+
maybe_insert_into_top_size(&top[index], &summary);
775+
776+
top = ctx->report.top_paths_by_inflate;
777+
maybe_insert_into_top_size(&top[index], &summary);
778+
}
736779
}
737780

738-
static int survey_objects_path_walk_fn(const char *path UNUSED,
781+
static int survey_objects_path_walk_fn(const char *path,
739782
struct oid_array *oids,
740783
enum object_type type,
741784
void *data)
@@ -744,7 +787,7 @@ static int survey_objects_path_walk_fn(const char *path UNUSED,
744787

745788
increment_object_counts(&ctx->report.reachable_objects,
746789
type, oids->nr);
747-
increment_object_totals(ctx, oids, type);
790+
increment_object_totals(ctx, oids, type, path);
748791

749792
ctx->progress_nr += oids->nr;
750793
display_progress(ctx->progress, ctx->progress_nr);
@@ -754,11 +797,31 @@ static int survey_objects_path_walk_fn(const char *path UNUSED,
754797

755798
static void initialize_report(struct survey_context *ctx)
756799
{
800+
const int top_limit = 100;
801+
757802
CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
758803
ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
759804
ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
760805
ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
761806
ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags"));
807+
808+
CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT);
809+
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE],
810+
top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr);
811+
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB],
812+
top_limit, _("TOP FILES BY COUNT"), cmp_by_nr);
813+
814+
CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT);
815+
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE],
816+
top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size);
817+
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB],
818+
top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size);
819+
820+
CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT);
821+
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE],
822+
top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size);
823+
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB],
824+
top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size);
762825
}
763826

764827
static void survey_phase_objects(struct survey_context *ctx)

t/t8100-git-survey.sh

+11-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,17 @@ test_expect_success 'git survey (default)' '
8181
Tags | 4 | $(test_oid tags_size_on_disk) | $(test_oid tags_size)
8282
EOF
8383
84-
test_cmp expect out
84+
lines=$(wc -l <expect) &&
85+
head -n $lines out >out-trimmed &&
86+
test_cmp expect out-trimmed &&
87+
88+
for type in "DIRECTORIES" "FILES"
89+
do
90+
for metric in "COUNT" "DISK SIZE" "INFLATED SIZE"
91+
do
92+
grep "TOP $type BY $metric" out || return 1
93+
done || return 1
94+
done
8595
'
8696

8797
test_done

0 commit comments

Comments
 (0)