Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Documentation/git-backfill.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,12 @@ OPTIONS
current sparse-checkout. If the sparse-checkout feature is enabled,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Junio C Hamano wrote on the Git mailing list (how to reply to this email):

"Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com> writes:

> -	repo_init_revisions(ctx->repo, &revs, "");
> -	handle_revision_arg("HEAD", &revs, 0, 0);

So we used to "cheat" and did an initialization without even knowing
in which directory we were started ...

> +	/* Walk from HEAD if otherwise unspecified. */
> +	if (!ctx->revs.pending.nr)
> +		handle_revision_arg("HEAD", &ctx->revs, 0, 0);

... but by initializing the revs correctly in the caller, we would
be correcting it.  Looking good.

> @@ -134,7 +135,12 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit
>  					 builtin_backfill_usage, options);
>  
>  	argc = parse_options(argc, argv, prefix, options, builtin_backfill_usage,
> -			     0);
> +			     PARSE_OPT_KEEP_UNKNOWN_OPT |
> +			     PARSE_OPT_KEEP_ARGV0 |
> +			     PARSE_OPT_KEEP_DASHDASH);
> +
> +	repo_init_revisions(repo, &ctx.revs, prefix);
> +	argc = setup_revisions(argc, argv, &ctx.revs, NULL);

OK.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Kristoffer Haugsbakk" wrote on the Git mailing list (how to reply to this email):

On Tue, Mar 17, 2026, at 01:29, Derrick Stolee via GitGitGadget wrote:
>[snip]
> diff --git a/Documentation/git-backfill.adoc b/Documentation/git-backfill.adoc
> index b8394dcf22..fdfe22d623 100644
> --- a/Documentation/git-backfill.adoc
> +++ b/Documentation/git-backfill.adoc
> @@ -63,9 +63,12 @@ OPTIONS
>  	current sparse-checkout. If the sparse-checkout feature is enabled,
>  	then `--sparse` is assumed and can be disabled with `--no-sparse`.
>
> +You may also specify the commit limiting options from linkgit:git-rev-list[1].
> +
>  SEE ALSO
>  --------
>  linkgit:git-clone[1].
> +linkgit:git-rev-list[1].

Should there be a comma between these two?

>[snip]

then `--sparse` is assumed and can be disabled with `--no-sparse`.

You may also specify the commit limiting options from linkgit:git-rev-list[1].

SEE ALSO
--------
linkgit:git-clone[1].
linkgit:git-clone[1],
linkgit:git-rev-list[1]

GIT
---
Expand Down
19 changes: 13 additions & 6 deletions builtin/backfill.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ struct backfill_context {
struct oid_array current_batch;
size_t min_batch_size;
int sparse;
struct rev_info revs;
};

static void backfill_context_clear(struct backfill_context *ctx)
Expand Down Expand Up @@ -80,7 +81,6 @@ static int fill_missing_blobs(const char *path UNUSED,

static int do_backfill(struct backfill_context *ctx)
{
struct rev_info revs;
struct path_walk_info info = PATH_WALK_INFO_INIT;
int ret;

Expand All @@ -92,13 +92,14 @@ static int do_backfill(struct backfill_context *ctx)
}
}

repo_init_revisions(ctx->repo, &revs, "");
handle_revision_arg("HEAD", &revs, 0, 0);
/* Walk from HEAD if otherwise unspecified. */
if (!ctx->revs.pending.nr)
add_head_to_pending(&ctx->revs);

info.blobs = 1;
info.tags = info.commits = info.trees = 0;

info.revs = &revs;
info.revs = &ctx->revs;
info.path_fn = fill_missing_blobs;
info.path_fn_data = ctx;

Expand All @@ -109,7 +110,6 @@ static int do_backfill(struct backfill_context *ctx)
download_batch(ctx);

path_walk_info_clear(&info);
release_revisions(&revs);
return ret;
}

Expand All @@ -121,6 +121,7 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit
.current_batch = OID_ARRAY_INIT,
.min_batch_size = 50000,
.sparse = 0,
.revs = REV_INFO_INIT,
};
struct option options[] = {
OPT_UNSIGNED(0, "min-batch-size", &ctx.min_batch_size,
Expand All @@ -134,7 +135,12 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit
builtin_backfill_usage, options);

argc = parse_options(argc, argv, prefix, options, builtin_backfill_usage,
0);
PARSE_OPT_KEEP_UNKNOWN_OPT |
PARSE_OPT_KEEP_ARGV0 |
PARSE_OPT_KEEP_DASHDASH);

repo_init_revisions(repo, &ctx.revs, prefix);
argc = setup_revisions(argc, argv, &ctx.revs, NULL);

repo_config(repo, git_default_config, NULL);

Expand All @@ -143,5 +149,6 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit

result = do_backfill(&ctx);
backfill_context_clear(&ctx);
release_revisions(&ctx.revs);
return result;
}
44 changes: 44 additions & 0 deletions path-walk.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "list-objects.h"
#include "object.h"
#include "oid-array.h"
#include "path.h"
#include "prio-queue.h"
#include "repository.h"
#include "revision.h"
Expand Down Expand Up @@ -62,6 +63,8 @@ struct path_walk_context {
*/
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Junio C Hamano wrote on the Git mailing list (how to reply to this email):

"Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: Derrick Stolee <stolee@gmail.com>
>
> Previously, walk_objects_by_path() silently ignored pathspecs containing
> wildcards or magic by clearing them. This caused all blobs to be
> downloaded regardless of the given pathspec. Wildcard pathspecs like
> "d/file.*.txt" are useful for narrowing which blobs to process (e.g.,
> during 'git backfill').
>
> Support wildcard pathspecs by making three changes:
>
>  1. Add an 'exact_pathspecs' flag to path_walk_context. When the
>     pathspec has no wildcards or magic, set this flag and use the
>     existing fast-path prefix matching in add_tree_entries(). When
>     wildcards are present, skip that block since prefix matching
>     cannot handle glob patterns.
>
>  2. Disable revision-level commit pruning (revs->prune = 0) for
>     wildcard pathspecs. The revision walk uses the pathspec to filter
>     commits via TREESAME detection. For exact prefix pathspecs this
>     works well, but wildcard pathspecs may fail to match through
>     TREESAME because fnmatch with WM_PATHNAME does not cross directory
>     boundaries. Disabling pruning ensures all commits are visited and
>     their trees are available for the path-walk to filter.

Hmph, I wonder how significant an impact does it have on the
performance that we have to disable pruning here.  With the bog
standard tree traversal, wouldn't tree_entry_interesting() already
be capable of doing this, even with fnmatch / WM_PATHNAME ?

>  3. Add a match_pathspec() check in walk_path() to filter out blobs
>     whose full path does not match the pathspec. This provides the
>     actual blob-level filtering for wildcard pathspecs.
>
> Signed-off-by: Derrick Stolee <stolee@gmail.com>
> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

The latter person cannot sign DCO or vouch for the origin of what
they have written in this patch, can they?

> ---
>  path-walk.c         | 22 ++++++++++++++--------
>  t/t5620-backfill.sh |  7 +++----
>  2 files changed, 17 insertions(+), 12 deletions(-)

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Derrick Stolee wrote on the Git mailing list (how to reply to this email):

On 3/17/2026 6:19 PM, Junio C Hamano wrote:
> "Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com> writes:
> 
>> From: Derrick Stolee <stolee@gmail.com>
>>
>> Previously, walk_objects_by_path() silently ignored pathspecs containing
>> wildcards or magic by clearing them. This caused all blobs to be
>> downloaded regardless of the given pathspec. Wildcard pathspecs like
>> "d/file.*.txt" are useful for narrowing which blobs to process (e.g.,
>> during 'git backfill').
>>
>> Support wildcard pathspecs by making three changes:
>>
>>  1. Add an 'exact_pathspecs' flag to path_walk_context. When the
>>     pathspec has no wildcards or magic, set this flag and use the
>>     existing fast-path prefix matching in add_tree_entries(). When
>>     wildcards are present, skip that block since prefix matching
>>     cannot handle glob patterns.
>>
>>  2. Disable revision-level commit pruning (revs->prune = 0) for
>>     wildcard pathspecs. The revision walk uses the pathspec to filter
>>     commits via TREESAME detection. For exact prefix pathspecs this
>>     works well, but wildcard pathspecs may fail to match through
>>     TREESAME because fnmatch with WM_PATHNAME does not cross directory
>>     boundaries. Disabling pruning ensures all commits are visited and
>>     their trees are available for the path-walk to filter.
> 
> Hmph, I wonder how significant an impact does it have on the
> performance that we have to disable pruning here.  With the bog
> standard tree traversal, wouldn't tree_entry_interesting() already
> be capable of doing this, even with fnmatch / WM_PATHNAME ?

I will explore what's possible here and see what I can do.

>>  3. Add a match_pathspec() check in walk_path() to filter out blobs
>>     whose full path does not match the pathspec. This provides the
>>     actual blob-level filtering for wildcard pathspecs.
>>
>> Signed-off-by: Derrick Stolee <stolee@gmail.com>
>> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
> 
> The latter person cannot sign DCO or vouch for the origin of what
> they have written in this patch, can they?
No they cannot. Sorry for this error.

Thanks,
-Stolee

struct prio_queue path_stack;
struct strset path_stack_pushed;

unsigned exact_pathspecs:1;
};

static int compare_by_type(const void *one, const void *two, void *cb_data)
Expand Down Expand Up @@ -206,6 +209,34 @@ static int add_tree_entries(struct path_walk_context *ctx,
match != MATCHED)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Junio C Hamano wrote on the Git mailing list (how to reply to this email):

"Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: Derrick Stolee <stolee@gmail.com>
>
> The previous change allowed specifying revision arguments over the 'git
> backfill' command-line. This created the opportunity for pathspecs that
> specify a smaller set of starting commits, but otherwise did not restrict
> the blob paths that were downloaded.

"pathspecs that specify a smaller set of starting commits" is
puzzling, as starting commits would be coming from the revision
arguments.  "opportunity for pathspec to further filter commits
to those that touch only the matching paths...", or something?

> Update the path-walk API to accept certain kinds of pathspecs and to
> silently ignore anything too complex.

Hmph, "silently ignore", instead of "no, you cannot use that! and
die", or at least "sorry, I cannot do that, so the result may not be
what you wanted, you've been warned"?

> The current behavior focuses on
> pathspecs that match paths exactly. This includes exact filenames,
> including directory names as prefixes. Pathspecs containing wildcards
> or magic are cleared so the path walk downloads all blobs, as before.

Ah, "we punt and lift the limitation to grab everything, so at least
everything you wanted to have will become available to you, even
though we may download more than what you asked"?  OK, users would
survive that, and as we improve the pathspec support, the user
experience would only improve.  OK.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Derrick Stolee wrote on the Git mailing list (how to reply to this email):

On 3/17/2026 6:10 PM, Junio C Hamano wrote:
> "Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com> writes:
> 
>> From: Derrick Stolee <stolee@gmail.com>
>>
>> The previous change allowed specifying revision arguments over the 'git
>> backfill' command-line. This created the opportunity for pathspecs that
>> specify a smaller set of starting commits, but otherwise did not restrict
>> the blob paths that were downloaded.
> 
> "pathspecs that specify a smaller set of starting commits" is
> puzzling, as starting commits would be coming from the revision
> arguments.  "opportunity for pathspec to further filter commits
> to those that touch only the matching paths...", or something?

You're right. I'm using "starting commits" incorrectly. My view was too
focused on how the path-walk API starts from the commits output by the
revision walk to get a list of root trees and then walks by path from
that point.

I'll reword to make this more clear.

>> Update the path-walk API to accept certain kinds of pathspecs and to
>> silently ignore anything too complex.
> 
> Hmph, "silently ignore", instead of "no, you cannot use that! and
> die", or at least "sorry, I cannot do that, so the result may not be
> what you wanted, you've been warned"?

The behavior when silently ignoring is to over-download. The revision
walk still filters commits, but the path-walk then walks paths beyond
that pathspec. This will be fixed in the next commit, so adding an
error case didn't seem worth it. I'll do a better job foreshadowing.

>> The current behavior focuses on
>> pathspecs that match paths exactly. This includes exact filenames,
>> including directory names as prefixes. Pathspecs containing wildcards
>> or magic are cleared so the path walk downloads all blobs, as before.
> 
> Ah, "we punt and lift the limitation to grab everything, so at least
> everything you wanted to have will become available to you, even
> though we may download more than what you asked"?  OK, users would
> survive that, and as we improve the pathspec support, the user
> experience would only improve.  OK.

Exactly. I can word things better.

Thanks,
-Stolee

continue;
}
if (ctx->revs->prune_data.nr && ctx->exact_pathspecs) {
struct pathspec *pd = &ctx->revs->prune_data;
bool found = false;

/* remove '/' for these checks. */
path.buf[path.len - 1] = 0;

for (int i = 0; i < pd->nr; i++) {
struct pathspec_item *item = &pd->items[i];

/*
* Continue if either is a directory prefix
* of the other.
*/
if (dir_prefix(path.buf, item->match) ||
dir_prefix(item->match, path.buf)) {
found = true;
break;
}
}

/* return '/' after these checks. */
path.buf[path.len - 1] = '/';

/* Skip paths that do not match the prefix. */
if (!found)
continue;
}

add_path_to_list(ctx, path.buf, type, &entry.oid,
!(o->flags & UNINTERESTING));
Expand Down Expand Up @@ -274,6 +305,13 @@ static int walk_path(struct path_walk_context *ctx,
return 0;
}

if (list->type == OBJ_BLOB &&
ctx->revs->prune_data.nr &&
!match_pathspec(ctx->repo->index, &ctx->revs->prune_data,
path, strlen(path), 0,
NULL, 0))
return 0;

/* Evaluate function pointer on this data, if requested. */
if ((list->type == OBJ_TREE && ctx->info->trees) ||
(list->type == OBJ_BLOB && ctx->info->blobs) ||
Expand Down Expand Up @@ -481,6 +519,12 @@ int walk_objects_by_path(struct path_walk_info *info)
if (info->tags)
info->revs->tag_objects = 1;

if (ctx.revs->prune_data.nr) {
if (!ctx.revs->prune_data.has_wildcard &&
!ctx.revs->prune_data.magic)
ctx.exact_pathspecs = 1;
}

/* Insert a single list for the root tree into the paths. */
CALLOC_ARRAY(root_tree_list, 1);
root_tree_list->type = OBJ_TREE;
Expand Down
2 changes: 1 addition & 1 deletion path.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ static void strbuf_cleanup_path(struct strbuf *sb)
strbuf_remove(sb, 0, path - sb->buf);
}

static int dir_prefix(const char *buf, const char *dir)
int dir_prefix(const char *buf, const char *dir)
{
int len = strlen(dir);
return !strncmp(buf, dir, len) &&
Expand Down
6 changes: 6 additions & 0 deletions path.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,12 @@ const char *repo_submodule_path_replace(struct repository *repo,
const char *fmt, ...)
__attribute__((format (printf, 4, 5)));

/*
* Given a directory name 'dir' (not ending with a trailing '/'),
* determine if 'buf' is equal to 'dir' or has prefix 'dir'+'/'.
*/
int dir_prefix(const char *buf, const char *dir);

void report_linked_checkout_garbage(struct repository *r);

/*
Expand Down
1 change: 1 addition & 0 deletions revision.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "commit.h"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Junio C Hamano wrote on the Git mailing list (how to reply to this email):

"Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: Derrick Stolee <stolee@gmail.com>
>
> The REV_INFO_INIT macro includes a use of the DEFAULT_ABBREV macro, which is
> defined in object-name.h. Include it in revision.h so consumers of
> REV_INFO_INIT do not need to include this hidden dependency.
>
> Signed-off-by: Derrick Stolee <stolee@gmail.com>
> ---
>  revision.h | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/revision.h b/revision.h
> index b36acfc2d9..18c9bbd822 100644
> --- a/revision.h
> +++ b/revision.h
> @@ -4,6 +4,7 @@
>  #include "commit.h"
>  #include "grep.h"
>  #include "notes.h"
> +#include "object-name.h"
>  #include "oidset.h"
>  #include "pretty.h"
>  #include "diff.h"

OK.  Other symbols REV_INFO_INIT needs are REV_SORT_IN_GRAPH_ORDER
(in <commit.h>), CMIT_FMT_DEFAULT (in <pretty.h>), and STRVEC_INIT
(in <strvec.h>), and all three are already included there.

Makes sense.

#include "grep.h"
#include "notes.h"
#include "object-name.h"
#include "oidset.h"
#include "pretty.h"
#include "diff.h"
Expand Down
Loading
Loading