From 27b46856653e7ca77f602284a13e366ad9d9c6d6 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Tue, 27 Jan 2026 15:56:44 +0200 Subject: [PATCH 1/6] index-pack, unpack-objects: use size_t for object size When unpacking objects from a packfile, the object size is decoded from a variable-length encoding. On platforms where unsigned long is 32-bit (such as Windows, even in 64-bit builds), the shift operation overflows when decoding sizes larger than 4GB. The result is a truncated size value, causing the unpacked object to be corrupted or rejected. Fix this by changing the size variable to size_t, which is 64-bit on 64-bit platforms, and ensuring the shift arithmetic occurs in 64-bit space. This was originally authored by LordKiRon , who preferred not to reveal their real name and therefore agreed that I take over authorship. Signed-off-by: Johannes Schindelin --- builtin/index-pack.c | 9 +++++---- builtin/unpack-objects.c | 5 +++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index b67fb0256cc831..1ea64089055887 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -37,7 +37,7 @@ static const char index_pack_usage[] = struct object_entry { struct pack_idx_entry idx; - unsigned long size; + size_t size; unsigned char hdr_size; signed char type; signed char real_type; @@ -469,7 +469,7 @@ static int is_delta_type(enum object_type type) return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA); } -static void *unpack_entry_data(off_t offset, unsigned long size, +static void *unpack_entry_data(off_t offset, size_t size, enum object_type type, struct object_id *oid) { static char fixed_buf[8192]; @@ -524,7 +524,8 @@ static void *unpack_raw_entry(struct object_entry *obj, struct object_id *oid) { unsigned char *p; - unsigned long size, c; + size_t size; + unsigned long c; off_t base_offset; unsigned shift; void *data; @@ -542,7 +543,7 @@ static void *unpack_raw_entry(struct object_entry *obj, p = fill(1); c = *p; use(1); - size += (c & 0x7f) << shift; + size += ((size_t)c & 0x7f) << shift; shift += 7; } obj->size = size; diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index 6fc64e9e4b8d5a..883440ccafef19 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -533,7 +533,8 @@ static void unpack_one(unsigned nr) { unsigned shift; unsigned char *pack; - unsigned long size, c; + size_t size; + unsigned long c; enum object_type type; obj_list[nr].offset = consumed_bytes; @@ -548,7 +549,7 @@ static void unpack_one(unsigned nr) pack = fill(1); c = *pack; use(1); - size += (c & 0x7f) << shift; + size += ((size_t)c & 0x7f) << shift; shift += 7; } From 9a5fa4c04d02f8fb67231619792a0e2638fc1b6a Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 29 Jan 2026 14:07:48 +0100 Subject: [PATCH 2/6] git-zlib: handle data streams larger than 4GB On Windows, zlib's `uLong` type is 32-bit even on 64-bit systems. When processing data streams larger than 4GB, the `total_in` and `total_out` fields in zlib's `z_stream` structure wrap around, which caused the sanity checks in `zlib_post_call()` to trigger `BUG()` assertions. The git_zstream wrapper now tracks its own 64-bit totals rather than copying them from zlib. The sanity checks compare only the low bits, using `maximum_unsigned_value_of_type(uLong)` to mask appropriately for the platform's `uLong` size. This is based on work by LordKiRon in git-for-windows#6076. Signed-off-by: Johannes Schindelin --- git-zlib.c | 25 +++++++++++++++++-------- git-zlib.h | 4 ++-- object-file.c | 2 +- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/git-zlib.c b/git-zlib.c index df9604910e3fdf..b91cb323aee916 100644 --- a/git-zlib.c +++ b/git-zlib.c @@ -30,6 +30,9 @@ static const char *zerr_to_string(int status) */ /* #define ZLIB_BUF_MAX ((uInt)-1) */ #define ZLIB_BUF_MAX ((uInt) 1024 * 1024 * 1024) /* 1GB */ + +/* uLong is 32-bit on Windows, even on 64-bit systems */ +#define ULONG_MAX_VALUE maximum_unsigned_value_of_type(uLong) static inline uInt zlib_buf_cap(unsigned long len) { return (ZLIB_BUF_MAX < len) ? ZLIB_BUF_MAX : len; @@ -39,31 +42,37 @@ static void zlib_pre_call(git_zstream *s) { s->z.next_in = s->next_in; s->z.next_out = s->next_out; - s->z.total_in = s->total_in; - s->z.total_out = s->total_out; + s->z.total_in = (uLong)(s->total_in & ULONG_MAX_VALUE); + s->z.total_out = (uLong)(s->total_out & ULONG_MAX_VALUE); s->z.avail_in = zlib_buf_cap(s->avail_in); s->z.avail_out = zlib_buf_cap(s->avail_out); } static void zlib_post_call(git_zstream *s, int status) { - unsigned long bytes_consumed; - unsigned long bytes_produced; + size_t bytes_consumed; + size_t bytes_produced; bytes_consumed = s->z.next_in - s->next_in; bytes_produced = s->z.next_out - s->next_out; - if (s->z.total_out != s->total_out + bytes_produced) + /* + * zlib's total_out/total_in are uLong which may wrap for >4GB. + * We track our own totals and verify only the low bits match. + */ + if ((s->z.total_out & ULONG_MAX_VALUE) != + ((s->total_out + bytes_produced) & ULONG_MAX_VALUE)) BUG("total_out mismatch"); /* * zlib does not update total_in when it returns Z_NEED_DICT, * causing a mismatch here. Skip the sanity check in that case. */ if (status != Z_NEED_DICT && - s->z.total_in != s->total_in + bytes_consumed) + (s->z.total_in & ULONG_MAX_VALUE) != + ((s->total_in + bytes_consumed) & ULONG_MAX_VALUE)) BUG("total_in mismatch"); - s->total_out = s->z.total_out; - s->total_in = s->z.total_in; + s->total_out += bytes_produced; + s->total_in += bytes_consumed; /* zlib-ng marks `next_in` as `const`, so we have to cast it away. */ s->next_in = (unsigned char *) s->z.next_in; s->next_out = s->z.next_out; diff --git a/git-zlib.h b/git-zlib.h index 0e66fefa8c9f05..44380e8ad38305 100644 --- a/git-zlib.h +++ b/git-zlib.h @@ -7,8 +7,8 @@ typedef struct git_zstream { struct z_stream_s z; unsigned long avail_in; unsigned long avail_out; - unsigned long total_in; - unsigned long total_out; + size_t total_in; + size_t total_out; unsigned char *next_in; unsigned char *next_out; } git_zstream; diff --git a/object-file.c b/object-file.c index e55bf1bfff670f..88f86ad04e54ca 100644 --- a/object-file.c +++ b/object-file.c @@ -1086,7 +1086,7 @@ int odb_source_loose_write_stream(struct odb_source *source, } while (ret == Z_OK || ret == Z_BUF_ERROR); if (stream.total_in != len + hdrlen) - die(_("write stream object %ld != %"PRIuMAX), stream.total_in, + die(_("write stream object %"PRIuMAX" != %"PRIuMAX), (uintmax_t)stream.total_in, (uintmax_t)len + hdrlen); /* From 9fb7d673eceb80ac76183eee75a95d62550eb57b Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Wed, 28 Jan 2026 01:01:23 +0200 Subject: [PATCH 3/6] odb, packfile: use size_t for streaming object sizes The odb_read_stream structure uses unsigned long for the size field, which is 32-bit on Windows even in 64-bit builds. When streaming objects larger than 4GB, the size would be truncated to zero or an incorrect value, resulting in empty files being written to disk. Change the size field in odb_read_stream to size_t and introduce unpack_object_header_sz() to return sizes via size_t pointer. Since object_info.sizep remains unsigned long for API compatibility, use temporary variables where the types differ, with comments noting the truncation limitation for code paths that still use unsigned long. This was originally authored by LordKiRon , who preferred not to reveal their real name and therefore agreed that I take over authorship. Signed-off-by: Johannes Schindelin --- object-file.c | 10 ++++++- odb/streaming.c | 13 ++++++++- odb/streaming.h | 2 +- packfile.c | 71 ++++++++++++++++++++++++++++++++++++++++--------- 4 files changed, 81 insertions(+), 15 deletions(-) diff --git a/object-file.c b/object-file.c index 88f86ad04e54ca..6baa36526ef9f6 100644 --- a/object-file.c +++ b/object-file.c @@ -2108,6 +2108,7 @@ int odb_source_loose_read_object_stream(struct odb_read_stream **out, struct object_info oi = OBJECT_INFO_INIT; struct odb_loose_read_stream *st; unsigned long mapsize; + unsigned long size_ul; void *mapped; mapped = odb_source_loose_map_object(source, oid, &mapsize); @@ -2131,11 +2132,18 @@ int odb_source_loose_read_object_stream(struct odb_read_stream **out, goto error; } - oi.sizep = &st->base.size; + /* + * object_info.sizep is unsigned long* (32-bit on Windows), but + * st->base.size is size_t (64-bit). Use temporary variable. + * Note: loose objects >4GB would still truncate here, but such + * large loose objects are uncommon (they'd normally be packed). + */ + oi.sizep = &size_ul; oi.typep = &st->base.type; if (parse_loose_header(st->hdr, &oi) < 0 || st->base.type < 0) goto error; + st->base.size = size_ul; st->mapped = mapped; st->mapsize = mapsize; diff --git a/odb/streaming.c b/odb/streaming.c index 4a4474f891a07f..bd460f9adc2cb2 100644 --- a/odb/streaming.c +++ b/odb/streaming.c @@ -158,15 +158,26 @@ static int open_istream_incore(struct odb_read_stream **out, .base.read = read_istream_incore, }; struct odb_incore_read_stream *st; + unsigned long size_ul; int ret; oi.typep = &stream.base.type; - oi.sizep = &stream.base.size; + /* + * object_info.sizep is unsigned long* (32-bit on Windows), but + * stream.base.size is size_t (64-bit). We use a temporary variable + * because the types are incompatible. Note: this path still truncates + * for >4GB objects, but large objects should use pack streaming + * (packfile_store_read_object_stream) which handles size_t properly. + * This incore fallback is only used for small objects or when pack + * streaming is unavailable. + */ + oi.sizep = &size_ul; oi.contentp = (void **)&stream.buf; ret = odb_read_object_info_extended(odb, oid, &oi, OBJECT_INFO_DIE_IF_CORRUPT); if (ret) return ret; + stream.base.size = size_ul; CALLOC_ARRAY(st, 1); *st = stream; diff --git a/odb/streaming.h b/odb/streaming.h index c7861f7e13c606..517e2ea2d3f5c3 100644 --- a/odb/streaming.h +++ b/odb/streaming.h @@ -21,7 +21,7 @@ struct odb_read_stream { odb_read_stream_close_fn close; odb_read_stream_read_fn read; enum object_type type; - unsigned long size; /* inflated size of full object */ + size_t size; /* inflated size of full object */ }; /* diff --git a/packfile.c b/packfile.c index 402c3b5dc73131..3a15a7ae8afd49 100644 --- a/packfile.c +++ b/packfile.c @@ -1130,8 +1130,8 @@ unsigned long repo_approximate_object_count(struct repository *r) return r->objects->approximate_object_count; } -unsigned long unpack_object_header_buffer(const unsigned char *buf, - unsigned long len, enum object_type *type, unsigned long *sizep) +static unsigned long unpack_object_header_buffer_internal(const unsigned char *buf, + unsigned long len, enum object_type *type, size_t *sizep) { unsigned shift; size_t size, c; @@ -1142,7 +1142,11 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf, size = c & 15; shift = 4; while (c & 0x80) { - if (len <= used || (bitsizeof(long) - 7) < shift) { + /* + * Each continuation byte adds 7 bits. Ensure shift won't + * overflow size_t (use size_t not long for 64-bit on Windows). + */ + if (len <= used || (bitsizeof(size_t) - 7) < shift) { error("bad object header"); size = used = 0; break; @@ -1151,6 +1155,15 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf, size = st_add(size, st_left_shift(c & 0x7f, shift)); shift += 7; } + *sizep = size; + return used; +} + +unsigned long unpack_object_header_buffer(const unsigned char *buf, + unsigned long len, enum object_type *type, unsigned long *sizep) +{ + size_t size; + unsigned long used = unpack_object_header_buffer_internal(buf, len, type, &size); *sizep = cast_size_t_to_ulong(size); return used; } @@ -1210,6 +1223,32 @@ unsigned long get_size_from_delta(struct packed_git *p, return get_delta_hdr_size(&data, delta_head+sizeof(delta_head)); } +/* + * Like unpack_object_header(), but returns size via size_t* instead of + * unsigned long*. This is needed for >4GB objects on Windows where + * unsigned long is 32-bit but size_t is 64-bit. Used by streaming code + * to get the correct untruncated object size. + */ +static int unpack_object_header_sz(struct packed_git *p, + struct pack_window **w_curs, + off_t *curpos, + size_t *sizep) +{ + unsigned char *base; + unsigned long left; + unsigned long used; + enum object_type type; + + base = use_pack(p, w_curs, *curpos, &left); + used = unpack_object_header_buffer_internal(base, left, &type, sizep); + if (!used) { + type = OBJ_BAD; + } else + *curpos += used; + + return type; +} + int unpack_object_header(struct packed_git *p, struct pack_window **w_curs, off_t *curpos, @@ -2561,21 +2600,29 @@ int packfile_store_read_object_stream(struct odb_read_stream **out, struct pack_window *window = NULL; struct object_info oi = OBJECT_INFO_INIT; enum object_type in_pack_type; - unsigned long size; - - oi.sizep = &size; + size_t size; + /* + * We need to check if this is a delta or if the object is smaller + * than the big file threshold. For the initial check, we don't need + * the exact size, just whether it qualifies for streaming. + */ if (packfile_store_read_object_info(store, oid, &oi, 0) || oi.u.packed.type == PACKED_OBJECT_TYPE_REF_DELTA || - oi.u.packed.type == PACKED_OBJECT_TYPE_OFS_DELTA || - repo_settings_get_big_file_threshold(store->source->odb->repo) >= size) + oi.u.packed.type == PACKED_OBJECT_TYPE_OFS_DELTA) return -1; - in_pack_type = unpack_object_header(oi.u.packed.pack, - &window, - &oi.u.packed.offset, - &size); + /* Read the actual size using size_t to handle >4GB objects on Windows */ + in_pack_type = unpack_object_header_sz(oi.u.packed.pack, + &window, + &oi.u.packed.offset, + &size); unuse_pack(&window); + + /* Now check the big file threshold with the correct size */ + if (repo_settings_get_big_file_threshold(store->source->odb->repo) >= size) + return -1; + switch (in_pack_type) { default: return -1; /* we do not do deltas for now */ From e89b0b6b75ea88610b6c8527929888e33131d04a Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 29 Jan 2026 10:18:01 +0200 Subject: [PATCH 4/6] delta, packfile: use size_t for delta header sizes The delta header decoding functions return unsigned long, which truncates on Windows for objects larger than 4GB. Introduce size_t variants get_delta_hdr_size_sz() and get_size_from_delta_sz() that preserve the full 64-bit size, and use them in packed_object_info() where the size is needed for streaming decisions. This was originally authored by LordKiRon , who preferred not to reveal their real name and therefore agreed that I take over authorship. Signed-off-by: Johannes Schindelin --- delta.h | 14 ++++++++++++-- packfile.c | 38 +++++++++++++++++++++++++++----------- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/delta.h b/delta.h index 8a56ec07992c75..fad68cfc45f6f4 100644 --- a/delta.h +++ b/delta.h @@ -86,8 +86,11 @@ void *patch_delta(const void *src_buf, unsigned long src_size, * This must be called twice on the delta data buffer, first to get the * expected source buffer size, and again to get the target buffer size. */ -static inline unsigned long get_delta_hdr_size(const unsigned char **datap, - const unsigned char *top) +/* + * Size_t variant that doesn't truncate - use for >4GB objects on Windows. + */ +static inline size_t get_delta_hdr_size_sz(const unsigned char **datap, + const unsigned char *top) { const unsigned char *data = *datap; size_t cmd, size = 0; @@ -98,6 +101,13 @@ static inline unsigned long get_delta_hdr_size(const unsigned char **datap, i += 7; } while (cmd & 0x80 && data < top); *datap = data; + return size; +} + +static inline unsigned long get_delta_hdr_size(const unsigned char **datap, + const unsigned char *top) +{ + size_t size = get_delta_hdr_size_sz(datap, top); return cast_size_t_to_ulong(size); } diff --git a/packfile.c b/packfile.c index 3a15a7ae8afd49..e198c9793f089f 100644 --- a/packfile.c +++ b/packfile.c @@ -1168,9 +1168,12 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf, return used; } -unsigned long get_size_from_delta(struct packed_git *p, - struct pack_window **w_curs, - off_t curpos) +/* + * Size_t variant for >4GB delta results on Windows. + */ +static size_t get_size_from_delta_sz(struct packed_git *p, + struct pack_window **w_curs, + off_t curpos) { const unsigned char *data; unsigned char delta_head[20], *in; @@ -1217,10 +1220,18 @@ unsigned long get_size_from_delta(struct packed_git *p, data = delta_head; /* ignore base size */ - get_delta_hdr_size(&data, delta_head+sizeof(delta_head)); + get_delta_hdr_size_sz(&data, delta_head+sizeof(delta_head)); /* Read the result size */ - return get_delta_hdr_size(&data, delta_head+sizeof(delta_head)); + return get_delta_hdr_size_sz(&data, delta_head+sizeof(delta_head)); +} + +unsigned long get_size_from_delta(struct packed_git *p, + struct pack_window **w_curs, + off_t curpos) +{ + size_t size = get_size_from_delta_sz(p, w_curs, curpos); + return cast_size_t_to_ulong(size); } /* @@ -1621,7 +1632,7 @@ int packed_object_info(struct packed_git *p, off_t obj_offset, struct object_info *oi) { struct pack_window *w_curs = NULL; - unsigned long size; + size_t size; off_t curpos = obj_offset; enum object_type type = OBJ_NONE; int ret; @@ -1636,7 +1647,8 @@ int packed_object_info(struct packed_git *p, if (!*oi->contentp) type = OBJ_BAD; } else if (oi->sizep || oi->typep || oi->delta_base_oid) { - type = unpack_object_header(p, &w_curs, &curpos, &size); + /* Use size_t variant to handle >4GB objects on Windows */ + type = unpack_object_header_sz(p, &w_curs, &curpos, &size); } if (!oi->contentp && oi->sizep) { @@ -1648,14 +1660,18 @@ int packed_object_info(struct packed_git *p, ret = -1; goto out; } - *oi->sizep = get_size_from_delta(p, &w_curs, tmp_pos); - if (*oi->sizep == 0) { + /* + * Use size_t variant to avoid die() on >4GB deltas. + * oi->sizep is unsigned long, so truncation may occur, + * but streaming code uses its own size_t tracking. + */ + size = get_size_from_delta_sz(p, &w_curs, tmp_pos); + if (size == 0) { ret = -1; goto out; } - } else { - *oi->sizep = size; } + *oi->sizep = (unsigned long)size; } if (oi->disk_sizep) { From 3de968e8b0041130530098fcadce433adedffd13 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 6 Jul 2023 13:37:46 +0200 Subject: [PATCH 5/6] test-tool: add a helper to synthesize large packfiles To test Git's behavior with very large pack files, we need a way to generate such files quickly. A naive approach using only readily-available Git commands would take over 10 hours for a 4GB pack file, which is prohibitive. Side-stepping Git's machinery and actual zlib compression by writing uncompressed content with the appropriate zlib header makes things much faster. The fastest method using this approach generates many small, unreachable blob objects and takes about 1.5 minutes for 4GB. However, this cannot be used because we need to test git clone, which requires a reachable commit history. Generating many reachable commits with small, uncompressed blobs takes about 4 minutes for 4GB. But this approach 1) does not reproduce the issues we want to fix (which require individual objects larger than 4GB) and 2) is comparatively slow because of the many SHA-1 calculations. The approach taken here generates a single large blob (filled with NUL bytes), along with the trees and commits needed to make it reachable. This takes about 2.5 minutes for 4.5GB, which is the fastest option that produces a valid, clonable repository with an object large enough to trigger the bugs we want to test. Signed-off-by: Johannes Schindelin --- Makefile | 1 + compat/zlib-compat.h | 2 + t/helper/meson.build | 1 + t/helper/test-synthesize.c | 250 +++++++++++++++++++++++++++++++++++++ t/helper/test-tool.c | 1 + t/helper/test-tool.h | 1 + 6 files changed, 256 insertions(+) create mode 100644 t/helper/test-synthesize.c diff --git a/Makefile b/Makefile index a9fa8f8208fbdb..58b870dc2c6693 100644 --- a/Makefile +++ b/Makefile @@ -875,6 +875,7 @@ TEST_BUILTINS_OBJS += test-submodule-config.o TEST_BUILTINS_OBJS += test-submodule-nested-repo-config.o TEST_BUILTINS_OBJS += test-submodule.o TEST_BUILTINS_OBJS += test-subprocess.o +TEST_BUILTINS_OBJS += test-synthesize.o TEST_BUILTINS_OBJS += test-trace2.o TEST_BUILTINS_OBJS += test-truncate.o TEST_BUILTINS_OBJS += test-userdiff.o diff --git a/compat/zlib-compat.h b/compat/zlib-compat.h index ac0827662298af..5078c5ef6ce0e8 100644 --- a/compat/zlib-compat.h +++ b/compat/zlib-compat.h @@ -7,6 +7,8 @@ # define z_stream_s zng_stream_s # define gz_header_s zng_gz_header_s +# define adler32(adler, buf, len) zng_adler32(adler, buf, len) + # define crc32(crc, buf, len) zng_crc32(crc, buf, len) # define inflate(strm, bits) zng_inflate(strm, bits) diff --git a/t/helper/meson.build b/t/helper/meson.build index cba4a9bf4f1434..d4499d26a9af1f 100644 --- a/t/helper/meson.build +++ b/t/helper/meson.build @@ -70,6 +70,7 @@ test_tool_sources = [ 'test-submodule-nested-repo-config.c', 'test-submodule.c', 'test-subprocess.c', + 'test-synthesize.c', 'test-tool.c', 'test-trace2.c', 'test-truncate.c', diff --git a/t/helper/test-synthesize.c b/t/helper/test-synthesize.c new file mode 100644 index 00000000000000..d4135b115bc9a2 --- /dev/null +++ b/t/helper/test-synthesize.c @@ -0,0 +1,250 @@ +#define USE_THE_REPOSITORY_VARIABLE + +#include "test-tool.h" +#include "git-compat-util.h" +#include "git-zlib.h" +#include "hash.h" +#include "hex.h" +#include "object.h" +#include "parse-options.h" +#include "repository.h" +#include "setup.h" +#include "strbuf.h" + +#define BLOCK_SIZE 0xffff +static const unsigned char zeros[BLOCK_SIZE]; + +/* + * Write data as an uncompressed zlib stream. + * For data larger than 64KB, writes multiple uncompressed blocks. + * If data is NULL, writes zeros. + * Updates the pack checksum context. + */ +static void write_uncompressed_zlib(FILE *f, struct git_hash_ctx *pack_ctx, + const void *data, size_t len, + const struct git_hash_algo *algo) +{ + unsigned char zlib_header[2] = { 0x78, 0x01 }; /* CMF, FLG */ + unsigned char block_header[5]; + const unsigned char *p = data; + size_t remaining = len; + uint32_t adler = 1L; /* adler32 initial value */ + unsigned char adler_buf[4]; + + /* Write zlib header */ + fwrite(zlib_header, 1, 2, f); + algo->update_fn(pack_ctx, zlib_header, 2); + + /* Write uncompressed blocks (max 64KB each) */ + do { + size_t block_len = remaining > BLOCK_SIZE ? BLOCK_SIZE : remaining; + int is_final = (block_len == remaining); + const unsigned char *block_data = data ? p : zeros; + + block_header[0] = is_final ? 0x01 : 0x00; + block_header[1] = block_len & 0xff; + block_header[2] = (block_len >> 8) & 0xff; + block_header[3] = block_header[1] ^ 0xff; + block_header[4] = block_header[2] ^ 0xff; + + fwrite(block_header, 1, 5, f); + algo->update_fn(pack_ctx, block_header, 5); + + if (block_len) { + fwrite(block_data, 1, block_len, f); + algo->update_fn(pack_ctx, block_data, block_len); + adler = adler32(adler, block_data, block_len); + } + + if (data) + p += block_len; + remaining -= block_len; + } while (remaining > 0); + + /* Write adler32 checksum */ + put_be32(adler_buf, adler); + fwrite(adler_buf, 1, 4, f); + algo->update_fn(pack_ctx, adler_buf, 4); +} + +/* + * Write a pack object header for the given type and size. + * Returns the number of bytes written to the buffer. + */ +static size_t object_header(char *buf, enum object_type type, size_t size) +{ + unsigned char *p = (unsigned char *)buf; + *p = (type << 4) | (size & 0xf); + size >>= 4; + while (size) { + *p++ |= 0x80; + *p = size & 0x7f; + size >>= 7; + } + p++; + return p - (unsigned char *)buf; +} + +/* + * Write an uncompressed object to the pack file. + * If `data == NULL`, it is treated like a buffer to NUL bytes. + * Updates the pack checksum context. + */ +static void write_pack_object(FILE *f, struct git_hash_ctx *pack_ctx, + enum object_type type, + const void *data, size_t len, + struct object_id *oid, + const struct git_hash_algo *algo) +{ + char header[32]; + size_t header_len; + struct git_hash_ctx ctx; + + /* Write pack object header */ + header_len = object_header(header, type, len); + fwrite(header, 1, header_len, f); + algo->update_fn(pack_ctx, header, header_len); + + /* Write the data as uncompressed zlib */ + write_uncompressed_zlib(f, pack_ctx, data, len, algo); + + algo->init_fn(&ctx); + header_len = xsnprintf(header, sizeof(header), "%s %"PRIuMAX, + type_name(type), (uintmax_t)len) + 1; + algo->update_fn(&ctx, header, header_len); + if (data) + algo->update_fn(&ctx, data, len); + else { + for (size_t i = len / BLOCK_SIZE; i; i--) + algo->update_fn(&ctx, zeros, BLOCK_SIZE); + algo->update_fn(&ctx, zeros, len % BLOCK_SIZE); + } + algo->final_oid_fn(oid, &ctx); +} + +/* + * Generate a pack file with a single large (>4GB) reachable object. + * + * Creates: + * 1. A large blob (all NUL bytes) + * 2. A tree containing that blob as "file" + * 3. A commit using that tree + * 4. The empty tree + * 5. A child commit using the empty tree + * + * This is useful for testing that Git can handle objects larger than 4GB. + */ +static int generate_pack_with_large_object(const char *path, size_t blob_size, + const struct git_hash_algo *algo) +{ + FILE *f = fopen_for_writing(path); + struct git_hash_ctx pack_ctx; + char header[1024]; + struct object_id blob_oid, tree_oid, commit_oid, empty_tree_oid, final_commit_oid; + struct strbuf buf = STRBUF_INIT; + size_t object_count = 5; /* large blob, tree, commit, empty tree, final commit */ + + algo->init_fn(&pack_ctx); + + /* Write pack header */ + memcpy(header, "PACK", 4); + put_be32(header + 4, 2); + put_be32(header + 8, object_count); + fwrite(header, 1, 12, f); + algo->update_fn(&pack_ctx, header, 12); + + /* 1. Write the large blob */ + write_pack_object(f, &pack_ctx, OBJ_BLOB, NULL, blob_size, &blob_oid, algo); + + /* 2. Write tree containing the blob as "file" */ + strbuf_addf(&buf, "100644 file%c", '\0'); + strbuf_add(&buf, blob_oid.hash, algo->rawsz); + write_pack_object(f, &pack_ctx, OBJ_TREE, buf.buf, buf.len, &tree_oid, algo); + + /* 3. Write commit using that tree */ + strbuf_reset(&buf); + strbuf_addf(&buf, + "tree %s\n" + "author A U Thor 1234567890 +0000\n" + "committer C O Mitter 1234567890 +0000\n" + "\n" + "Large blob commit\n", + oid_to_hex(&tree_oid)); + write_pack_object(f, &pack_ctx, OBJ_COMMIT, buf.buf, buf.len, &commit_oid, algo); + + /* 4. Write the empty tree */ + write_pack_object(f, &pack_ctx, OBJ_TREE, "", 0, &empty_tree_oid, algo); + + /* 5. Write final commit using empty tree, with previous commit as parent */ + strbuf_reset(&buf); + strbuf_addf(&buf, + "tree %s\n" + "parent %s\n" + "author A U Thor 1234567890 +0000\n" + "committer C O Mitter 1234567890 +0000\n" + "\n" + "Empty tree commit\n", + oid_to_hex(&empty_tree_oid), + oid_to_hex(&commit_oid)); + write_pack_object(f, &pack_ctx, OBJ_COMMIT, buf.buf, buf.len, &final_commit_oid, algo); + + /* Write pack trailer (checksum) */ + algo->final_fn((unsigned char *)header, &pack_ctx); + fwrite(header, 1, algo->rawsz, f); + + fclose(f); + + strbuf_release(&buf); + + /* Print the final commit OID so caller can set up refs */ + printf("%s\n", oid_to_hex(&final_commit_oid)); + + return 0; +} + +static int cmd__synthesize__pack(int argc, const char **argv, + const char *prefix UNUSED, + struct repository *repo) +{ + int non_git; + const struct git_hash_algo *algo; + size_t count; + const char *path; + const char * const usage[] = { + "test-tool synthesize pack ", + NULL + }; + struct option options[] = { + OPT_END() + }; + + setup_git_directory_gently(&non_git); + repo = the_repository; + algo = repo->hash_algo; + + argc = parse_options(argc, argv, NULL, options, usage, + PARSE_OPT_KEEP_ARGV0); + if (argc != 3) + usage_with_options(usage, options); + + count = strtoumax(argv[1], NULL, 10); + path = argv[2]; + + return !!generate_pack_with_large_object(path, count, algo); +} + +int cmd__synthesize(int argc, const char **argv) +{ + const char *prefix = NULL; + char const * const synthesize_usage[] = { + "test-tool synthesize pack ", + NULL, + }; + parse_opt_subcommand_fn *fn = NULL; + struct option options[] = { + OPT_SUBCOMMAND("pack", &fn, cmd__synthesize__pack), + OPT_END() + }; + argc = parse_options(argc, argv, prefix, options, synthesize_usage, 0); + return !!fn(argc, argv, prefix, NULL); +} diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c index 9d1b41c8e39b89..ee16b2cb23719e 100644 --- a/t/helper/test-tool.c +++ b/t/helper/test-tool.c @@ -83,6 +83,7 @@ static struct test_cmd cmds[] = { { "submodule-config", cmd__submodule_config }, { "submodule-nested-repo-config", cmd__submodule_nested_repo_config }, { "subprocess", cmd__subprocess }, + { "synthesize", cmd__synthesize }, { "trace2", cmd__trace2 }, { "truncate", cmd__truncate }, { "userdiff", cmd__userdiff }, diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h index e18e5a9ed9de81..4cf9f935a4cdfa 100644 --- a/t/helper/test-tool.h +++ b/t/helper/test-tool.h @@ -76,6 +76,7 @@ int cmd__submodule(int argc, const char **argv); int cmd__submodule_config(int argc, const char **argv); int cmd__submodule_nested_repo_config(int argc, const char **argv); int cmd__subprocess(int argc, const char **argv); +int cmd__synthesize(int argc, const char **argv); int cmd__trace2(int argc, const char **argv); int cmd__truncate(int argc, const char **argv); int cmd__userdiff(int argc, const char **argv); From b03bc86b9facc6278e772a65e5d6c5f8769d1b16 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 29 Jan 2026 14:07:14 +0100 Subject: [PATCH 6/6] t5608: add regression test for >4GB object clone The shift overflow bug in index-pack and unpack-objects caused incorrect object size calculation when the encoded size required more than 32 bits of shift. This would result in corrupted or failed unpacking of objects larger than 4GB. Add a test that creates a pack file containing a 4GB+ blob using the new 'test-tool synthesize pack --reachable-large' command, then clones the repository to verify the fix works correctly. Signed-off-by: Johannes Schindelin --- t/t5608-clone-2gb.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/t/t5608-clone-2gb.sh b/t/t5608-clone-2gb.sh index 87a8cd9f98381a..67b611353de8c4 100755 --- a/t/t5608-clone-2gb.sh +++ b/t/t5608-clone-2gb.sh @@ -49,4 +49,15 @@ test_expect_success 'clone - with worktree, file:// protocol' ' ' +test_expect_success 'clone with >4GB object' ' + # Generate a pack with a single >4GB blob to test the shift overflow fix. + # The bug causes size calculation to overflow when shift > 32 bits. + git init --bare 4gb-repo && + head_oid=$(test-tool synthesize pack $((4*1024*1024*1024+1)) 4gb-repo/objects/pack/test.pack) && + git -C 4gb-repo index-pack objects/pack/test.pack && + git -C 4gb-repo update-ref refs/heads/main $head_oid && + + git clone --no-checkout --bare 4gb-repo 4gb-clone +' + test_done