Skip to content
Snippets Groups Projects
Commit 6977e67e authored by Christophe Benz's avatar Christophe Benz
Browse files

Implement "append" mode for pack files, fix SHA-1 checksum.

parent 4b08c648
No related branches found
No related tags found
No related merge requests found
Pipeline #1169 failed with stage
in 1 minute and 24 seconds
......@@ -26,6 +26,8 @@
import argparse
from collections import OrderedDict
import hashlib
import io
import itertools
import logging
import os
......@@ -37,7 +39,7 @@ import time
from dulwich.file import GitFile
from dulwich.objects import Blob, Tree, Commit
from dulwich.pack import deltify_pack_objects, OFS_DELTA, REF_DELTA, SHA1Writer, write_pack_index_v2, \
from dulwich.pack import deltify_pack_objects, load_pack_index, OFS_DELTA, REF_DELTA, SHA1Writer, write_pack_index_v2, \
write_pack_object, write_pack_header
from dulwich.repo import Repo
from dulwich.server import generate_objects_info_packs
......@@ -336,70 +338,119 @@ def main():
repo.object_store.add_object(provider_tree)
# Create commit
log.info("Creating commit...")
committer = "{} <eurostat@db.nomics.world>".format(provider_code).encode('utf-8')
commit_id = repo.do_commit(
commit_timestamp=int(time.time()),
commit_timezone=0,
committer=committer,
encoding=b"UTF-8",
message=b"New conversion",
tree=provider_tree.id,
)
log.info("Commit ID: %s", commit_id)
generate_objects_info_packs(repo)
if b'HEAD' in repo.get_refs() and repo[repo.head()].tree == provider_tree.id:
log.info("No changes => not committing")
else:
log.info("Creating commit...")
committer = "{} <eurostat@db.nomics.world>".format(provider_code).encode('utf-8')
commit_id = repo.do_commit(
commit_timestamp=int(time.time()),
commit_timezone=0,
committer=committer,
encoding=b"UTF-8",
message=b"New conversion",
tree=provider_tree.id,
)
log.info("Commit ID: %s", commit_id)
generate_objects_info_packs(repo)
return 0
def write_dataset_pack(sdmx_file_path, git_objects, target_dir):
def iter_deduped(git_objects):
yielded_ids = set()
for git_object in git_objects:
git_object_id = git_object.id
if git_object_id not in yielded_ids:
yielded_ids.add(git_object_id)
yield git_object
pack_start_time = time.time()
dataset_code = os.path.basename(sdmx_file_path[:-len(sdmx_file_extension)])
pack_file_name = "pack-{}".format(dataset_code)
pack_file_path = os.path.abspath(os.path.join(target_dir, "objects", "pack", pack_file_name))
write_pack(pack_file_path, iter_deduped(git_objects))
write_pack(pack_file_path, git_objects)
pack_time = time.time() - pack_start_time
log.info("Git pack file %s written, took %s seconds", pack_file_path, pack_time)
# Dulwich functions sightly modified not to require passing objects length.
# To achieve that:
# - `write_pack_data` writes a dummy number (42) of Git objects, counts objects during iteration and returns the number.
# - `write_pack` overwrites this dummy number after Git objects are written, by the number of objects returned by `write_pack_data`.
# A drawback is that the checksum of the pack file is wrong now, so "git verify-pack" fails, but not other git commands.
# Dulwich functions sightly modified to accept a generator and iterate it only one time.
# As a consequence, the objects given in argument must no more implement len(). To achieve that:
# - `write_pack_data` writes a dummy number (42) of Git objects, counts objects during iteration and returns the number,
# - `write_pack` overwrites this dummy number after Git objects are written, by the number of objects returned by `write_pack_data`,
# - write the SHA-1 checksum of the pack file afterwards.
#
# Also, adds an "append" mode to add objects to a pack file:
# - remove the SHA-1 checksum,
# - append objects,
# - recompute the checksum and write it,
# - write the index file (.idx) of the pack file (.pack) taking into account existing objects count and existing entries.
def iter_deduped(git_objects, existing_ids=None):
yielded_ids = set() if existing_ids is None else existing_ids
for git_object in git_objects:
git_object_id = git_object.id
if git_object_id not in yielded_ids:
yielded_ids.add(git_object_id)
yield git_object
def write_pack(filename, objects, deltify=None, delta_window_size=None):
def write_pack(file_path, objects, deltify=None, delta_window_size=None):
"""Write a new pack data file.
:param filename: Path to the new pack file (without .pack extension)
:param file_path: Path to the new pack file (without .pack extension)
:param objects: Iterable of (object, path) tuples to write.
Should provide __len__
:param window_size: Delta window size
:param deltify: Whether to deltify pack objects
:return: Tuple with checksum of pack file and index file
"""
pack_file_path = filename + '.pack'
with GitFile(pack_file_path, 'wb') as f:
entries, data_sum, num_objects = write_pack_objects(
f, objects, delta_window_size=delta_window_size, deltify=deltify)
pack_file_path = file_path + '.pack'
if os.path.isfile(pack_file_path):
pack_index = load_pack_index(file_path + '.idx')
existing_entries = {
object_id: (offset, crc32)
for object_id, offset, crc32 in pack_index.iterentries()
}
with open(pack_file_path, 'r+b') as f:
f.seek(8)
existing_num_objects = struct.unpack(b'>L', f.read(4))[0]
f.seek(-20, io.SEEK_END)
f.truncate()
new_entries, new_num_objects = write_pack_objects(
f,
iter_deduped(objects, existing_ids=set(pack_index)),
delta_window_size=delta_window_size,
deltify=deltify,
)
num_objects = existing_num_objects + new_num_objects
entries = {}
entries.update(existing_entries)
entries.update(new_entries)
else:
with GitFile(pack_file_path, 'wb') as f:
write_pack_header(f, 42) # Temporary value, overwritten after Git objects are written.
entries, num_objects = write_pack_objects(
f,
iter_deduped(objects),
delta_window_size=delta_window_size,
deltify=deltify,
)
# Overwrite the dummy number of Git objects written during `write_pack_objects`.
with open(pack_file_path, "r+b") as pack_file:
pack_file.seek(8)
pack_file.write(struct.pack(b'>L', num_objects))
with open(pack_file_path, "r+b") as f:
f.seek(8)
f.write(struct.pack(b'>L', num_objects))
f.seek(0)
BUF_SIZE = 65536 * 1024
sha1 = hashlib.sha1()
while True:
data = f.read(BUF_SIZE)
if not data:
break
sha1.update(data)
f.seek(0, io.SEEK_END)
data_sum = sha1.digest()
f.write(data_sum)
entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()])
with GitFile(filename + '.idx', 'wb') as f:
with GitFile(file_path + '.idx', 'wb') as f:
return data_sum, write_pack_index_v2(f, entries, data_sum)
......@@ -411,11 +462,9 @@ def write_pack_data(f, records):
"""
# Write the pack
entries = {}
f = SHA1Writer(f)
write_pack_header(f, 42) # Temporary value, overwritten after Git objects are written.
num_objects = 0
for type_num, object_id, delta_base, raw in records:
offset = f.offset()
offset = f.tell()
if delta_base is not None:
try:
base_offset, base_crc32 = entries[delta_base]
......@@ -429,7 +478,7 @@ def write_pack_data(f, records):
entries[object_id] = (offset, crc32)
num_objects += 1
return entries, f.write_sha(), num_objects
return entries, num_objects
def write_pack_objects(f, objects, delta_window_size=None, deltify=False):
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment