Skip to content
Snippets Groups Projects
Commit a97ab709 authored by Christophe Benz's avatar Christophe Benz
Browse files

Find existing dataset tree in packs instead of commit

parent 60183894
No related branches found
No related tags found
No related merge requests found
Pipeline #1183 canceled with stage
......@@ -25,7 +25,7 @@
import argparse
from collections import OrderedDict
from collections import deque, OrderedDict
import hashlib
import io
import itertools
......@@ -313,9 +313,6 @@ def main():
repo = Repo(args.target_dir)
if args.keep_packs:
assert b'HEAD' in repo.get_refs()
# Write provider.json, category, datasets, series and observations in repo.
provider_tree = Tree()
......@@ -373,25 +370,26 @@ def main():
return 0
def tail(n, iterable):
"Return an iterator over the last n items"
# tail(3, 'ABCDEFG') --> E F G
return iter(deque(iterable, maxlen=n))
def add_dataset_to_category(repo, dataset_code, category_tree):
# Keep dataset tree of current commit for the next commit, to avoid deleting the dataset.
dataset_tree = find_dataset_tree(repo, dataset_code, tree=repo[repo[repo.head()].tree])
# Don't find in HEAD commit because the script could have failed before committing.
if not repo.object_store.packs:
return None
dataset_tree = None
for pack in repo.object_store.packs:
if os.path.basename(pack._basename) != "pack-{}".format(dataset_code):
continue
# Because the last yield in iter_git_objects_in_sdmx_file is dataset_tree.
dataset_tree = next(tail(1, pack.iterobjects()))
assert isinstance(dataset_tree, Tree) and b"dataset.json" in dataset_tree
assert dataset_tree is not None
category_tree.add(dataset_code.encode('utf-8'), git_tree_filemode, dataset_tree.id)
def find_dataset_tree(repo, dataset_code, tree):
"""Find the Git tree corresponding to a dataset."""
if b"dataset.json" in tree and tree[b"dataset.json"][0] == git_blob_filemode:
dataset_json = json.loads(repo[tree[b"dataset.json"][1]].data.decode('utf-8'))
return tree \
if dataset_json["code"] == dataset_code \
else None
for tree_entry in tree.iteritems():
if tree_entry.mode == git_tree_filemode:
found_tree = find_dataset_tree(repo, dataset_code, tree=repo[tree_entry.sha])
if found_tree is not None:
return found_tree
return None
......@@ -401,16 +399,18 @@ def write_dataset_pack(repo, sdmx_file_path, category_tree):
dataset_code = os.path.basename(sdmx_file_path[:-len(sdmx_file_extension)])
pack_file_name = "pack-{}.pack".format(dataset_code) # Must be named like so to be recognized as packs by dulwich.
pack_file_path = os.path.abspath(os.path.join(args.target_dir, "objects", "pack", pack_file_name))
if args.datasets_codes is not None and dataset_code not in args.datasets_codes:
add_dataset_to_category(repo, dataset_code, category_tree)
elif args.keep_packs and os.path.isfile(pack_file_path):
log.info("Git pack file %s already exists: skipping pack generation", pack_file_path)
add_dataset_to_category(repo, dataset_code, category_tree)
if args.datasets_codes is None or dataset_code in args.datasets_codes:
if args.keep_packs and os.path.isfile(pack_file_path):
log.info("Git pack file %s already exists: skipping pack generation", pack_file_path)
add_dataset_to_category(repo, dataset_code, category_tree)
else:
pack_start_time = time.time()
write_pack(pack_file_path, git_objects)
pack_time = time.time() - pack_start_time
log.info("Git pack file %s written, took %s seconds", pack_file_path, pack_time)
else:
pack_start_time = time.time()
write_pack(pack_file_path, git_objects)
pack_time = time.time() - pack_start_time
log.info("Git pack file %s written, took %s seconds", pack_file_path, pack_time)
# In order not to delete this dataset in the new commit, add it to category_tree.
add_dataset_to_category(repo, dataset_code, category_tree)
# Dulwich functions sightly modified to accept a generator and iterate it only one time.
......@@ -480,11 +480,12 @@ def write_pack(pack_file_path, objects, deltify=None, delta_window_size=None):
deltify=deltify,
)
# Overwrite the dummy number of Git objects written during `write_pack_objects`.
with open(pack_file_path, "r+b") as f:
# Overwrite the dummy number of Git objects written during `write_pack_objects`.
f.seek(8)
f.write(struct.pack(b'>L', num_objects))
# Recompute the pack SHA-1 checksum.
f.seek(0)
BUF_SIZE = 65536 * 1024
sha1 = hashlib.sha1()
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment