Skip to content
Snippets Groups Projects
Commit 89bdc692 authored by Christophe Benz's avatar Christophe Benz
Browse files

Don't index already indexed packs

parent 7ac5c38a
No related branches found
No related tags found
No related merge requests found
Pipeline #1279 failed with stage
in 1 minute and 23 seconds
......@@ -65,14 +65,30 @@ def main():
pack_dir_path = os.path.abspath(os.path.join(args.target_dir, "objects", "pack"))
# Load datasets index in Git repository.
dataset_pair_by_dataset_code = {}
dataset_index_file_path = os.path.abspath(os.path.join(args.target_dir, "objects", "info", "datasets"))
with open(dataset_index_file_path, "w") as dataset_index_file:
if os.path.isfile(dataset_index_file_path):
with open(dataset_index_file_path) as dataset_index_file:
def to_pair(line):
fragments = line[:-1].split("\t") # Remove final "\n".
return fragments[0], fragments[1:3]
dataset_pair_by_dataset_code = dict(map(to_pair, dataset_index_file.readlines()))
# Update datasets index file.
with open(dataset_index_file_path, "a") as dataset_index_file:
for pack_file_name in glob.iglob(os.path.join(pack_dir_path, "*.pack")):
dataset_code = os.path.basename(pack_file_name)[len("pack-"):-len(".pack")]
if dataset_code in dataset_pair_by_dataset_code:
log.info("Pack %s already indexed, skipping", pack_file_name)
continue
pack_file_path = os.path.join(pack_dir_path, pack_file_name)
log.info("Indexing pack %s (size: %d)", pack_file_name, os.path.getsize(pack_file_path))
# The last yield in iter_git_objects_in_sdmx_file is dataset_tree,
# because it was yielded last by iter_git_objects_in_sdmx_file.
dataset_code = os.path.basename(pack_file_name)[len("pack-"):-len(".pack")]
# Find tree ID of dataset in pack.
process = subprocess.run(
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment