Don't index already indexed packs

89bdc692 · Christophe Benz · 7ac5c38a · 89bdc692
Commit 89bdc692 authored 7 years ago by Christophe Benz
--- a/update_dataset_index.py
+++ b/update_dataset_index.py
@@ -65,14 +65,30 @@ def main():

    pack_dir_path = os.path.abspath(os.path.join(args.target_dir, "objects", "pack"))

+    # Load datasets index in Git repository.
+
+    dataset_pair_by_dataset_code = {}
    dataset_index_file_path = os.path.abspath(os.path.join(args.target_dir, "objects", "info", "datasets"))
-    with open(dataset_index_file_path, "w") as dataset_index_file:
+    if os.path.isfile(dataset_index_file_path):
+        with open(dataset_index_file_path) as dataset_index_file:
+            def to_pair(line):
+                fragments = line[:-1].split("\t")  # Remove final "\n".
+                return fragments[0], fragments[1:3]
+            dataset_pair_by_dataset_code = dict(map(to_pair, dataset_index_file.readlines()))
+
+    # Update datasets index file.
+
+    with open(dataset_index_file_path, "a") as dataset_index_file:
        for pack_file_name in glob.iglob(os.path.join(pack_dir_path, "*.pack")):
+            dataset_code = os.path.basename(pack_file_name)[len("pack-"):-len(".pack")]
+            if dataset_code in dataset_pair_by_dataset_code:
+                log.info("Pack %s already indexed, skipping", pack_file_name)
+                continue
+
            pack_file_path = os.path.join(pack_dir_path, pack_file_name)
            log.info("Indexing pack %s (size: %d)", pack_file_name, os.path.getsize(pack_file_path))
            # The last yield in iter_git_objects_in_sdmx_file is dataset_tree,
            # because it was yielded last by iter_git_objects_in_sdmx_file.
-            dataset_code = os.path.basename(pack_file_name)[len("pack-"):-len(".pack")]

            # Find tree ID of dataset in pack.
            process = subprocess.run(