update_dataset_index.py

#! /usr/bin/env python3

# eurostat-fetcher -- Fetch series from Eurostat database
# By: Christophe Benz <christophe.benz@cepremap.org>
#
# Copyright (C) 2017 Cepremap
# https://git.nomics.world/dbnomics-fetchers/eurostat-fetcher
#
# eurostat-fetcher is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# eurostat-fetcher is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http:>www.gnu.org/licenses/>.


"""Update .git/objects/info/datasets index file, which is a JSON file containing stuff like this:

ilc_mdes01      aa1675b190b33b8780456b12be0661e36a1fe9d9        18aea78b30f0ecbb1326592bb774ddb570fb8872
met_pjanaggr3   6e0fa8b80ca7a8fe5bbfa64bce1d5eab11dbc1cd        bbf60063bb70b48da3874ec193ce35c61d6071ae
nrg_134m        e3337a0f5f57367b191e6e04205499d0ab614e5e        4a68fecda8f6a82f53e68d32d821c817f154620d
lfsq_ipga       740dfcfebb6e14b26f8605e7fd9b03ec3a30510f        a67a085713d5061096f06615c15640f9f2324700

Columns: dataset_code, dataset_tree_id, dataset_json_blob_id

This index is used by eurostat_to_dbnomics.py to quickly find the dataset tree ID of each pack.
"""

import argparse
from collections import deque
import glob
import json
import logging
import os
import subprocess
import sys


log = logging.getLogger(__name__)


def main():
    global args
    script_name = os.path.basename(sys.argv[0])

    parser = argparse.ArgumentParser()
    parser.add_argument(
        'target_dir',
        help='path of target directory containing datasets & series in DB.nomics JSON and TSV formats',
    )
    parser.add_argument(
        '--debug',
        action='store_true',
        help='display logging messages from debug level',
    )
    args = parser.parse_args()

    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)

    pack_dir_path = os.path.abspath(os.path.join(args.target_dir, "objects", "pack"))

    # Load datasets index in Git repository.

    dataset_pair_by_dataset_code = {}
    dataset_index_file_path = os.path.abspath(os.path.join(args.target_dir, "objects", "info", "datasets"))
    if os.path.isfile(dataset_index_file_path):
        with open(dataset_index_file_path) as dataset_index_file:
            def to_pair(line):
                fragments = line[:-1].split("\t")  # Remove final "\n".
                return fragments[0], fragments[1:3]
            dataset_pair_by_dataset_code = dict(map(to_pair, dataset_index_file.readlines()))

    # Update datasets index file.

    with open(dataset_index_file_path, "a") as dataset_index_file:
        for pack_file_name in glob.iglob(os.path.join(pack_dir_path, "*.pack")):
            dataset_code = os.path.basename(pack_file_name)[len("pack-"):-len(".pack")]
            if dataset_code in dataset_pair_by_dataset_code:
                log.info("Pack %s already indexed, skipping", pack_file_name)
                continue

            pack_file_path = os.path.join(pack_dir_path, pack_file_name)
            log.info("Indexing pack %s (size: %d)", pack_file_name, os.path.getsize(pack_file_path))
            # The last yield in iter_git_objects_in_sdmx_file is dataset_tree,
            # because it was yielded last by iter_git_objects_in_sdmx_file.

            # Find tree ID of dataset in pack.
            process = subprocess.run(
                "git verify-pack -v {} | grep tree | tail -n 1".format(pack_file_name),
                shell=True,
                cwd=args.target_dir,
                check=True,
                stdout=subprocess.PIPE,
            )
            dataset_tree_id = process.stdout.decode('utf-8').split(" ")[0]
            if not dataset_tree_id:
                log.error("Could not retrieve dataset_tree_id from pack %s, skipping", pack_file_name)
                continue

            # Find blob ID of dataset.json in dataset tree.
            process = subprocess.run(
                "git ls-tree {} | grep dataset.json".format(dataset_tree_id),
                shell=True,
                cwd=args.target_dir,
                check=True,
                stdout=subprocess.PIPE,
            )
            dataset_json_blob_id = process.stdout.decode('utf-8').split(" ")[2].split("\t")[0]
            assert dataset_json_blob_id

            dataset_index_file.write("\t".join([dataset_code, dataset_tree_id, dataset_json_blob_id]) + "\n")

    log.info("Datasets index file written: %s", dataset_index_file_path)

    return 0


if __name__ == '__main__':
    sys.exit(main())