Newer
Older
#! /usr/bin/env python3
# eurostat-fetcher -- Fetch series from Eurostat database
# By: Christophe Benz <christophe.benz@cepremap.org>
#
# Copyright (C) 2017 Cepremap
# https://git.nomics.world/dbnomics-fetchers/eurostat-fetcher
#
# eurostat-fetcher is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# eurostat-fetcher is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http:>www.gnu.org/licenses/>.
"""Update .git/objects/info/datasets index file, which is a JSON file containing stuff like this:
ilc_mdes01 aa1675b190b33b8780456b12be0661e36a1fe9d9 18aea78b30f0ecbb1326592bb774ddb570fb8872
met_pjanaggr3 6e0fa8b80ca7a8fe5bbfa64bce1d5eab11dbc1cd bbf60063bb70b48da3874ec193ce35c61d6071ae
nrg_134m e3337a0f5f57367b191e6e04205499d0ab614e5e 4a68fecda8f6a82f53e68d32d821c817f154620d
lfsq_ipga 740dfcfebb6e14b26f8605e7fd9b03ec3a30510f a67a085713d5061096f06615c15640f9f2324700
Columns: dataset_code, dataset_tree_id, dataset_json_blob_id
This index is used by eurostat_to_dbnomics.py to quickly find the dataset tree ID of each pack.
"""
import argparse
from collections import deque
import json
import logging
import os
import sys
log = logging.getLogger(__name__)
def main():
global args
script_name = os.path.basename(sys.argv[0])
parser = argparse.ArgumentParser()
parser.add_argument(
'target_dir',
help='path of target directory containing datasets & series in DB.nomics JSON and TSV formats',
)
parser.add_argument(
'--debug',
action='store_true',
help='display logging messages from debug level',
)
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
pack_dir_path = os.path.abspath(os.path.join(args.target_dir, "objects", "pack"))
# Load datasets index in Git repository.
dataset_pair_by_dataset_code = {}
dataset_index_file_path = os.path.abspath(os.path.join(args.target_dir, "objects", "info", "datasets"))
if os.path.isfile(dataset_index_file_path):
with open(dataset_index_file_path) as dataset_index_file:
def to_pair(line):
fragments = line[:-1].split("\t") # Remove final "\n".
return fragments[0], fragments[1:3]
dataset_pair_by_dataset_code = dict(map(to_pair, dataset_index_file.readlines()))
# Update datasets index file.
with open(dataset_index_file_path, "a") as dataset_index_file:
for pack_file_name in glob.iglob(os.path.join(pack_dir_path, "*.pack")):
dataset_code = os.path.basename(pack_file_name)[len("pack-"):-len(".pack")]
if dataset_code in dataset_pair_by_dataset_code:
log.info("Pack %s already indexed, skipping", pack_file_name)
continue
pack_file_path = os.path.join(pack_dir_path, pack_file_name)
log.info("Indexing pack %s (size: %d)", pack_file_name, os.path.getsize(pack_file_path))
# The last yield in iter_git_objects_in_sdmx_file is dataset_tree,
# because it was yielded last by iter_git_objects_in_sdmx_file.
process = subprocess.run(
"git verify-pack -v {} | grep tree | tail -n 1".format(pack_file_name),
shell=True,
check=True,
stdout=subprocess.PIPE,
)
dataset_tree_id = process.stdout.decode('utf-8').split(" ")[0]
if not dataset_tree_id:
log.error("Could not retrieve dataset_tree_id from pack %s, skipping", pack_file_name)
continue
# Find blob ID of dataset.json in dataset tree.
process = subprocess.run(
"git ls-tree {} | grep dataset.json".format(dataset_tree_id),
shell=True,
cwd=args.target_dir,
check=True,
stdout=subprocess.PIPE,
)
dataset_json_blob_id = process.stdout.decode('utf-8').split(" ")[2].split("\t")[0]
assert dataset_json_blob_id
dataset_index_file.write("\t".join([dataset_code, dataset_tree_id, dataset_json_blob_id]) + "\n")
log.info("Datasets index file written: %s", dataset_index_file_path)
return 0
if __name__ == '__main__':
sys.exit(main())