Skip to content
Snippets Groups Projects

Draft: Read previous datetime from env

Closed Christophe Benz requested to merge 821-read-datetime-from-env into master
+ 6
68
@@ -28,12 +28,9 @@ import argparse
import logging
import os
import re
import shutil
import subprocess
import sys
import time
from collections import OrderedDict
from io import StringIO
from pathlib import Path
import humanize
@@ -375,12 +372,6 @@ def main():
default=datasets_from_env,
help="convert only the given datasets (datasets codes, space separated)",
)
parser.add_argument(
"--full",
action="store_true",
default=os.getenv(FULL_ENV_VAR),
help="convert all datasets; default behavior is to convert what changed since last commit",
)
parser.add_argument("--log", default="INFO", help="level of logging messages")
parser.add_argument("--resume", action="store_true", help="do not process already written datasets")
parser.add_argument("--start-from", metavar="DATASET_CODE", help="start indexing from dataset code")
@@ -396,40 +387,9 @@ def main():
raise ValueError("Invalid log level: {}".format(args.log))
logging.basicConfig(format="%(levelname)s:%(message)s", level=numeric_level)
if args.datasets:
args.full = True
# Ask Git which datasets directories were modified in latest commit in source-data repository.
if not args.full:
try:
output = subprocess.check_output(
["git", "diff", "--name-status", "HEAD^", datasets_dir_name],
cwd=str(args.source_dir),
universal_newlines=True,
)
except subprocess.CalledProcessError:
args.full = True
else:
modified_datasets_codes = set()
deleted_datasets_codes = set()
for line in StringIO(output):
action, file_path = line.strip().split()
try:
dataset_code = Path(file_path).parent.relative_to(datasets_dir_name).name
except ValueError:
continue
if action in {"A", "M"}:
modified_datasets_codes.add(dataset_code)
else:
assert action == "D", action
deleted_datasets_codes.add(dataset_code)
log.info(
"%d datasets were modified and %d were deleted by last download",
len(modified_datasets_codes),
len(deleted_datasets_codes),
)
log.info("Command-line arguments: %r", args)
log.info("Mode: %s", "full" if args.full else "incremental")
write_json_file(args.target_dir / "provider.json", provider_json)
# Parse "table_of_contents", abbreviated "toc".
toc_element = etree.parse(str(args.source_dir / "table_of_contents.xml")).getroot()
@@ -439,6 +399,9 @@ def main():
toc_dataset_json_stub_by_code = {}
category_tree_json = toc_to_category_tree(toc_element, toc_dataset_json_stub_by_code)
if category_tree_json:
write_json_file(args.target_dir / "category_tree.json", category_tree_json)
# Build list of datasets codes to convert
datasets_codes_to_convert = set()
for dataset_code in sorted(toc_dataset_json_stub_by_code):
@@ -448,12 +411,6 @@ def main():
dataset_code,
)
continue
if not args.full and dataset_code not in modified_datasets_codes:
log.debug(
"Skipping dataset %r because it was not modified by last download (due to incremental mode)",
dataset_code,
)
continue
if args.start_from is not None and dataset_code < args.start_from:
log.debug("Skipping dataset %r because of --start-from option", dataset_code)
continue
@@ -474,7 +431,7 @@ def main():
)
continue
dataset_dir = args.target_dir / dataset_code
if args.resume and (dataset_dir / "dataset.json").is_file():
if args.resume and dataset_dir.is_dir():
log.debug(
"Skipping dataset %r because it already exists (due to --resume option)",
dataset_code,
@@ -484,20 +441,6 @@ def main():
log.info("Converting %d datasets...", len(datasets_codes_to_convert))
# Remove directories of datasets to be converted before converting.
if not args.resume:
datasets_codes_to_delete = datasets_codes_to_convert
if not args.full:
datasets_codes_to_delete = datasets_codes_to_delete.union(deleted_datasets_codes)
log.info(
"Removing directories of deleted datasets and datasets to be converted: %r",
datasets_codes_to_delete,
)
for dataset_code in datasets_codes_to_delete:
dataset_dir = args.target_dir / dataset_code
if dataset_dir.is_dir():
shutil.rmtree(str(dataset_dir))
# Convert SDMX files. Side-effect: write files for each dataset.
converted_datasets_codes = set()
for index, dataset_code in enumerate(sorted(datasets_codes_to_convert), start=1):
@@ -524,11 +467,6 @@ def main():
converted_datasets_codes.add(dataset_code)
write_json_file(args.target_dir / "provider.json", provider_json)
if category_tree_json:
write_json_file(args.target_dir / "category_tree.json", category_tree_json)
return 0