Skip to content
Snippets Groups Projects
Commit 0ff0ac25 authored by Christophe Benz's avatar Christophe Benz
Browse files

Remove incremental mode for convert

parent 282c2a14
No related branches found
No related tags found
No related merge requests found
......@@ -28,12 +28,9 @@ import argparse
import logging
import os
import re
import shutil
import subprocess
import sys
import time
from collections import OrderedDict
from io import StringIO
from pathlib import Path
import humanize
......@@ -375,12 +372,6 @@ def main():
default=datasets_from_env,
help="convert only the given datasets (datasets codes, space separated)",
)
parser.add_argument(
"--full",
action="store_true",
default=os.getenv(FULL_ENV_VAR),
help="convert all datasets; default behavior is to convert what changed since last commit",
)
parser.add_argument("--log", default="INFO", help="level of logging messages")
parser.add_argument("--resume", action="store_true", help="do not process already written datasets")
parser.add_argument("--start-from", metavar="DATASET_CODE", help="start indexing from dataset code")
......@@ -396,40 +387,9 @@ def main():
raise ValueError("Invalid log level: {}".format(args.log))
logging.basicConfig(format="%(levelname)s:%(message)s", level=numeric_level)
if args.datasets:
args.full = True
# Ask Git which datasets directories were modified in latest commit in source-data repository.
if not args.full:
try:
output = subprocess.check_output(
["git", "diff", "--name-status", "HEAD^", datasets_dir_name],
cwd=str(args.source_dir),
universal_newlines=True,
)
except subprocess.CalledProcessError:
args.full = True
else:
modified_datasets_codes = set()
deleted_datasets_codes = set()
for line in StringIO(output):
action, file_path = line.strip().split()
try:
dataset_code = Path(file_path).parent.relative_to(datasets_dir_name).name
except ValueError:
continue
if action in {"A", "M"}:
modified_datasets_codes.add(dataset_code)
else:
assert action == "D", action
deleted_datasets_codes.add(dataset_code)
log.info(
"%d datasets were modified and %d were deleted by last download",
len(modified_datasets_codes),
len(deleted_datasets_codes),
)
log.info("Command-line arguments: %r", args)
log.info("Mode: %s", "full" if args.full else "incremental")
write_json_file(args.target_dir / "provider.json", provider_json)
# Parse "table_of_contents", abbreviated "toc".
toc_element = etree.parse(str(args.source_dir / "table_of_contents.xml")).getroot()
......@@ -439,6 +399,9 @@ def main():
toc_dataset_json_stub_by_code = {}
category_tree_json = toc_to_category_tree(toc_element, toc_dataset_json_stub_by_code)
if category_tree_json:
write_json_file(args.target_dir / "category_tree.json", category_tree_json)
# Build list of datasets codes to convert
datasets_codes_to_convert = set()
for dataset_code in sorted(toc_dataset_json_stub_by_code):
......@@ -448,12 +411,6 @@ def main():
dataset_code,
)
continue
if not args.full and dataset_code not in modified_datasets_codes:
log.debug(
"Skipping dataset %r because it was not modified by last download (due to incremental mode)",
dataset_code,
)
continue
if args.start_from is not None and dataset_code < args.start_from:
log.debug("Skipping dataset %r because of --start-from option", dataset_code)
continue
......@@ -474,7 +431,7 @@ def main():
)
continue
dataset_dir = args.target_dir / dataset_code
if args.resume and (dataset_dir / "dataset.json").is_file():
if args.resume and dataset_dir.is_dir():
log.debug(
"Skipping dataset %r because it already exists (due to --resume option)",
dataset_code,
......@@ -484,20 +441,6 @@ def main():
log.info("Converting %d datasets...", len(datasets_codes_to_convert))
# Remove directories of datasets to be converted before converting.
if not args.resume:
datasets_codes_to_delete = datasets_codes_to_convert
if not args.full:
datasets_codes_to_delete = datasets_codes_to_delete.union(deleted_datasets_codes)
log.info(
"Removing directories of deleted datasets and datasets to be converted: %r",
datasets_codes_to_delete,
)
for dataset_code in datasets_codes_to_delete:
dataset_dir = args.target_dir / dataset_code
if dataset_dir.is_dir():
shutil.rmtree(str(dataset_dir))
# Convert SDMX files. Side-effect: write files for each dataset.
converted_datasets_codes = set()
for index, dataset_code in enumerate(sorted(datasets_codes_to_convert), start=1):
......@@ -524,11 +467,6 @@ def main():
converted_datasets_codes.add(dataset_code)
write_json_file(args.target_dir / "provider.json", provider_json)
if category_tree_json:
write_json_file(args.target_dir / "category_tree.json", category_tree_json)
return 0
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment