Commit de03b940 authored by Bruno Duyé's avatar Bruno Duyé

Convert - add errors.json artifact

parent 592a079a
......@@ -52,6 +52,8 @@ from docopt import docopt
import jsonstat
import toolz
import ujson as json
from dbnomics_json_errors import ErrorsArtifact
PROVIDER_JSON = dict(
code='CSO',
......@@ -85,31 +87,32 @@ def main():
write_json_file(target_dir / 'provider.json', PROVIDER_JSON)
datasets_files = sorted(source_dir.glob('*.json'))
ignored_datasets = total_nb_datasets = converted_datasets = 0
datasets_in_error = []
nb_ignored_datasets = nb_total_nb_datasets = nb_converted_datasets = nb_expected_datasets = 0
errors_artifact = ErrorsArtifact()
from_option_first_dataset_reached = False # Only used when "--from" option is given
for dataset_file in datasets_files:
if dataset_file.name == "category_tree.json":
continue
dataset_code = Path(dataset_file).stem # "C1323.json" => "C1323"
total_nb_datasets += 1
nb_total_nb_datasets += 1
if args["--only"] and not dataset_code in args["--only"]:
ignored_datasets += 1
nb_ignored_datasets += 1
continue
if args["--limit_nb_datasets"] and converted_datasets == int(args["--limit_nb_datasets"]):
if args["--limit_nb_datasets"] and nb_converted_datasets == int(args["--limit_nb_datasets"]):
break
if args["--from"]:
if dataset_code == args["--from"]:
from_option_first_dataset_reached = True
if not from_option_first_dataset_reached:
ignored_datasets += 1
nb_ignored_datasets += 1
continue
nb_expected_datasets += 1
try:
dataset_dict = iterate_dataset(dataset_file)
except DatasetAbortedException as e:
log.error("Error in dataset {!r} => dataset aborded".format(dataset_code))
log.error(e)
datasets_in_error.append(dataset_code)
errors_artifact.add_dataset_error(dataset_code, str(e))
continue
# Create dataset dir
log.debug("* dataset {}".format(dataset_code))
......@@ -120,11 +123,11 @@ def main():
jsonl_series_file = (dataset_dir / 'series.jsonl').open('w', encoding='utf8')
try:
for series_dict in dataset_dict['series_iter']:
series_code = series_dict['code']
# Add series information to series.jsonl
add_series_to_jsonl_file(series_dict, jsonl_series_file)
except DatasetAbortedException as e:
log.warning("{!r} dataset aborted ! - {}".format(dataset_code, e))
errors_artifact.add_dataset_error(dataset_code, str(e))
# Delete dataset dir
shutil.rmtree(str(dataset_dir))
continue
......@@ -137,23 +140,29 @@ def main():
'dimensions_codes_order': dataset_dict['dimensions_codes_order'],
'dimensions_values_labels': dataset_dict['dimensions_values_labels'],
})
converted_datasets += 1
nb_converted_datasets += 1
# Open category_tree.json file from source dir
with Path(source_dir / 'category_tree.json').open() as f:
tree = json.load(f)
# Remove "href" keys from tree
remove_href_keys(tree)
# Write filtered tree to target_dir
# Write filtered category tree to target_dir
write_json_file(Path(target_dir / 'category_tree.json'), tree)
# Write errors.json file
errors_artifact.write_json_file(target_dir, nb_expected_datasets=nb_expected_datasets)
log.info('END')
if ignored_datasets > 0:
log.info("{} ignored dataset(s) due to '--only' or '--from' options".format(ignored_datasets))
if datasets_in_error:
ignored_datasets_str = ': ' + ', '.join(datasets_in_error) if len(datasets_in_error) < 100 else ''
log.error("{}/{} dataset(s) ignored due to errors {}".format(len(datasets_in_error), total_nb_datasets,
ignored_datasets_str))
# Display convertion statistics
if nb_ignored_datasets > 0:
log.info("{} ignored dataset(s) due to '--only' or '--from' options".format(nb_ignored_datasets))
if errors_artifact.get_nb_errors():
ignored_datasets_str = ': ' + \
', '.join(e['dataset_code']
for e in errors_artifact.datasets_errors) if errors_artifact.get_nb_errors() < 100 else ''
log.info("{}/{} dataset(s) ignored due to errors {}".format(errors_artifact.get_nb_errors(),
nb_expected_datasets, ignored_datasets_str))
def iterate_dataset(filepath):
......
......@@ -4,3 +4,4 @@ toolz
lxml
requests
jsonstat.py
git+https://git.nomics.world/dbnomics/dbnomics-json-errors.git@0.1.1#egg=dbnomics_json_errors
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment