...
 
Commits (2)
...@@ -52,6 +52,8 @@ from docopt import docopt ...@@ -52,6 +52,8 @@ from docopt import docopt
import jsonstat import jsonstat
import toolz import toolz
import ujson as json import ujson as json
from dbnomics_json_errors import ErrorsArtifact
PROVIDER_JSON = dict( PROVIDER_JSON = dict(
code='CSO', code='CSO',
...@@ -85,31 +87,32 @@ def main(): ...@@ -85,31 +87,32 @@ def main():
write_json_file(target_dir / 'provider.json', PROVIDER_JSON) write_json_file(target_dir / 'provider.json', PROVIDER_JSON)
datasets_files = sorted(source_dir.glob('*.json')) datasets_files = sorted(source_dir.glob('*.json'))
ignored_datasets = total_nb_datasets = converted_datasets = 0 nb_ignored_datasets = nb_total_nb_datasets = nb_converted_datasets = nb_expected_datasets = 0
datasets_in_error = [] errors_artifact = ErrorsArtifact()
from_option_first_dataset_reached = False # Only used when "--from" option is given from_option_first_dataset_reached = False # Only used when "--from" option is given
for dataset_file in datasets_files: for dataset_file in datasets_files:
if dataset_file.name == "category_tree.json": if dataset_file.name == "category_tree.json":
continue continue
dataset_code = Path(dataset_file).stem # "C1323.json" => "C1323" dataset_code = Path(dataset_file).stem # "C1323.json" => "C1323"
total_nb_datasets += 1 nb_total_nb_datasets += 1
if args["--only"] and not dataset_code in args["--only"]: if args["--only"] and not dataset_code in args["--only"]:
ignored_datasets += 1 nb_ignored_datasets += 1
continue continue
if args["--limit_nb_datasets"] and converted_datasets == int(args["--limit_nb_datasets"]): if args["--limit_nb_datasets"] and nb_converted_datasets == int(args["--limit_nb_datasets"]):
break break
if args["--from"]: if args["--from"]:
if dataset_code == args["--from"]: if dataset_code == args["--from"]:
from_option_first_dataset_reached = True from_option_first_dataset_reached = True
if not from_option_first_dataset_reached: if not from_option_first_dataset_reached:
ignored_datasets += 1 nb_ignored_datasets += 1
continue continue
nb_expected_datasets += 1
try: try:
dataset_dict = iterate_dataset(dataset_file) dataset_dict = iterate_dataset(dataset_file)
except DatasetAbortedException as e: except DatasetAbortedException as e:
log.error("Error in dataset {!r} => dataset aborded".format(dataset_code)) log.error("Error in dataset {!r} => dataset aborded".format(dataset_code))
log.error(e) log.error(e)
datasets_in_error.append(dataset_code) errors_artifact.add_dataset_error(dataset_code, str(e))
continue continue
# Create dataset dir # Create dataset dir
log.debug("* dataset {}".format(dataset_code)) log.debug("* dataset {}".format(dataset_code))
...@@ -120,11 +123,11 @@ def main(): ...@@ -120,11 +123,11 @@ def main():
jsonl_series_file = (dataset_dir / 'series.jsonl').open('w', encoding='utf8') jsonl_series_file = (dataset_dir / 'series.jsonl').open('w', encoding='utf8')
try: try:
for series_dict in dataset_dict['series_iter']: for series_dict in dataset_dict['series_iter']:
series_code = series_dict['code']
# Add series information to series.jsonl # Add series information to series.jsonl
add_series_to_jsonl_file(series_dict, jsonl_series_file) add_series_to_jsonl_file(series_dict, jsonl_series_file)
except DatasetAbortedException as e: except DatasetAbortedException as e:
log.warning("{!r} dataset aborted ! - {}".format(dataset_code, e)) log.warning("{!r} dataset aborted ! - {}".format(dataset_code, e))
errors_artifact.add_dataset_error(dataset_code, str(e))
# Delete dataset dir # Delete dataset dir
shutil.rmtree(str(dataset_dir)) shutil.rmtree(str(dataset_dir))
continue continue
...@@ -137,23 +140,29 @@ def main(): ...@@ -137,23 +140,29 @@ def main():
'dimensions_codes_order': dataset_dict['dimensions_codes_order'], 'dimensions_codes_order': dataset_dict['dimensions_codes_order'],
'dimensions_values_labels': dataset_dict['dimensions_values_labels'], 'dimensions_values_labels': dataset_dict['dimensions_values_labels'],
}) })
converted_datasets += 1 nb_converted_datasets += 1
# Open category_tree.json file from source dir # Open category_tree.json file from source dir
with Path(source_dir / 'category_tree.json').open() as f: with Path(source_dir / 'category_tree.json').open() as f:
tree = json.load(f) tree = json.load(f)
# Remove "href" keys from tree # Remove "href" keys from tree
remove_href_keys(tree) remove_href_keys(tree)
# Write filtered tree to target_dir # Write filtered category tree to target_dir
write_json_file(Path(target_dir / 'category_tree.json'), tree) write_json_file(Path(target_dir / 'category_tree.json'), tree)
# Write errors.json file
errors_artifact.write_json_file(target_dir, nb_expected_datasets=nb_expected_datasets)
log.info('END') log.info('END')
if ignored_datasets > 0: # Display convertion statistics
log.info("{} ignored dataset(s) due to '--only' or '--from' options".format(ignored_datasets)) if nb_ignored_datasets > 0:
if datasets_in_error: log.info("{} ignored dataset(s) due to '--only' or '--from' options".format(nb_ignored_datasets))
ignored_datasets_str = ': ' + ', '.join(datasets_in_error) if len(datasets_in_error) < 100 else '' if errors_artifact.get_nb_errors():
log.error("{}/{} dataset(s) ignored due to errors {}".format(len(datasets_in_error), total_nb_datasets, ignored_datasets_str = ': ' + \
ignored_datasets_str)) ', '.join(e['dataset_code']
for e in errors_artifact.datasets_errors) if errors_artifact.get_nb_errors() < 100 else ''
log.info("{}/{} dataset(s) ignored due to errors {}".format(errors_artifact.get_nb_errors(),
nb_expected_datasets, ignored_datasets_str))
def iterate_dataset(filepath): def iterate_dataset(filepath):
...@@ -187,24 +196,6 @@ def iterate_dataset(filepath): ...@@ -187,24 +196,6 @@ def iterate_dataset(filepath):
assert metric_dimension, "didn't found metric dimension in dataset !" assert metric_dimension, "didn't found metric dimension in dataset !"
return time_dimension, metric_dimension, other_dimensions return time_dimension, metric_dimension, other_dimensions
def get_dimensions_values_codes(dimensions_values_labels):
"""Return lists of all (dimension_code, dimension_value_code) for this dataset from dimensions_values_labels
This doesn't include time and metric dimensions.
Example:
[
[ {'code': 'Sex', 'value': '1'}, {'code': 'Sex', 'value': '2'} ],
[ {'code': 'Broad Industrial Group', 'value': '10'}, {'code': 'Broad Industrial Group', 'value': '15'}, {'code': 'Broad Industrial Group', 'value': '40'} ],
[ {'code': 'Regular Unpaid Help', 'value': '-6'}, {'code': 'Regular Unpaid Help', 'value': '01'} ]
]
"""
dimensions_values_array = []
for dimension_code, values in dimensions_values_labels.items():
dimensions_values_array.append(list(
{'code': dimension_code, 'value': code}
for code, _label in values.items()
))
return dimensions_values_array
def get_series_observations(dataframe): def get_series_observations(dataframe):
"""Return values corresponding to given series_dimensions_values, on a array, to be added to dataset's series.jsonl file """Return values corresponding to given series_dimensions_values, on a array, to be added to dataset's series.jsonl file
Example: Example:
......
...@@ -4,3 +4,4 @@ toolz ...@@ -4,3 +4,4 @@ toolz
lxml lxml
requests requests
jsonstat.py jsonstat.py
git+https://git.nomics.world/dbnomics/dbnomics-json-errors.git@0.1.1#egg=dbnomics_json_errors
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
import jsonstat
from pathlib import Path
file_path = Path("/home/bruno/dev/dbnomics/fetchers/cso/cso-source-data")
file_path = file_path / "EP003.json"
# file_path = file_path / "ACEN1.json"
collection = jsonstat.from_file(str(file_path))