...
 
Commits (2)
......@@ -52,6 +52,8 @@ from docopt import docopt
import jsonstat
import toolz
import ujson as json
from dbnomics_json_errors import ErrorsArtifact
PROVIDER_JSON = dict(
code='CSO',
......@@ -85,31 +87,32 @@ def main():
write_json_file(target_dir / 'provider.json', PROVIDER_JSON)
datasets_files = sorted(source_dir.glob('*.json'))
ignored_datasets = total_nb_datasets = converted_datasets = 0
datasets_in_error = []
nb_ignored_datasets = nb_total_nb_datasets = nb_converted_datasets = nb_expected_datasets = 0
errors_artifact = ErrorsArtifact()
from_option_first_dataset_reached = False # Only used when "--from" option is given
for dataset_file in datasets_files:
if dataset_file.name == "category_tree.json":
continue
dataset_code = Path(dataset_file).stem # "C1323.json" => "C1323"
total_nb_datasets += 1
nb_total_nb_datasets += 1
if args["--only"] and not dataset_code in args["--only"]:
ignored_datasets += 1
nb_ignored_datasets += 1
continue
if args["--limit_nb_datasets"] and converted_datasets == int(args["--limit_nb_datasets"]):
if args["--limit_nb_datasets"] and nb_converted_datasets == int(args["--limit_nb_datasets"]):
break
if args["--from"]:
if dataset_code == args["--from"]:
from_option_first_dataset_reached = True
if not from_option_first_dataset_reached:
ignored_datasets += 1
nb_ignored_datasets += 1
continue
nb_expected_datasets += 1
try:
dataset_dict = iterate_dataset(dataset_file)
except DatasetAbortedException as e:
log.error("Error in dataset {!r} => dataset aborded".format(dataset_code))
log.error(e)
datasets_in_error.append(dataset_code)
errors_artifact.add_dataset_error(dataset_code, str(e))
continue
# Create dataset dir
log.debug("* dataset {}".format(dataset_code))
......@@ -120,11 +123,11 @@ def main():
jsonl_series_file = (dataset_dir / 'series.jsonl').open('w', encoding='utf8')
try:
for series_dict in dataset_dict['series_iter']:
series_code = series_dict['code']
# Add series information to series.jsonl
add_series_to_jsonl_file(series_dict, jsonl_series_file)
except DatasetAbortedException as e:
log.warning("{!r} dataset aborted ! - {}".format(dataset_code, e))
errors_artifact.add_dataset_error(dataset_code, str(e))
# Delete dataset dir
shutil.rmtree(str(dataset_dir))
continue
......@@ -137,23 +140,29 @@ def main():
'dimensions_codes_order': dataset_dict['dimensions_codes_order'],
'dimensions_values_labels': dataset_dict['dimensions_values_labels'],
})
converted_datasets += 1
nb_converted_datasets += 1
# Open category_tree.json file from source dir
with Path(source_dir / 'category_tree.json').open() as f:
tree = json.load(f)
# Remove "href" keys from tree
remove_href_keys(tree)
# Write filtered tree to target_dir
# Write filtered category tree to target_dir
write_json_file(Path(target_dir / 'category_tree.json'), tree)
# Write errors.json file
errors_artifact.write_json_file(target_dir, nb_expected_datasets=nb_expected_datasets)
log.info('END')
if ignored_datasets > 0:
log.info("{} ignored dataset(s) due to '--only' or '--from' options".format(ignored_datasets))
if datasets_in_error:
ignored_datasets_str = ': ' + ', '.join(datasets_in_error) if len(datasets_in_error) < 100 else ''
log.error("{}/{} dataset(s) ignored due to errors {}".format(len(datasets_in_error), total_nb_datasets,
ignored_datasets_str))
# Display convertion statistics
if nb_ignored_datasets > 0:
log.info("{} ignored dataset(s) due to '--only' or '--from' options".format(nb_ignored_datasets))
if errors_artifact.get_nb_errors():
ignored_datasets_str = ': ' + \
', '.join(e['dataset_code']
for e in errors_artifact.datasets_errors) if errors_artifact.get_nb_errors() < 100 else ''
log.info("{}/{} dataset(s) ignored due to errors {}".format(errors_artifact.get_nb_errors(),
nb_expected_datasets, ignored_datasets_str))
def iterate_dataset(filepath):
......@@ -187,24 +196,6 @@ def iterate_dataset(filepath):
assert metric_dimension, "didn't found metric dimension in dataset !"
return time_dimension, metric_dimension, other_dimensions
def get_dimensions_values_codes(dimensions_values_labels):
"""Return lists of all (dimension_code, dimension_value_code) for this dataset from dimensions_values_labels
This doesn't include time and metric dimensions.
Example:
[
[ {'code': 'Sex', 'value': '1'}, {'code': 'Sex', 'value': '2'} ],
[ {'code': 'Broad Industrial Group', 'value': '10'}, {'code': 'Broad Industrial Group', 'value': '15'}, {'code': 'Broad Industrial Group', 'value': '40'} ],
[ {'code': 'Regular Unpaid Help', 'value': '-6'}, {'code': 'Regular Unpaid Help', 'value': '01'} ]
]
"""
dimensions_values_array = []
for dimension_code, values in dimensions_values_labels.items():
dimensions_values_array.append(list(
{'code': dimension_code, 'value': code}
for code, _label in values.items()
))
return dimensions_values_array
def get_series_observations(dataframe):
"""Return values corresponding to given series_dimensions_values, on a array, to be added to dataset's series.jsonl file
Example:
......
......@@ -4,3 +4,4 @@ toolz
lxml
requests
jsonstat.py
git+https://git.nomics.world/dbnomics/dbnomics-json-errors.git@0.1.1#egg=dbnomics_json_errors
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
import jsonstat
from pathlib import Path
file_path = Path("/home/bruno/dev/dbnomics/fetchers/cso/cso-source-data")
file_path = file_path / "EP003.json"
# file_path = file_path / "ACEN1.json"
collection = jsonstat.from_file(str(file_path))