Commit b3297a0c authored by Pierre Dittgen's avatar Pierre Dittgen

Fix 79 missing datasets in category_tree.json

parent 330ee40f
Pipeline #89745 passed with stage
in 49 seconds
......@@ -28,8 +28,7 @@ import argparse
import json
import re
import sys
from collections import OrderedDict
from itertools import count, islice
from itertools import count
from pathlib import Path
from slugify import slugify
......@@ -168,8 +167,9 @@ def main():
),
key=lambda txt_filepath: int(SOURCE_FILENAME_RE.match(txt_filepath.name).group('number')),
)
categories_tree_json = root_categories.copy()
for root_category, source_filepath in zip(categories_tree_json, source_filepaths):
category_tree_json = root_categories.copy()
for root_category, source_filepath in zip(category_tree_json, source_filepaths):
dataset_json = None
sub_category = None
with source_filepath.open('r', encoding='utf-8') as source_file:
......@@ -178,11 +178,16 @@ def main():
labels = list(filter(None, (label.strip() for label in first_line.split(';'))))
assert labels[:5] == ['CODE', 'COUNTRY', 'SUB-CHAPTER', 'TITLE', 'UNIT'], \
'Unexpected labels: {}'.format(labels)
# Parse remaining lines.
for line in source_file:
row = [cell.strip() for cell in line.split(';')]
assert len(labels) == len(row), (labels, row)
entry = OrderedDict(zip(labels, row))
# Data row
data_row = [cell.strip() for cell in line.split(';')]
assert len(labels) == len(data_row), (labels, data_row)
entry = dict(zip(labels, data_row))
# Handle sub-category
sub_category_relative_code, sub_category_name = entry['SUB-CHAPTER'].split(None, 1)
if sub_category is None or sub_category_name != sub_category['name']: # pylint: disable=unsubscriptable-object
if sub_category is not None:
......@@ -192,10 +197,17 @@ def main():
"code": '{}.{}'.format(root_category['code'], sub_category_relative_code),
"name": sub_category_name,
}
dataset_code = entry['CODE'].rsplit('.', 1)[-1]
# Encounter a new dataset
if dataset_json is None or entry['TITLE'] != dataset_json['name']: # pylint: disable=unsubscriptable-object
# Save previous dataset.json
if dataset_json is not None:
write_json_file(dataset_dir / 'dataset.json', dataset_json)
# Ensure dataset code unicity
if dataset_code in datasets_code:
# A few datasets use the same code for different titles => Generate a new code:
for i in count(1):
......@@ -204,6 +216,8 @@ def main():
break
dataset_code = test_dataset_code
datasets_code.add(dataset_code)
# Creates dataset.json skeleton
dataset_json = dict(
code=dataset_code,
dimensions_codes_order=['freq', 'unit', 'geo'],
......@@ -218,27 +232,34 @@ def main():
unit={},
),
name=entry['TITLE'],
series=[]
)
# Add dataset reference in category_tree
sub_category.setdefault('children', []).append({
"code": dataset_json['code'],
"name": dataset_json['name'],
})
# Creates dataset_dir if not already done.
dataset_dir = target_dir / dataset_json['code']
if not dataset_dir.exists():
dataset_dir.mkdir()
else:
# 'normal' row, just check consistency
assert dataset_code == dataset_json['code'] or dataset_code == dataset_json['code'] + 'R' or \
re.match(r'{}-\d+$'.format(re.escape(dataset_code)), dataset_json['code']), \
'Dataset code {} differs from {} but titles are the same'.format(
dataset_code, dataset_json['code'])
dimensions_values_labels = dataset_json['dimensions_values_labels']
country_code = slugify(entry['CODE'].split('.', 1)[0])
dimensions_values_labels['geo'][country_code] = entry['COUNTRY'] # The country label
unit_label = entry['UNIT'] or '-'
unit_code = slugify(unit_label) or '-'
dimensions_values_labels['unit'][unit_code] = unit_label
# Save single series.
# Compute and collect series dimensions
country_code, country_label = slugify(entry['CODE'].split('.', 1)[0]), entry['COUNTRY']
dataset_json['dimensions_values_labels']['geo'][country_code] = country_label
unit_code, unit_label = slugify(entry['UNIT']) or '-', entry['UNIT'] or '-'
dataset_json['dimensions_values_labels']['unit'][unit_code] = unit_label
# Add series reference to dataset_json
series_code = entry['CODE']
series_json = dict(
code=series_code,
......@@ -248,22 +269,33 @@ def main():
unit=unit_code,
)
)
dataset_json.setdefault('series', []).append(series_json)
dataset_json['series'].append(series_json)
tsv_filepath = dataset_dir / '{}.tsv'.format(series_code)
with tsv_filepath.open('w', encoding='utf-8') as tsv_file:
tsv_file.write('PERIOD\tVALUE\n')
for period, value in islice(entry.items(), 5, None):
tsv_file.write('{}\t{}\n'.format(period, value))
# Write observation values to TSV file
write_tsv(dataset_dir / '{}.tsv'.format(series_code), zip(labels[5:], data_row[5:]))
# Write dataset.json of the last dataset found
write_json_file(dataset_dir / 'dataset.json', dataset_json)
# Append last sub_category found to category_tree
root_category.setdefault('children', []).append(sub_category)
write_json_file(target_dir / 'provider.json', provider_json)
write_json_file(target_dir / 'category_tree.json', categories_tree_json)
write_json_file(target_dir / 'category_tree.json', category_tree_json)
return 0
def write_tsv(file_path: Path, period_value_list):
"""Write observation file as TSV"""
with file_path.open('w', encoding='utf-8') as fd:
fd.write('PERIOD\tVALUE\n')
for period, value in period_value_list:
fd.write('{}\t{}\n'.format(period, value))
def write_json_file(file_path: Path, data):
"""JSON dumps data in a file"""
with file_path.open('w', encoding='utf-8') as fp:
json.dump(data, fp, ensure_ascii=False, indent=2, sort_keys=True)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment