...
 
Commits (4)
......@@ -26,20 +26,13 @@
import argparse
import json
import os
import re
import subprocess
import sys
from collections import OrderedDict
from itertools import count, islice
from itertools import count
from pathlib import Path
from slugify import slugify
datapackage_json = {
"dbnomics": {
"data_model_version": "0.7.3"
}
}
economic_databases_base_url = 'https://ec.europa.eu/info/business-economy-euro/indicators-statistics/economic-databases'
provider_json = dict(
code='AMECO',
......@@ -142,45 +135,59 @@ root_categories = [
),
]
source_filename_re = re.compile('AMECO(?P<number>\d+)\.TXT$')
SOURCE_FILENAME_RE = re.compile(r'AMECO(?P<number>\d+)\.TXT$')
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'source_dir',
'source_dir', type=Path,
help='path of source directory containing Ameco series in their original format',
)
parser.add_argument(
'target_dir',
'target_dir', type=Path,
help='path of target directory containing datasets & series in Widukind format',
)
args = parser.parse_args()
source_dir = args.source_dir
if not source_dir.exists():
parser.error("Source dir {!r} not found".format(str(source_dir)))
target_dir = args.target_dir
if not target_dir.exists():
parser.error("Target dir {!r} not found".format(str(target_dir)))
datasets_code = set()
source_filenames = sorted(
source_filepaths = sorted(
(
source_filename
for source_filename in os.listdir(args.source_dir)
if source_filename_re.match(source_filename) is not None
txt_filepath
for txt_filepath in source_dir.glob('AMECO*.TXT')
if SOURCE_FILENAME_RE.match(txt_filepath.name) is not None
),
key=lambda filename: int(source_filename_re.match(filename).group('number')),
key=lambda txt_filepath: int(SOURCE_FILENAME_RE.match(txt_filepath.name).group('number')),
)
categories_tree_json = root_categories.copy()
for root_category, source_filename in zip(categories_tree_json, source_filenames):
category_tree_json = root_categories.copy()
for root_category, source_filepath in zip(category_tree_json, source_filepaths):
dataset_json = None
sub_category = None
with open(os.path.join(args.source_dir, source_filename), 'r', encoding='utf-8') as source_file:
with source_filepath.open('r', encoding='utf-8') as source_file:
# Parse first line.
first_line = next(source_file)
labels = list(filter(None, (label.strip() for label in first_line.split(';'))))
assert labels[:5] == ['CODE', 'COUNTRY', 'SUB-CHAPTER', 'TITLE', 'UNIT'], \
'Unexpected labels: {}'.format(labels)
# Parse remaining lines.
for line in source_file:
row = [cell.strip() for cell in line.split(';')]
assert len(labels) == len(row), (labels, row)
entry = OrderedDict(zip(labels, row))
# Data row
data_row = [cell.strip() for cell in line.split(';')]
assert len(labels) == len(data_row), (labels, data_row)
entry = dict(zip(labels, data_row))
# Handle sub-category
sub_category_relative_code, sub_category_name = entry['SUB-CHAPTER'].split(None, 1)
if sub_category is None or sub_category_name != sub_category['name']: # pylint: disable=unsubscriptable-object
if sub_category is not None:
......@@ -190,13 +197,17 @@ def main():
"code": '{}.{}'.format(root_category['code'], sub_category_relative_code),
"name": sub_category_name,
}
dataset_code = entry['CODE'].rsplit('.', 1)[-1]
# Encounter a new dataset
if dataset_json is None or entry['TITLE'] != dataset_json['name']: # pylint: disable=unsubscriptable-object
# Save previous dataset.json
if dataset_json is not None:
write_json_file(
os.path.join(dataset_dir, 'dataset.json'), # pylint: disable=used-before-assignment
dataset_json,
)
write_json_file(dataset_dir / 'dataset.json', dataset_json)
# Ensure dataset code unicity
if dataset_code in datasets_code:
# A few datasets use the same code for different titles => Generate a new code:
for i in count(1):
......@@ -205,6 +216,8 @@ def main():
break
dataset_code = test_dataset_code
datasets_code.add(dataset_code)
# Creates dataset.json skeleton
dataset_json = dict(
code=dataset_code,
dimensions_codes_order=['freq', 'unit', 'geo'],
......@@ -219,27 +232,34 @@ def main():
unit={},
),
name=entry['TITLE'],
series=[]
)
# Add dataset reference in category_tree
sub_category.setdefault('children', []).append({
"code": dataset_json['code'],
"name": dataset_json['name'],
})
dataset_dir = os.path.join(args.target_dir, dataset_json['code'])
if not os.path.exists(dataset_dir):
os.mkdir(dataset_dir)
# Creates dataset_dir if not already done.
dataset_dir = target_dir / dataset_json['code']
if not dataset_dir.exists():
dataset_dir.mkdir()
else:
# 'normal' row, just check consistency
assert dataset_code == dataset_json['code'] or dataset_code == dataset_json['code'] + 'R' or \
re.match('{}-\d+$'.format(re.escape(dataset_code)), dataset_json['code']), \
re.match(r'{}-\d+$'.format(re.escape(dataset_code)), dataset_json['code']), \
'Dataset code {} differs from {} but titles are the same'.format(
dataset_code, dataset_json['code'])
dimensions_values_labels = dataset_json['dimensions_values_labels']
country_code = slugify(entry['CODE'].split('.', 1)[0])
dimensions_values_labels['geo'][country_code] = entry['COUNTRY'] # The country label
unit_label = entry['UNIT'] or '-'
unit_code = slugify(unit_label) or '-'
dimensions_values_labels['unit'][unit_code] = unit_label
# Save single series.
# Compute and collect series dimensions
country_code, country_label = slugify(entry['CODE'].split('.', 1)[0]), entry['COUNTRY']
dataset_json['dimensions_values_labels']['geo'][country_code] = country_label
unit_code, unit_label = slugify(entry['UNIT']) or '-', entry['UNIT'] or '-'
dataset_json['dimensions_values_labels']['unit'][unit_code] = unit_label
# Add series reference to dataset_json
series_code = entry['CODE']
series_json = dict(
code=series_code,
......@@ -249,23 +269,34 @@ def main():
unit=unit_code,
)
)
dataset_json.setdefault('series', []).append(series_json)
dataset_json['series'].append(series_json)
# Write observation values to TSV file
write_tsv(dataset_dir / '{}.tsv'.format(series_code), zip(labels[5:], data_row[5:]))
# Write dataset.json of the last dataset found
write_json_file(dataset_dir / 'dataset.json', dataset_json)
with open(os.path.join(dataset_dir, '{}.tsv'.format(series_code)), 'w', encoding='utf-8') as tsv_file:
tsv_file.write('PERIOD\tVALUE\n')
for period, value in islice(entry.items(), 5, None):
tsv_file.write('{}\t{}\n'.format(period, value))
write_json_file(os.path.join(dataset_dir, 'dataset.json'), dataset_json)
# Append last sub_category found to category_tree
root_category.setdefault('children', []).append(sub_category)
write_json_file(os.path.join(args.target_dir, 'datapackage.json'), datapackage_json)
write_json_file(os.path.join(args.target_dir, 'provider.json'), provider_json)
write_json_file(os.path.join(args.target_dir, 'category_tree.json'), categories_tree_json)
write_json_file(target_dir / 'provider.json', provider_json)
write_json_file(target_dir / 'category_tree.json', category_tree_json)
return 0
def write_json_file(file_path, data):
with open(file_path, 'w', encoding='utf-8') as fp:
def write_tsv(file_path: Path, period_value_list):
"""Write observation file as TSV"""
with file_path.open('w', encoding='utf-8') as fd:
fd.write('PERIOD\tVALUE\n')
for period, value in period_value_list:
fd.write('{}\t{}\n'.format(period, value))
def write_json_file(file_path: Path, data):
"""JSON dumps data in a file"""
with file_path.open('w', encoding='utf-8') as fp:
json.dump(data, fp, ensure_ascii=False, indent=2, sort_keys=True)
......