Commit 330ee40f authored by Pierre Dittgen's avatar Pierre Dittgen

Use pathlib

parent b143b831
......@@ -26,11 +26,11 @@
import argparse
import json
import os
import re
import sys
from collections import OrderedDict
from itertools import count, islice
from pathlib import Path
from slugify import slugify
......@@ -136,35 +136,43 @@ root_categories = [
),
]
source_filename_re = re.compile(r'AMECO(?P<number>\d+)\.TXT$')
SOURCE_FILENAME_RE = re.compile(r'AMECO(?P<number>\d+)\.TXT$')
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'source_dir',
'source_dir', type=Path,
help='path of source directory containing Ameco series in their original format',
)
parser.add_argument(
'target_dir',
'target_dir', type=Path,
help='path of target directory containing datasets & series in Widukind format',
)
args = parser.parse_args()
source_dir = args.source_dir
if not source_dir.exists():
parser.error("Source dir {!r} not found".format(str(source_dir)))
target_dir = args.target_dir
if not target_dir.exists():
parser.error("Target dir {!r} not found".format(str(target_dir)))
datasets_code = set()
source_filenames = sorted(
source_filepaths = sorted(
(
source_filename
for source_filename in os.listdir(args.source_dir)
if source_filename_re.match(source_filename) is not None
txt_filepath
for txt_filepath in source_dir.glob('AMECO*.TXT')
if SOURCE_FILENAME_RE.match(txt_filepath.name) is not None
),
key=lambda filename: int(source_filename_re.match(filename).group('number')),
key=lambda txt_filepath: int(SOURCE_FILENAME_RE.match(txt_filepath.name).group('number')),
)
categories_tree_json = root_categories.copy()
for root_category, source_filename in zip(categories_tree_json, source_filenames):
for root_category, source_filepath in zip(categories_tree_json, source_filepaths):
dataset_json = None
sub_category = None
with open(os.path.join(args.source_dir, source_filename), 'r', encoding='utf-8') as source_file:
with source_filepath.open('r', encoding='utf-8') as source_file:
# Parse first line.
first_line = next(source_file)
labels = list(filter(None, (label.strip() for label in first_line.split(';'))))
......@@ -187,10 +195,7 @@ def main():
dataset_code = entry['CODE'].rsplit('.', 1)[-1]
if dataset_json is None or entry['TITLE'] != dataset_json['name']: # pylint: disable=unsubscriptable-object
if dataset_json is not None:
write_json_file(
os.path.join(dataset_dir, 'dataset.json'), # pylint: disable=used-before-assignment
dataset_json,
)
write_json_file(dataset_dir / 'dataset.json', dataset_json)
if dataset_code in datasets_code:
# A few datasets use the same code for different titles => Generate a new code:
for i in count(1):
......@@ -218,9 +223,9 @@ def main():
"code": dataset_json['code'],
"name": dataset_json['name'],
})
dataset_dir = os.path.join(args.target_dir, dataset_json['code'])
if not os.path.exists(dataset_dir):
os.mkdir(dataset_dir)
dataset_dir = target_dir / dataset_json['code']
if not dataset_dir.exists():
dataset_dir.mkdir()
else:
assert dataset_code == dataset_json['code'] or dataset_code == dataset_json['code'] + 'R' or \
re.match(r'{}-\d+$'.format(re.escape(dataset_code)), dataset_json['code']), \
......@@ -245,20 +250,21 @@ def main():
)
dataset_json.setdefault('series', []).append(series_json)
with open(os.path.join(dataset_dir, '{}.tsv'.format(series_code)), 'w', encoding='utf-8') as tsv_file:
tsv_filepath = dataset_dir / '{}.tsv'.format(series_code)
with tsv_filepath.open('w', encoding='utf-8') as tsv_file:
tsv_file.write('PERIOD\tVALUE\n')
for period, value in islice(entry.items(), 5, None):
tsv_file.write('{}\t{}\n'.format(period, value))
write_json_file(os.path.join(dataset_dir, 'dataset.json'), dataset_json)
write_json_file(dataset_dir / 'dataset.json', dataset_json)
write_json_file(os.path.join(args.target_dir, 'provider.json'), provider_json)
write_json_file(os.path.join(args.target_dir, 'category_tree.json'), categories_tree_json)
write_json_file(target_dir / 'provider.json', provider_json)
write_json_file(target_dir / 'category_tree.json', categories_tree_json)
return 0
def write_json_file(file_path, data):
with open(file_path, 'w', encoding='utf-8') as fp:
def write_json_file(file_path: Path, data):
with file_path.open('w', encoding='utf-8') as fp:
json.dump(data, fp, ensure_ascii=False, indent=2, sort_keys=True)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment