...
 
Commits (2)
......@@ -43,7 +43,7 @@ job:
cd ${PROVIDER_SLUG}-source-data
time find -not -path "./.git/*" -not -name ".git" -delete
cd ..
time python3 download.py ${PROVIDER_SLUG}-source-data
time python3 --log info download.py ${PROVIDER_SLUG}-source-data
cd ${PROVIDER_SLUG}-source-data
time git add -A
time git commit -m "New download" --quiet || true
......@@ -56,7 +56,7 @@ job:
cd ${PROVIDER_SLUG}-json-data
time find -not -path "./.git/*" -not -name ".git" -delete
cd ..
time python3 convert.py ${PROVIDER_SLUG}-source-data ${PROVIDER_SLUG}-json-data
time python3 convert.py --log info ${PROVIDER_SLUG}-source-data ${PROVIDER_SLUG}-json-data
cd ${PROVIDER_SLUG}-json-data
time git add -A
time git commit -m "New conversion" --quiet || true
......
......@@ -26,6 +26,7 @@ import json
import logging
import re
import sys
from operator import itemgetter
from pathlib import Path
from lxml import etree
......@@ -50,17 +51,20 @@ DAY_RE = re.compile(r'^([12]\d{3})-([01]\d)-([0-3]\d)$')
# Helps normalize space
NORM_SPACES_RE = re.compile(r'\s+')
def compute_series_observations(periods, observations, obs_addon_cols, obs_addon_values):
"""Computes series observations"""
def write_series_file(file_path: Path, periods, observations, obs_addon_cols, obs_addon_values):
""" write series tsv file from period and value list """
def norm_obs_value(value_str):
try:
return float(value_str)
except ValueError:
return "NA"
header = ['PERIOD', 'VALUE', *obs_addon_cols]
values = [periods, observations, *obs_addon_values]
norm_observations = list(map(norm_obs_value, observations))
with file_path.open(mode='wt', encoding='UTF-8') as fd:
fd.write('{}\n'.format('\t'.join(header)))
for tup in zip(*values):
fd.write('{}\n'.format('\t'.join(tup)))
header = ['PERIOD', 'VALUE', *obs_addon_cols]
values = [periods, norm_observations, *obs_addon_values]
return [header] + list(zip(*values))
def norm_space(str_value):
......@@ -180,7 +184,7 @@ def load_series_xml_file(xml_file: Path):
return info
def generate_series(ds_dir, source_dir, dimension_codes):
def generate_series_jsonl(ds_dir, source_dir, dimension_codes):
""" Browse source dir form XML file, extract content and save series in dataset dir """
def get_series_code_from_filename(filename):
......@@ -213,13 +217,10 @@ def generate_series(ds_dir, source_dir, dimension_codes):
obs = obs[::-1]
obs_addon_values = [l[::-1] for l in obs_addon_values]
# Write TSV file
ts_filepath = ds_dir / '{}.tsv'.format(ts_code)
write_series_file(ts_filepath, periods, obs, obs_addon_cols, obs_addon_values)
# and populates time series info list
series_info = {
'code': ts_code,
'observations': compute_series_observations(periods, obs, obs_addon_cols, obs_addon_values)
}
# Using structure info if available
......@@ -233,7 +234,11 @@ def generate_series(ds_dir, source_dir, dimension_codes):
series_info['attributes'] = xml_info['attributes']
series_info_list.append(series_info)
return series_info_list
series_jsonl_filepath = ds_dir / "series.jsonl"
with series_jsonl_filepath.open('wt', encoding='utf-8') as fd:
for series_info in sorted(series_info_list, key=itemgetter('code')):
json.dump(series_info, fd, sort_keys=True, ensure_ascii=False)
fd.write('\n')
def load_structure_xml_file(ds_code, structure_file: Path):
......@@ -357,8 +362,7 @@ def load_structure_xml_file(ds_code, structure_file: Path):
def generate_dataset(ds_code, source_dir: Path, structure_file: Path, ds_dir: Path, dataset_info_dict):
""" Generates dataset info and time series """
if not ds_dir.exists():
ds_dir.mkdir()
ds_dir.mkdir(exist_ok=True)
# Loads struct info (if exists)
# Note: no structure info for BBK01
......@@ -369,11 +373,10 @@ def generate_dataset(ds_code, source_dir: Path, structure_file: Path, ds_dir: Pa
# Generates series
dimension_codes = [code['id'] for code in ds_struct_info['codelists']] if ds_struct_info else []
series_info = generate_series(ds_dir, source_dir, dimension_codes)
generate_series_jsonl(ds_dir, source_dir, dimension_codes)
dataset_info = {
'code': ds_code,
'series': sorted(series_info, key=lambda s: s['code'])
}
if ds_struct_info:
dataset_info = {**dataset_info, **ds_struct_info['dataset']}
......