Commit 6817feef authored by Pierre Dittgen's avatar Pierre Dittgen
Browse files

Sanzi dataset ok

parent 24b301ef
Pipeline #19267 passed with stage
in 17 seconds
......@@ -23,8 +23,10 @@
http://www.meti.go.jp
"""
import argparse
import csv
import logging
import os
import shutil
import convert_util as cu
LOG = logging.Logger('meti fetcher')
......@@ -46,6 +48,12 @@ PROVIDER_JSON = dict(
website='http://www.meti.go.jp/english/'
)
DATASETS_INFO = {
'iip': 'Indices of Industrial Production by Industry (2010 = 100.0)',
'sanzi': 'Indices of Tertiary Industry Activity by industry (2010 = 100.0)'
}
FREQ_LABEL_MAP = {
'A': 'annual',
'Q': 'quarterly',
......@@ -156,6 +164,10 @@ class CSVLineHelper:
self.period_infos.append(p_info)
code_set.add(code)
@staticmethod
def norm_obs_values(obs_values):
return ['NA' if elt in ('', '-') else elt for elt in obs_values]
def generate_tsv_from_csv_row(self, cols):
""" Generates all TSV from a CSV line """
......@@ -171,7 +183,7 @@ class CSVLineHelper:
period_values = period_info['norm_period_values']
ci = period_info['col_interval']
obs_values = cols[ci[0]:ci[1]]
obs_values = CSVLineHelper.norm_obs_values(cols[ci[0]:ci[1]])
assert len(period_values) == len(obs_values)
# Write TSV file
......@@ -215,10 +227,10 @@ def extract_timeseries_from_csv(csv_code, csv_filepath, ds_dir, with_concept=Fal
in_data = False
csv_lh = None
reader = csv.reader(csv_fd, delimiter=',', quotechar='"')
# Reads input CSV line by line
for line in csv_fd.readlines():
# print('Line #{}'.format(idx))
cols = line.strip().split(',')
for cols in reader:
# Most frequently use case
if in_data:
......@@ -295,7 +307,7 @@ def write_dataset_json(json_filepath, ds_code, ds_name, series_info, with_concep
'O': 'Original',
}
},
'dimensions_code_order': [
'dimensions_codes_order': [
'frequency', 'seasonal_adjustment',
],
'series': []
......@@ -310,7 +322,7 @@ def write_dataset_json(json_filepath, ds_code, ds_name, series_info, with_concep
'PIR': "producer's inventory ratio of finished goods",
'PIRA': "producer's inventory ratio of finished goods (average)",
}
dataset_data['dimensions_code_order'].append('concept')
dataset_data['dimensions_codes_order'].append('concept')
for si in series_info:
series_dict = dict(
......@@ -323,6 +335,34 @@ def write_dataset_json(json_filepath, ds_code, ds_name, series_info, with_concep
cu.write_json_file(json_filepath, dataset_data)
def clean_csv_files(source_dir):
""" Fix CSV files found in given source_dir and store fixed versions in temp directory
return temp directory path """
import tempfile
temp_dir = tempfile.mkdtemp(prefix='sanzi')
for filename in os.listdir(source_dir):
if filename[-4:] != '.csv':
continue
orig_csv_filepath = os.path.join(source_dir, filename)
with open(orig_csv_filepath, mode='rb') as bin_fd:
bcontent = bin_fd.read()
fixed_bcontent = bcontent.replace(b'\x81\x6A', b')')
fixed_csv_filepath = os.path.join(temp_dir, filename)
with open(fixed_csv_filepath, mode='wb') as bin_fd:
bin_fd.write(fixed_bcontent)
return temp_dir
def write_category_tree_json(json_filepath):
""" Creates category_tree data and saves it as a json file """
category_tree_data = [
{'code': 'iip', 'name': DATASETS_INFO['iip']},
{'code': 'sanzi', 'name': DATASETS_INFO['sanzi']},
]
cu.write_json_file(json_filepath, category_tree_data)
def main():
""" Converts downloaded CSV files into datasets and time series """
parser = argparse.ArgumentParser(description=__doc__,
......@@ -346,13 +386,19 @@ def main():
# Standard metadata
cu.write_json_file(os.path.join(target_dir, 'provider.json'), PROVIDER_JSON)
cu.write_json_file(os.path.join(target_dir, 'datapackage.json'), DATAPACKAGE_JSON)
write_category_tree_json(os.path.join(target_dir, 'category_tree.json'))
# iip dataset
generate_dataset('iip', os.path.join(source_dir, 'b2010_g1e'), DATASETS_INFO['iip'], target_dir, with_concept=True)
# sanzi dataset
sanzi_source_dir = os.path.join(source_dir, 'b2010_ke')
# sanzi csv files include bad character, first clean them and store fixed_csv in a temporary directory
temp_dir = clean_csv_files(sanzi_source_dir)
# Works on file
generate_dataset('iip', os.path.join(source_dir, 'b2010_g1e'),
'Indices of Industrial Production by Industry (2010 = 100.0)', target_dir, with_concept=True)
# TODO: manage non ASCII characters
# generate_dataset('sanzi', os.path.join(source_dir, 'b2010_ke'),
# 'Indices of Tertiary Industry Activity by industry (2010 = 100.0)', target_dir)
generate_dataset('sanzi', temp_dir, DATASETS_INFO['sanzi'], target_dir)
shutil.rmtree(temp_dir)
if __name__ == '__main__':
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment