Commit 4ed0595f authored by Bruno Duyé's avatar Bruno Duyé

Convert - first version

parent 29bf7db3
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# bea-fetcher -- Fetch series from http://www.bea.gov
# By: Bruno Duyé <bruno.duye@cepremap.org>
#
# Copyright (C) 2017 Cepremap
# https://git.nomics.world/dbnomics-fetchers/ecb-fetcher
#
# This is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This software is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Convert BEA series to DBnomics format.
Usage:
{self_filename} <source_dir> <target_dir> [options]
source_dir: path of source directory containing BEA series generated by download.py script
target_dir: path of target directory to write datasets & series in DBnomics format
=> all files will be deleted
Options:
--debug show debug output and make some additional tests during the process
--only <dataset_code> only convert given dataset_code
"""
import logging
import os
import sys
from babel import numbers
from docopt import docopt
from slugify import slugify
import ujson as json
DATAPACKAGE_JSON = {
"dbnomics": {
"data_model_version": "0.7.5"
}
}
PROVIDER_JSON = dict(
code='BEA',
name='U.S. Bureau of Economic Analysis',
website='http://www.bea.gov',
region='US',
)
log = logging.getLogger(__name__)
def main():
global log
global args
# Parse command line arguments
args = docopt(__doc__.format(self_filename=os.path.basename(__file__)))
source_dir = args['<source_dir>']
assert os.path.exists(source_dir)
target_dir = args['<target_dir>']
debug_mode = args['--debug']
logging.basicConfig(level=(logging.DEBUG if debug_mode else logging.INFO), format='%(message)s')
# Write datapackage.json
write_json_file(os.path.join(target_dir, 'datapackage.json'), DATAPACKAGE_JSON)
# Write provider.json
write_json_file(os.path.join(target_dir, 'provider.json'), PROVIDER_JSON)
datasets_dirnames = sorted([dirname for dirname in os.listdir(source_dir)
if os.path.isdir(os.path.join(source_dir, dirname)) and not dirname.startswith('.')
]) # List source_dir subdirs
for dataset_code in datasets_dirnames:
# Create dataset dir
log.debug("* dataset {}".format(dataset_code))
target_dataset_path = os.path.join(target_dir, dataset_code)
os.mkdir(target_dataset_path)
# Iterate through tables (set of series)
source_dataset_dir = os.path.join(source_dir, dataset_code)
# open dataset.json and read tables information
with open(os.path.join(source_dataset_dir, "dataset.json")) as dataset_json_file:
dataset_json = json.load(dataset_json_file)
for table_json in dataset_json:
table_filename = table_json["filename"]
with open(os.path.join(source_dataset_dir, table_filename)) as table_file:
table_source_json = json.load(table_file)
log.debug("- table {!r}".format(table_filename))
# Read each observation and create series files
current_series_code = None
current_observations_file = None
for observation_json in table_source_json["Data"]:
series_code = observation_json["SeriesCode"]
if current_series_code and series_code != current_series_code or current_series_code is None:
# This is the beginning of a new series
log.debug(" - series {!r}".format(series_code))
if current_observations_file:
current_observations_file.close()
current_series_code = series_code
current_observations_filename = "{}-{}.tsv".format(slugify(current_series_code), table_json['dimension_code'])
current_observations_filepath = os.path.join(target_dataset_path, current_observations_filename)
assert not os.path.exists(current_observations_filepath), "Error: already existing observation file {!r}".format(
current_observations_filepath)
current_observations_file = open(current_observations_filepath, "w")
current_observations_file.write("PERIOD\tVALUE\n")
# Prepare value
observation_value_str = str(numbers.parse_decimal(observation_json["DataValue"], locale='en_us'))
current_observations_file.write(observation_json["TimePeriod"] + "\t" + observation_value_str + "\n")
log.info('\nEND')
def write_json_file(file_path, data):
with open(file_path, 'w', encoding='utf-8') as file_:
json.dump(data, file_, ensure_ascii=False, indent=2, sort_keys=True)
if __name__ == '__main__':
sys.exit(main())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment