Commit 5cfc3ba0 authored by Bruno Duyé's avatar Bruno Duyé

Convert: adapt to "new" source's datasets.json + code variables renaming to...

Convert: adapt to "new" source's datasets.json + code variables renaming to reflect that is finaly a dataset
parent 6f99e97b
......@@ -68,8 +68,10 @@ def main():
global args
# Parse command line arguments
args = docopt(__doc__.format(self_filename=os.path.basename(__file__)))
global source_dir
source_dir = args['<source_dir>']
assert os.path.exists(source_dir)
global target_dir
target_dir = args['<target_dir>']
debug_mode = args['--debug']
logging.basicConfig(level=(logging.DEBUG if debug_mode else logging.INFO), format='%(message)s')
......@@ -80,48 +82,60 @@ def main():
# Write provider.json
write_json_file(os.path.join(target_dir, 'provider.json'), PROVIDER_JSON)
datasets_dirnames = sorted([dirname for dirname in os.listdir(source_dir)
appendix_dirnames = sorted([dirname for dirname in os.listdir(source_dir)
if os.path.isdir(os.path.join(source_dir, dirname)) and not dirname.startswith('.')
]) # List source_dir subdirs
for dataset_code in datasets_dirnames:
# Create dataset dir
log.debug("* dataset {}".format(dataset_code))
target_dataset_path = os.path.join(target_dir, dataset_code)
os.mkdir(target_dataset_path)
# Iterate through tables (set of series)
source_dataset_dir = os.path.join(source_dir, dataset_code)
# open dataset.json and read tables information
with open(os.path.join(source_dataset_dir, "dataset.json")) as dataset_json_file:
dataset_json = json.load(dataset_json_file)
for table_json in dataset_json:
table_filename = table_json["filename"]
with open(os.path.join(source_dataset_dir, table_filename)) as table_file:
table_source_json = json.load(table_file)
log.debug("- table {!r}".format(table_filename))
# Read each observation and create series files
current_series_code = None
current_observations_file = None
for observation_json in table_source_json["Data"]:
series_code = observation_json["SeriesCode"]
if current_series_code and series_code != current_series_code or current_series_code is None:
# This is the beginning of a new series
log.debug(" - series {!r}".format(series_code))
if current_observations_file:
current_observations_file.close()
current_series_code = series_code
current_observations_filename = "{}-{}.tsv".format(slugify(current_series_code), table_json['dimension_code'])
current_observations_filepath = os.path.join(target_dataset_path, current_observations_filename)
assert not os.path.exists(current_observations_filepath), "Error: already existing observation file {!r}".format(
current_observations_filepath)
current_observations_file = open(current_observations_filepath, "w")
current_observations_file.write("PERIOD\tVALUE\n")
# Prepare value
observation_value_str = str(numbers.parse_decimal(observation_json["DataValue"], locale='en_us'))
current_observations_file.write(observation_json["TimePeriod"] + "\t" + observation_value_str + "\n")
for appendix_code in appendix_dirnames:
log.info("* Appendix: {!r}".format(appendix_code))
# Iterate through datasets
appendix_path = os.path.join(source_dir, appendix_code)
# open datasets.json and read datasets information
with open(os.path.join(appendix_path, "datasets.json")) as datasets_json_file:
datasets_information = json.load(datasets_json_file)
for dataset_dict in datasets_information:
treat_dataset(dataset_dict, appendix_path)
log.info('\nEND')
def treat_dataset(dataset_dict, appendix_path):
dataset_code = dataset_dict["dataset_code"]
log.debug('- Dataset {!r}'.format(dataset_code))
# Create dataset dir
target_dataset_path = os.path.join(target_dir, dataset_code)
os.mkdir(target_dataset_path)
# Iterate through dataset source files
for dataset_source_file_info in dataset_dict['files']:
dataset_source_filename = dataset_source_file_info['filename']
log.debug(dataset_source_filename)
observations_source_json = None
with open(os.path.join(appendix_path, dataset_source_filename)) as dataset_file:
observations_source_json = json.load(dataset_file)
# Read each observation and create series files
current_series_code = None
current_observations_file = None
for observation_json in observations_source_json["Data"]:
series_code = observation_json["SeriesCode"] + '-' + dataset_source_file_info['dimension_code']
if current_series_code and series_code != current_series_code or current_series_code is None:
# This is the beginning of a new series
log.debug(" - series {!r}".format(series_code))
if current_observations_file:
# This is the end of a series (so, not the beginning of the first one)
current_observations_file.close()
dimensions = toolz.keyfilter(
lambda key: not key in ["DataValue", "LineDescription", "LineNumber",
"SeriesCode", "TableName", "TimePeriod", "UNIT_MULT", "NoteRef"],
observation_json
)
current_series_code = series_code
current_observations_filename = find_available_name_for_series_file(current_series_code, target_dataset_path)
current_observations_filepath = os.path.join(target_dataset_path, current_observations_filename)
current_observations_file = open(current_observations_filepath, "w")
current_observations_file.write("PERIOD\tVALUE\n")
# Prepare value
observation_value_str = str(numbers.parse_decimal(observation_json["DataValue"], locale='en_us'))
current_observations_file.write(observation_json["TimePeriod"] + "\t" + observation_value_str + "\n")
def write_json_file(file_path, data):
with open(file_path, 'w', encoding='utf-8') as file_:
json.dump(data, file_, ensure_ascii=False, indent=2, sort_keys=True)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment