...
 
Commits (3)
......@@ -38,6 +38,7 @@ Options:
--debug show debug output
--only <datasets_codes> convert given dataset(s)_code(s). Ex: "--only ABA,BGG"
-l --limit_nb_datasets <number> limit the number of datasets to convert
--from <dataset_code> start the conversion from given dataset code
"""
......@@ -97,6 +98,7 @@ def main():
datasets_files = sorted(source_dir.glob('*.json'))
ignored_datasets = total_nb_datasets = converted_datasets = 0
datasets_in_error = []
from_option_first_dataset_reached = False # Only used when "--from" option is given
for dataset_file in datasets_files:
if dataset_file.name == "category_tree.json":
continue
......@@ -107,6 +109,12 @@ def main():
continue
if args["--limit_nb_datasets"] and converted_datasets == int(args["--limit_nb_datasets"]):
break
if args["--from"]:
if dataset_code == args["--from"]:
from_option_first_dataset_reached = True
if not from_option_first_dataset_reached:
ignored_datasets += 1
continue
try:
dataset_dict = iterate_dataset(dataset_file)
except DatasetAbortedException as e:
......@@ -129,7 +137,7 @@ def main():
except DatasetAbortedException as e:
log.warning("{!r} dataset aborted ! - {}".format(dataset_code, e))
# Delete dataset dir
dataset_dir.rmdir()
shutil.rmtree(str(dataset_dir))
continue
# Write dataset.json
......@@ -152,7 +160,7 @@ def main():
log.info('END')
if ignored_datasets > 0:
log.info("{} ignored dataset(s) due to '--only' option".format(ignored_datasets))
log.info("{} ignored dataset(s) due to '--only' or '--from' options".format(ignored_datasets))
if datasets_in_error:
log.error("{}/{} dataset(s) ignored due to errors: {}".format(len(datasets_in_error), total_nb_datasets, ', '.join(datasets_in_error)))
......@@ -253,24 +261,28 @@ def iterate_dataset(filepath):
'csv_content': 'PERIOD\tVALUE\n2006\t844.0\n2011\t539.0\n',
}
"""
# Replace None values by 'NA'
dataframe.fillna('NA', inplace=True)
# Make periods format match DBnomics periods format
format_periods(dataframe, time_dimension)
# Do the hard job: cut dataframe onto series
dataframe = dataframe.set_index(time_dimension.label)
group_by = dataframe.groupby(list(real_dimensions_codes))
# Iterate through those sub dataframes and return series data
for dimensions_values_codes, sub_df in group_by:
# Ensure dimensions_values_codes is a list
dimensions_values_codes = ensure_list(dimensions_values_codes)
series_code = '.'.join(dimensions_values_codes)
log.debug("[{}]".format(series_code))
yield {
'code': series_code,
'dimensions': dict(zip(real_dimensions_codes, dimensions_values_codes)),
'observations': get_series_observations(sub_df),
}
try:
# Replace None values by 'NA'
dataframe.fillna('NA', inplace=True)
# Make periods format match DBnomics periods format
format_periods(dataframe, time_dimension)
# Do the hard job: cut dataframe onto series
dataframe = dataframe.set_index(time_dimension.label)
group_by = dataframe.groupby(list(real_dimensions_codes))
# Iterate through those sub dataframes and return series data
for dimensions_values_codes, sub_df in group_by:
# Ensure dimensions_values_codes is a list
dimensions_values_codes = ensure_list(dimensions_values_codes)
series_code = '.'.join(dimensions_values_codes)
log.debug("[{}]".format(series_code))
yield {
'code': series_code,
'dimensions': dict(zip(real_dimensions_codes, dimensions_values_codes)),
'observations': get_series_observations(sub_df),
}
except Exception as e:
log.exception(e)
raise DatasetAbortedException(str(e))
log.info("* Parsing {}".format(filepath))
try:
......