Skip to content
Snippets Groups Projects

WIP: #45 - WTO: Write datasets in JSON repo for "annually" category

Closed Bruno Duyé requested to merge dev into master
All threads resolved!
+ 27
27
@@ -44,7 +44,7 @@ import xlrd
from docopt import docopt
from slugify import slugify
from dbnomics_converters.base import verified_value, to_float
from dbnomics_converters.base import assert_no_error, to_float
from dbnomics_converters.datasets import validate_dataset
from dbnomics_converters.providers import validate_provider
from dbnomics_converters.series import validate_series
@@ -55,8 +55,8 @@ log = logging.getLogger(__name__)
PROVIDER = dict(
name='WTO',
long_name='World Trade Organization',
code='WTO',
name='World Trade Organization',
region='World',
website='https://www.wto.org/',
terms_of_use='https://www.wto.org/english/tratop_e/trips_e/intel2_e.htm'
@@ -213,7 +213,7 @@ def main():
# Create provider.json
provider_json_data = PROVIDER
provider_json_data['categories'] = [category['name'] for category in CATEGORIES]
provider_json_data = verified_value(validate_provider(provider_json_data, format='json'))
provider_json_data = assert_no_error(validate_provider(provider_json_data, format='json'))
write_json_file(os.path.join(target_dir, 'provider.json'), provider_json_data)
for category in CATEGORIES:
@@ -242,7 +242,7 @@ def create_directories_subtree(category_or_dataset, parent_category_path):
],
'category_code': category['code']
}
# category_json_data = verified_value(validate_category(category_json_data, format='json', used_categories_code=used_categories_code))
# category_json_data = assert_no_error(validate_category(category_json_data, format='json', used_categories_code=used_categories_code))
write_json_file(os.path.join(category_path, element_dirname, 'category.json'), category_json_data)
element_type = category_or_dataset['type']
@@ -262,7 +262,7 @@ def create_directories_subtree(category_or_dataset, parent_category_path):
elif element['type'] == 'dataset':
create_dataset_and_series(element, element_path)
else:
raise "Unexpted type: {}".format(element['type'])
raise "Unexpected type: {}".format(element['type'])
def create_dataset_and_series(dataset, parent_category_path):
@@ -281,7 +281,7 @@ def create_dataset_and_series(dataset, parent_category_path):
elif filename.endswith('.xls'):
create_dataset_and_series_from_xls(dataset, dataset_path)
else:
raise ValueError("Unexepted file format: {} for dataset \"{}\"".format(filename, dataset['name']))
raise ValueError("Unexpected file format: {} for dataset {!r}".format(filename, dataset['name']))
def create_dataset_and_series_from_csv(dataset, dataset_path):
@@ -382,7 +382,7 @@ def create_dataset_and_series_from_csv(dataset, dataset_path):
for dimension_code, label in dimensions_key_labels_by_dimension_code.items()
]
series = {
'key': '-'.join(dimensions_key_values_codes),
'code': '-'.join(dimensions_key_values_codes),
'dimensions': dict(zip(dimensions_codes, dimensions_key_values_codes)),
}
@@ -400,29 +400,29 @@ def create_dataset_and_series_from_csv(dataset, dataset_path):
)
# Create series.json
series = verified_value(validate_series(series, format='json'))
series = assert_no_error(validate_series(series, format='json'))
write_json_file(os.path.join(series_dir_path, 'series.json'), series)
# prepare data to be written in dataset.json
dataset_json_data = {
'name': dataset['name'],
'dataset_code': dataset['code'],
'codelists': {
'code': dataset['code'],
'dimensions_values_labels': {
dimension_code: {
dimension_value_code: dimensions_values_labels[dimension_code][dimension_value_code]
for dimension_value_code in dimensions_values_labels_codes
}
for dimension_code, dimensions_values_labels_codes in found_dimensions_values_codes.items()
},
'concepts': {
'dimensions_labels': {
cached_slugify(dimension_label): dimension_label
for dimension_label in dimensions_labels
},
'dimension_keys': dimensions_codes,
'dimensions_codes_order': dimensions_codes,
}
dataset_json_data["series"] = list(sorted(series_directories_names))
dataset_json_data = verified_value(validate_dataset(dataset_json_data, format='json',
skip_series_duplicates_checking=True))
dataset_json_data = assert_no_error(validate_dataset(dataset_json_data, format='json',
skip_series_duplicates_check=True))
write_json_file(os.path.join(dataset_path, 'dataset.json'), dataset_json_data)
@@ -450,10 +450,10 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
# Ensure all elements match regexp and there's no "holes"
for col_num, value in enumerate(periods_row):
row_coords = (row_num, first_non_empty_cell_col_number + col_num) # coords in file
assert value, "Unexepted empty cell at position {} ({}) in {} sheet".format(
assert value, "Unexpected empty cell at position {} ({}) in {} sheet".format(
row_coords, xlrd.cellname(*row_coords), sheet.name)
assert re.match(period_regexp, value), \
"Unexepted period \"{}\" at position {} ({}) in {} sheet".format(
"Unexpected period {!r} at position {} ({}) in {} sheet".format(
value, row_coords, xlrd.cellname(*row_coords), sheet.name)
return first_non_empty_cell_col_number, periods_row
@@ -464,7 +464,7 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
series_dir_path = os.path.join(dataset_path, series_directory_name)
os.mkdir(series_dir_path)
# Create series.json
series = verified_value(validate_series(series, format='json'))
series = assert_no_error(validate_series(series, format='json'))
write_json_file(os.path.join(series_dir_path, 'series.json'), series)
# Write series observations
write_series_tsv_file(series_dir_path, observations, unit)
@@ -488,14 +488,14 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
continue
# Stop parsing if dataset defines a "stop_text" and this text is found in row
if dataset.get('stop_text') and dataset.get('stop_text') in row:
print("Info: reached stop text \"{}\" at line {}".format(dataset.get('stop_text'), row_num))
print("Info: reached stop text {!r} at line {}".format(dataset.get('stop_text'), row_num))
break
# Parse data rows
region_code = row[xls_constants['regions_codes_col_num']].strip()
region_label = row[xls_constants['regions_labels_col_num']].strip()
# Pass through rows without data
if set(row[first_value_col_num:]) == {''}:
print("Info: row {} - ignoring {}".format(row_num, "\"{}\"".format(region_label)
print("Info: row {} - ignoring {}".format(row_num, "{!r}".format(region_label)
if region_label else "line {}".format(row_num)))
continue
assert region_label, "No region label found at row {}".format(row_num)
@@ -508,7 +508,7 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
v_, error = to_float(value)
if error:
row_coords = (row_num, first_value_col_num + i)
assert not error, "Unexpected value \"{}\" at position {} ({}) in sheet \"{}\"".format(
assert not error, "Unexpected value {!r} at position {} ({}) in sheet {!r}".format(
value, row_coords, xlrd.cellname(*row_coords), sheet.name)
else:
converted_values.append(str(value))
@@ -520,7 +520,7 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
generated_region_code = region_code or slugify(region_label)
regions_codes.add((region_label, generated_region_code))
series = dict(
key=series_directory_name,
code=series_directory_name,
dimensions=dict(
Flow=flow_code,
Region=generated_region_code
@@ -535,9 +535,9 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
dimension_codes = [slugify(label) for label in xls_constants['dimensions_labels']]
dataset_json_data = {
'name': dataset['name'],
'dataset_code': dataset['code'],
'dimension_keys': dimension_codes,
'codelists': {
'code': dataset['code'],
'dimensions_codes_order': dimension_codes,
'dimensions_values_labels': {
'flow': {
label_and_code[1]: label_and_code[0]
for label_and_code in xls_constants['flow_codes_and_names_by_sheet_names'].values()
@@ -547,14 +547,14 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
for region in regions_codes
}
},
'concepts': dict(zip(
'dimensions_labels': dict(zip(
dimension_codes,
xls_constants['dimensions_labels'],
)),
}
# Finaly, write dataset.json
dataset_json_data["series"] = list(sorted(series_directories_names))
dataset_json_data = verified_value(validate_dataset(dataset_json_data, format='json'))
dataset_json_data = assert_no_error(validate_dataset(dataset_json_data, format='json'))
write_json_file(os.path.join(dataset_path, 'dataset.json'), dataset_json_data)