Commit 08e18a3f authored by Bruno Duyé's avatar Bruno Duyé

Add handling of "tree" datasets (#217)

parent 9f8ffa82
......@@ -2,6 +2,10 @@
Common code between download and convert
"""
# List of datasets that are presented as tree in XLS files => a dimension must be added
TREE_DATASETS = ["GDPbyIndustry-6", "GDPbyIndustry-7", "GDPbyIndustry-30", "GDPbyIndustry-29",
"GDPbyIndustry-27", "GDPbyIndustry-26", "GDPbyIndustry-25"]
APPENDIX_TO_DOWNLOAD = [
{
'code': 'NIPA',
......@@ -16,16 +20,14 @@ APPENDIX_TO_DOWNLOAD = [
'dimensions_values_keyname': 'FrequencyID',
'series_data_url': 'https://www.bea.gov/api/data/?&UserID={api_user_id}&method=GetData&DataSetName={appendix_code}&TableName={dataset_code}&Frequency={dimension_value_code}&Year=ALL',
},
'returned_data': {
'dimensions': [
{
'dimension_code': 'concept',
'dimension_label': 'Concept',
'dimensions_value_code_field': None,
'dimensions_value_label_field': "LineDescription",
},
]
},
'dimensions': [
{
'dimension_code': 'concept',
'dimension_label': 'Concept',
'dimensions_value_code_field': None,
'dimensions_value_label_field': "LineDescription",
},
],
},
{
'code': 'NIUnderlyingDetail',
......@@ -40,16 +42,14 @@ APPENDIX_TO_DOWNLOAD = [
'dimensions_values_keyname': 'FrequencyID',
'series_data_url': 'https://www.bea.gov/api/data/?&UserID={api_user_id}&method=GetData&DataSetName={appendix_code}&TableName={dataset_code}&Frequency={dimension_value_code}&Year=ALL',
},
'returned_data': {
'dimensions': [
{
'dimension_code': 'concept',
'dimension_label': 'Concept',
'dimensions_value_code_field': None,
'dimensions_value_label_field': "LineDescription",
},
]
},
'dimensions': [
{
'dimension_code': 'concept',
'dimension_label': 'Concept',
'dimensions_value_code_field': None,
'dimensions_value_label_field': "LineDescription",
},
],
},
{
'code': 'FixedAssets',
......@@ -59,16 +59,14 @@ APPENDIX_TO_DOWNLOAD = [
'datasets_list_url': 'https://bea.gov/api/data/?&UserID={api_user_id}&method=GetParameterValues&DataSetName={appendix_code}&ParameterName=TableID',
'series_data_url': 'https://www.bea.gov/api/data/?&UserID={api_user_id}&method=GetData&DataSetName={appendix_code}&TableID={dataset_code}&Year=X',
},
'returned_data': {
'dimensions': [
{
'dimension_code': 'concept',
'dimension_label': 'Concept',
'dimensions_value_code_field': None,
'dimensions_value_label_field': "LineDescription",
},
]
},
'dimensions': [
{
'dimension_code': 'concept',
'dimension_label': 'Concept',
'dimensions_value_code_field': None,
'dimensions_value_label_field': "LineDescription",
},
]
},
# 'MNE', # Appendix E – Direct Investment and Multinational Enterprises (MNEs)
# Direct Investment (DI) OR Activities of Multinational Enterprises (AMNE)
......@@ -76,6 +74,8 @@ APPENDIX_TO_DOWNLOAD = [
{
'code': 'GDPbyIndustry',
'name': 'Appendix F – Gross Domestic Product by Industry (GDPbyIndustry)',
# Special case for GDPbyIndustry (#217): series are presented as a tree => the same industry code have more than one "IndustrYDescription"
'industries_labels_url': "https://bea.gov/api/data/?UserID={api_user_id}&method=GetParameterValues&DataSetName=GDPBYINDUSTRY&ParameterName=INDUSTRY",
'cut_on': {
'dimension_code': 'FREQ',
'dimension_label': 'Frequency',
......@@ -87,16 +87,21 @@ APPENDIX_TO_DOWNLOAD = [
'dimensions_values_keyname': 'Key',
'series_data_url': 'https://www.bea.gov/api/data/?&UserID={api_user_id}&method=GetData&DataSetName={appendix_code}&TableID={dataset_code}&Frequency={dimension_value_code}&Industry=ALL&Year=ALL',
},
'returned_data': {
'dimensions': [
{
'dimension_code': 'industry',
'dimension_label': 'Industry',
'dimensions_value_code_field': 'Industry',
'dimensions_value_label_field': 'IndustrYDescription',
},
]
}
'dimensions': [
{
'dimension_code': 'industry',
'dimension_label': 'Industry',
'dimensions_value_code_field': 'Industry',
'dimensions_value_label_field': "IndustrYDescription", # Except for TREE_DATASETS
}
],
'additional_dimension': {
# This dimension will be only used for dataset presented as tree in XLS files (TREE_DATASETS)
'dimension_code': 'sub-industry',
'dimension_label': 'Industry precision',
'dimensions_value_code_field': None,
'dimensions_value_label_field': 'IndustrYDescription',
},
},
# 'ITA', # Appendix G – ITA (International Transactions)
......@@ -124,16 +129,14 @@ APPENDIX_TO_DOWNLOAD = [
'dimensions_values_keyname': 'Key',
'series_data_url': 'https://www.bea.gov/api/data/?&UserID={api_user_id}&method=GetData&DataSetName=underlyingGDPbyIndustry&Year=ALL&Industry=ALL&tableID={dataset_code}&Frequency={dimension_value_code}'
},
'returned_data': {
'dimensions': [
{
'dimension_code': 'industry',
'dimension_label': 'Industry',
'dimensions_value_code_field': 'Industry',
'dimensions_value_label_field': 'IndustrYDescription',
},
]
}
'dimensions': [
{
'dimension_code': 'industry',
'dimension_label': 'Industry',
'dimensions_value_code_field': 'Industry',
'dimensions_value_label_field': 'IndustrYDescription',
},
],
},
# {
# 'code': 'IntlServTrade',
......
This diff is collapsed.
......@@ -67,7 +67,8 @@ def main():
requests_session = Session()
# http://www.coglib.com/~icordasc/blog/2014/12/retries-in-requests.html
# backoff_factor=2 will make sleep for 2 * (2 ^ (retry_number - 1)), ie 0, 2, 4, 8, 16, 32 ...
requests_session.mount('http://', HTTPAdapter(max_retries=Retry(total=50, backoff_factor=2, status_forcelist=[500, 503, 504])))
requests_session.mount('http://', HTTPAdapter(max_retries=Retry(total=50,
backoff_factor=2, status_forcelist=[500, 503, 504])))
# Parse command line arguments
args = docopt(__doc__.format(self_filename=os.path.basename(__file__)))
target_dir = args['<target_dir>']
......@@ -85,14 +86,17 @@ def main():
appendix_path = os.path.join(target_dir, appendix_code)
os.mkdir(appendix_path)
log.info("** {}".format(appendix_code))
datasets_list_url = appendix_dict['api']['datasets_list_url'].format(api_user_id=api_user_id, appendix_code=appendix_code)
datasets_list_url = appendix_dict['api']['datasets_list_url'].format(
api_user_id=api_user_id, appendix_code=appendix_code)
datasets_list = get_from_api(datasets_list_url)
nb_tables_downloaded = 0
datasets_json = [] # Data to be written to datasets.json
# For each dataset
for dataset_info in datasets_list["ParamValue"]:
dataset_code = dataset_info.get('TableName') or dataset_info.get('TableID') or dataset_info.get('TableNumber') or dataset_info.get('Key')
assert dataset_code, "Error: can't find dataset code in dataset_info (dataset {!r}):\n{!r}".format(appendix_code, dataset_info)
dataset_code = dataset_info.get('TableName') or dataset_info.get(
'TableID') or dataset_info.get('TableNumber') or dataset_info.get('Key')
assert dataset_code, "Error: can't find dataset code in dataset_info (dataset {!r}):\n{!r}".format(
appendix_code, dataset_info)
# Download series data and write series file
if args['--limit_nb_datasets'] and nb_tables_downloaded >= int(args['--limit_nb_datasets']):
break
......@@ -116,10 +120,19 @@ def main():
}))
# Write datasets.json
write_json_file(os.path.join(appendix_path, "datasets.json"), datasets_json)
# Special case for GDPbyIndustry: download "Industry" labels
if appendix_code == "GDPbyIndustry":
industries_labels = get_gdp_by_industry_labels_from_api(appendix_dict['industries_labels_url'])
print("industries_labels: {!r}".format(industries_labels))
write_json_file(os.path.join(appendix_path, "industries_labels.json"), industries_labels["ParamValue"])
log.info('END')
def get_gdp_by_industry_labels_from_api(url):
return get_from_api(url.format(api_user_id=get_api_user_id_from_env()))
def iter_data(dataset_code, appendix_dict):
"""Yields dimension_value_code, dataset_observations_json tuples
=> in case of no "cut" on any dimension, dimension_value_code is None
......@@ -141,7 +154,8 @@ def iter_data(dataset_code, appendix_dict):
yield dimension_value_code, dataset_observations_json
else:
# No dimension to "cut" on
url = appendix_dict['api']['series_data_url'].format(api_user_id=api_user_id, appendix_code=appendix_code, dataset_code=dataset_code)
url = appendix_dict['api']['series_data_url'].format(
api_user_id=api_user_id, appendix_code=appendix_code, dataset_code=dataset_code)
dataset_observations_json = get_from_api(url, raise_on_errors=False)
if dataset_observations_json:
yield None, dataset_observations_json
......@@ -168,7 +182,8 @@ def get_from_api(url, raise_on_errors=True):
for location in error_locations:
if dicttoolz.get_in(location, json):
error_dict = dicttoolz.get_in(location, json)
error_description = dicttoolz.get_in(['ErrorDetail', 'Description'], error_dict) or error_dict.get('APIErrorDescription')
error_description = dicttoolz.get_in(['ErrorDetail', 'Description'],
error_dict) or error_dict.get('APIErrorDescription')
break
if error_dict:
if raise_on_errors:
......@@ -206,7 +221,8 @@ def get_from_api(url, raise_on_errors=True):
try:
content_dict = json.loads(content_str)
except Exception as e:
log.exception("API call: {!r}\nException during conversion of API result to json. Result saved to last_api_result.json file.".format(url))
log.exception(
"API call: {!r}\nException during conversion of API result to json. Result saved to last_api_result.json file.".format(url))
with open('last_api_result.json', 'wb') as _f:
_f.write(content_str)
raise e
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment