Commit 8db87886 authored by Pierre Dittgen's avatar Pierre Dittgen

Filter category_tree to retain only existing datasets

parent 21d6881a
Pipeline #89988 passed with stage
in 3 minutes and 6 seconds
......@@ -37,6 +37,7 @@ Options:
"""
import copy
import logging
import os
import sys
......@@ -61,6 +62,36 @@ PROVIDER_JSON = dict(
log = logging.getLogger(__name__)
def filter_category_tree(dataset_codes):
""" cut branches in category tree to retain only given dataset codes """
cat_tree = []
# 1st level categories
for cat in ine_fetcher_common.CATEGORY_TREE:
# 2nd level categories
real_sub_cats = []
for sub_cat in cat.get('children', []):
real_datasets = []
# filter dataset on real datasets
for dataset in sub_cat.get('children', []):
if dataset['code'] in dataset_codes:
real_datasets.append(dataset)
# Keep this sub_cat if there are datasets
if real_datasets:
real_sub_cat = copy.deepcopy(sub_cat)
real_sub_cat['children'] = real_datasets
real_sub_cats.append(real_sub_cat)
# Keep this cat if there are subcats
if real_sub_cats:
real_cat = copy.copy(cat)
real_cat['children'] = real_sub_cats
cat_tree.append(real_cat)
return cat_tree
def main():
global log
global args
......@@ -79,6 +110,11 @@ def main():
datasets_list = sorted(f for f in os.listdir(source_dir) if os.path.isfile(
os.path.join(source_dir, f))) # List source_dir sub-dirs
# dataset_code accumulator
dataset_codes = set()
# For each dataset
for table_filename in datasets_list:
dataset_code = table_filename.replace('.json', '')
if args["--only"] and not dataset_code in args["--only"]:
......@@ -95,6 +131,7 @@ def main():
dimensions_values_labels = defaultdict(dict)
nb_series_converted = 1
series_iterator = iter_series(source_filepath)
for series_dict in series_iterator:
series_code = generate_series_code({k: list(v.keys())[0]
for k, v in series_dict['dimensions_values'].items()})
......@@ -127,8 +164,11 @@ def main():
'dimensions_values_labels': dimensions_values_labels,
})
# Write categories_tree.json file
write_json_file(os.path.join(target_dir, 'categories_tree.json'), ine_fetcher_common.CATEGORY_TREE)
# accumulate
dataset_codes.add(dataset_code)
# Write category_tree.json file
write_json_file(os.path.join(target_dir, 'category_tree.json'), filter_category_tree(dataset_codes))
log.info('\nEND')
......
......@@ -86,7 +86,7 @@ def download_table(table_code):
try:
json_response = json.loads(response.content)
except ValueError as e:
log.info('ERROR: response is not JSON, ignoring table {}!'.format(table_code))
log.warning('Response is not JSON, ignoring table %s!', table_code)
raise e
if isinstance(json_response, list):
# Series list
......@@ -101,7 +101,7 @@ def download_table(table_code):
assert table_code
url = DOWNLOAD_TABLE_URL.format(table_code)
print('Downloading {}'.format(url))
log.info('Downloading %s', url)
# Try to get table, waiting a little each time to let the server process request
log.debug("Get {!r}".format(url))
waited_time = 0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment