From 03c751a5f055e7ff90cd324c03cede6ea22a72c3 Mon Sep 17 00:00:00 2001 From: Christophe Benz <christophe.benz@cepremap.org> Date: Fri, 1 Dec 2017 10:07:28 +0100 Subject: [PATCH] Log datasets counter --- eurostat_to_dbnomics.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/eurostat_to_dbnomics.py b/eurostat_to_dbnomics.py index 9b69ca6..893cfeb 100755 --- a/eurostat_to_dbnomics.py +++ b/eurostat_to_dbnomics.py @@ -281,14 +281,17 @@ def iter_git_objects_in_sdmx_series_element(series_element, sdmx_blob_name, dsd_ def toc_xml_element_to_json(source_repo, source_tree, repo, dataset_pair_by_dataset_code, data_package_tree, xml_element, - processed_datasets_codes): + processed_datasets_codes, leaf_index): + """ + Note: leaf_index is a singleton list because the function parameter must be modified between recursive calls. + """ xml_element_tag = xml_element.tag[len(toc_nsmap["nt"]) + 2:] if xml_element_tag == "tree": return list(filter( None, ( toc_xml_element_to_json(source_repo, source_tree, repo, dataset_pair_by_dataset_code, data_package_tree, - child_element, processed_datasets_codes) + child_element, processed_datasets_codes, leaf_index) for child_element in xml_element ) )) @@ -297,7 +300,7 @@ def toc_xml_element_to_json(source_repo, source_tree, repo, dataset_pair_by_data None, ( toc_xml_element_to_json(source_repo, source_tree, repo, dataset_pair_by_dataset_code, data_package_tree, - child_element, processed_datasets_codes) + child_element, processed_datasets_codes, leaf_index) for child_element in xml_element.iterfind("nt:children/*", namespaces=toc_nsmap) ) )) @@ -326,6 +329,7 @@ def toc_xml_element_to_json(source_repo, source_tree, repo, dataset_pair_by_data data_package_tree.add(dataset_code.encode('utf-8'), git_tree_filemode, dataset_pair[0].encode('utf-8')) return categories_tree_dataset_json else: + leaf_index[0] += 1 if (args.datasets is None or dataset_code in args.datasets) and \ (args.exclude_datasets is None or dataset_code not in args.exclude_datasets) and \ (args.start_from is None or dataset_code == args.start_from): @@ -342,7 +346,8 @@ def toc_xml_element_to_json(source_repo, source_tree, repo, dataset_pair_by_data if sdmx_blob is not None: if dataset_code not in processed_datasets_codes: sdmx_blob_data = sdmx_blob.data - log.info("Converting SDMX source file %s (size: %d)", sdmx_blob_name, len(sdmx_blob_data)) + log.info("Converting SDMX source file %s (nb %d, size: %d)", + sdmx_blob_name, leaf_index[0], len(sdmx_blob_data)) pack_start_time = time.time() write_pack( pack_file_path, @@ -459,7 +464,7 @@ def main(): processed_datasets_codes = set() categories_tree_json = toc_xml_element_to_json(source_repo, source_tree, repo, dataset_pair_by_dataset_code, data_package_tree, xml_element=toc_element, - processed_datasets_codes=processed_datasets_codes) + processed_datasets_codes=processed_datasets_codes, leaf_index=[0]) # Write datasets index in Git repository, which was modified above by a side-effect. # TODO Write during iteration in case script crashes. -- GitLab