Skip to content
Snippets Groups Projects
Commit dc86fd7a authored by Christophe Benz's avatar Christophe Benz
Browse files

Handle case where leaves have children in table of contents

parent 93249b6d
No related branches found
No related tags found
No related merge requests found
Pipeline #429612 canceled with stages
......@@ -218,70 +218,47 @@ def iter_datasets_to_convert(
yield dataset_code, source_dataset_dir
def toc_to_category_tree(source_dir: Path, dataset_codes_to_convert: set[str]):
def toc_to_category_tree(source_dir: Path):
"""Walk recursively table_of_contents.xml and return category_tree_json and dataset.json stubs."""
# Parse "table_of_contents", abbreviated "toc".
toc_element = etree.parse(str(source_dir / "table_of_contents.xml")).getroot()
dataset_json_stubs = {}
category_tree_json = toc_element_to_category_tree(toc_element, dataset_json_stubs, dataset_codes_to_convert)
category_tree_json = list(iter_category_tree_nodes(toc_element, dataset_json_stubs))
return category_tree_json, dataset_json_stubs
def toc_element_to_category_tree(xml_element, dataset_json_stubs, dataset_codes_to_convert: set[str]):
def iter_category_tree_nodes(xml_element, dataset_json_stubs) -> Iterator[dict[str, Any]]:
"""Walk recursively xml_element (table_of_contents.xml) and return category_tree_json.
Side-effects: fill dataset_json_stubs.
"""
xml_element_tag = xml_element.tag[len("urn:eu.europa.ec.eurostat.navtree") + 2 :]
if xml_element_tag == "tree":
return list(
filter(
None,
(
toc_element_to_category_tree(child_element, dataset_json_stubs, dataset_codes_to_convert)
for child_element in xml_element
),
)
)
for child_element in xml_element:
yield from iter_category_tree_nodes(child_element, dataset_json_stubs)
elif xml_element_tag == "branch":
children = list(
filter(
None,
(
toc_element_to_category_tree(child_element, dataset_json_stubs, dataset_codes_to_convert)
for child_element in xml_element.iterfind("{*}children/*")
),
)
)
return (
without_falsy_values(
children = [
child
for child_element in xml_element.iterfind("{*}children/*")
for child in iter_category_tree_nodes(child_element, dataset_json_stubs)
]
if children:
yield without_falsy_values(
{
"code": xml_element.findtext("{*}code"),
"name": xml_element.findtext("{*}title[@language='en']"),
"children": children,
}
)
if children
else None
)
elif xml_element_tag == "leaf" and xml_element.attrib["type"] in (
"dataset",
"table",
):
dataset_code = xml_element.findtext("{*}code")
if dataset_code not in dataset_codes_to_convert:
return None
dataset_url = xml_element.findtext("{*}downloadLink[@format='sdmx']")
if not dataset_url:
log.debug(
"Ignoring the dataset %r from the category tree because it does not provide a SDMX download link",
dataset_code,
)
return None
elif xml_element_tag == "leaf" and xml_element.attrib["type"] in {"dataset", "table"}:
dataset_code = xml_element.findtext("{*}code")
dataset_name = xml_element.findtext("{*}title[@language='en']")
# Datasets can appear multiple time in the category tree
if dataset_code not in dataset_json_stubs:
dataset_json_stubs[dataset_code] = {
"code": dataset_code,
......@@ -289,19 +266,22 @@ def toc_element_to_category_tree(xml_element, dataset_json_stubs, dataset_codes_
"description": xml_element.findtext("{*}shortDescription[@language='en']") or None,
"doc_href": xml_element.findtext("{*}metadata[@format='html']") or None,
}
return {
yield {
"code": dataset_code,
"name": dataset_name,
}
for child_element in xml_element.iterfind("{*}children/*"):
yield from iter_category_tree_nodes(child_element, dataset_json_stubs)
else:
log.warning(
"Unexpected node type: {!r}, type {!r} (code {!r})".format(
xml_element_tag,
xml_element.attrib["type"],
xml_element.findtext("{*}code"),
)
"Unexpected node type: %r, type %r (code %r)",
xml_element_tag,
xml_element.attrib["type"],
xml_element.findtext("{*}code"),
)
return None
def main() -> int:
......@@ -355,10 +335,7 @@ def main() -> int:
)
)
dataset_codes_to_convert = {dataset_code for (dataset_code, _) in datasets_to_convert}
category_tree_json, dataset_json_stubs = toc_to_category_tree(
source_dir=args.source_dir, dataset_codes_to_convert=dataset_codes_to_convert
)
category_tree_json, dataset_json_stubs = toc_to_category_tree(source_dir=args.source_dir)
convert_datasets(
datasets_to_convert=datasets_to_convert, dataset_json_stubs=dataset_json_stubs, target_dir=args.target_dir
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment