Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
eurostat-fetcher
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dbnomics-fetchers
eurostat-fetcher
Commits
dc86fd7a
Commit
dc86fd7a
authored
1 year ago
by
Christophe Benz
Browse files
Options
Downloads
Patches
Plain Diff
Handle case where leaves have children in table of contents
parent
93249b6d
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Pipeline
#429612
canceled with stages
Changes
1
Pipelines
16
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
convert.py
+29
-52
29 additions, 52 deletions
convert.py
with
29 additions
and
52 deletions
convert.py
+
29
−
52
View file @
dc86fd7a
...
...
@@ -218,70 +218,47 @@ def iter_datasets_to_convert(
yield
dataset_code
,
source_dataset_dir
def
toc_to_category_tree
(
source_dir
:
Path
,
dataset_codes_to_convert
:
set
[
str
]
):
def
toc_to_category_tree
(
source_dir
:
Path
):
"""
Walk recursively table_of_contents.xml and return category_tree_json and dataset.json stubs.
"""
# Parse "table_of_contents", abbreviated "toc".
toc_element
=
etree
.
parse
(
str
(
source_dir
/
"
table_of_contents.xml
"
)).
getroot
()
dataset_json_stubs
=
{}
category_tree_json
=
toc_element_to
_category_tree
(
toc_element
,
dataset_json_stubs
,
dataset_codes_to_convert
)
category_tree_json
=
list
(
iter
_category_tree
_nodes
(
toc_element
,
dataset_json_stubs
)
)
return
category_tree_json
,
dataset_json_stubs
def
toc_element_to
_category_tree
(
xml_element
,
dataset_json_stubs
,
dataset_codes_to_convert
:
set
[
str
])
:
def
iter
_category_tree
_nodes
(
xml_element
,
dataset_json_stubs
)
->
Iterator
[
dict
[
str
,
Any
]]
:
"""
Walk recursively xml_element (table_of_contents.xml) and return category_tree_json.
Side-effects: fill dataset_json_stubs.
"""
xml_element_tag
=
xml_element
.
tag
[
len
(
"
urn:eu.europa.ec.eurostat.navtree
"
)
+
2
:]
if
xml_element_tag
==
"
tree
"
:
return
list
(
filter
(
None
,
(
toc_element_to_category_tree
(
child_element
,
dataset_json_stubs
,
dataset_codes_to_convert
)
for
child_element
in
xml_element
),
)
)
for
child_element
in
xml_element
:
yield
from
iter_category_tree_nodes
(
child_element
,
dataset_json_stubs
)
elif
xml_element_tag
==
"
branch
"
:
children
=
list
(
filter
(
None
,
(
toc_element_to_category_tree
(
child_element
,
dataset_json_stubs
,
dataset_codes_to_convert
)
for
child_element
in
xml_element
.
iterfind
(
"
{*}children/*
"
)
),
)
)
return
(
without_falsy_values
(
children
=
[
child
for
child_element
in
xml_element
.
iterfind
(
"
{*}children/*
"
)
for
child
in
iter_category_tree_nodes
(
child_element
,
dataset_json_stubs
)
]
if
children
:
yield
without_falsy_values
(
{
"
code
"
:
xml_element
.
findtext
(
"
{*}code
"
),
"
name
"
:
xml_element
.
findtext
(
"
{*}title[@language=
'
en
'
]
"
),
"
children
"
:
children
,
}
)
if
children
else
None
)
elif
xml_element_tag
==
"
leaf
"
and
xml_element
.
attrib
[
"
type
"
]
in
(
"
dataset
"
,
"
table
"
,
):
dataset_code
=
xml_element
.
findtext
(
"
{*}code
"
)
if
dataset_code
not
in
dataset_codes_to_convert
:
return
None
dataset_url
=
xml_element
.
findtext
(
"
{*}downloadLink[@format=
'
sdmx
'
]
"
)
if
not
dataset_url
:
log
.
debug
(
"
Ignoring the dataset %r from the category tree because it does not provide a SDMX download link
"
,
dataset_code
,
)
return
None
elif
xml_element_tag
==
"
leaf
"
and
xml_element
.
attrib
[
"
type
"
]
in
{
"
dataset
"
,
"
table
"
}:
dataset_code
=
xml_element
.
findtext
(
"
{*}code
"
)
dataset_name
=
xml_element
.
findtext
(
"
{*}title[@language=
'
en
'
]
"
)
# Datasets can appear multiple time in the category tree
if
dataset_code
not
in
dataset_json_stubs
:
dataset_json_stubs
[
dataset_code
]
=
{
"
code
"
:
dataset_code
,
...
...
@@ -289,19 +266,22 @@ def toc_element_to_category_tree(xml_element, dataset_json_stubs, dataset_codes_
"
description
"
:
xml_element
.
findtext
(
"
{*}shortDescription[@language=
'
en
'
]
"
)
or
None
,
"
doc_href
"
:
xml_element
.
findtext
(
"
{*}metadata[@format=
'
html
'
]
"
)
or
None
,
}
return
{
yield
{
"
code
"
:
dataset_code
,
"
name
"
:
dataset_name
,
}
for
child_element
in
xml_element
.
iterfind
(
"
{*}children/*
"
):
yield
from
iter_category_tree_nodes
(
child_element
,
dataset_json_stubs
)
else
:
log
.
warning
(
"
Unexpected node type: {!r}, type {!r} (code {!r})
"
.
format
(
xml_element_tag
,
xml_element
.
attrib
[
"
type
"
],
xml_element
.
findtext
(
"
{*}code
"
),
)
"
Unexpected node type: %r, type %r (code %r)
"
,
xml_element_tag
,
xml_element
.
attrib
[
"
type
"
],
xml_element
.
findtext
(
"
{*}code
"
),
)
return
None
def
main
()
->
int
:
...
...
@@ -355,10 +335,7 @@ def main() -> int:
)
)
dataset_codes_to_convert
=
{
dataset_code
for
(
dataset_code
,
_
)
in
datasets_to_convert
}
category_tree_json
,
dataset_json_stubs
=
toc_to_category_tree
(
source_dir
=
args
.
source_dir
,
dataset_codes_to_convert
=
dataset_codes_to_convert
)
category_tree_json
,
dataset_json_stubs
=
toc_to_category_tree
(
source_dir
=
args
.
source_dir
)
convert_datasets
(
datasets_to_convert
=
datasets_to_convert
,
dataset_json_stubs
=
dataset_json_stubs
,
target_dir
=
args
.
target_dir
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment