Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
eurostat-fetcher
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dbnomics-fetchers
eurostat-fetcher
Commits
d7765803
Commit
d7765803
authored
7 years ago
by
Christophe Benz
Browse files
Options
Downloads
Patches
Plain Diff
Read source data from Git repo
parent
f6da856e
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
eurostat_to_dbnomics.py
+64
-40
64 additions, 40 deletions
eurostat_to_dbnomics.py
with
64 additions
and
40 deletions
eurostat_to_dbnomics.py
+
64
−
40
View file @
d7765803
...
...
@@ -81,23 +81,35 @@ def fast_iter(context, func, *args, **kwargs):
del
context
def
iter_git_objects_in_sdmx_element
(
element
,
sdmx_file_path
,
sdmx_nsmap
,
dsd_file_path
,
dsd_nsmap
,
dsd_tree
,
def
find_git_object
(
repo
,
tree
,
fragments
):
entry_name
=
fragments
[
0
].
encode
(
'
utf-8
'
)
if
entry_name
in
tree
:
child_tree
=
repo
[
tree
[
entry_name
][
1
]]
return
child_tree
\
if
len
(
fragments
)
==
1
\
else
find_git_object
(
repo
,
child_tree
,
fragments
[
1
:])
return
None
def
iter_git_objects_in_sdmx_element
(
element
,
sdmx_blob_name
,
sdmx_nsmap
,
dsd_blob_name
,
dsd_nsmap
,
dsd_element
,
dataset_json
,
dataset_tree
,
observations_tsv
,
yielded_git_object_ids
):
if
element
.
tag
==
"
{{{}}}Series
"
.
format
(
sdmx_nsmap
[
"
data
"
]):
yield
from
iter_git_objects_in_sdmx_series_element
(
element
,
sdmx_file_path
,
dsd_file_path
,
dsd_nsmap
,
dsd_tree
,
dataset_json
,
dataset_tree
,
observations_tsv
,
yielded_git_object_ids
)
yield
from
iter_git_objects_in_sdmx_series_element
(
element
,
sdmx_blob_name
,
dsd_blob_name
,
dsd_nsmap
,
dsd_element
,
dataset_json
,
dataset_tree
,
observations_tsv
,
yielded_git_object_ids
)
observations_tsv
.
clear
()
elif
element
.
tag
==
"
{{{}}}Obs
"
.
format
(
sdmx_nsmap
[
"
data
"
]):
observations_tsv
.
append
(
dict
(
element
.
attrib
))
def
iter_git_objects_in_sdmx_file
(
sdmx_file_path
,
data_package_tree
,
dataset_pair_by_dataset_code
,
dataset_json_stub
):
def
iter_git_objects_in_sdmx_file
(
source_repo
,
dataset_code
,
dataset_tree
,
sdmx_blob_name
,
sdmx_blob_data
,
data_package_tree
,
dataset_pair_by_dataset_code
,
dataset_json_stub
):
# Load DSD
dsd_
file_path
=
"
{}.dsd.xml
"
.
format
(
sdmx_file_path
[:
-
len
(
sdmx_file_extension
)]
)
with
open
(
dsd_file_path
)
as
dsd_file
:
dsd_
tree
=
etree
.
parse
(
dsd_file
)
dsd_nsmap
=
dsd_
tree
.
getroot
()
.
nsmap
.
copy
()
dsd_
blob_name
=
"
{}.dsd.xml
"
.
format
(
dataset_code
)
dsd_blob
=
find_git_object
(
source_repo
,
dataset_tree
,
[
dsd_blob_name
])
dsd_
element
=
etree
.
fromstring
(
dsd_blob
.
data
)
dsd_nsmap
=
dsd_
element
.
nsmap
.
copy
()
dsd_nsmap
[
'
message
'
]
=
dsd_nsmap
.
pop
(
None
)
dsd_nsmap
[
'
xml
'
]
=
"
http://www.w3.org/XML/1998/namespace
"
...
...
@@ -119,12 +131,13 @@ def iter_git_objects_in_sdmx_file(sdmx_file_path, data_package_tree, dataset_pai
}
dataset_tree
=
Tree
()
context
=
etree
.
iterparse
(
sdmx_file_path
,
events
=
(
"
end
"
,))
context
=
etree
.
iterparse
(
io
.
BytesIO
(
sdmx_blob_data
)
,
events
=
(
"
end
"
,))
observations_tsv
=
[]
yielded_git_object_ids
=
set
()
yield
from
fast_iter
(
context
,
iter_git_objects_in_sdmx_element
,
sdmx_file_path
,
sdmx_nsmap
,
dsd_file_path
,
dsd_nsmap
,
dsd_tree
,
dataset_json
,
dataset_tree
,
observations_tsv
,
yielded_git_object_ids
)
yield
from
fast_iter
(
context
,
iter_git_objects_in_sdmx_element
,
sdmx_blob_name
,
sdmx_nsmap
,
dsd_blob_name
,
dsd_nsmap
,
dsd_element
,
dataset_json
,
dataset_tree
,
observations_tsv
,
yielded_git_object_ids
)
dataset_json
=
without_falsy_keys
(
dataset_json
)
if
args
.
validate_json
:
...
...
@@ -142,7 +155,7 @@ def iter_git_objects_in_sdmx_file(sdmx_file_path, data_package_tree, dataset_pai
yield
dataset_tree
def
iter_git_objects_in_sdmx_series_element
(
series_element
,
sdmx_
file_path
,
dsd_file_path
,
dsd_nsmap
,
dsd_
tree
,
def
iter_git_objects_in_sdmx_series_element
(
series_element
,
sdmx_
blob_name
,
dsd_blob_name
,
dsd_nsmap
,
dsd_
element
,
dataset_json
,
dataset_tree
,
observations_tsv
,
yielded_git_object_ids
):
# Ignore some specific XML element attributes corresponding to series SDMX attributes,
# because series SDMX attributes do not exist in DB.nomics.
...
...
@@ -158,7 +171,7 @@ def iter_git_objects_in_sdmx_series_element(series_element, sdmx_file_path, dsd_
else
:
# dimensions_codes_order must not change between series.
assert
dataset_json
[
"
dimensions_codes_order
"
]
==
dimensions_codes_order
,
\
(
sdmx_
file_path
,
dataset_json
[
"
dimensions_codes_order
"
],
dimensions_codes_order
)
(
sdmx_
blob_name
,
dataset_json
[
"
dimensions_codes_order
"
],
dimensions_codes_order
)
# Fill series dimensions labels in dataset.json.
for
dimension_code
,
dimension_value_code
in
series_element_attributes
.
items
():
...
...
@@ -166,7 +179,7 @@ def iter_git_objects_in_sdmx_series_element(series_element, sdmx_file_path, dsd_
dimension_label_xpath
=
'
./message:Concepts/structure:ConceptScheme[@id=
"
CONCEPTS
"
]/structure:Concept[@id=
"
{}
"
]/structure:Name[@xml:lang=
"
en
"
]
'
.
format
(
dimension_code
)
dimension_label
=
dsd_
tree
.
findtext
(
dimension_label_xpath
,
namespaces
=
dsd_nsmap
)
dimension_label
=
dsd_
element
.
findtext
(
dimension_label_xpath
,
namespaces
=
dsd_nsmap
)
if
dimension_label
:
# Some dimensions labels are an empty string: e.g. bs_bs12_04.sdmx.xml
dataset_json
[
"
dimensions_labels
"
][
dimension_code
]
=
dimension_label
...
...
@@ -175,17 +188,17 @@ def iter_git_objects_in_sdmx_series_element(series_element, sdmx_file_path, dsd_
dimension_element_xpath
=
'
./message:KeyFamilies/structure:KeyFamily/structure:Components/structure:Dimension[@conceptRef=
"
{}
"
]
'
.
format
(
dimension_code
)
dimension_element
=
dsd_
tree
.
find
(
dimension_element_xpath
,
namespaces
=
dsd_nsmap
)
assert
dimension_element
is
not
None
,
(
dsd_
file_path
,
dimension_element_xpath
)
dimension_element
=
dsd_
element
.
find
(
dimension_element_xpath
,
namespaces
=
dsd_nsmap
)
assert
dimension_element
is
not
None
,
(
dsd_
blob_name
,
dimension_element_xpath
)
codelist_code
=
dimension_element
.
attrib
[
"
codelist
"
]
dimension_value_label_xpath
=
'
./message:CodeLists/structure:CodeList[@id=
"
{}
"
]/structure:Code[@value=
"
{}
"
]/structure:Description[@xml:lang=
"
en
"
]
'
.
format
(
codelist_code
,
dimension_value_code
,
)
dimension_value_label
=
dsd_
tree
.
findtext
(
dimension_value_label_xpath
,
namespaces
=
dsd_nsmap
)
dimension_value_label
=
dsd_
element
.
findtext
(
dimension_value_label_xpath
,
namespaces
=
dsd_nsmap
)
# Some descriptions are empty string: just ensure it's a string, but do not store empty descriptions.
assert
isinstance
(
dimension_value_label
,
str
),
\
(
dsd_
file_path
,
dimension_value_label_xpath
,
dimension_value_label
)
(
dsd_
blob_name
,
dimension_value_label_xpath
,
dimension_value_label
)
if
dimension_value_label
:
dataset_json
[
"
dimensions_values_labels
"
].
setdefault
(
dimension_code
,
{})[
dimension_value_code
]
=
dimension_value_label
...
...
@@ -197,8 +210,8 @@ def iter_git_objects_in_sdmx_series_element(series_element, sdmx_file_path, dsd_
attribute_label_xpath
=
'
./message:Concepts/structure:ConceptScheme[@id=
"
CONCEPTS
"
]/structure:Concept[@id=
"
{}
"
]/structure:Name[@xml:lang=
"
en
"
]
'
.
format
(
attribute_code
)
attribute_label
=
dsd_
tree
.
findtext
(
attribute_label_xpath
,
namespaces
=
dsd_nsmap
)
assert
attribute_label
,
(
dsd_
file_path
,
attribute_label_xpath
,
attribute_label
)
attribute_label
=
dsd_
element
.
findtext
(
attribute_label_xpath
,
namespaces
=
dsd_nsmap
)
assert
attribute_label
,
(
dsd_
blob_name
,
attribute_label_xpath
,
attribute_label
)
dataset_json
[
"
attributes_labels
"
][
attribute_code
]
=
attribute_label
# Some attributes values codes are multi-valued and concatenated into the same string.
attribute_codes
=
list
(
attribute_code
)
\
...
...
@@ -215,18 +228,18 @@ def iter_git_objects_in_sdmx_series_element(series_element, sdmx_file_path, dsd_
attribute_element_xpath
=
'
./message:KeyFamilies/structure:KeyFamily/structure:Components/structure:Attribute[@conceptRef=
"
{}
"
]
'
.
format
(
attribute_code
)
attribute_element
=
dsd_
tree
.
find
(
attribute_element_xpath
,
namespaces
=
dsd_nsmap
)
attribute_element
=
dsd_
element
.
find
(
attribute_element_xpath
,
namespaces
=
dsd_nsmap
)
if
attribute_element
is
not
None
:
codelist_code
=
attribute_element
.
attrib
[
"
codelist
"
]
attribute_value_label_xpath
=
'
./message:CodeLists/structure:CodeList[@id=
"
{}
"
]/structure:Code[@value=
"
{}
"
]/structure:Description[@xml:lang=
"
en
"
]
'
.
format
(
codelist_code
,
attribute_value_code
,
)
attribute_value_label
=
dsd_
tree
.
findtext
(
attribute_value_label
=
dsd_
element
.
findtext
(
attribute_value_label_xpath
,
namespaces
=
dsd_nsmap
,
)
assert
attribute_value_label
,
(
dsd_
file_path
,
attribute_code
,
attribute_value_code
)
assert
attribute_value_label
,
(
dsd_
blob_name
,
attribute_code
,
attribute_value_code
)
dataset_json
[
"
attributes_values_labels
"
].
setdefault
(
attribute_code
,
{})[
attribute_value_code
]
=
attribute_value_label
...
...
@@ -267,14 +280,15 @@ def iter_git_objects_in_sdmx_series_element(series_element, sdmx_file_path, dsd_
dataset_tree
.
add
(
"
{}.tsv
"
.
format
(
series_code
).
encode
(
'
utf-8
'
),
git_blob_filemode
,
observations_tsv_blob_id
)
def
toc_xml_element_to_json
(
repo
,
dataset_pair_by_dataset_code
,
data_package_tree
,
xml_element
,
processed_datasets_codes
):
def
toc_xml_element_to_json
(
source_repo
,
source_tree
,
repo
,
dataset_pair_by_dataset_code
,
data_package_tree
,
xml_element
,
processed_datasets_codes
):
xml_element_tag
=
xml_element
.
tag
[
len
(
toc_nsmap
[
"
nt
"
])
+
2
:]
if
xml_element_tag
==
"
tree
"
:
return
list
(
filter
(
None
,
(
toc_xml_element_to_json
(
repo
,
dataset_pair_by_dataset_code
,
data_package_tree
,
child_element
,
processed_datasets_codes
)
toc_xml_element_to_json
(
source_repo
,
source_tree
,
repo
,
dataset_pair_by_dataset_code
,
data_package_tree
,
child_element
,
processed_datasets_codes
)
for
child_element
in
xml_element
)
))
...
...
@@ -282,8 +296,8 @@ def toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tre
children
=
list
(
filter
(
None
,
(
toc_xml_element_to_json
(
repo
,
dataset_pair_by_dataset_code
,
data_package_tree
,
child_element
,
processed_datasets_codes
)
toc_xml_element_to_json
(
source_repo
,
source_tree
,
repo
,
dataset_pair_by_dataset_code
,
data_package_tree
,
child_element
,
processed_datasets_codes
)
for
child_element
in
xml_element
.
iterfind
(
"
nt:children/*
"
,
namespaces
=
toc_nsmap
)
)
))
...
...
@@ -295,10 +309,7 @@ def toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tre
elif
xml_element_tag
==
"
leaf
"
and
xml_element
.
attrib
[
"
type
"
]
==
"
dataset
"
:
dataset_code
=
xml_element
.
findtext
(
"
nt:code
"
,
namespaces
=
toc_nsmap
)
dataset_name
=
xml_element
.
findtext
(
"
nt:title[@language=
'
en
'
]
"
,
namespaces
=
toc_nsmap
)
# Side-effect: generate Git pack corresponding to current dataset.
sdmx_file_path
=
os
.
path
.
abspath
(
os
.
path
.
join
(
args
.
source_dir
,
"
data
"
,
dataset_code
,
dataset_code
+
sdmx_file_extension
))
# Must be named like "pack-foo.pack" to be recognized as a pack by dulwich.
pack_file_path
=
os
.
path
.
abspath
(
os
.
path
.
join
(
args
.
target_dir
,
"
objects
"
,
"
pack
"
,
"
pack-{}.pack
"
.
format
(
dataset_code
)))
...
...
@@ -317,15 +328,26 @@ def toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tre
else
:
if
(
args
.
datasets_codes
is
None
or
dataset_code
in
args
.
datasets_codes
)
and
\
args
.
exclude_datasets_codes
is
None
or
dataset_code
not
in
args
.
exclude_datasets_codes
:
if
os
.
path
.
isfile
(
sdmx_file_path
):
sdmx_blob_name
=
dataset_code
+
sdmx_file_extension
sdmx_entry_name
=
sdmx_blob_name
.
encode
(
'
utf-8
'
)
dataset_tree
=
find_git_object
(
source_repo
,
source_tree
,
[
"
data
"
,
dataset_code
])
sdmx_blob
=
source_repo
[
dataset_tree
[
sdmx_entry_name
][
1
]]
\
if
dataset_tree
is
not
None
\
else
None
if
sdmx_blob
is
not
None
:
if
dataset_code
not
in
processed_datasets_codes
:
log
.
info
(
"
Converting SDMX source file %s (size: %d)
"
,
sdmx_file_path
,
os
.
path
.
getsize
(
sdmx_file_p
at
h
))
sdmx_blob_data
=
sdmx_blob
.
data
log
.
info
(
"
Converting SDMX source file %s (size: %d)
"
,
sdmx_blob_name
,
len
(
sdmx_blob_d
at
a
))
pack_start_time
=
time
.
time
()
write_pack
(
pack_file_path
,
objects
=
iter_git_objects_in_sdmx_file
(
sdmx_file_path
,
source_repo
,
dataset_code
,
dataset_tree
,
sdmx_blob_name
,
sdmx_blob_data
,
data_package_tree
,
dataset_pair_by_dataset_code
,
dataset_json_stub
=
{
...
...
@@ -345,7 +367,7 @@ def toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tre
processed_datasets_codes
.
add
(
dataset_code
)
return
categories_tree_dataset_json
else
:
log
.
debug
(
"
SDMX file %s was not downloaded, skipping
"
,
sdmx_
file_path
)
log
.
debug
(
"
SDMX file %s was not downloaded, skipping
"
,
sdmx_
blob_name
)
return
None
...
...
@@ -438,9 +460,11 @@ def main():
# Parse table_of_contents.xml.
xml_file_path
=
os
.
path
.
join
(
args
.
source_dir
,
'
table_of_contents.xml
'
)
source_repo
=
Repo
(
args
.
source_dir
)
source_tree
=
source_repo
[
source_repo
[
source_repo
.
head
()].
tree
]
# "table_of_contents" is abbreviated starting from below "toc".
toc_element
=
etree
.
parse
(
xml_file_path
)
toc_blob
=
source_repo
[
source_tree
[
b
'
table_of_contents.xml
'
][
1
]]
toc_element
=
etree
.
fromstring
(
toc_blob
.
data
)
# Load datasets index in Git repository.
...
...
@@ -456,8 +480,8 @@ def main():
# Walk recursively in table_of_contents.xml and return categories_tree_json.
# Side-effects: write dataset Git packs, update dataset_pair_by_dataset_code and data_package_tree.
processed_datasets_codes
=
set
()
categories_tree_json
=
toc_xml_element_to_json
(
repo
,
dataset_pair_by_dataset_code
,
data_package_tree
,
xml_element
=
toc_element
.
getroot
()
,
categories_tree_json
=
toc_xml_element_to_json
(
source_repo
,
source_tree
,
repo
,
dataset_pair_by_dataset_code
,
data_package_tree
,
xml_element
=
toc_element
,
processed_datasets_codes
=
processed_datasets_codes
)
# Write datasets index in Git repository, which was modified above by a side-effect.
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment