Skip to content
Snippets Groups Projects
Commit d7765803 authored by Christophe Benz's avatar Christophe Benz
Browse files

Read source data from Git repo

parent f6da856e
No related branches found
No related tags found
No related merge requests found
......@@ -81,23 +81,35 @@ def fast_iter(context, func, *args, **kwargs):
del context
def iter_git_objects_in_sdmx_element(element, sdmx_file_path, sdmx_nsmap, dsd_file_path, dsd_nsmap, dsd_tree,
def find_git_object(repo, tree, fragments):
entry_name = fragments[0].encode('utf-8')
if entry_name in tree:
child_tree = repo[tree[entry_name][1]]
return child_tree \
if len(fragments) == 1 \
else find_git_object(repo, child_tree, fragments[1:])
return None
def iter_git_objects_in_sdmx_element(element, sdmx_blob_name, sdmx_nsmap, dsd_blob_name, dsd_nsmap, dsd_element,
dataset_json, dataset_tree, observations_tsv, yielded_git_object_ids):
if element.tag == "{{{}}}Series".format(sdmx_nsmap["data"]):
yield from iter_git_objects_in_sdmx_series_element(element, sdmx_file_path, dsd_file_path, dsd_nsmap, dsd_tree, dataset_json,
dataset_tree, observations_tsv, yielded_git_object_ids)
yield from iter_git_objects_in_sdmx_series_element(
element, sdmx_blob_name, dsd_blob_name, dsd_nsmap, dsd_element, dataset_json, dataset_tree, observations_tsv,
yielded_git_object_ids)
observations_tsv.clear()
elif element.tag == "{{{}}}Obs".format(sdmx_nsmap["data"]):
observations_tsv.append(dict(element.attrib))
def iter_git_objects_in_sdmx_file(sdmx_file_path, data_package_tree, dataset_pair_by_dataset_code, dataset_json_stub):
def iter_git_objects_in_sdmx_file(source_repo, dataset_code, dataset_tree, sdmx_blob_name, sdmx_blob_data,
data_package_tree, dataset_pair_by_dataset_code, dataset_json_stub):
# Load DSD
dsd_file_path = "{}.dsd.xml".format(sdmx_file_path[:-len(sdmx_file_extension)])
with open(dsd_file_path) as dsd_file:
dsd_tree = etree.parse(dsd_file)
dsd_nsmap = dsd_tree.getroot().nsmap.copy()
dsd_blob_name = "{}.dsd.xml".format(dataset_code)
dsd_blob = find_git_object(source_repo, dataset_tree, [dsd_blob_name])
dsd_element = etree.fromstring(dsd_blob.data)
dsd_nsmap = dsd_element.nsmap.copy()
dsd_nsmap['message'] = dsd_nsmap.pop(None)
dsd_nsmap['xml'] = "http://www.w3.org/XML/1998/namespace"
......@@ -119,12 +131,13 @@ def iter_git_objects_in_sdmx_file(sdmx_file_path, data_package_tree, dataset_pai
}
dataset_tree = Tree()
context = etree.iterparse(sdmx_file_path, events=("end",))
context = etree.iterparse(io.BytesIO(sdmx_blob_data), events=("end",))
observations_tsv = []
yielded_git_object_ids = set()
yield from fast_iter(context, iter_git_objects_in_sdmx_element, sdmx_file_path, sdmx_nsmap, dsd_file_path, dsd_nsmap, dsd_tree,
dataset_json, dataset_tree, observations_tsv, yielded_git_object_ids)
yield from fast_iter(context, iter_git_objects_in_sdmx_element, sdmx_blob_name, sdmx_nsmap,
dsd_blob_name, dsd_nsmap, dsd_element, dataset_json, dataset_tree, observations_tsv,
yielded_git_object_ids)
dataset_json = without_falsy_keys(dataset_json)
if args.validate_json:
......@@ -142,7 +155,7 @@ def iter_git_objects_in_sdmx_file(sdmx_file_path, data_package_tree, dataset_pai
yield dataset_tree
def iter_git_objects_in_sdmx_series_element(series_element, sdmx_file_path, dsd_file_path, dsd_nsmap, dsd_tree,
def iter_git_objects_in_sdmx_series_element(series_element, sdmx_blob_name, dsd_blob_name, dsd_nsmap, dsd_element,
dataset_json, dataset_tree, observations_tsv, yielded_git_object_ids):
# Ignore some specific XML element attributes corresponding to series SDMX attributes,
# because series SDMX attributes do not exist in DB.nomics.
......@@ -158,7 +171,7 @@ def iter_git_objects_in_sdmx_series_element(series_element, sdmx_file_path, dsd_
else:
# dimensions_codes_order must not change between series.
assert dataset_json["dimensions_codes_order"] == dimensions_codes_order, \
(sdmx_file_path, dataset_json["dimensions_codes_order"], dimensions_codes_order)
(sdmx_blob_name, dataset_json["dimensions_codes_order"], dimensions_codes_order)
# Fill series dimensions labels in dataset.json.
for dimension_code, dimension_value_code in series_element_attributes.items():
......@@ -166,7 +179,7 @@ def iter_git_objects_in_sdmx_series_element(series_element, sdmx_file_path, dsd_
dimension_label_xpath = './message:Concepts/structure:ConceptScheme[@id="CONCEPTS"]/structure:Concept[@id="{}"]/structure:Name[@xml:lang="en"]'.format(
dimension_code
)
dimension_label = dsd_tree.findtext(dimension_label_xpath, namespaces=dsd_nsmap)
dimension_label = dsd_element.findtext(dimension_label_xpath, namespaces=dsd_nsmap)
if dimension_label:
# Some dimensions labels are an empty string: e.g. bs_bs12_04.sdmx.xml
dataset_json["dimensions_labels"][dimension_code] = dimension_label
......@@ -175,17 +188,17 @@ def iter_git_objects_in_sdmx_series_element(series_element, sdmx_file_path, dsd_
dimension_element_xpath = './message:KeyFamilies/structure:KeyFamily/structure:Components/structure:Dimension[@conceptRef="{}"]'.format(
dimension_code
)
dimension_element = dsd_tree.find(dimension_element_xpath, namespaces=dsd_nsmap)
assert dimension_element is not None, (dsd_file_path, dimension_element_xpath)
dimension_element = dsd_element.find(dimension_element_xpath, namespaces=dsd_nsmap)
assert dimension_element is not None, (dsd_blob_name, dimension_element_xpath)
codelist_code = dimension_element.attrib["codelist"]
dimension_value_label_xpath = './message:CodeLists/structure:CodeList[@id="{}"]/structure:Code[@value="{}"]/structure:Description[@xml:lang="en"]'.format(
codelist_code,
dimension_value_code,
)
dimension_value_label = dsd_tree.findtext(dimension_value_label_xpath, namespaces=dsd_nsmap)
dimension_value_label = dsd_element.findtext(dimension_value_label_xpath, namespaces=dsd_nsmap)
# Some descriptions are empty string: just ensure it's a string, but do not store empty descriptions.
assert isinstance(dimension_value_label, str), \
(dsd_file_path, dimension_value_label_xpath, dimension_value_label)
(dsd_blob_name, dimension_value_label_xpath, dimension_value_label)
if dimension_value_label:
dataset_json["dimensions_values_labels"].setdefault(
dimension_code, {})[dimension_value_code] = dimension_value_label
......@@ -197,8 +210,8 @@ def iter_git_objects_in_sdmx_series_element(series_element, sdmx_file_path, dsd_
attribute_label_xpath = './message:Concepts/structure:ConceptScheme[@id="CONCEPTS"]/structure:Concept[@id="{}"]/structure:Name[@xml:lang="en"]'.format(
attribute_code
)
attribute_label = dsd_tree.findtext(attribute_label_xpath, namespaces=dsd_nsmap)
assert attribute_label, (dsd_file_path, attribute_label_xpath, attribute_label)
attribute_label = dsd_element.findtext(attribute_label_xpath, namespaces=dsd_nsmap)
assert attribute_label, (dsd_blob_name, attribute_label_xpath, attribute_label)
dataset_json["attributes_labels"][attribute_code] = attribute_label
# Some attributes values codes are multi-valued and concatenated into the same string.
attribute_codes = list(attribute_code) \
......@@ -215,18 +228,18 @@ def iter_git_objects_in_sdmx_series_element(series_element, sdmx_file_path, dsd_
attribute_element_xpath = './message:KeyFamilies/structure:KeyFamily/structure:Components/structure:Attribute[@conceptRef="{}"]'.format(
attribute_code
)
attribute_element = dsd_tree.find(attribute_element_xpath, namespaces=dsd_nsmap)
attribute_element = dsd_element.find(attribute_element_xpath, namespaces=dsd_nsmap)
if attribute_element is not None:
codelist_code = attribute_element.attrib["codelist"]
attribute_value_label_xpath = './message:CodeLists/structure:CodeList[@id="{}"]/structure:Code[@value="{}"]/structure:Description[@xml:lang="en"]'.format(
codelist_code,
attribute_value_code,
)
attribute_value_label = dsd_tree.findtext(
attribute_value_label = dsd_element.findtext(
attribute_value_label_xpath,
namespaces=dsd_nsmap,
)
assert attribute_value_label, (dsd_file_path, attribute_code, attribute_value_code)
assert attribute_value_label, (dsd_blob_name, attribute_code, attribute_value_code)
dataset_json["attributes_values_labels"].setdefault(
attribute_code, {})[attribute_value_code] = attribute_value_label
......@@ -267,14 +280,15 @@ def iter_git_objects_in_sdmx_series_element(series_element, sdmx_file_path, dsd_
dataset_tree.add("{}.tsv".format(series_code).encode('utf-8'), git_blob_filemode, observations_tsv_blob_id)
def toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tree, xml_element, processed_datasets_codes):
def toc_xml_element_to_json(source_repo, source_tree, repo, dataset_pair_by_dataset_code, data_package_tree, xml_element,
processed_datasets_codes):
xml_element_tag = xml_element.tag[len(toc_nsmap["nt"]) + 2:]
if xml_element_tag == "tree":
return list(filter(
None,
(
toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tree, child_element,
processed_datasets_codes)
toc_xml_element_to_json(source_repo, source_tree, repo, dataset_pair_by_dataset_code, data_package_tree,
child_element, processed_datasets_codes)
for child_element in xml_element
)
))
......@@ -282,8 +296,8 @@ def toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tre
children = list(filter(
None,
(
toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tree, child_element,
processed_datasets_codes)
toc_xml_element_to_json(source_repo, source_tree, repo, dataset_pair_by_dataset_code, data_package_tree,
child_element, processed_datasets_codes)
for child_element in xml_element.iterfind("nt:children/*", namespaces=toc_nsmap)
)
))
......@@ -295,10 +309,7 @@ def toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tre
elif xml_element_tag == "leaf" and xml_element.attrib["type"] == "dataset":
dataset_code = xml_element.findtext("nt:code", namespaces=toc_nsmap)
dataset_name = xml_element.findtext("nt:title[@language='en']", namespaces=toc_nsmap)
# Side-effect: generate Git pack corresponding to current dataset.
sdmx_file_path = os.path.abspath(
os.path.join(args.source_dir, "data", dataset_code, dataset_code + sdmx_file_extension))
# Must be named like "pack-foo.pack" to be recognized as a pack by dulwich.
pack_file_path = os.path.abspath(
os.path.join(args.target_dir, "objects", "pack", "pack-{}.pack".format(dataset_code)))
......@@ -317,15 +328,26 @@ def toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tre
else:
if (args.datasets_codes is None or dataset_code in args.datasets_codes) and \
args.exclude_datasets_codes is None or dataset_code not in args.exclude_datasets_codes:
if os.path.isfile(sdmx_file_path):
sdmx_blob_name = dataset_code + sdmx_file_extension
sdmx_entry_name = sdmx_blob_name.encode('utf-8')
dataset_tree = find_git_object(source_repo, source_tree, ["data", dataset_code])
sdmx_blob = source_repo[dataset_tree[sdmx_entry_name][1]] \
if dataset_tree is not None \
else None
if sdmx_blob is not None:
if dataset_code not in processed_datasets_codes:
log.info("Converting SDMX source file %s (size: %d)",
sdmx_file_path, os.path.getsize(sdmx_file_path))
sdmx_blob_data = sdmx_blob.data
log.info("Converting SDMX source file %s (size: %d)", sdmx_blob_name, len(sdmx_blob_data))
pack_start_time = time.time()
write_pack(
pack_file_path,
objects=iter_git_objects_in_sdmx_file(
sdmx_file_path,
source_repo,
dataset_code,
dataset_tree,
sdmx_blob_name,
sdmx_blob_data,
data_package_tree,
dataset_pair_by_dataset_code,
dataset_json_stub={
......@@ -345,7 +367,7 @@ def toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tre
processed_datasets_codes.add(dataset_code)
return categories_tree_dataset_json
else:
log.debug("SDMX file %s was not downloaded, skipping", sdmx_file_path)
log.debug("SDMX file %s was not downloaded, skipping", sdmx_blob_name)
return None
......@@ -438,9 +460,11 @@ def main():
# Parse table_of_contents.xml.
xml_file_path = os.path.join(args.source_dir, 'table_of_contents.xml')
source_repo = Repo(args.source_dir)
source_tree = source_repo[source_repo[source_repo.head()].tree]
# "table_of_contents" is abbreviated starting from below "toc".
toc_element = etree.parse(xml_file_path)
toc_blob = source_repo[source_tree[b'table_of_contents.xml'][1]]
toc_element = etree.fromstring(toc_blob.data)
# Load datasets index in Git repository.
......@@ -456,8 +480,8 @@ def main():
# Walk recursively in table_of_contents.xml and return categories_tree_json.
# Side-effects: write dataset Git packs, update dataset_pair_by_dataset_code and data_package_tree.
processed_datasets_codes = set()
categories_tree_json = toc_xml_element_to_json(repo, dataset_pair_by_dataset_code,
data_package_tree, xml_element=toc_element.getroot(),
categories_tree_json = toc_xml_element_to_json(source_repo, source_tree, repo, dataset_pair_by_dataset_code,
data_package_tree, xml_element=toc_element,
processed_datasets_codes=processed_datasets_codes)
# Write datasets index in Git repository, which was modified above by a side-effect.
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment