diff --git a/eurostat_to_dbnomics.py b/eurostat_to_dbnomics.py index 51e3840f83cf5d4545f9e1f0fbc0eef766d335fd..62228a72bdcf1461e12da0ade2380268758b0ada 100755 --- a/eurostat_to_dbnomics.py +++ b/eurostat_to_dbnomics.py @@ -82,136 +82,17 @@ def fast_iter(context, func, *args, **kwargs): del context -def iter_git_objects_in_sdmx_file(sdmx_file_path, data_package_tree, dataset_pair_by_dataset_code, dataset_json_stub): - def process_element(element): - if element.tag == "{{{}}}Series".format(sdmx_nsmap["data"]): - yield from process_series_element(element) - observations_tsv.clear() - elif element.tag == "{{{}}}Obs".format(sdmx_nsmap["data"]): - observations_tsv.append(dict(element.attrib)) - - def process_series_element(series_element): - # Ignore some specific XML element attributes corresponding to series SDMX attributes, - # because series SDMX attributes do not exist in DB.nomics. - series_element_attributes = OrderedDict([ - (attribute_key, attribute_value) - for attribute_key, attribute_value in series_element.attrib.items() - if attribute_key not in {"TIME_FORMAT"} - ]) - - dimensions_codes_order = list(series_element_attributes.keys()) - if dataset_json["dimensions_codes_order"] is None: - dataset_json["dimensions_codes_order"] = dimensions_codes_order - else: - # dimensions_codes_order must not change between series. - assert dataset_json["dimensions_codes_order"] == dimensions_codes_order, \ - (sdmx_file_path, dataset_json["dimensions_codes_order"], dimensions_codes_order) - - # Series code is not given by SDMX source files: create it from dimensions values codes. - series_code = ".".join( - series_element_attributes[dimension_code] - for dimension_code in dimensions_codes_order - ) - dataset_json["series"].append({ - "code": series_code, - "dimensions": dict(series_element_attributes), - }) - - # Fill series dimensions labels in dataset.json. - for dimension_code, dimension_value_code in series_element_attributes.items(): - if dimension_code not in dataset_json["dimensions_labels"]: - dimension_label_xpath = './message:Concepts/structure:ConceptScheme[@id="CONCEPTS"]/structure:Concept[@id="{}"]/structure:Name[@xml:lang="en"]'.format( - dimension_code - ) - dimension_label = dsd_tree.findtext(dimension_label_xpath, namespaces=dsd_nsmap) - if dimension_label: - # Some dimensions labels are an empty string: e.g. bs_bs12_04.sdmx.xml - dataset_json["dimensions_labels"][dimension_code] = dimension_label - if dimension_code not in dataset_json["dimensions_values_labels"] or \ - dimension_value_code not in dataset_json["dimensions_values_labels"][dimension_code]: - dimension_element_xpath = './message:KeyFamilies/structure:KeyFamily/structure:Components/structure:Dimension[@conceptRef="{}"]'.format( - dimension_code - ) - dimension_element = dsd_tree.find(dimension_element_xpath, namespaces=dsd_nsmap) - assert dimension_element is not None, (dsd_file_path, dimension_element_xpath) - codelist_code = dimension_element.attrib["codelist"] - dimension_value_label_xpath = './message:CodeLists/structure:CodeList[@id="{}"]/structure:Code[@value="{}"]/structure:Description[@xml:lang="en"]'.format( - codelist_code, - dimension_value_code, - ) - dimension_value_label = dsd_tree.findtext(dimension_value_label_xpath, namespaces=dsd_nsmap) - # Some descriptions are empty string: just ensure it's a string, but do not store empty descriptions. - assert isinstance(dimension_value_label, str), \ - (dsd_file_path, dimension_value_label_xpath, dimension_value_label) - if dimension_value_label: - dataset_json["dimensions_values_labels"].setdefault( - dimension_code, {})[dimension_value_code] = dimension_value_label - - # Fill observations attributes labels in dataset.json. - for observation in observations_tsv: - for attribute_code, attribute_value_code in observation.items(): - if attribute_code not in dataset_json["attributes_labels"]: - attribute_label_xpath = './message:Concepts/structure:ConceptScheme[@id="CONCEPTS"]/structure:Concept[@id="{}"]/structure:Name[@xml:lang="en"]'.format( - attribute_code - ) - attribute_label = dsd_tree.findtext(attribute_label_xpath, namespaces=dsd_nsmap) - assert attribute_label, (dsd_file_path, attribute_label_xpath, attribute_label) - dataset_json["attributes_labels"][attribute_code] = attribute_label - # Some attributes values codes are multi-valued and concatenated into the same string. - attribute_codes = list(attribute_code) \ - if attribute_code == "OBS_STATUS" \ - else [attribute_code] - for attribute_code in attribute_codes: - # Ignore period and value observations XML attributes, - # because they don't provide any codes but values. - if attribute_code not in required_observations_tsv_column_names and \ - ( - attribute_code not in dataset_json["attributes_values_labels"] or - attribute_value_code not in dataset_json["attributes_values_labels"][attribute_code] - ): - attribute_element_xpath = './message:KeyFamilies/structure:KeyFamily/structure:Components/structure:Attribute[@conceptRef="{}"]'.format( - attribute_code - ) - attribute_element = dsd_tree.find(attribute_element_xpath, namespaces=dsd_nsmap) - if attribute_element is not None: - codelist_code = attribute_element.attrib["codelist"] - attribute_value_label_xpath = './message:CodeLists/structure:CodeList[@id="{}"]/structure:Code[@value="{}"]/structure:Description[@xml:lang="en"]'.format( - codelist_code, - attribute_value_code, - ) - attribute_value_label = dsd_tree.findtext( - attribute_value_label_xpath, - namespaces=dsd_nsmap, - ) - assert attribute_value_label, (dsd_file_path, attribute_code, attribute_value_code) - dataset_json["attributes_values_labels"].setdefault( - attribute_code, {})[attribute_value_code] = attribute_value_label - - # Write observations.tsv - other_column_names = set() - for observation in observations_tsv: - for column_name in observation.keys(): - if column_name not in required_observations_tsv_column_names: - other_column_names.add(column_name) - observations_tsv_column_names = required_observations_tsv_column_names + list(sorted(other_column_names)) - observations_tsv_str = "\n".join([ - "\t".join(observations_tsv_column_names), - ] + [ - "\t".join( - observation.get(column_name, "") - for column_name in observations_tsv_column_names - ) - for observation in observations_tsv - ]) +def iter_git_objects_in_sdmx_element(element, sdmx_file_path, sdmx_nsmap, dsd_file_path, dsd_nsmap, dsd_tree, + dataset_json, dataset_tree, observations_tsv, yielded_git_object_ids): + if element.tag == "{{{}}}Series".format(sdmx_nsmap["data"]): + yield from iter_git_objects_in_sdmx_series_element(element, sdmx_file_path, dsd_file_path, dsd_nsmap, dsd_tree, dataset_json, + dataset_tree, observations_tsv, yielded_git_object_ids) + observations_tsv.clear() + elif element.tag == "{{{}}}Obs".format(sdmx_nsmap["data"]): + observations_tsv.append(dict(element.attrib)) - observations_tsv_blob = Blob.from_string(observations_tsv_str.encode('utf-8')) - observations_tsv_blob_id = observations_tsv_blob.id - # Some TSV observations files are exactly the same, and Git packs don't tolerate duplicate objects. - if observations_tsv_blob_id not in yielded_git_object_ids: - yielded_git_object_ids.add(observations_tsv_blob_id) - yield observations_tsv_blob - dataset_tree.add("{}.tsv".format(series_code).encode('utf-8'), git_blob_filemode, observations_tsv_blob_id) +def iter_git_objects_in_sdmx_file(sdmx_file_path, data_package_tree, dataset_pair_by_dataset_code, dataset_json_stub): # Load DSD dsd_file_path = "{}.dsd.xml".format(sdmx_file_path[:-len(sdmx_file_extension)]) @@ -242,9 +123,9 @@ def iter_git_objects_in_sdmx_file(sdmx_file_path, data_package_tree, dataset_pai context = etree.iterparse(sdmx_file_path, events=("end",)) observations_tsv = [] yielded_git_object_ids = set() - yield from fast_iter(context, process_element) - # Write dataset.json + yield from fast_iter(context, iter_git_objects_in_sdmx_element, sdmx_file_path, sdmx_nsmap, dsd_file_path, dsd_nsmap, dsd_tree, + dataset_json, dataset_tree, observations_tsv, yielded_git_object_ids) dataset_json = without_falsy_keys(dataset_json) if args.validate_json: @@ -262,13 +143,138 @@ def iter_git_objects_in_sdmx_file(sdmx_file_path, data_package_tree, dataset_pai yield dataset_tree -def toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tree, xml_element): +def iter_git_objects_in_sdmx_series_element(series_element, sdmx_file_path, dsd_file_path, dsd_nsmap, dsd_tree, + dataset_json, dataset_tree, observations_tsv, yielded_git_object_ids): + # Ignore some specific XML element attributes corresponding to series SDMX attributes, + # because series SDMX attributes do not exist in DB.nomics. + series_element_attributes = OrderedDict([ + (attribute_key, attribute_value) + for attribute_key, attribute_value in series_element.attrib.items() + if attribute_key not in {"TIME_FORMAT"} + ]) + + dimensions_codes_order = list(series_element_attributes.keys()) + if dataset_json["dimensions_codes_order"] is None: + dataset_json["dimensions_codes_order"] = dimensions_codes_order + else: + # dimensions_codes_order must not change between series. + assert dataset_json["dimensions_codes_order"] == dimensions_codes_order, \ + (sdmx_file_path, dataset_json["dimensions_codes_order"], dimensions_codes_order) + + # Series code is not given by SDMX source files: create it from dimensions values codes. + series_code = ".".join( + series_element_attributes[dimension_code] + for dimension_code in dimensions_codes_order + ) + dataset_json["series"].append({ + "code": series_code, + "dimensions": dict(series_element_attributes), + }) + + # Fill series dimensions labels in dataset.json. + for dimension_code, dimension_value_code in series_element_attributes.items(): + if dimension_code not in dataset_json["dimensions_labels"]: + dimension_label_xpath = './message:Concepts/structure:ConceptScheme[@id="CONCEPTS"]/structure:Concept[@id="{}"]/structure:Name[@xml:lang="en"]'.format( + dimension_code + ) + dimension_label = dsd_tree.findtext(dimension_label_xpath, namespaces=dsd_nsmap) + if dimension_label: + # Some dimensions labels are an empty string: e.g. bs_bs12_04.sdmx.xml + dataset_json["dimensions_labels"][dimension_code] = dimension_label + if dimension_code not in dataset_json["dimensions_values_labels"] or \ + dimension_value_code not in dataset_json["dimensions_values_labels"][dimension_code]: + dimension_element_xpath = './message:KeyFamilies/structure:KeyFamily/structure:Components/structure:Dimension[@conceptRef="{}"]'.format( + dimension_code + ) + dimension_element = dsd_tree.find(dimension_element_xpath, namespaces=dsd_nsmap) + assert dimension_element is not None, (dsd_file_path, dimension_element_xpath) + codelist_code = dimension_element.attrib["codelist"] + dimension_value_label_xpath = './message:CodeLists/structure:CodeList[@id="{}"]/structure:Code[@value="{}"]/structure:Description[@xml:lang="en"]'.format( + codelist_code, + dimension_value_code, + ) + dimension_value_label = dsd_tree.findtext(dimension_value_label_xpath, namespaces=dsd_nsmap) + # Some descriptions are empty string: just ensure it's a string, but do not store empty descriptions. + assert isinstance(dimension_value_label, str), \ + (dsd_file_path, dimension_value_label_xpath, dimension_value_label) + if dimension_value_label: + dataset_json["dimensions_values_labels"].setdefault( + dimension_code, {})[dimension_value_code] = dimension_value_label + + # Fill observations attributes labels in dataset.json. + for observation in observations_tsv: + for attribute_code, attribute_value_code in observation.items(): + if attribute_code not in dataset_json["attributes_labels"]: + attribute_label_xpath = './message:Concepts/structure:ConceptScheme[@id="CONCEPTS"]/structure:Concept[@id="{}"]/structure:Name[@xml:lang="en"]'.format( + attribute_code + ) + attribute_label = dsd_tree.findtext(attribute_label_xpath, namespaces=dsd_nsmap) + assert attribute_label, (dsd_file_path, attribute_label_xpath, attribute_label) + dataset_json["attributes_labels"][attribute_code] = attribute_label + # Some attributes values codes are multi-valued and concatenated into the same string. + attribute_codes = list(attribute_code) \ + if attribute_code == "OBS_STATUS" \ + else [attribute_code] + for attribute_code in attribute_codes: + # Ignore period and value observations XML attributes, + # because they don't provide any codes but values. + if attribute_code not in required_observations_tsv_column_names and \ + ( + attribute_code not in dataset_json["attributes_values_labels"] or + attribute_value_code not in dataset_json["attributes_values_labels"][attribute_code] + ): + attribute_element_xpath = './message:KeyFamilies/structure:KeyFamily/structure:Components/structure:Attribute[@conceptRef="{}"]'.format( + attribute_code + ) + attribute_element = dsd_tree.find(attribute_element_xpath, namespaces=dsd_nsmap) + if attribute_element is not None: + codelist_code = attribute_element.attrib["codelist"] + attribute_value_label_xpath = './message:CodeLists/structure:CodeList[@id="{}"]/structure:Code[@value="{}"]/structure:Description[@xml:lang="en"]'.format( + codelist_code, + attribute_value_code, + ) + attribute_value_label = dsd_tree.findtext( + attribute_value_label_xpath, + namespaces=dsd_nsmap, + ) + assert attribute_value_label, (dsd_file_path, attribute_code, attribute_value_code) + dataset_json["attributes_values_labels"].setdefault( + attribute_code, {})[attribute_value_code] = attribute_value_label + + # Write observations.tsv + other_column_names = set() + for observation in observations_tsv: + for column_name in observation.keys(): + if column_name not in required_observations_tsv_column_names: + other_column_names.add(column_name) + observations_tsv_column_names = required_observations_tsv_column_names + list(sorted(other_column_names)) + observations_tsv_str = "\n".join([ + "\t".join(observations_tsv_column_names), + ] + [ + "\t".join( + observation.get(column_name, "") + for column_name in observations_tsv_column_names + ) + for observation in observations_tsv + ]) + + observations_tsv_blob = Blob.from_string(observations_tsv_str.encode('utf-8')) + observations_tsv_blob_id = observations_tsv_blob.id + # Some TSV observations files are exactly the same, and Git packs don't tolerate duplicate objects. + if observations_tsv_blob_id not in yielded_git_object_ids: + yielded_git_object_ids.add(observations_tsv_blob_id) + yield observations_tsv_blob + dataset_tree.add("{}.tsv".format(series_code).encode('utf-8'), git_blob_filemode, observations_tsv_blob_id) + + +def toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tree, xml_element, processed_datasets_codes): xml_element_tag = xml_element.tag[len(toc_nsmap["nt"]) + 2:] if xml_element_tag == "tree": return list(filter( None, ( - toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tree, child_element) + toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tree, child_element, + processed_datasets_codes) for child_element in xml_element ) )) @@ -276,7 +282,8 @@ def toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tre children = list(filter( None, ( - toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tree, child_element) + toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tree, child_element, + processed_datasets_codes) for child_element in xml_element.iterfind("nt:children/*", namespaces=toc_nsmap) ) )) @@ -310,30 +317,32 @@ def toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, data_package_tre else: if args.datasets_codes is None or dataset_code in args.datasets_codes: if os.path.isfile(sdmx_file_path): - log.info("Converting SDMX source file %s (size: %d)", - sdmx_file_path, os.path.getsize(sdmx_file_path)) - pack_start_time = time.time() - write_pack( - pack_file_path, - objects=iter_git_objects_in_sdmx_file( - sdmx_file_path, - data_package_tree, - dataset_pair_by_dataset_code, - dataset_json_stub={ - "code": dataset_code, - "name": dataset_name, - "description": xml_element.findtext("nt:shortDescription[@language='en']", - namespaces=toc_nsmap) or None, - "doc_href": xml_element.findtext("nt:metadata[@format='html']", - namespaces=toc_nsmap) or None, - }, - ), - ) - log.info("Git pack file {!r} written, took {:.2f} seconds".format( - pack_file_path, - time.time() - pack_start_time, - )) - return categories_tree_dataset_json + if dataset_code not in processed_datasets_codes: + log.info("Converting SDMX source file %s (size: %d)", + sdmx_file_path, os.path.getsize(sdmx_file_path)) + pack_start_time = time.time() + write_pack( + pack_file_path, + objects=iter_git_objects_in_sdmx_file( + sdmx_file_path, + data_package_tree, + dataset_pair_by_dataset_code, + dataset_json_stub={ + "code": dataset_code, + "name": dataset_name, + "description": xml_element.findtext("nt:shortDescription[@language='en']", + namespaces=toc_nsmap) or None, + "doc_href": xml_element.findtext("nt:metadata[@format='html']", + namespaces=toc_nsmap) or None, + }, + ), + ) + log.info("Git pack file {!r} written, took {:.2f} seconds".format( + pack_file_path, + time.time() - pack_start_time, + )) + processed_datasets_codes.add(dataset_code) + return categories_tree_dataset_json else: log.debug("SDMX file %s was not downloaded, skipping", sdmx_file_path) @@ -439,11 +448,13 @@ def main(): # Walk recursively in table_of_contents.xml and return categories_tree_json. # Side-effects: write dataset Git packs, update dataset_pair_by_dataset_code and data_package_tree. - + processed_datasets_codes = set() categories_tree_json = toc_xml_element_to_json(repo, dataset_pair_by_dataset_code, - data_package_tree, xml_element=toc_element.getroot()) + data_package_tree, xml_element=toc_element.getroot(), + processed_datasets_codes=processed_datasets_codes) # Write datasets index in Git repository, which was modified above by a side-effect. + # TODO Write during iteration in case script crashes. with open(dataset_index_file_path, "w") as dataset_index_file: dataset_index_file.writelines(