Use regexes to normalize data

3e753731 · Christophe Benz · 8afd265f · 3e753731
Commit 3e753731 authored 8 years ago by Christophe Benz
--- a/eurostat_bulk_fetcher.py
+++ b/eurostat_bulk_fetcher.py
@@ -38,6 +38,7 @@ EUROSTAT SDMX documentation:
 import argparse
 import io
 import os
+import re
 import shutil
 import subprocess
 import sys
@@ -51,6 +52,7 @@ data_repository_url = 'git@git.nomics.world:dbnomics-source-data/eurostat-source
 eurostat_namespace_url_by_name = dict(
    nt='urn:eu.europa.ec.eurostat.navtree',
 )
+prepared_element_re = re.compile('<Prepared>.+</Prepared>')
 sdmx_namespace_url_by_name = dict(
    common='http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common',
    footer='http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message/footer',
@@ -176,9 +178,8 @@ def main():
        for metadata_zip_info in metadata_zip_file.infolist():
            if metadata_zip_info.filename.endswith('.xml'):
                with metadata_zip_file.open(metadata_zip_info) as metadata_file:
-                    xml_tree = etree.parse(metadata_file)
                    xml_file_path = os.path.join(metadata_dir, metadata_zip_info.filename)
-                    write_normalized_xml_element(xml_file_path, xml_tree.getroot())
+                    write_normalized_xml_element(xml_file_path, metadata_file)
            else:
                metadata_zip_file.extract(metadata_zip_info, metadata_dir)

@@ -199,9 +200,8 @@ def main():
        for data_zip_info in data_zip_file.infolist():
            if data_zip_info.filename.endswith('.xml'):
                with data_zip_file.open(data_zip_info) as data_file:
-                    xml_tree = etree.parse(data_file)
                    xml_file_path = os.path.join(dataset_dir, data_zip_info.filename)
-                    write_normalized_xml_element(xml_file_path, xml_tree.getroot())
+                    write_normalized_xml_element(xml_file_path, data_file)
            else:
                data_zip_file.extract(data_zip_info, dataset_dir)

@@ -232,26 +232,15 @@ def main():
    return 0


-def write_normalized_xml_element(xml_file_path, xml_element):
+def write_normalized_xml_element(xml_file_path, source_file):
    """Normalize data that changes at each download, like today date,
    in order to avoid triggering a false commit in source data.
-    """
-    # id_element = xml_element.xpath(
-    #     './/message:Header/message:ID',
-    #     namespaces=sdmx_namespace_url_by_name,
-    #     )[0]
-    # splitted_id = id_element.text.rsplit('_', 1)
-    # id_element.text = '{}_{}'.format(splitted_id[0], '1' * len(splitted_id[1]))
-
-    prepared_element = xml_element.find(
-        './/message:Header/message:Prepared',
-        namespaces=sdmx_namespace_url_by_name,
-    )
-    if prepared_element is not None:
-        prepared_element.text = '1111-11-11T11:11:11'

-    with open(xml_file_path, 'wb') as xml_file:
-        etree.ElementTree(xml_element).write(xml_file, encoding='utf-8', xml_declaration=True)
+    Use regexes because lxml fails with too large files.
+    """
+    xml_str = source_file.read().decode('utf-8')
+    with open(xml_file_path, mode="w", encoding='utf-8') as xml_file:
+        xml_file.write(prepared_element_re.sub("<Prepared>1111-11-11T11:11:11</Prepared>", xml_str, 1))


 if __name__ == '__main__':