Skip to content
Snippets Groups Projects
Commit 3e753731 authored by Christophe Benz's avatar Christophe Benz
Browse files

Use regexes to normalize data

parent 8afd265f
No related branches found
No related tags found
No related merge requests found
Pipeline #1096 failed with stage
in 31 minutes and 43 seconds
......@@ -38,6 +38,7 @@ EUROSTAT SDMX documentation:
import argparse
import io
import os
import re
import shutil
import subprocess
import sys
......@@ -51,6 +52,7 @@ data_repository_url = 'git@git.nomics.world:dbnomics-source-data/eurostat-source
eurostat_namespace_url_by_name = dict(
nt='urn:eu.europa.ec.eurostat.navtree',
)
prepared_element_re = re.compile('<Prepared>.+</Prepared>')
sdmx_namespace_url_by_name = dict(
common='http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common',
footer='http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message/footer',
......@@ -176,9 +178,8 @@ def main():
for metadata_zip_info in metadata_zip_file.infolist():
if metadata_zip_info.filename.endswith('.xml'):
with metadata_zip_file.open(metadata_zip_info) as metadata_file:
xml_tree = etree.parse(metadata_file)
xml_file_path = os.path.join(metadata_dir, metadata_zip_info.filename)
write_normalized_xml_element(xml_file_path, xml_tree.getroot())
write_normalized_xml_element(xml_file_path, metadata_file)
else:
metadata_zip_file.extract(metadata_zip_info, metadata_dir)
......@@ -199,9 +200,8 @@ def main():
for data_zip_info in data_zip_file.infolist():
if data_zip_info.filename.endswith('.xml'):
with data_zip_file.open(data_zip_info) as data_file:
xml_tree = etree.parse(data_file)
xml_file_path = os.path.join(dataset_dir, data_zip_info.filename)
write_normalized_xml_element(xml_file_path, xml_tree.getroot())
write_normalized_xml_element(xml_file_path, data_file)
else:
data_zip_file.extract(data_zip_info, dataset_dir)
......@@ -232,26 +232,15 @@ def main():
return 0
def write_normalized_xml_element(xml_file_path, xml_element):
def write_normalized_xml_element(xml_file_path, source_file):
"""Normalize data that changes at each download, like today date,
in order to avoid triggering a false commit in source data.
"""
# id_element = xml_element.xpath(
# './/message:Header/message:ID',
# namespaces=sdmx_namespace_url_by_name,
# )[0]
# splitted_id = id_element.text.rsplit('_', 1)
# id_element.text = '{}_{}'.format(splitted_id[0], '1' * len(splitted_id[1]))
prepared_element = xml_element.find(
'.//message:Header/message:Prepared',
namespaces=sdmx_namespace_url_by_name,
)
if prepared_element is not None:
prepared_element.text = '1111-11-11T11:11:11'
with open(xml_file_path, 'wb') as xml_file:
etree.ElementTree(xml_element).write(xml_file, encoding='utf-8', xml_declaration=True)
Use regexes because lxml fails with too large files.
"""
xml_str = source_file.read().decode('utf-8')
with open(xml_file_path, mode="w", encoding='utf-8') as xml_file:
xml_file.write(prepared_element_re.sub("<Prepared>1111-11-11T11:11:11</Prepared>", xml_str, 1))
if __name__ == '__main__':
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment