Skip to content
Snippets Groups Projects
Commit 3bcc5183 authored by Christophe Benz's avatar Christophe Benz
Browse files

Stop formatting XML files with xmlstarlet...

because with huge files (e.g. 17Gb) the process is killed by the kernel OOM killer
parent dc86fd7a
No related branches found
No related tags found
No related merge requests found
Pipeline #475577 failed with stages
in 2 hours, 51 minutes, and 53 seconds
......@@ -7,7 +7,7 @@ ENV PYTHONUNBUFFERED=1
WORKDIR /app
RUN apt-get update \
&& apt-get install --yes --no-install-recommends --no-install-suggests wget xmlstarlet \
&& apt-get install --yes --no-install-recommends --no-install-suggests wget \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
......
......@@ -150,7 +150,7 @@ def main() -> int:
else:
log.info(f"Fetching data resource of dataset {index}/{len(dataset_codes)} {dataset_code}")
data_req = estat_client.data(dataset_code, dry_run=True)
download_xml(data_req.url, data_file_path)
download(data_req.url, data_file_path)
dataflow_file_path = dataset_dir / f"{dataset_code}.dsd.xml"
if args.resume and dataflow_file_path.is_file():
......@@ -158,7 +158,7 @@ def main() -> int:
else:
log.info(f"Fetching dataflow resource of dataset {index}/{len(dataset_codes)} {dataset_code}")
dataflow_req = estat_client.dataflow(dataset_code, dry_run=True)
download_xml(dataflow_req.url, dataflow_file_path)
download(dataflow_req.url, dataflow_file_path)
return 0
......@@ -177,18 +177,6 @@ def download(url: str, output_file: Path) -> None:
subprocess.run(["/usr/bin/wget", "--no-verbose", url, "-O", str(output_file)], check=True) # noqa: S603
def download_xml(url: str, output_file: Path) -> None:
download(url, output_file)
format_xml(output_file)
def format_xml(file: Path) -> None:
with NamedTemporaryFile(delete=False, dir=file.parent) as fp:
subprocess.run(["/usr/bin/xmlstarlet", "format", str(file)], check=True, stdout=fp) # noqa: S603
fp.close()
Path(fp.name).rename(file)
def parse_table_of_contents(file: Path) -> Element:
xml_parser = etree.XMLParser(remove_blank_text=True)
return etree.parse(str(file), parser=xml_parser)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment