Skip to content
Snippets Groups Projects
Commit 0ce1f52a authored by Christophe Benz's avatar Christophe Benz
Browse files

Add logging, use dbnomics-git-storage, simplify code

parent eb1471c9
No related branches found
No related tags found
No related merge requests found
Pipeline #7138 failed with stage
in 39 seconds
......@@ -37,6 +37,7 @@ EUROSTAT SDMX documentation:
import argparse
import io
import logging
import os
import re
import shutil
......@@ -44,11 +45,13 @@ import subprocess
import sys
import zipfile
import requests
from dulwich.repo import Repo
from lxml import etree
import requests
import dbnomics_git_storage as git_storage
log = logging.getLogger(__name__)
nsmap = dict(
nt='urn:eu.europa.ec.eurostat.navtree',
)
......@@ -86,23 +89,41 @@ def main():
parser.add_argument('--incremental', action='store_true',
help='download only datasets that changed since the last commit')
parser.add_argument('--keep-files', action='store_true', help='keep existing files in target directory')
parser.add_argument('-v', '--verbose', action='store_true', default=False, help='display debug logging messages')
args = parser.parse_args()
logging.basicConfig(
format="%(levelname)s: %(message)s",
level=logging.DEBUG if args.verbose else logging.INFO,
stream=sys.stdout
)
assert os.path.exists(args.target_dir)
old_xml_element = None
table_of_contents_file_name = 'table_of_contents.xml'
if args.incremental:
repo = Repo(args.target_dir)
assert b'HEAD' in repo.get_refs()
old_xml_element = etree.fromstring(repo[repo[repo[repo.head()].tree][b"table_of_contents.xml"][1]].data)
tree = git_storage.get_latest_commit_tree(repo)
if tree is None:
log.error("Incremental mode can't be used when source data repository has no commit.")
old_xml_element = etree.fromstring(git_storage.load_text_blob(repo, tree, table_of_contents_file_name))
# Fetch list of datasets.
xml_url = 'http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=table_of_contents.xml'
print('Fetching table of content {}'.format(xml_url))
response = requests.get(xml_url)
xml_element = etree.fromstring(response.content, parser=etree.XMLParser(remove_blank_text=True))
xml_file_path = os.path.join(args.target_dir, 'table_of_contents.xml')
with open(xml_file_path, 'wb') as xml_file:
etree.ElementTree(xml_element).write(xml_file, encoding='utf-8', pretty_print=True, xml_declaration=True)
xml_file_path = os.path.join(args.target_dir, table_of_contents_file_name)
parser = etree.XMLParser(remove_blank_text=True)
if args.keep_files and os.path.exists(xml_file_path):
log.info("Skipping existing file {}".format(table_of_contents_file_name))
xml_element = etree.parse(xml_file_path, parser=parser)
else:
xml_url = 'http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file={}'.format(
table_of_contents_file_name)
log.info('Fetching table of content {}'.format(xml_url))
response = requests.get(xml_url)
xml_element = etree.fromstring(response.content, parser=parser)
with open(xml_file_path, 'wb') as xml_file:
etree.ElementTree(xml_element).write(xml_file, encoding='utf-8', pretty_print=True, xml_declaration=True)
# Fetch datasets.
......@@ -118,32 +139,25 @@ def main():
else:
os.mkdir(data_dir)
data_urls = set(
data_url
for data_url in (
leaf_element.findtext('./nt:downloadLink[@format="sdmx"]', namespaces=nsmap)
for leaf_element in iter_datasets(xml_element, old_xml_element)
)
if data_url
)
for data_url in data_urls:
data_urls = set()
metadata_urls = set()
for leaf_element in iter_datasets(xml_element, old_xml_element):
data_url = leaf_element.findtext('./nt:downloadLink[@format="sdmx"]', namespaces=nsmap)
if data_url:
data_urls.add(data_url)
metadata_url = leaf_element.findtext('./nt:metadata[@format="sdmx"]', namespaces=nsmap)
if metadata_url:
metadata_urls.add(metadata_url)
for index, data_url in enumerate(data_urls, start=1):
dataset_dir = os.path.join(data_dir, data_url.rsplit('/', 1)[-1].split('.', 1)[0])
if os.path.exists(dataset_dir):
print('Skipping existing dataset {}'.format(data_url))
log.info('Skipping existing dataset {}'.format(data_url))
else:
print('Fetching dataset {}'.format(data_url))
os.mkdir(dataset_dir)
log.info('Fetching dataset {}/{} {}'.format(index, len(data_urls), data_url))
response = requests.get(data_url)
data_zip_file = zipfile.ZipFile(io.BytesIO(response.content))
if os.path.exists(dataset_dir):
for node_name in os.listdir(dataset_dir):
node_path = os.path.join(dataset_dir, node_name)
if os.path.isdir(node_path):
shutil.rmtree(node_path)
else:
os.remove(node_path)
else:
os.mkdir(dataset_dir)
for data_zip_info in data_zip_file.infolist():
if data_zip_info.filename.endswith('.xml'):
with data_zip_file.open(data_zip_info) as data_file:
......@@ -166,32 +180,15 @@ def main():
else:
os.mkdir(data_structures_dir)
metadata_urls = set(
metadata_url
for metadata_url in (
leaf_element.findtext('./nt:metadata[@format="sdmx"]', namespaces=nsmap)
for leaf_element in iter_datasets(xml_element, old_xml_element)
)
if metadata_url
)
for metadata_url in metadata_urls:
for index, metadata_url in enumerate(metadata_urls, start=1):
metadata_dir = os.path.join(data_structures_dir, metadata_url.rsplit('/', 1)[-1].split('.', 1)[0])
if os.path.exists(metadata_dir):
print('Skipping existing data structure {}'.format(metadata_url))
log.info('Skipping existing data structure {}'.format(metadata_url))
else:
print('Fetching data structure {}'.format(metadata_url))
os.mkdir(metadata_dir)
log.info('Fetching data structure {}/{} {}'.format(index, len(metadata_urls), metadata_url))
response = requests.get(metadata_url)
metadata_zip_file = zipfile.ZipFile(io.BytesIO(response.content))
if os.path.exists(metadata_dir):
for node_name in os.listdir(metadata_dir):
node_path = os.path.join(metadata_dir, node_name)
if os.path.isdir(node_path):
shutil.rmtree(node_path)
else:
os.remove(node_path)
else:
os.mkdir(metadata_dir)
for metadata_zip_info in metadata_zip_file.infolist():
if metadata_zip_info.filename.endswith('.xml'):
with metadata_zip_file.open(metadata_zip_info) as metadata_file:
......
dbnomics-git-storage >= 0.0, < 0.1
lxml
requests
dulwich
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment