diff --git a/download.py b/download.py index 2c4946e2879092711dee0a2d852f36799b8bed4b..a51d6a2f79a980a86e317ed637f867283d4172d2 100755 --- a/download.py +++ b/download.py @@ -37,6 +37,7 @@ EUROSTAT SDMX documentation: import argparse import io +import logging import os import re import shutil @@ -44,11 +45,13 @@ import subprocess import sys import zipfile +import requests from dulwich.repo import Repo from lxml import etree -import requests +import dbnomics_git_storage as git_storage +log = logging.getLogger(__name__) nsmap = dict( nt='urn:eu.europa.ec.eurostat.navtree', ) @@ -86,23 +89,41 @@ def main(): parser.add_argument('--incremental', action='store_true', help='download only datasets that changed since the last commit') parser.add_argument('--keep-files', action='store_true', help='keep existing files in target directory') + parser.add_argument('-v', '--verbose', action='store_true', default=False, help='display debug logging messages') args = parser.parse_args() + logging.basicConfig( + format="%(levelname)s: %(message)s", + level=logging.DEBUG if args.verbose else logging.INFO, + stream=sys.stdout + ) + + assert os.path.exists(args.target_dir) old_xml_element = None + table_of_contents_file_name = 'table_of_contents.xml' if args.incremental: repo = Repo(args.target_dir) - assert b'HEAD' in repo.get_refs() - old_xml_element = etree.fromstring(repo[repo[repo[repo.head()].tree][b"table_of_contents.xml"][1]].data) + tree = git_storage.get_latest_commit_tree(repo) + if tree is None: + log.error("Incremental mode can't be used when source data repository has no commit.") + old_xml_element = etree.fromstring(git_storage.load_text_blob(repo, tree, table_of_contents_file_name)) # Fetch list of datasets. - xml_url = 'http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=table_of_contents.xml' - print('Fetching table of content {}'.format(xml_url)) - response = requests.get(xml_url) - xml_element = etree.fromstring(response.content, parser=etree.XMLParser(remove_blank_text=True)) - xml_file_path = os.path.join(args.target_dir, 'table_of_contents.xml') - with open(xml_file_path, 'wb') as xml_file: - etree.ElementTree(xml_element).write(xml_file, encoding='utf-8', pretty_print=True, xml_declaration=True) + + xml_file_path = os.path.join(args.target_dir, table_of_contents_file_name) + parser = etree.XMLParser(remove_blank_text=True) + if args.keep_files and os.path.exists(xml_file_path): + log.info("Skipping existing file {}".format(table_of_contents_file_name)) + xml_element = etree.parse(xml_file_path, parser=parser) + else: + xml_url = 'http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file={}'.format( + table_of_contents_file_name) + log.info('Fetching table of content {}'.format(xml_url)) + response = requests.get(xml_url) + xml_element = etree.fromstring(response.content, parser=parser) + with open(xml_file_path, 'wb') as xml_file: + etree.ElementTree(xml_element).write(xml_file, encoding='utf-8', pretty_print=True, xml_declaration=True) # Fetch datasets. @@ -118,32 +139,25 @@ def main(): else: os.mkdir(data_dir) - data_urls = set( - data_url - for data_url in ( - leaf_element.findtext('./nt:downloadLink[@format="sdmx"]', namespaces=nsmap) - for leaf_element in iter_datasets(xml_element, old_xml_element) - ) - if data_url - ) - - for data_url in data_urls: + data_urls = set() + metadata_urls = set() + for leaf_element in iter_datasets(xml_element, old_xml_element): + data_url = leaf_element.findtext('./nt:downloadLink[@format="sdmx"]', namespaces=nsmap) + if data_url: + data_urls.add(data_url) + metadata_url = leaf_element.findtext('./nt:metadata[@format="sdmx"]', namespaces=nsmap) + if metadata_url: + metadata_urls.add(metadata_url) + + for index, data_url in enumerate(data_urls, start=1): dataset_dir = os.path.join(data_dir, data_url.rsplit('/', 1)[-1].split('.', 1)[0]) if os.path.exists(dataset_dir): - print('Skipping existing dataset {}'.format(data_url)) + log.info('Skipping existing dataset {}'.format(data_url)) else: - print('Fetching dataset {}'.format(data_url)) + os.mkdir(dataset_dir) + log.info('Fetching dataset {}/{} {}'.format(index, len(data_urls), data_url)) response = requests.get(data_url) data_zip_file = zipfile.ZipFile(io.BytesIO(response.content)) - if os.path.exists(dataset_dir): - for node_name in os.listdir(dataset_dir): - node_path = os.path.join(dataset_dir, node_name) - if os.path.isdir(node_path): - shutil.rmtree(node_path) - else: - os.remove(node_path) - else: - os.mkdir(dataset_dir) for data_zip_info in data_zip_file.infolist(): if data_zip_info.filename.endswith('.xml'): with data_zip_file.open(data_zip_info) as data_file: @@ -166,32 +180,15 @@ def main(): else: os.mkdir(data_structures_dir) - metadata_urls = set( - metadata_url - for metadata_url in ( - leaf_element.findtext('./nt:metadata[@format="sdmx"]', namespaces=nsmap) - for leaf_element in iter_datasets(xml_element, old_xml_element) - ) - if metadata_url - ) - - for metadata_url in metadata_urls: + for index, metadata_url in enumerate(metadata_urls, start=1): metadata_dir = os.path.join(data_structures_dir, metadata_url.rsplit('/', 1)[-1].split('.', 1)[0]) if os.path.exists(metadata_dir): - print('Skipping existing data structure {}'.format(metadata_url)) + log.info('Skipping existing data structure {}'.format(metadata_url)) else: - print('Fetching data structure {}'.format(metadata_url)) + os.mkdir(metadata_dir) + log.info('Fetching data structure {}/{} {}'.format(index, len(metadata_urls), metadata_url)) response = requests.get(metadata_url) metadata_zip_file = zipfile.ZipFile(io.BytesIO(response.content)) - if os.path.exists(metadata_dir): - for node_name in os.listdir(metadata_dir): - node_path = os.path.join(metadata_dir, node_name) - if os.path.isdir(node_path): - shutil.rmtree(node_path) - else: - os.remove(node_path) - else: - os.mkdir(metadata_dir) for metadata_zip_info in metadata_zip_file.infolist(): if metadata_zip_info.filename.endswith('.xml'): with metadata_zip_file.open(metadata_zip_info) as metadata_file: diff --git a/requirements.txt b/requirements.txt index fda990cb9665dd4ca6d74b87a919e2e9cc0a9c4a..8adf2f99646720bbeab9644fb8de3cce1ae81baf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +dbnomics-git-storage >= 0.0, < 0.1 lxml requests dulwich