Skip to content
Snippets Groups Projects
download.py 9.01 KiB
Newer Older
Emmanuel Raviart's avatar
Emmanuel Raviart committed
#! /usr/bin/env python3


# eurostat-fetcher -- Fetch series from Eurostat database
# By: Emmanuel Raviart <emmanuel.raviart@cepremap.org>
#
# Copyright (C) 2017 Cepremap
# https://git.nomics.world/dbnomics-fetchers/eurostat-fetcher
#
# eurostat-fetcher is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# eurostat-fetcher is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
Christophe Benz's avatar
Christophe Benz committed
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
Emmanuel Raviart's avatar
Emmanuel Raviart committed


"""Fetch series from Eurostat, the statistical office of the European Union, using bulk download and SDMX formats.

http://ec.europa.eu/eurostat/data/database

EUROSTAT bulk download:
- http://ec.europa.eu/eurostat/fr/data/bulkdownload
- http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing

EUROSTAT SDMX documentation:
- http://ec.europa.eu/eurostat/web/sdmx-infospace/welcome
- http://ec.europa.eu/eurostat/web/sdmx-web-services/rest-sdmx-2.1
"""


import argparse
import io
Emmanuel Raviart's avatar
Emmanuel Raviart committed
import os
import re
Emmanuel Raviart's avatar
Emmanuel Raviart committed
import shutil
import subprocess
import sys
import zipfile

from dulwich.repo import Repo
Emmanuel Raviart's avatar
Emmanuel Raviart committed
from lxml import etree

import dbnomics_git_storage as git_storage
log = logging.getLogger(__name__)
Emmanuel Raviart's avatar
Emmanuel Raviart committed
    nt='urn:eu.europa.ec.eurostat.navtree',
Christophe Benz's avatar
Christophe Benz committed
)
prepared_element_re = re.compile('<Prepared>.+</Prepared>')
def iter_datasets(xml_element, old_xml_element=None):
    """Yield datasets. If old_xml_element is provided, yield only updated datasets."""
    old_last_update_by_dataset_code = {}
    if old_xml_element is not None:
        # Index lastUpdate attributes in old table_of_contents.xml.
        for element in old_xml_element.iterfind('.//nt:leaf[@type="dataset"]', namespaces=nsmap):
            dataset_code = element.findtext("nt:code", namespaces=nsmap)
            old_last_update = element.findtext("nt:lastUpdate", namespaces=nsmap)
            old_last_update_by_dataset_code[dataset_code] = old_last_update

    for leaf_element in xml_element.iterfind('.//nt:leaf[@type="dataset"]', namespaces=nsmap):
        if old_xml_element is None:
            yield leaf_element
        else:
            dataset_code = leaf_element.findtext('nt:code', namespaces=nsmap)
            old_last_update = old_last_update_by_dataset_code.get(dataset_code)
            if old_last_update is None:
                # This leaf_element is new in this version of table_of_contents.xml
                yield leaf_element
            else:
                last_update = leaf_element.findtext("nt:lastUpdate", namespaces=nsmap)
                if last_update != old_last_update:
                    yield leaf_element


Emmanuel Raviart's avatar
Emmanuel Raviart committed
def main():
    parser = argparse.ArgumentParser()
Christophe Benz's avatar
Christophe Benz committed
    parser.add_argument('target_dir', help='path of target directory containing Eurostat series')
    parser.add_argument('--incremental', action='store_true',
                        help='download only datasets that changed since the last commit')
    parser.add_argument('--keep-files', action='store_true', help='keep existing files in target directory')
    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='display debug logging messages')
Emmanuel Raviart's avatar
Emmanuel Raviart committed
    args = parser.parse_args()
    logging.basicConfig(
        format="%(levelname)s: %(message)s",
        level=logging.DEBUG if args.verbose else logging.INFO,
        stream=sys.stdout
    )

    assert os.path.exists(args.target_dir)
    old_xml_element = None
    table_of_contents_file_name = 'table_of_contents.xml'

    if args.incremental:
        repo = Repo(args.target_dir)
        tree = git_storage.get_latest_commit_tree(repo)
        if tree is None:
            log.error("Incremental mode can't be used when source data repository has no commit.")
        old_xml_element = etree.fromstring(git_storage.load_text_blob(repo, tree, table_of_contents_file_name))
Emmanuel Raviart's avatar
Emmanuel Raviart committed
    # Fetch list of datasets.

    xml_file_path = os.path.join(args.target_dir, table_of_contents_file_name)
    parser = etree.XMLParser(remove_blank_text=True)
    if args.keep_files and os.path.exists(xml_file_path):
        log.info("Skipping existing file {}".format(table_of_contents_file_name))
        xml_element = etree.parse(xml_file_path, parser=parser)
    else:
        xml_url = 'http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file={}'.format(
            table_of_contents_file_name)
        log.info('Fetching table of content {}'.format(xml_url))
        response = requests.get(xml_url)
        xml_element = etree.fromstring(response.content, parser=parser)
        with open(xml_file_path, 'wb') as xml_file:
            etree.ElementTree(xml_element).write(xml_file, encoding='utf-8', pretty_print=True, xml_declaration=True)
    # Fetch datasets.

    data_dir = os.path.join(args.target_dir, 'data')
    if os.path.exists(data_dir):
        if not args.keep_files and not args.incremental:
            for node_name in os.listdir(data_dir):
                node_path = os.path.join(data_dir, node_name)
                if os.path.isdir(node_path):
                    shutil.rmtree(node_path)
                else:
                    os.remove(node_path)
    else:
        os.mkdir(data_dir)

    data_urls = set()
    metadata_urls = set()
    for leaf_element in iter_datasets(xml_element, old_xml_element):
        data_url = leaf_element.findtext('./nt:downloadLink[@format="sdmx"]', namespaces=nsmap)
        if data_url:
            data_urls.add(data_url)
        metadata_url = leaf_element.findtext('./nt:metadata[@format="sdmx"]', namespaces=nsmap)
        if metadata_url:
            metadata_urls.add(metadata_url)

    for index, data_url in enumerate(data_urls, start=1):
        dataset_dir = os.path.join(data_dir, data_url.rsplit('/', 1)[-1].split('.', 1)[0])
        if os.path.exists(dataset_dir):
            log.info('Skipping existing dataset {}'.format(data_url))
            os.mkdir(dataset_dir)
            log.info('Fetching dataset {}/{} {}'.format(index, len(data_urls), data_url))
            response = requests.get(data_url)
            data_zip_file = zipfile.ZipFile(io.BytesIO(response.content))
            for data_zip_info in data_zip_file.infolist():
                if data_zip_info.filename.endswith('.xml'):
                    with data_zip_file.open(data_zip_info) as data_file:
                        xml_file_path = os.path.join(dataset_dir, data_zip_info.filename)
Christophe Benz's avatar
Christophe Benz committed
                        write_normalized_xml_file(xml_file_path, data_file)
                else:
                    data_zip_file.extract(data_zip_info, dataset_dir)

    # Fetch datasets definitions.
Emmanuel Raviart's avatar
Emmanuel Raviart committed

    data_structures_dir = os.path.join(args.target_dir, 'datastructure')
    if os.path.exists(data_structures_dir):
        if not args.keep_files and not args.incremental:
            for node_name in os.listdir(data_structures_dir):
                node_path = os.path.join(data_structures_dir, node_name)
                if os.path.isdir(node_path):
                    shutil.rmtree(node_path)
                else:
                    os.remove(node_path)
Emmanuel Raviart's avatar
Emmanuel Raviart committed
    else:
        os.mkdir(data_structures_dir)

    for index, metadata_url in enumerate(metadata_urls, start=1):
Emmanuel Raviart's avatar
Emmanuel Raviart committed
        metadata_dir = os.path.join(data_structures_dir, metadata_url.rsplit('/', 1)[-1].split('.', 1)[0])
        if os.path.exists(metadata_dir):
            log.info('Skipping existing data structure {}'.format(metadata_url))
Emmanuel Raviart's avatar
Emmanuel Raviart committed
        else:
            os.mkdir(metadata_dir)
            log.info('Fetching data structure {}/{} {}'.format(index, len(metadata_urls), metadata_url))
            response = requests.get(metadata_url)
            metadata_zip_file = zipfile.ZipFile(io.BytesIO(response.content))
            for metadata_zip_info in metadata_zip_file.infolist():
                if metadata_zip_info.filename.endswith('.xml'):
                    with metadata_zip_file.open(metadata_zip_info) as metadata_file:
                        xml_file_path = os.path.join(metadata_dir, metadata_zip_info.filename)
Christophe Benz's avatar
Christophe Benz committed
                        write_normalized_xml_file(xml_file_path, metadata_file)
                else:
                    metadata_zip_file.extract(metadata_zip_info, metadata_dir)
Christophe Benz's avatar
Christophe Benz committed
def write_normalized_xml_file(xml_file_path, source_file):
Christophe Benz's avatar
Christophe Benz committed
    """Normalize data that changes at each download, like today date,
    in order to avoid triggering a false commit in source data.
Christophe Benz's avatar
Christophe Benz committed
    Use regexes because lxml raises SerialisationError with too large files.
Christophe Benz's avatar
Christophe Benz committed
    global prepared_element_re
    xml_str = source_file.read().decode('utf-8')
Christophe Benz's avatar
Christophe Benz committed
    with open(xml_file_path, mode="w") as xml_file:
        xml_file.write(prepared_element_re.sub("<Prepared>1111-11-11T11:11:11</Prepared>", xml_str, 1))
Emmanuel Raviart's avatar
Emmanuel Raviart committed


if __name__ == '__main__':
    sys.exit(main())