From 0ce1f52aeeab89d66f1a824a668b3ca7b4f28959 Mon Sep 17 00:00:00 2001
From: Christophe Benz <christophe.benz@cepremap.org>
Date: Fri, 18 May 2018 15:22:04 +0200
Subject: [PATCH] Add logging, use dbnomics-git-storage, simplify code

---
 download.py      | 101 +++++++++++++++++++++++------------------------
 requirements.txt |   1 +
 2 files changed, 50 insertions(+), 52 deletions(-)

diff --git a/download.py b/download.py
index 2c4946e..a51d6a2 100755
--- a/download.py
+++ b/download.py
@@ -37,6 +37,7 @@ EUROSTAT SDMX documentation:
 
 import argparse
 import io
+import logging
 import os
 import re
 import shutil
@@ -44,11 +45,13 @@ import subprocess
 import sys
 import zipfile
 
+import requests
 from dulwich.repo import Repo
 from lxml import etree
-import requests
 
+import dbnomics_git_storage as git_storage
 
+log = logging.getLogger(__name__)
 nsmap = dict(
     nt='urn:eu.europa.ec.eurostat.navtree',
 )
@@ -86,23 +89,41 @@ def main():
     parser.add_argument('--incremental', action='store_true',
                         help='download only datasets that changed since the last commit')
     parser.add_argument('--keep-files', action='store_true', help='keep existing files in target directory')
+    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='display debug logging messages')
     args = parser.parse_args()
+    logging.basicConfig(
+        format="%(levelname)s: %(message)s",
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        stream=sys.stdout
+    )
+
+    assert os.path.exists(args.target_dir)
 
     old_xml_element = None
+    table_of_contents_file_name = 'table_of_contents.xml'
 
     if args.incremental:
         repo = Repo(args.target_dir)
-        assert b'HEAD' in repo.get_refs()
-        old_xml_element = etree.fromstring(repo[repo[repo[repo.head()].tree][b"table_of_contents.xml"][1]].data)
+        tree = git_storage.get_latest_commit_tree(repo)
+        if tree is None:
+            log.error("Incremental mode can't be used when source data repository has no commit.")
+        old_xml_element = etree.fromstring(git_storage.load_text_blob(repo, tree, table_of_contents_file_name))
 
     # Fetch list of datasets.
-    xml_url = 'http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=table_of_contents.xml'
-    print('Fetching table of content {}'.format(xml_url))
-    response = requests.get(xml_url)
-    xml_element = etree.fromstring(response.content, parser=etree.XMLParser(remove_blank_text=True))
-    xml_file_path = os.path.join(args.target_dir, 'table_of_contents.xml')
-    with open(xml_file_path, 'wb') as xml_file:
-        etree.ElementTree(xml_element).write(xml_file, encoding='utf-8', pretty_print=True, xml_declaration=True)
+
+    xml_file_path = os.path.join(args.target_dir, table_of_contents_file_name)
+    parser = etree.XMLParser(remove_blank_text=True)
+    if args.keep_files and os.path.exists(xml_file_path):
+        log.info("Skipping existing file {}".format(table_of_contents_file_name))
+        xml_element = etree.parse(xml_file_path, parser=parser)
+    else:
+        xml_url = 'http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file={}'.format(
+            table_of_contents_file_name)
+        log.info('Fetching table of content {}'.format(xml_url))
+        response = requests.get(xml_url)
+        xml_element = etree.fromstring(response.content, parser=parser)
+        with open(xml_file_path, 'wb') as xml_file:
+            etree.ElementTree(xml_element).write(xml_file, encoding='utf-8', pretty_print=True, xml_declaration=True)
 
     # Fetch datasets.
 
@@ -118,32 +139,25 @@ def main():
     else:
         os.mkdir(data_dir)
 
-    data_urls = set(
-        data_url
-        for data_url in (
-            leaf_element.findtext('./nt:downloadLink[@format="sdmx"]', namespaces=nsmap)
-            for leaf_element in iter_datasets(xml_element, old_xml_element)
-        )
-        if data_url
-    )
-
-    for data_url in data_urls:
+    data_urls = set()
+    metadata_urls = set()
+    for leaf_element in iter_datasets(xml_element, old_xml_element):
+        data_url = leaf_element.findtext('./nt:downloadLink[@format="sdmx"]', namespaces=nsmap)
+        if data_url:
+            data_urls.add(data_url)
+        metadata_url = leaf_element.findtext('./nt:metadata[@format="sdmx"]', namespaces=nsmap)
+        if metadata_url:
+            metadata_urls.add(metadata_url)
+
+    for index, data_url in enumerate(data_urls, start=1):
         dataset_dir = os.path.join(data_dir, data_url.rsplit('/', 1)[-1].split('.', 1)[0])
         if os.path.exists(dataset_dir):
-            print('Skipping existing dataset {}'.format(data_url))
+            log.info('Skipping existing dataset {}'.format(data_url))
         else:
-            print('Fetching dataset {}'.format(data_url))
+            os.mkdir(dataset_dir)
+            log.info('Fetching dataset {}/{} {}'.format(index, len(data_urls), data_url))
             response = requests.get(data_url)
             data_zip_file = zipfile.ZipFile(io.BytesIO(response.content))
-            if os.path.exists(dataset_dir):
-                for node_name in os.listdir(dataset_dir):
-                    node_path = os.path.join(dataset_dir, node_name)
-                    if os.path.isdir(node_path):
-                        shutil.rmtree(node_path)
-                    else:
-                        os.remove(node_path)
-            else:
-                os.mkdir(dataset_dir)
             for data_zip_info in data_zip_file.infolist():
                 if data_zip_info.filename.endswith('.xml'):
                     with data_zip_file.open(data_zip_info) as data_file:
@@ -166,32 +180,15 @@ def main():
     else:
         os.mkdir(data_structures_dir)
 
-    metadata_urls = set(
-        metadata_url
-        for metadata_url in (
-            leaf_element.findtext('./nt:metadata[@format="sdmx"]', namespaces=nsmap)
-            for leaf_element in iter_datasets(xml_element, old_xml_element)
-        )
-        if metadata_url
-    )
-
-    for metadata_url in metadata_urls:
+    for index, metadata_url in enumerate(metadata_urls, start=1):
         metadata_dir = os.path.join(data_structures_dir, metadata_url.rsplit('/', 1)[-1].split('.', 1)[0])
         if os.path.exists(metadata_dir):
-            print('Skipping existing data structure {}'.format(metadata_url))
+            log.info('Skipping existing data structure {}'.format(metadata_url))
         else:
-            print('Fetching data structure {}'.format(metadata_url))
+            os.mkdir(metadata_dir)
+            log.info('Fetching data structure {}/{} {}'.format(index, len(metadata_urls), metadata_url))
             response = requests.get(metadata_url)
             metadata_zip_file = zipfile.ZipFile(io.BytesIO(response.content))
-            if os.path.exists(metadata_dir):
-                for node_name in os.listdir(metadata_dir):
-                    node_path = os.path.join(metadata_dir, node_name)
-                    if os.path.isdir(node_path):
-                        shutil.rmtree(node_path)
-                    else:
-                        os.remove(node_path)
-            else:
-                os.mkdir(metadata_dir)
             for metadata_zip_info in metadata_zip_file.infolist():
                 if metadata_zip_info.filename.endswith('.xml'):
                     with metadata_zip_file.open(metadata_zip_info) as metadata_file:
diff --git a/requirements.txt b/requirements.txt
index fda990c..8adf2f9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+dbnomics-git-storage >= 0.0, < 0.1
 lxml
 requests
 dulwich
-- 
GitLab