Compare revisions

Michel Juillard · Michel Juillard · Michel Juillard · Michel Juillard · Michel Juillard · f96bf5af
--- a/convert.py
+++ b/convert.py
@@ -83,39 +83,6 @@ def get_countries(source_dir):
    return countries


-def build_categories_tree(source_dir, target_dir):
-    '''build categories > datasets from index.html'''
-    with open(os.path.join(source_dir, "index.html"), "r", encoding='utf-8') as f:
-        soup = bs(f.read(), "lxml")
-
-    categories_tree = []
-    for ul in soup.find_all("ul", class_="extend"):
-        cat_name = clean_special_char(ul.parent.a.text)
-        category = {
-            # ugly fix of broken unicode char
-            "name": clean_special_char(ul.parent.a.text),
-            "code": slugify.slugify(clean_special_char(ul.parent.a.text))
-        }
-        category["children"] = []
-        for li in ul.find_all("li"):
-            series = list(ast.literal_eval(li.a.get("onclick").replace("soumettreTab", "")))
-            code, name = series[-1].split(" - ")
-            category["children"].append({
-                "code": code,
-                "name": name
-            })
-            # for country_code, country_label in countries.items():
-            #     categories[cat].append({
-            #         "code": "_".join([code, country_code]),
-            #         "name": " - ".join([name, country_label])
-            #     })
-        categories_tree.append(category)
-
-    write_json_file(os.path.join(target_dir, "category_tree.json"), categories_tree)
-    write_json_file(os.path.join(target_dir, "provider.json"), provider_json)
-
-    return categories_tree
-

 def detect_unit_label(category_table):
    #    detect unit_label from title NOT USED
@@ -153,14 +120,15 @@ def define_label(label, sub_label, top_label):
        return None, None, last_label


-def build_series(source_dir, target_dir):
+def build_series(source_dir, target_dir, category_tree):
    '''
    From datasets < dataset_code > .html to series
    '''
    countries = get_countries(source_dir)
-    categories = build_categories_tree(source_dir, target_dir)
-    datasets = itertools.chain.from_iterable([category["children"] for category in categories])
+    datasets = itertools.chain.from_iterable([category["children"] for category in category_tree])
    datasets_dict = {n["code"]: n["name"] for n in datasets}
+    for n in datasets_dict.items():
+        print(n)
    for f in os.listdir(source_dir):
        if f.endswith(".html") and f != "index.html":
            dataset_file = os.path.join(source_dir, f)
@@ -245,7 +213,13 @@ def main():
    source_dir = os.path.abspath(args["<source_dir>"])
    target_dir = os.path.abspath(args["<target_dir>"])
    assert os.path.exists(source_dir), source_dir
-    build_series(source_dir, target_dir)
+    write_json_file(os.path.join(target_dir, "provider.json"), provider_json)
+    category_tree_filename_in = os.path.join(source_dir, "category_tree.json")
+    with open(category_tree_filename_in) as f:
+        category_tree = json.load(f)
+    category_tree_filename_out = os.path.join(source_dir, "category_tree.json")
+    write_json_file(category_tree_filename_out, category_tree)
+    build_series(source_dir, target_dir, category_tree)


 if __name__ == "__main__":

--- a/download.py
+++ b/download.py
@@ -28,13 +28,35 @@ Usage:
 """

 import ast
+import json
+import logging
 import os
+import slugify
 import sys

 import requests
 from bs4 import BeautifulSoup as bs
 from docopt import docopt

+log = logging.getLogger(__name__)
+
+def clean_special_char(string):
+    '''ugly fix of encoding error of HTML source file
+    can be fixed replacing unrecognized char into é because 98,79% are é
+    two exceptions:
+    - Côte d'Ivoire
+    - impôts
+    '''
+    if "C�te" in string:
+        return string.replace("�", "ô")
+    elif "imp�ts" in string:
+        return string.replace("�", "ô")
+    else:
+        return string.replace("�", "é")
+
+def write_json_file(file_path, data):
+    with open(file_path, 'w', encoding='utf-8') as file_:
+        json.dump(data, file_, ensure_ascii=False, indent=2, sort_keys=True)

 def download_datasets(target_dir):
    '''download provider root url and build the datasets source page'''
@@ -47,20 +69,27 @@ def download_datasets(target_dir):
    #     'User-Agent': 'DBNomics Downloader BOT see next.dbnomics.world',
    # }
    # download the main index 'index.html'
-    resp = requests.get(provider_root_url)
+    resp0 = requests.get(provider_root_url)
+    resp = requests.get(provider_root_url, cookies=resp0.cookies)
    resp.raise_for_status()
    with open(os.path.join(target_dir, "index.html"), "w") as _file:
        _file.write(resp.text)
    headers = resp.headers
    soup = bs(resp.text, "lxml")
-    categories = {}
+    categories = []
    for ul in soup.find_all("ul", class_="extend"):
        cat = ul.parent.a.text.encode("utf-8").decode("utf-8", errors="ignore")
-        categories[cat] = []
+        # ugly fix of broken unicode char
+        cat_name = clean_special_char(ul.parent.a.text)
+        category = {
+            "name": cat_name,
+            "code": slugify.slugify(clean_special_char(ul.parent.a.text))
+        }
+        category["datasets"] = []
        for li in ul.find_all("li"):
            series = list(ast.literal_eval(li.a.get("onclick").replace("soumettreTab", "")))
-            categories[cat].append(series)
-
+            category["datasets"].append(series)
+        categories.append(category)
    # no need to have country names
    # encoding problems: broken utf-8 from website app: countries and categories é and ô same
    countries = {n.text.strip().encode('utf-8').decode("utf-8", errors="ignore"): n.find("input")
@@ -98,31 +127,47 @@ def download_datasets(target_dir):
    country = {n[0]: n[1] for n in zip(countries.keys(), countries.keys())}
    post_data.update(country)

-    for cat, series_l in categories.items():
-        for series in series_l:
-            freq, id_tab, s_name_id = series
-            s_id, s_name = s_name_id.split(" - ")
+    categories_tree = []
+    for cat in categories:
+        category = {
+            "code": cat["code"],
+            "name": cat["name"]
+        }
+        category["children"] = []
+        for freq, id_tab, s_name_id in cat["datasets"]:
            post_data["idTab"] = id_tab
            data = [(k, v) for k, v in post_data.items()]
            # html_download
-            resp = requests.post("https://edenpub.bceao.int/rapportPredefini.php",
-                                 data=data)
-            assert resp is not None, "requests.get() failed with url entrypoint %s" % provider_root_url
-            assert resp.status_code in range(200, 399), \
-                "requests response.status_code == %s" % resp.status_code
-            with open(os.path.join(target_dir, s_id + ".html"), "w") as _f:
-                _f.write(resp.text)
-            # xls_download
-            post_data["export"] = ""
-            xport_data = [(k, v) for k, v in post_data.items() if k in ["params", "export", "idTab"]]
-            resp = requests.post("https://edenpub.bceao.int/rapportPredefini.php",
-                                 data=data)
-            assert resp is not None, "requests.get() failed with url entrypoint %s" % provider_root_url
-            assert resp.status_code in range(200, 399), \
-                "requests response.status_code == %s" % resp.status_code
-            with open(os.path.join(target_dir, s_id + ".xls"), "w") as _f:
-                _f.write(resp.text)
-
+            try:
+                resp = requests.post("https://edenpub.bceao.int/rapportPredefini.php",
+                                     data=data)
+                assert resp is not None, "requests.get() failed with url entrypoint %s" % provider_root_url
+                assert resp.status_code in range(200, 399), \
+                    "requests response.status_code == %s" % resp.status_code
+            except:
+                log.warning("Dataset {} is missing".format(s_name_id))
+            else:
+                soup = bs(resp.text, "lxml")
+                s_id = soup.find("h2").text.split(' - ')[0]
+                category["children"].append({
+                    "code": s_id,
+                    "name": s_name_id
+                })
+                with open(os.path.join(target_dir, s_id + ".html"), "w") as _f:
+                    _f.write(resp.text)
+                # xls_download
+                post_data["export"] = ""
+                xport_data = [(k, v) for k, v in post_data.items() if k in ["params", "export", "idTab"]]
+                resp = requests.post("https://edenpub.bceao.int/rapportPredefini.php",
+                                     data=data)
+                assert resp is not None, "requests.get() failed with url entrypoint %s" % provider_root_url
+                assert resp.status_code in range(200, 399), \
+                    "requests response.status_code == %s" % resp.status_code
+                with open(os.path.join(target_dir, s_id + ".xls"), "w") as _f:
+                    _f.write(resp.text)
+        categories_tree.append(category)
+        write_json_file(os.path.join(target_dir, "category_tree.json"), categories_tree)
+        

 def main():
    args = docopt(__doc__)
No results found