Michel Juillard · f96bf5af
--- a/convert.py

+ 11

− 37
+++ b/convert.py

+ 11

− 37
 @@ -83,39 +83,6 @@ def get_countries(source_dir):
    return countries


-def build_categories_tree(source_dir, target_dir):
-    '''build categories > datasets from index.html'''
-    with open(os.path.join(source_dir, "index.html"), "r", encoding='utf-8') as f:
-        soup = bs(f.read(), "lxml")
-
-    categories_tree = []
-    for ul in soup.find_all("ul", class_="extend"):
-        cat_name = clean_special_char(ul.parent.a.text)
-        category = {
-            # ugly fix of broken unicode char
-            "name": clean_special_char(ul.parent.a.text),
-            "code": slugify.slugify(clean_special_char(ul.parent.a.text))
-        }
-        category["children"] = []
-        for li in ul.find_all("li"):
-            series = list(ast.literal_eval(li.a.get("onclick").replace("soumettreTab", "")))
-            code, name = series[-1].split(" - ")
-            category["children"].append({
-                "code": code,
-                "name": name
-            })
-            # for country_code, country_label in countries.items():
-            #     categories[cat].append({
-            #         "code": "_".join([code, country_code]),
-            #         "name": " - ".join([name, country_label])
-            #     })
-        categories_tree.append(category)
-
-    write_json_file(os.path.join(target_dir, "category_tree.json"), categories_tree)
-    write_json_file(os.path.join(target_dir, "provider.json"), provider_json)
-
-    return categories_tree
-

 def detect_unit_label(category_table):
    #    detect unit_label from title NOT USED
 @@ -153,14 +120,15 @@ def define_label(label, sub_label, top_label):
        return None, None, last_label


-def build_series(source_dir, target_dir):
+def build_series(source_dir, target_dir, category_tree):
    '''
    From datasets < dataset_code > .html to series
    '''
    countries = get_countries(source_dir)
-    categories = build_categories_tree(source_dir, target_dir)
-    datasets = itertools.chain.from_iterable([category["children"] for category in categories])
+    datasets = itertools.chain.from_iterable([category["children"] for category in category_tree])
    datasets_dict = {n["code"]: n["name"] for n in datasets}
+    for n in datasets_dict.items():
+        print(n)
    for f in os.listdir(source_dir):
        if f.endswith(".html") and f != "index.html":
            dataset_file = os.path.join(source_dir, f)
 @@ -245,7 +213,13 @@ def main():
    source_dir = os.path.abspath(args["<source_dir>"])
    target_dir = os.path.abspath(args["<target_dir>"])
    assert os.path.exists(source_dir), source_dir
-    build_series(source_dir, target_dir)
+    write_json_file(os.path.join(target_dir, "provider.json"), provider_json)
+    category_tree_filename_in = os.path.join(source_dir, "category_tree.json")
+    with open(category_tree_filename_in) as f:
+        category_tree = json.load(f)
+    category_tree_filename_out = os.path.join(source_dir, "category_tree.json")
+    write_json_file(category_tree_filename_out, category_tree)
+    build_series(source_dir, target_dir, category_tree)


 if __name__ == "__main__":