Skip to content
Snippets Groups Projects

Fixing BCEAO

Closed Michel Juillard requested to merge MichelJuillard/bceao-fetcher:master into master
2 files
+ 83
64
Compare changes
  • Side-by-side
  • Inline
Files
2
+ 11
37
@@ -83,39 +83,6 @@ def get_countries(source_dir):
return countries
def build_categories_tree(source_dir, target_dir):
'''build categories > datasets from index.html'''
with open(os.path.join(source_dir, "index.html"), "r", encoding='utf-8') as f:
soup = bs(f.read(), "lxml")
categories_tree = []
for ul in soup.find_all("ul", class_="extend"):
cat_name = clean_special_char(ul.parent.a.text)
category = {
# ugly fix of broken unicode char
"name": clean_special_char(ul.parent.a.text),
"code": slugify.slugify(clean_special_char(ul.parent.a.text))
}
category["children"] = []
for li in ul.find_all("li"):
series = list(ast.literal_eval(li.a.get("onclick").replace("soumettreTab", "")))
code, name = series[-1].split(" - ")
category["children"].append({
"code": code,
"name": name
})
# for country_code, country_label in countries.items():
# categories[cat].append({
# "code": "_".join([code, country_code]),
# "name": " - ".join([name, country_label])
# })
categories_tree.append(category)
write_json_file(os.path.join(target_dir, "category_tree.json"), categories_tree)
write_json_file(os.path.join(target_dir, "provider.json"), provider_json)
return categories_tree
def detect_unit_label(category_table):
# detect unit_label from title NOT USED
@@ -153,14 +120,15 @@ def define_label(label, sub_label, top_label):
return None, None, last_label
def build_series(source_dir, target_dir):
def build_series(source_dir, target_dir, category_tree):
'''
From datasets < dataset_code > .html to series
'''
countries = get_countries(source_dir)
categories = build_categories_tree(source_dir, target_dir)
datasets = itertools.chain.from_iterable([category["children"] for category in categories])
datasets = itertools.chain.from_iterable([category["children"] for category in category_tree])
datasets_dict = {n["code"]: n["name"] for n in datasets}
for n in datasets_dict.items():
print(n)
for f in os.listdir(source_dir):
if f.endswith(".html") and f != "index.html":
dataset_file = os.path.join(source_dir, f)
@@ -245,7 +213,13 @@ def main():
source_dir = os.path.abspath(args["<source_dir>"])
target_dir = os.path.abspath(args["<target_dir>"])
assert os.path.exists(source_dir), source_dir
build_series(source_dir, target_dir)
write_json_file(os.path.join(target_dir, "provider.json"), provider_json)
category_tree_filename_in = os.path.join(source_dir, "category_tree.json")
with open(category_tree_filename_in) as f:
category_tree = json.load(f)
category_tree_filename_out = os.path.join(source_dir, "category_tree.json")
write_json_file(category_tree_filename_out, category_tree)
build_series(source_dir, target_dir, category_tree)
if __name__ == "__main__":