Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • dbnomics-fetchers/bceao-fetcher
  • MichelJuillard/bceao-fetcher
2 results
Show changes
Commits on Source (5)
......@@ -83,39 +83,6 @@ def get_countries(source_dir):
return countries
def build_categories_tree(source_dir, target_dir):
'''build categories > datasets from index.html'''
with open(os.path.join(source_dir, "index.html"), "r", encoding='utf-8') as f:
soup = bs(f.read(), "lxml")
categories_tree = []
for ul in soup.find_all("ul", class_="extend"):
cat_name = clean_special_char(ul.parent.a.text)
category = {
# ugly fix of broken unicode char
"name": clean_special_char(ul.parent.a.text),
"code": slugify.slugify(clean_special_char(ul.parent.a.text))
}
category["children"] = []
for li in ul.find_all("li"):
series = list(ast.literal_eval(li.a.get("onclick").replace("soumettreTab", "")))
code, name = series[-1].split(" - ")
category["children"].append({
"code": code,
"name": name
})
# for country_code, country_label in countries.items():
# categories[cat].append({
# "code": "_".join([code, country_code]),
# "name": " - ".join([name, country_label])
# })
categories_tree.append(category)
write_json_file(os.path.join(target_dir, "category_tree.json"), categories_tree)
write_json_file(os.path.join(target_dir, "provider.json"), provider_json)
return categories_tree
def detect_unit_label(category_table):
# detect unit_label from title NOT USED
......@@ -153,14 +120,15 @@ def define_label(label, sub_label, top_label):
return None, None, last_label
def build_series(source_dir, target_dir):
def build_series(source_dir, target_dir, category_tree):
'''
From datasets < dataset_code > .html to series
'''
countries = get_countries(source_dir)
categories = build_categories_tree(source_dir, target_dir)
datasets = itertools.chain.from_iterable([category["children"] for category in categories])
datasets = itertools.chain.from_iterable([category["children"] for category in category_tree])
datasets_dict = {n["code"]: n["name"] for n in datasets}
for n in datasets_dict.items():
print(n)
for f in os.listdir(source_dir):
if f.endswith(".html") and f != "index.html":
dataset_file = os.path.join(source_dir, f)
......@@ -245,7 +213,13 @@ def main():
source_dir = os.path.abspath(args["<source_dir>"])
target_dir = os.path.abspath(args["<target_dir>"])
assert os.path.exists(source_dir), source_dir
build_series(source_dir, target_dir)
write_json_file(os.path.join(target_dir, "provider.json"), provider_json)
category_tree_filename_in = os.path.join(source_dir, "category_tree.json")
with open(category_tree_filename_in) as f:
category_tree = json.load(f)
category_tree_filename_out = os.path.join(source_dir, "category_tree.json")
write_json_file(category_tree_filename_out, category_tree)
build_series(source_dir, target_dir, category_tree)
if __name__ == "__main__":
......
......@@ -28,13 +28,35 @@ Usage:
"""
import ast
import json
import logging
import os
import slugify
import sys
import requests
from bs4 import BeautifulSoup as bs
from docopt import docopt
log = logging.getLogger(__name__)
def clean_special_char(string):
'''ugly fix of encoding error of HTML source file
can be fixed replacing unrecognized char into é because 98,79% are é
two exceptions:
- Côte d'Ivoire
- impôts
'''
if "C�te" in string:
return string.replace("", "ô")
elif "imp�ts" in string:
return string.replace("", "ô")
else:
return string.replace("", "é")
def write_json_file(file_path, data):
with open(file_path, 'w', encoding='utf-8') as file_:
json.dump(data, file_, ensure_ascii=False, indent=2, sort_keys=True)
def download_datasets(target_dir):
'''download provider root url and build the datasets source page'''
......@@ -47,20 +69,27 @@ def download_datasets(target_dir):
# 'User-Agent': 'DBNomics Downloader BOT see next.dbnomics.world',
# }
# download the main index 'index.html'
resp = requests.get(provider_root_url)
resp0 = requests.get(provider_root_url)
resp = requests.get(provider_root_url, cookies=resp0.cookies)
resp.raise_for_status()
with open(os.path.join(target_dir, "index.html"), "w") as _file:
_file.write(resp.text)
headers = resp.headers
soup = bs(resp.text, "lxml")
categories = {}
categories = []
for ul in soup.find_all("ul", class_="extend"):
cat = ul.parent.a.text.encode("utf-8").decode("utf-8", errors="ignore")
categories[cat] = []
# ugly fix of broken unicode char
cat_name = clean_special_char(ul.parent.a.text)
category = {
"name": cat_name,
"code": slugify.slugify(clean_special_char(ul.parent.a.text))
}
category["datasets"] = []
for li in ul.find_all("li"):
series = list(ast.literal_eval(li.a.get("onclick").replace("soumettreTab", "")))
categories[cat].append(series)
category["datasets"].append(series)
categories.append(category)
# no need to have country names
# encoding problems: broken utf-8 from website app: countries and categories é and ô same
countries = {n.text.strip().encode('utf-8').decode("utf-8", errors="ignore"): n.find("input")
......@@ -98,31 +127,47 @@ def download_datasets(target_dir):
country = {n[0]: n[1] for n in zip(countries.keys(), countries.keys())}
post_data.update(country)
for cat, series_l in categories.items():
for series in series_l:
freq, id_tab, s_name_id = series
s_id, s_name = s_name_id.split(" - ")
categories_tree = []
for cat in categories:
category = {
"code": cat["code"],
"name": cat["name"]
}
category["children"] = []
for freq, id_tab, s_name_id in cat["datasets"]:
post_data["idTab"] = id_tab
data = [(k, v) for k, v in post_data.items()]
# html_download
resp = requests.post("https://edenpub.bceao.int/rapportPredefini.php",
data=data)
assert resp is not None, "requests.get() failed with url entrypoint %s" % provider_root_url
assert resp.status_code in range(200, 399), \
"requests response.status_code == %s" % resp.status_code
with open(os.path.join(target_dir, s_id + ".html"), "w") as _f:
_f.write(resp.text)
# xls_download
post_data["export"] = ""
xport_data = [(k, v) for k, v in post_data.items() if k in ["params", "export", "idTab"]]
resp = requests.post("https://edenpub.bceao.int/rapportPredefini.php",
data=data)
assert resp is not None, "requests.get() failed with url entrypoint %s" % provider_root_url
assert resp.status_code in range(200, 399), \
"requests response.status_code == %s" % resp.status_code
with open(os.path.join(target_dir, s_id + ".xls"), "w") as _f:
_f.write(resp.text)
try:
resp = requests.post("https://edenpub.bceao.int/rapportPredefini.php",
data=data)
assert resp is not None, "requests.get() failed with url entrypoint %s" % provider_root_url
assert resp.status_code in range(200, 399), \
"requests response.status_code == %s" % resp.status_code
except:
log.warning("Dataset {} is missing".format(s_name_id))
else:
soup = bs(resp.text, "lxml")
s_id = soup.find("h2").text.split(' - ')[0]
category["children"].append({
"code": s_id,
"name": s_name_id
})
with open(os.path.join(target_dir, s_id + ".html"), "w") as _f:
_f.write(resp.text)
# xls_download
post_data["export"] = ""
xport_data = [(k, v) for k, v in post_data.items() if k in ["params", "export", "idTab"]]
resp = requests.post("https://edenpub.bceao.int/rapportPredefini.php",
data=data)
assert resp is not None, "requests.get() failed with url entrypoint %s" % provider_root_url
assert resp.status_code in range(200, 399), \
"requests response.status_code == %s" % resp.status_code
with open(os.path.join(target_dir, s_id + ".xls"), "w") as _f:
_f.write(resp.text)
categories_tree.append(category)
write_json_file(os.path.join(target_dir, "category_tree.json"), categories_tree)
def main():
args = docopt(__doc__)
......