Skip to content
Snippets Groups Projects

Implement download script and more

Closed Christophe Benz requested to merge dev into init
Files
7
+ 203
0
#!/usr/bin/env python3
# coding: utf-8
import json
import os
import requests
from bs4 import BeautifulSoup as bs
from slugify import slugify
ROOT_PROVIDER_URL = "http://dares.travail-emploi.gouv.fr/"
ENTRY_POINT_URL = os.path.join(ROOT_PROVIDER_URL, "dares-etudes-et-statistiques/statistiques-de-a-a-z/article/")
PAGES = [
{
"name": "L’activité partielle",
"page_slug": 'l-activite-partielle',
"file_nb": 1
},
{
"name": "Les demandeurs d’emploi inscrits à Pôle emploi : données nationales",
"page_slug": 'les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-nationales',
"file_nb": 3
},
{
"name": "Les demandeurs d’emploi inscrits à Pôle emploi : données régionales",
"page_slug": 'les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-regionales',
"file_nb": 3
},
{
"name": "La durée collective hebdomadaire de travail",
"page_slug": 'la-duree-collective-hebdomadaire-de-travail',
"file_nb": 2
},
{
"name": "La durée individuelle du travail",
"page_slug": 'la-duree-individuelle-du-travail',
"file_nb": 1
},
{
"name": "L’emploi salarié",
"page_slug": 'l-emploi-salarie',
"file_nb": 1
},
{
"name": "Les emplois vacants",
"page_slug": 'les-emplois-vacants',
"file_nb": 1
},
{
"name": "L’emploi intérimaire",
"page_slug": 'l-emploi-interimaire',
"file_nb": 3
},
{
"name": "Les journées individuelles non travaillées (JINT)",
"page_slug": 'les-journees-individuelles-non-travaillees-jint',
"file_nb": 1
},
{
"name": "Les heures supplémentaires",
"page_slug": 'les-heures-supplementaires',
"file_nb": 1
},
{
"name": "Les heures supplémentaires rémunérées",
"page_slug": 'les-heures-supplementaires-remunerees',
"file_nb": 1
},
{
"name": "Les offres collectées et satisfaites par Pôle emploi",
"page_slug": 'les-offres-collectees-et-satisfaites-par-pole-emploi-les-series-mensuelles',
"file_nb": 1
},
{
"name": "La participation, l’intéressement et l’épargne salariale",
"page_slug": "la-participation-l-interessement-et-l-epargne-salariale",
"file_nb": 1
},
{
"name":"Les dispositifs publics d’accompagnement des restructurations",
"page_slug":"les-dispositifs-publics-d-accompagnement-des-restructurations",
"file_nb": 2,
},
{
"name": "Les ruptures conventionnelles",
"page_slug": "les-ruptures-conventionnelles",
"file_nb": 2
},
{
"name": "Les indices de salaire de base",
"page_slug": "les-indices-de-salaire-de-base",
"file_nb": 2
},
{
"name": "Les sortants des listes de demandeurs d’emploi inscrits à Pôle emploi",
"page_slug": "donnees-statistiques-les-sortants-des-listes-de-demandeurs-d-emploi-inscrits-a",
"file_nb": 2
},
{
"name":"Le temps partiel",
"page_slug": "le-temps-partiel",
"file_nb": 1
},
{
"name": "Les tensions sur le marché du travail par métier",
"page_slug": "les-tensions-sur-le-marche-du-travail-par-metier",
"file_nb": 2
}
]
def build_root_categorie(i, cat):
'''build root cat dir + name + category code FROM PAGES level 0'''
# cat_dir = "%i%s" % (i+1, cat['page_slug'])
root_cat = {
"code": i+1,
"slug": cat["page_slug"],
"doc_href": os.path.join(ENTRY_POINT_URL, cat["page_slug"])
}
if i+1 == 19:
root_cat["datasets"] = build_datasets(root_cat["doc_href"])
return root_cat
elif cat["file_nb"] > 1:
root_cat["sub-categories"] = build_categories(root_cat["doc_href"], root_cat["code"])
return root_cat
else:
root_cat["datasets"] = build_datasets(root_cat["doc_href"])
return root_cat
def build_datasets(doc_href):
'''from cat page load page source and build raw_file info form datasets'''
try:
# url = os.path.join(ENTRY_POINT_URL, cat["page_slug"])
resp = requests.get(doc_href)
# print(resp)
assert resp.status_code in range(200, 399), "requests response.status_code == %s" %resp.status_code
except:
cat["status"] = False
return [cat]
soup = bs(resp.text, "lxml")
sidebar = soup.find("aside")
doc_list = sidebar.find("ul", {"class":"docs-joints__liste"})
documents = [doc for doc in doc_list.findAll("li") if doc.span.text in ["xls", "xlsx"]]
f_docs = [doc.find("a").get("href").split("/")[-1] for doc in documents]
# f_titles = [slugify(doc.findAll("span")[1].text) for doc in documents]
return f_docs
def build_categories(doc_href, code):
'''from cat load page source and build subcategories and datasets
subcategories name are defined by the slugify title of the doc
'''
try:
# url = os.path.join(root_cat["doc_href"])
resp = requests.get(doc_href)
# print(resp)
assert resp.status_code in range(200, 399), "%s" %url
except:
root_cat["status"] = False
return [root_cat]
soup = bs(resp.text, "lxml")
doc_list = soup.find("ul", {"class":"docs-joints__liste"})
categories = []
ii = 0
for doc in doc_list.find_all("li"):
if doc.span.text in ["xls", "xlsx"]:
ii = ii+1
sub_code = "%i.%i" % (int(code), ii)
doc_file = doc.find("a").get("href").split("/")[-1]
title = doc.findAll("span")[1].text
dataset_name = doc_file.split(".")[:-1]
dataset_slug = slugify(dataset_name)
slug = slugify(title)
categories.append({
"code": sub_code,
"slug": slug,
"datasets": [doc_file]
})
return categories
def build_file_tree():
data = {"CATEGORIES": [build_categorie_d(i+1, c) for i, c in enumerate(CATEGORIES)]}
with open("./tree.json", "w") as f:
pdata = json.dumps(data, sort_keys=True, indent=4, ensure_ascii=False)
f.write(pdata,f)
def datasets_tree():
import pprint
pp = pprint.PrettyPrinter(indent=4)
root = [build_root_categorie(i, cat) for i, cat in enumerate(PAGES)]
with open("./tree.json", "w") as f:
jdata = json.dumps(root, sort_keys=True,
indent=4, separators=(',', ': '), ensure_ascii=False)
f.write(jdata)
print("New Tree available ./tree.json")
if __name__ == "__main__":
datasets_tree()
\ No newline at end of file