Skip to content
Snippets Groups Projects
Commit 1e133b6e authored by Constance de Quatrebarbes's avatar Constance de Quatrebarbes
Browse files

FIX nb_files >> acceptance tests see issue #37

parent 3092ea68
No related branches found
No related tags found
1 merge request!1Implement download script and more
#!/usr/bin/env python3
'''DARES SETTINGS
define some CONSTANTS that are commons to:
- dares_to_source_data.py
- dares_to_dbnomics.py
and make some Assertion test on import as acceptance tests for the fetcher
'''
import os
### GIT REPOSITORIES
SOURCE_REPOSITORY_URL = 'git@git.nomics.world:dbnomics-source-data/dares-source-data.git'
TARGET_REPOSITORY_URL = 'git@git.nomics.world:dbnomics-json-data/dares-json-data.git'
### PROVIDER
PROVIDER_URL = "http://dares.travail-emploi.gouv.fr/"
SOURCE_FILES_URL = os.path.join(PROVIDER_URL, "dares-etudes-et-statistiques/statistiques-de-a-a-z/article/")
PROVIDER = {
"long_name": "Direction de l'Animation de la Recherche des Etudes et des Statistiques",
"name": "DARES",
"region": "France",
"slug": "dares",
"terms_of_use": "http://dares.travail-emploi.gouv.fr/dares-etudes-et-statistiques/article/mentions-legales",
"website": PROVIDER_URL,
}
### CATEGORIES and FILES
#### REQUIRED FILES AND CATs
TOP_CATEGORIES_NB = 19
SOURCE_FILES_NB = 32
#### TOP CATEGORIES WITH FILE_NB and PAGE_SLUG
TOP_CATEGORIES = [
{
"name": "L’activité partielle",
"page_slug": 'l-activite-partielle',
"file_nb": 1
},
{
"name": "Les demandeurs d’emploi inscrits à Pôle emploi : données nationales",
"page_slug": 'les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-nationales',
"file_nb": 3
},
{
"name": "Les demandeurs d’emploi inscrits à Pôle emploi : données nationales",
"page_slug": 'les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-regionales',
"file_nb": 3
},
{
"name": "Les demandeurs d’emploi inscrits à Pôle emploi : données régionales, départementales et par zone d’emploi",
"page_slug": 'la-duree-collective-hebdomadaire',
"file_nb": 2
},
{
"name": "La durée individuelle du travail",
"page_slug": 'la-duree-individuelle-du-travail',
"file_nb": 1
},
{
"name": "L’emploi salarié",
"page_slug": 'l-emploi-salarie',
"file_nb": 1
},
{
"name": "Les emplois vacants",
"page_slug": 'les-emplois-vacants',
"file_nb": 1
},
{
"name": "L’emploi intérimaire",
"page_slug": 'l-emploi-interimaire',
"file_nb": 3
},
{
"name": "Les journées individuelles non travaillées (JINT)",
"page_slug": 'les-journees-individuelles-non-travaillees-jint',
"file_nb": 1
},
{
"name": "Les heures supplémentaires",
"page_slug": 'les-heures-supplementaires',
"file_nb": 1
},
{
"name": "Les heures supplémentaires rémunérées",
"page_slug": 'les-heures-supplementaires-remunerees',
"file_nb": 1
},
{
"name": "Les offres collectées et satisfaites par Pôle emploi",
"page_slug": 'les-offres-collectees-et-satisfaites-par-pole-emploi-les-series-mensuelles',
"file_nb": 1
},
{
"name": "La participation, l’intéressement et l’épargne salariale",
"page_slug": "la-participation-l-interessement-et-l-epargne-salariale",
"file_nb": 1
},
{
"name":"Les dispositifs publics d’accompagnement des restructurations",
"page_slug":"les-dispositifs-publics-d-accompagnement-des-restructurations",
"file_nb": 2,
},
{
"name": "Les ruptures conventionnelles",
"page_slug": "les-ruptures-conventionnelles",
"file_nb": 2
},
{
"name": "Les indices de salaire de base",
"page_slug": "les-indices-de-salaire-de-base",
"file_nb": 3
},
{
"name": "Les sortants des listes de demandeurs d’emploi inscrits à Pôle emploi",
"page_slug": "donnees-statistiques-les-sortants-des-listes-de-demandeurs-d-emploi-inscrits-a",
"file_nb": 2
},
{
"name":"Le temps partiel",
"page_slug": "le-temps-partiel",
"file_nb": 1
},
{
"name": "Les tensions sur le marché du travail par métier",
"page_slug": "les-tensions-sur-le-marche-du-travail-par-metier",
"file_nb": 2
}
]
### Acceptance TESTS
assert(len(TOP_CATEGORIES) == TOP_CATEGORIES_NB), "Wrong number of top categories required"
assert(sum([top_cat["file_nb"] for top_cat in TOP_CATEGORIES]) == SOURCE_FILES_NB), "Wrong number of source files required"
......@@ -37,13 +37,17 @@ from bs4 import BeautifulSoup as bs
from docopt import docopt
from dares_settings import SOURCE_REPOSITORY_URL
from dares_settings import SOURCE_FILES_URL, PROVIDER_URL
from dares_settings import TOP_CATEGORIES
def write_source_html(html_file_path, data):
'''write html page as source-data'''
with open(html_file_path, "wb") as excel_file:
excel_file.write(data)
def fetch(dataset):
url = os.path.join(ENTRY_POINT_URL, dataset["page_slug"])
url = os.path.join(SOURCE_FILES_URL, dataset["page_slug"])
resp = requests.get(url)
assert resp is not None, "requests.get() failed with url entrypoint %s" %url
assert resp.status_code in range(200, 399), "requests response.status_code == %s" %resp.status_code
......@@ -56,7 +60,7 @@ def fetch(dataset):
ext, title = [n.text for n in doc.findAll("span")[0:2]]
if ext in ["xls", "xlsx", "xlsm"]:
raw_url = doc.find('a', {"class": ext}).get("href")
f_url = os.path.join(ROOT_PROVIDER_URL, raw_url)
f_url = os.path.join(PROVIDER_URL, raw_url)
f_name = f_url.split("/")[-1]
target_files.append({
"f_name": f_name,
......@@ -98,7 +102,7 @@ def main():
# else:
# #reset the repo by changing remote url of the git and push ?
# pass
for ds in TARGET_EXCEL_FILES:
for ds in TOP_CATEGORIES:
excel_data = fetch(ds)
for file in excel_data:
response = requests.get(file["f_url"])
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment