Skip to content
Snippets Groups Projects
Commit aa149099 authored by Constance de Quatrebarbes's avatar Constance de Quatrebarbes
Browse files

CHANGE FILENAME pattern

parent 632f97a4
No related branches found
No related tags found
1 merge request!1Implement download script and more
......@@ -28,63 +28,194 @@ Usage:
"""
import sys
import os
import subprocess
import sys
import requests
from docopt import docopt
from bs4 import BeautifulSoup as bs
from docopt import docopt
TARGET_REPOSITORY_URL = 'git@git.nomics.world:dbnomics-source-data/dares-source-data.git'
# DATA_REPOSITORY_URL = 'git@git.nomics.world:dbnomics-source-data/dares-source-data.git'
ROOT_PROVIDER_URL = "http://dares.travail-emploi.gouv.fr/"
ENTRY_POINT_URL = os.path.join(ROOT_PROVIDER_URL, "dares-etudes-et-statistiques/statistiques-de-a-a-z/article/")
TARGET_EXCEL_FILES = [
{
"page_slug": 'l-activite-partielle',
"file_nb": 1
},
{
"page_slug":'les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-nationales',
"file_nb": 3
},
{
"page_slug":'les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-regionales',
"file_nb": 3
},
{
"page_slug": 'la-duree-collective-hebdomadaire',
"file_nb": 2
},
{
"page_slug": 'la-duree-individuelle-du-travail',
"file_nb": 1
},
{
"page_slug": 'l-emploi-salarie',
"file_nb": 1
},
{
"page_slug": 'les-emplois-vacants',
"file_nb": 1
},
{
"page_slug": 'l-emploi-interimaire',
"file_nb": 3
},
{
"page_slug": 'les-journees-individuelles-non-travaillees-jint',
"file_nb": 1
},
{
"page_slug": 'les-heures-supplementaires',
"file_nb": 1
},
{
"page_slug": 'les-heures-supplementaires-remunerees',
"file_nb": 1
},
{
"page_slug": 'les-offres-collectees-et-satisfaites-par-pole-emploi-les-series-mensuelles',
"file_nb": 1
},
{
"page_slug": "la-participation-l-interessement-et-l-epargne-salariale",
"file_nb": 1
},
{
"page_slug": "les-ruptures-conventionnelles",
"file_nb": 2
},
{
"page_slug": "les-indices-de-salaire-de-base",
"file_nb": 2
},
{
"page_slug": "donnees-statistiques-les-sortants-des-listes-de-demandeurs-d-emploi-inscrits-a",
"file_nb": 2
},
{
"page_slug": "le-temps-partiel",
"file_nb": 1
},
{
"page_slug": "les-tensions-sur-le-marche-du-travail-par-metier",
"file_nb": 2
}
]
ROOT_PROVIDER_URL = "http://dares.travail-emploi.gouv.fr/"
ENTRY_POINT_URL = os.path.join(ROOT_PROVIDER_URL, "dares-etudes-et-statistiques/statistiques-de-a-a-z/article/")
TARGET_EXCEL_FILES = [
{
"name": "L’activité partielle",
"page_slug": 'l-activite-partielle',
"file_nb": 1
},
{
"name": "Les demandeurs d’emploi inscrits à Pôle emploi : données nationales"
"page_slug": 'les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-nationales',
"file_nb": 3
},
{
"name": "Les demandeurs d’emploi inscrits à Pôle emploi : données nationales"
"page_slug": 'les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-regionales',
"file_nb": 3
},
{
"name": "Les demandeurs d’emploi inscrits à Pôle emploi : données régionales, départementales et par zone d’emploi"
"page_slug": 'la-duree-collective-hebdomadaire',
"file_nb": 2
},
{
"name": "La durée individuelle du travail",
"page_slug": 'la-duree-individuelle-du-travail',
"file_nb": 1
},
{
"name": "L’emploi salarié",
"page_slug": 'l-emploi-salarie',
"file_nb": 1
},
{
"name": "Les emplois vacants",
"page_slug": 'les-emplois-vacants',
"file_nb": 1
},
{
"name": "L’emploi intérimaire",
"page_slug": 'l-emploi-interimaire',
"file_nb": 3
},
{
"name": "Les journées individuelles non travaillées (JINT)",
"page_slug": 'les-journees-individuelles-non-travaillees-jint',
"file_nb": 1
},
{
"name": "Les heures supplémentaires",
"page_slug": 'les-heures-supplementaires',
"file_nb": 1
},
{
"name": "Les heures supplémentaires rémunérées",
"page_slug": 'les-heures-supplementaires-remunerees',
"file_nb": 1
},
{
"name": "Les offres collectées et satisfaites par Pôle emploi",
"page_slug": 'les-offres-collectees-et-satisfaites-par-pole-emploi-les-series-mensuelles',
"file_nb": 1
},
{
"name": "La participation, l’intéressement et l’épargne salariale",
"page_slug": "la-participation-l-interessement-et-l-epargne-salariale",
"file_nb": 1
},
{
"name":"Les dispositifs publics d’accompagnement des restructurations",
"page_slug":"les-dispositifs-publics-d-accompagnement-des-restructurations",
"file_nb": 2,
},
{
"name": "Les ruptures conventionnelles",
"page_slug": "les-ruptures-conventionnelles",
"file_nb": 2
},
{
"name": "Les indices de salaire de base",
"page_slug": "les-indices-de-salaire-de-base",
"file_nb": 2
},
{
"name": "Les sortants des listes de demandeurs d’emploi inscrits à Pôle emploi",
"page_slug": "donnees-statistiques-les-sortants-des-listes-de-demandeurs-d-emploi-inscrits-a",
"file_nb": 2
},
{
"name":"Le temps partiel",
"page_slug": "le-temps-partiel",
"file_nb": 1
},
{
"name": "Les tensions sur le marché du travail par métier",
"page_slug": "les-tensions-sur-le-marche-du-travail-par-metier",
"file_nb": 2
}
]
TARGET_EXCEL_FILES = [ { "page_slug": 'l-activite-partielle',
"file_nb": 1},
{ "page_slug":'les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-nationales',
"file_nb": 3},
{ "page_slug":'les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-regionales',
"file_nb": 3},
{ "page_slug":'la-duree-collective-hebdomadaire',
"file_nb": 2},
{
"page_slug":'la-duree-individuelle-du-travail',
"file_nb": 1},
{
"page_slug":'l-emploi-salarie',
"file_nb": 1},
{
"page_slug":'les-emplois-vacants',
"file_nb": 1},
{
"page_slug":'l-emploi-interimaire',
"file_nb": 3}, #! warning pdf
{
"page_slug":'les-journees-individuelles-non-travaillees-jint',
"file_nb": 1},
{
"page_slug":'les-heures-supplementaires',
"file_nb": 1},
{ "page_slug":'les-heures-supplementaires-remunerees',
"file_nb": 1},
{ "page_slug":'les-offres-collectees-et-satisfaites-par-pole-emploi-les-series-mensuelles',
"file_nb": 1},
{ "page_slug": "la-participation-l-interessement-et-l-epargne-salariale",
"file_nb": 1},
{ "page_slug": "les-ruptures-conventionnelles",
"file_nb": 2},
{ "page_slug": "les-indices-de-salaire-de-base",
"file_nb": 2},
{ "page_slug": "donnees-statistiques-les-sortants-des-listes-de-demandeurs-d-emploi-inscrits-a",
"file_nb": 2},
{ "page_slug": "le-temps-partiel",
"file_nb": 1},
{ "page_slug": "les-tensions-sur-le-marche-du-travail-par-metier",
"file_nb": 2},
]
def fetch(dataset):
url = os.path.join(ENTRY_POINT_URL, dataset["page_slug"])
......@@ -98,25 +229,28 @@ def fetch(dataset):
for doc in doc_list.findAll("li"):
ext, title = [n.text for n in doc.findAll("span")[0:2]]
if ext in ["xls", "xlsx"]:
raw_url = doc.find('a',{"class":ext}).get("href")
raw_url = doc.find('a', {"class": ext}).get("href")
f_url = os.path.join(ROOT_PROVIDER_URL, raw_url)
f_name = f_url.split("/")[-1]
target_files.append({
"f_name":f_name,
"f_url":f_url,
"f_name": f_name,
"f_url": f_url,
"f_title": title,
"f_ext": ext})
"f_ext": ext})
assert len(target_files) == dataset["file_nb"], \
"Fetcher Error: url %s should retrieve %i xls docs instead of %i" %(url, dataset["file_nb"], len(target_files))
"Fetcher Error: url %s should retrieve %i xls docs instead of %i"\
% (url, dataset["file_nb"], len(target_files))
return(target_files)
def is_git_repo(path):
result = subprocess.check_output(["git", "-C", path, "rev-parse"])
if result == 0:
return True
return False
def main():
args = docopt(__doc__.format(self_filename=os.path.basename(__file__)))
target_dir = os.path.abspath(args['<target_dir>'])
......@@ -139,13 +273,12 @@ def main():
subprocess.check_call(['git', 'clone', DATA_REPOSITORY_URL, target_dir])
# else:
# #reset the repo by changing remote url of the git and push ?
# pass
# pass
for ds in TARGET_EXCEL_FILES:
excel_data = fetch(ds)
for file in excel_data:
response = requests.get(file["f_url"])
excel_file_path = os.path.join(target_dir, file["f_name"])
excel_file_path = os.path.join(target_dir, file["f_title"], file["f_ext"])
with open(excel_file_path, "wb") as excel_file:
excel_file.write(response.content)
subprocess.check_call(
......@@ -164,6 +297,5 @@ def main():
cwd=target_dir,
)
if __name__ == '__main__':
sys.exit(main())
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment