Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
dares-fetcher
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dbnomics-fetchers
dares-fetcher
Commits
aa149099
Commit
aa149099
authored
7 years ago
by
Constance de Quatrebarbes
Browse files
Options
Downloads
Patches
Plain Diff
CHANGE FILENAME pattern
parent
632f97a4
No related branches found
Branches containing commit
No related tags found
1 merge request
!1
Implement download script and more
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
dares_to_source_data.py
+189
-57
189 additions, 57 deletions
dares_to_source_data.py
with
189 additions
and
57 deletions
dares_to_source_data.py
+
189
−
57
View file @
aa149099
...
...
@@ -28,63 +28,194 @@ Usage:
"""
import
sys
import
os
import
subprocess
import
sys
import
requests
from
docopt
import
docopt
from
bs4
import
BeautifulSoup
as
bs
from
docopt
import
docopt
TARGET_REPOSITORY_URL
=
'
git@git.nomics.world:dbnomics-source-data/dares-source-data.git
'
# DATA_REPOSITORY_URL = 'git@git.nomics.world:dbnomics-source-data/dares-source-data.git'
ROOT_PROVIDER_URL
=
"
http://dares.travail-emploi.gouv.fr/
"
ENTRY_POINT_URL
=
os
.
path
.
join
(
ROOT_PROVIDER_URL
,
"
dares-etudes-et-statistiques/statistiques-de-a-a-z/article/
"
)
TARGET_EXCEL_FILES
=
[
{
"
page_slug
"
:
'
l-activite-partielle
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
'
les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-nationales
'
,
"
file_nb
"
:
3
},
{
"
page_slug
"
:
'
les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-regionales
'
,
"
file_nb
"
:
3
},
{
"
page_slug
"
:
'
la-duree-collective-hebdomadaire
'
,
"
file_nb
"
:
2
},
{
"
page_slug
"
:
'
la-duree-individuelle-du-travail
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
'
l-emploi-salarie
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
'
les-emplois-vacants
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
'
l-emploi-interimaire
'
,
"
file_nb
"
:
3
},
{
"
page_slug
"
:
'
les-journees-individuelles-non-travaillees-jint
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
'
les-heures-supplementaires
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
'
les-heures-supplementaires-remunerees
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
'
les-offres-collectees-et-satisfaites-par-pole-emploi-les-series-mensuelles
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
"
la-participation-l-interessement-et-l-epargne-salariale
"
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
"
les-ruptures-conventionnelles
"
,
"
file_nb
"
:
2
},
{
"
page_slug
"
:
"
les-indices-de-salaire-de-base
"
,
"
file_nb
"
:
2
},
{
"
page_slug
"
:
"
donnees-statistiques-les-sortants-des-listes-de-demandeurs-d-emploi-inscrits-a
"
,
"
file_nb
"
:
2
},
{
"
page_slug
"
:
"
le-temps-partiel
"
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
"
les-tensions-sur-le-marche-du-travail-par-metier
"
,
"
file_nb
"
:
2
}
]
ROOT_PROVIDER_URL
=
"
http://dares.travail-emploi.gouv.fr/
"
ENTRY_POINT_URL
=
os
.
path
.
join
(
ROOT_PROVIDER_URL
,
"
dares-etudes-et-statistiques/statistiques-de-a-a-z/article/
"
)
TARGET_EXCEL_FILES
=
[
{
"
name
"
:
"
L’activité partielle
"
,
"
page_slug
"
:
'
l-activite-partielle
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les demandeurs d’emploi inscrits à Pôle emploi : données nationales
"
"
page_slug
"
:
'
les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-nationales
'
,
"
file_nb
"
:
3
},
{
"
name
"
:
"
Les demandeurs d’emploi inscrits à Pôle emploi : données nationales
"
"
page_slug
"
:
'
les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-regionales
'
,
"
file_nb
"
:
3
},
{
"
name
"
:
"
Les demandeurs d’emploi inscrits à Pôle emploi : données régionales, départementales et par zone d’emploi
"
"
page_slug
"
:
'
la-duree-collective-hebdomadaire
'
,
"
file_nb
"
:
2
},
{
"
name
"
:
"
La durée individuelle du travail
"
,
"
page_slug
"
:
'
la-duree-individuelle-du-travail
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
L’emploi salarié
"
,
"
page_slug
"
:
'
l-emploi-salarie
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les emplois vacants
"
,
"
page_slug
"
:
'
les-emplois-vacants
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
L’emploi intérimaire
"
,
"
page_slug
"
:
'
l-emploi-interimaire
'
,
"
file_nb
"
:
3
},
{
"
name
"
:
"
Les journées individuelles non travaillées (JINT)
"
,
"
page_slug
"
:
'
les-journees-individuelles-non-travaillees-jint
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les heures supplémentaires
"
,
"
page_slug
"
:
'
les-heures-supplementaires
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les heures supplémentaires rémunérées
"
,
"
page_slug
"
:
'
les-heures-supplementaires-remunerees
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les offres collectées et satisfaites par Pôle emploi
"
,
"
page_slug
"
:
'
les-offres-collectees-et-satisfaites-par-pole-emploi-les-series-mensuelles
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
La participation, l’intéressement et l’épargne salariale
"
,
"
page_slug
"
:
"
la-participation-l-interessement-et-l-epargne-salariale
"
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les dispositifs publics d’accompagnement des restructurations
"
,
"
page_slug
"
:
"
les-dispositifs-publics-d-accompagnement-des-restructurations
"
,
"
file_nb
"
:
2
,
},
{
"
name
"
:
"
Les ruptures conventionnelles
"
,
"
page_slug
"
:
"
les-ruptures-conventionnelles
"
,
"
file_nb
"
:
2
},
{
"
name
"
:
"
Les indices de salaire de base
"
,
"
page_slug
"
:
"
les-indices-de-salaire-de-base
"
,
"
file_nb
"
:
2
},
{
"
name
"
:
"
Les sortants des listes de demandeurs d’emploi inscrits à Pôle emploi
"
,
"
page_slug
"
:
"
donnees-statistiques-les-sortants-des-listes-de-demandeurs-d-emploi-inscrits-a
"
,
"
file_nb
"
:
2
},
{
"
name
"
:
"
Le temps partiel
"
,
"
page_slug
"
:
"
le-temps-partiel
"
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les tensions sur le marché du travail par métier
"
,
"
page_slug
"
:
"
les-tensions-sur-le-marche-du-travail-par-metier
"
,
"
file_nb
"
:
2
}
]
TARGET_EXCEL_FILES
=
[
{
"
page_slug
"
:
'
l-activite-partielle
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
'
les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-nationales
'
,
"
file_nb
"
:
3
},
{
"
page_slug
"
:
'
les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-regionales
'
,
"
file_nb
"
:
3
},
{
"
page_slug
"
:
'
la-duree-collective-hebdomadaire
'
,
"
file_nb
"
:
2
},
{
"
page_slug
"
:
'
la-duree-individuelle-du-travail
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
'
l-emploi-salarie
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
'
les-emplois-vacants
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
'
l-emploi-interimaire
'
,
"
file_nb
"
:
3
},
#! warning pdf
{
"
page_slug
"
:
'
les-journees-individuelles-non-travaillees-jint
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
'
les-heures-supplementaires
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
'
les-heures-supplementaires-remunerees
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
'
les-offres-collectees-et-satisfaites-par-pole-emploi-les-series-mensuelles
'
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
"
la-participation-l-interessement-et-l-epargne-salariale
"
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
"
les-ruptures-conventionnelles
"
,
"
file_nb
"
:
2
},
{
"
page_slug
"
:
"
les-indices-de-salaire-de-base
"
,
"
file_nb
"
:
2
},
{
"
page_slug
"
:
"
donnees-statistiques-les-sortants-des-listes-de-demandeurs-d-emploi-inscrits-a
"
,
"
file_nb
"
:
2
},
{
"
page_slug
"
:
"
le-temps-partiel
"
,
"
file_nb
"
:
1
},
{
"
page_slug
"
:
"
les-tensions-sur-le-marche-du-travail-par-metier
"
,
"
file_nb
"
:
2
},
]
def
fetch
(
dataset
):
url
=
os
.
path
.
join
(
ENTRY_POINT_URL
,
dataset
[
"
page_slug
"
])
...
...
@@ -98,25 +229,28 @@ def fetch(dataset):
for
doc
in
doc_list
.
findAll
(
"
li
"
):
ext
,
title
=
[
n
.
text
for
n
in
doc
.
findAll
(
"
span
"
)[
0
:
2
]]
if
ext
in
[
"
xls
"
,
"
xlsx
"
]:
raw_url
=
doc
.
find
(
'
a
'
,{
"
class
"
:
ext
}).
get
(
"
href
"
)
raw_url
=
doc
.
find
(
'
a
'
,
{
"
class
"
:
ext
}).
get
(
"
href
"
)
f_url
=
os
.
path
.
join
(
ROOT_PROVIDER_URL
,
raw_url
)
f_name
=
f_url
.
split
(
"
/
"
)[
-
1
]
target_files
.
append
({
"
f_name
"
:
f_name
,
"
f_url
"
:
f_url
,
"
f_name
"
:
f_name
,
"
f_url
"
:
f_url
,
"
f_title
"
:
title
,
"
f_ext
"
:
ext
})
"
f_ext
"
:
ext
})
assert
len
(
target_files
)
==
dataset
[
"
file_nb
"
],
\
"
Fetcher Error: url %s should retrieve %i xls docs instead of %i
"
%
(
url
,
dataset
[
"
file_nb
"
],
len
(
target_files
))
"
Fetcher Error: url %s should retrieve %i xls docs instead of %i
"
\
%
(
url
,
dataset
[
"
file_nb
"
],
len
(
target_files
))
return
(
target_files
)
def
is_git_repo
(
path
):
result
=
subprocess
.
check_output
([
"
git
"
,
"
-C
"
,
path
,
"
rev-parse
"
])
if
result
==
0
:
return
True
return
False
def
main
():
args
=
docopt
(
__doc__
.
format
(
self_filename
=
os
.
path
.
basename
(
__file__
)))
target_dir
=
os
.
path
.
abspath
(
args
[
'
<target_dir>
'
])
...
...
@@ -139,13 +273,12 @@ def main():
subprocess
.
check_call
([
'
git
'
,
'
clone
'
,
DATA_REPOSITORY_URL
,
target_dir
])
# else:
# #reset the repo by changing remote url of the git and push ?
# pass
# pass
for
ds
in
TARGET_EXCEL_FILES
:
excel_data
=
fetch
(
ds
)
for
file
in
excel_data
:
response
=
requests
.
get
(
file
[
"
f_url
"
])
excel_file_path
=
os
.
path
.
join
(
target_dir
,
file
[
"
f_
name
"
])
excel_file_path
=
os
.
path
.
join
(
target_dir
,
file
[
"
f_
title
"
],
file
[
"
f_ext
"
])
with
open
(
excel_file_path
,
"
wb
"
)
as
excel_file
:
excel_file
.
write
(
response
.
content
)
subprocess
.
check_call
(
...
...
@@ -164,6 +297,5 @@ def main():
cwd
=
target_dir
,
)
if
__name__
==
'
__main__
'
:
sys
.
exit
(
main
())
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment