Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
dares-fetcher
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dbnomics-fetchers
dares-fetcher
Commits
1e133b6e
Commit
1e133b6e
authored
7 years ago
by
Constance de Quatrebarbes
Browse files
Options
Downloads
Patches
Plain Diff
FIX nb_files >> acceptance tests see issue #37
parent
3092ea68
No related branches found
Branches containing commit
No related tags found
1 merge request
!1
Implement download script and more
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
dares_settings.py
+136
-0
136 additions, 0 deletions
dares_settings.py
dares_to_source_data.py
+7
-3
7 additions, 3 deletions
dares_to_source_data.py
with
143 additions
and
3 deletions
dares_settings.py
0 → 100644
+
136
−
0
View file @
1e133b6e
#!/usr/bin/env python3
'''
DARES SETTINGS
define some CONSTANTS that are commons to:
- dares_to_source_data.py
- dares_to_dbnomics.py
and make some Assertion test on import as acceptance tests for the fetcher
'''
import
os
### GIT REPOSITORIES
SOURCE_REPOSITORY_URL
=
'
git@git.nomics.world:dbnomics-source-data/dares-source-data.git
'
TARGET_REPOSITORY_URL
=
'
git@git.nomics.world:dbnomics-json-data/dares-json-data.git
'
### PROVIDER
PROVIDER_URL
=
"
http://dares.travail-emploi.gouv.fr/
"
SOURCE_FILES_URL
=
os
.
path
.
join
(
PROVIDER_URL
,
"
dares-etudes-et-statistiques/statistiques-de-a-a-z/article/
"
)
PROVIDER
=
{
"
long_name
"
:
"
Direction de l
'
Animation de la Recherche des Etudes et des Statistiques
"
,
"
name
"
:
"
DARES
"
,
"
region
"
:
"
France
"
,
"
slug
"
:
"
dares
"
,
"
terms_of_use
"
:
"
http://dares.travail-emploi.gouv.fr/dares-etudes-et-statistiques/article/mentions-legales
"
,
"
website
"
:
PROVIDER_URL
,
}
### CATEGORIES and FILES
#### REQUIRED FILES AND CATs
TOP_CATEGORIES_NB
=
19
SOURCE_FILES_NB
=
32
#### TOP CATEGORIES WITH FILE_NB and PAGE_SLUG
TOP_CATEGORIES
=
[
{
"
name
"
:
"
L’activité partielle
"
,
"
page_slug
"
:
'
l-activite-partielle
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les demandeurs d’emploi inscrits à Pôle emploi : données nationales
"
,
"
page_slug
"
:
'
les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-nationales
'
,
"
file_nb
"
:
3
},
{
"
name
"
:
"
Les demandeurs d’emploi inscrits à Pôle emploi : données nationales
"
,
"
page_slug
"
:
'
les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-regionales
'
,
"
file_nb
"
:
3
},
{
"
name
"
:
"
Les demandeurs d’emploi inscrits à Pôle emploi : données régionales, départementales et par zone d’emploi
"
,
"
page_slug
"
:
'
la-duree-collective-hebdomadaire
'
,
"
file_nb
"
:
2
},
{
"
name
"
:
"
La durée individuelle du travail
"
,
"
page_slug
"
:
'
la-duree-individuelle-du-travail
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
L’emploi salarié
"
,
"
page_slug
"
:
'
l-emploi-salarie
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les emplois vacants
"
,
"
page_slug
"
:
'
les-emplois-vacants
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
L’emploi intérimaire
"
,
"
page_slug
"
:
'
l-emploi-interimaire
'
,
"
file_nb
"
:
3
},
{
"
name
"
:
"
Les journées individuelles non travaillées (JINT)
"
,
"
page_slug
"
:
'
les-journees-individuelles-non-travaillees-jint
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les heures supplémentaires
"
,
"
page_slug
"
:
'
les-heures-supplementaires
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les heures supplémentaires rémunérées
"
,
"
page_slug
"
:
'
les-heures-supplementaires-remunerees
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les offres collectées et satisfaites par Pôle emploi
"
,
"
page_slug
"
:
'
les-offres-collectees-et-satisfaites-par-pole-emploi-les-series-mensuelles
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
La participation, l’intéressement et l’épargne salariale
"
,
"
page_slug
"
:
"
la-participation-l-interessement-et-l-epargne-salariale
"
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les dispositifs publics d’accompagnement des restructurations
"
,
"
page_slug
"
:
"
les-dispositifs-publics-d-accompagnement-des-restructurations
"
,
"
file_nb
"
:
2
,
},
{
"
name
"
:
"
Les ruptures conventionnelles
"
,
"
page_slug
"
:
"
les-ruptures-conventionnelles
"
,
"
file_nb
"
:
2
},
{
"
name
"
:
"
Les indices de salaire de base
"
,
"
page_slug
"
:
"
les-indices-de-salaire-de-base
"
,
"
file_nb
"
:
3
},
{
"
name
"
:
"
Les sortants des listes de demandeurs d’emploi inscrits à Pôle emploi
"
,
"
page_slug
"
:
"
donnees-statistiques-les-sortants-des-listes-de-demandeurs-d-emploi-inscrits-a
"
,
"
file_nb
"
:
2
},
{
"
name
"
:
"
Le temps partiel
"
,
"
page_slug
"
:
"
le-temps-partiel
"
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les tensions sur le marché du travail par métier
"
,
"
page_slug
"
:
"
les-tensions-sur-le-marche-du-travail-par-metier
"
,
"
file_nb
"
:
2
}
]
### Acceptance TESTS
assert
(
len
(
TOP_CATEGORIES
)
==
TOP_CATEGORIES_NB
),
"
Wrong number of top categories required
"
assert
(
sum
([
top_cat
[
"
file_nb
"
]
for
top_cat
in
TOP_CATEGORIES
])
==
SOURCE_FILES_NB
),
"
Wrong number of source files required
"
This diff is collapsed.
Click to expand it.
dares_to_source_data.py
+
7
−
3
View file @
1e133b6e
...
...
@@ -37,13 +37,17 @@ from bs4 import BeautifulSoup as bs
from
docopt
import
docopt
from
dares_settings
import
SOURCE_REPOSITORY_URL
from
dares_settings
import
SOURCE_FILES_URL
,
PROVIDER_URL
from
dares_settings
import
TOP_CATEGORIES
def
write_source_html
(
html_file_path
,
data
):
'''
write html page as source-data
'''
with
open
(
html_file_path
,
"
wb
"
)
as
excel_file
:
excel_file
.
write
(
data
)
def
fetch
(
dataset
):
url
=
os
.
path
.
join
(
ENTRY_POINT
_URL
,
dataset
[
"
page_slug
"
])
url
=
os
.
path
.
join
(
SOURCE_FILES
_URL
,
dataset
[
"
page_slug
"
])
resp
=
requests
.
get
(
url
)
assert
resp
is
not
None
,
"
requests.get() failed with url entrypoint %s
"
%
url
assert
resp
.
status_code
in
range
(
200
,
399
),
"
requests response.status_code == %s
"
%
resp
.
status_code
...
...
@@ -56,7 +60,7 @@ def fetch(dataset):
ext
,
title
=
[
n
.
text
for
n
in
doc
.
findAll
(
"
span
"
)[
0
:
2
]]
if
ext
in
[
"
xls
"
,
"
xlsx
"
,
"
xlsm
"
]:
raw_url
=
doc
.
find
(
'
a
'
,
{
"
class
"
:
ext
}).
get
(
"
href
"
)
f_url
=
os
.
path
.
join
(
ROOT_
PROVIDER_URL
,
raw_url
)
f_url
=
os
.
path
.
join
(
PROVIDER_URL
,
raw_url
)
f_name
=
f_url
.
split
(
"
/
"
)[
-
1
]
target_files
.
append
({
"
f_name
"
:
f_name
,
...
...
@@ -98,7 +102,7 @@ def main():
# else:
# #reset the repo by changing remote url of the git and push ?
# pass
for
ds
in
T
ARGET_EXCEL_FIL
ES
:
for
ds
in
T
OP_CATEGORI
ES
:
excel_data
=
fetch
(
ds
)
for
file
in
excel_data
:
response
=
requests
.
get
(
file
[
"
f_url
"
])
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment