Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
dares-fetcher
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dbnomics-fetchers
dares-fetcher
Merge requests
!1
Implement download script and more
Code
Review changes
Check out branch
Download
Patches
Plain diff
Closed
Implement download script and more
dev
into
init
Overview
44
Commits
53
Pipelines
0
Changes
7
Closed
Christophe Benz
requested to merge
dev
into
init
7 years ago
Overview
28
Commits
53
Pipelines
0
Changes
7
Expand
Related to
management#37 (closed)
Edited
7 years ago
by
Christophe Benz
0
0
Merge request reports
Compare
init
version 3
11b0a8ce
7 years ago
version 2
c4033343
7 years ago
version 1
c4033343
7 years ago
init (base)
and
version 1
latest version
1d9338a0
53 commits,
7 years ago
version 3
11b0a8ce
52 commits,
7 years ago
version 2
c4033343
39 commits,
7 years ago
version 1
c4033343
39 commits,
7 years ago
7 files
+
1005
−
145
Inline
Compare changes
Side-by-side
Inline
Show whitespace changes
Show one file at a time
Files
7
Search (e.g. *.vue) (Ctrl+P)
snippets/build_tree.py
0 → 100644
+
203
−
0
Options
#!/usr/bin/env python3
# coding: utf-8
import
json
import
os
import
requests
from
bs4
import
BeautifulSoup
as
bs
from
slugify
import
slugify
ROOT_PROVIDER_URL
=
"
http://dares.travail-emploi.gouv.fr/
"
ENTRY_POINT_URL
=
os
.
path
.
join
(
ROOT_PROVIDER_URL
,
"
dares-etudes-et-statistiques/statistiques-de-a-a-z/article/
"
)
PAGES
=
[
{
"
name
"
:
"
L’activité partielle
"
,
"
page_slug
"
:
'
l-activite-partielle
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les demandeurs d’emploi inscrits à Pôle emploi : données nationales
"
,
"
page_slug
"
:
'
les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-nationales
'
,
"
file_nb
"
:
3
},
{
"
name
"
:
"
Les demandeurs d’emploi inscrits à Pôle emploi : données régionales
"
,
"
page_slug
"
:
'
les-demandeurs-d-emploi-inscrits-a-pole-emploi-les-series-mensuelles-regionales
'
,
"
file_nb
"
:
3
},
{
"
name
"
:
"
La durée collective hebdomadaire de travail
"
,
"
page_slug
"
:
'
la-duree-collective-hebdomadaire-de-travail
'
,
"
file_nb
"
:
2
},
{
"
name
"
:
"
La durée individuelle du travail
"
,
"
page_slug
"
:
'
la-duree-individuelle-du-travail
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
L’emploi salarié
"
,
"
page_slug
"
:
'
l-emploi-salarie
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les emplois vacants
"
,
"
page_slug
"
:
'
les-emplois-vacants
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
L’emploi intérimaire
"
,
"
page_slug
"
:
'
l-emploi-interimaire
'
,
"
file_nb
"
:
3
},
{
"
name
"
:
"
Les journées individuelles non travaillées (JINT)
"
,
"
page_slug
"
:
'
les-journees-individuelles-non-travaillees-jint
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les heures supplémentaires
"
,
"
page_slug
"
:
'
les-heures-supplementaires
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les heures supplémentaires rémunérées
"
,
"
page_slug
"
:
'
les-heures-supplementaires-remunerees
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les offres collectées et satisfaites par Pôle emploi
"
,
"
page_slug
"
:
'
les-offres-collectees-et-satisfaites-par-pole-emploi-les-series-mensuelles
'
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
La participation, l’intéressement et l’épargne salariale
"
,
"
page_slug
"
:
"
la-participation-l-interessement-et-l-epargne-salariale
"
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les dispositifs publics d’accompagnement des restructurations
"
,
"
page_slug
"
:
"
les-dispositifs-publics-d-accompagnement-des-restructurations
"
,
"
file_nb
"
:
2
,
},
{
"
name
"
:
"
Les ruptures conventionnelles
"
,
"
page_slug
"
:
"
les-ruptures-conventionnelles
"
,
"
file_nb
"
:
2
},
{
"
name
"
:
"
Les indices de salaire de base
"
,
"
page_slug
"
:
"
les-indices-de-salaire-de-base
"
,
"
file_nb
"
:
2
},
{
"
name
"
:
"
Les sortants des listes de demandeurs d’emploi inscrits à Pôle emploi
"
,
"
page_slug
"
:
"
donnees-statistiques-les-sortants-des-listes-de-demandeurs-d-emploi-inscrits-a
"
,
"
file_nb
"
:
2
},
{
"
name
"
:
"
Le temps partiel
"
,
"
page_slug
"
:
"
le-temps-partiel
"
,
"
file_nb
"
:
1
},
{
"
name
"
:
"
Les tensions sur le marché du travail par métier
"
,
"
page_slug
"
:
"
les-tensions-sur-le-marche-du-travail-par-metier
"
,
"
file_nb
"
:
2
}
]
def
build_root_categorie
(
i
,
cat
):
'''
build root cat dir + name + category code FROM PAGES level 0
'''
# cat_dir = "%i%s" % (i+1, cat['page_slug'])
root_cat
=
{
"
code
"
:
i
+
1
,
"
slug
"
:
cat
[
"
page_slug
"
],
"
doc_href
"
:
os
.
path
.
join
(
ENTRY_POINT_URL
,
cat
[
"
page_slug
"
])
}
if
i
+
1
==
19
:
root_cat
[
"
datasets
"
]
=
build_datasets
(
root_cat
[
"
doc_href
"
])
return
root_cat
elif
cat
[
"
file_nb
"
]
>
1
:
root_cat
[
"
sub-categories
"
]
=
build_categories
(
root_cat
[
"
doc_href
"
],
root_cat
[
"
code
"
])
return
root_cat
else
:
root_cat
[
"
datasets
"
]
=
build_datasets
(
root_cat
[
"
doc_href
"
])
return
root_cat
def
build_datasets
(
doc_href
):
'''
from cat page load page source and build raw_file info form datasets
'''
try
:
# url = os.path.join(ENTRY_POINT_URL, cat["page_slug"])
resp
=
requests
.
get
(
doc_href
)
# print(resp)
assert
resp
.
status_code
in
range
(
200
,
399
),
"
requests response.status_code == %s
"
%
resp
.
status_code
except
:
cat
[
"
status
"
]
=
False
return
[
cat
]
soup
=
bs
(
resp
.
text
,
"
lxml
"
)
sidebar
=
soup
.
find
(
"
aside
"
)
doc_list
=
sidebar
.
find
(
"
ul
"
,
{
"
class
"
:
"
docs-joints__liste
"
})
documents
=
[
doc
for
doc
in
doc_list
.
findAll
(
"
li
"
)
if
doc
.
span
.
text
in
[
"
xls
"
,
"
xlsx
"
]]
f_docs
=
[
doc
.
find
(
"
a
"
).
get
(
"
href
"
).
split
(
"
/
"
)[
-
1
]
for
doc
in
documents
]
# f_titles = [slugify(doc.findAll("span")[1].text) for doc in documents]
return
f_docs
def
build_categories
(
doc_href
,
code
):
'''
from cat load page source and build subcategories and datasets
subcategories name are defined by the slugify title of the doc
'''
try
:
# url = os.path.join(root_cat["doc_href"])
resp
=
requests
.
get
(
doc_href
)
# print(resp)
assert
resp
.
status_code
in
range
(
200
,
399
),
"
%s
"
%
url
except
:
root_cat
[
"
status
"
]
=
False
return
[
root_cat
]
soup
=
bs
(
resp
.
text
,
"
lxml
"
)
doc_list
=
soup
.
find
(
"
ul
"
,
{
"
class
"
:
"
docs-joints__liste
"
})
categories
=
[]
ii
=
0
for
doc
in
doc_list
.
find_all
(
"
li
"
):
if
doc
.
span
.
text
in
[
"
xls
"
,
"
xlsx
"
]:
ii
=
ii
+
1
sub_code
=
"
%i.%i
"
%
(
int
(
code
),
ii
)
doc_file
=
doc
.
find
(
"
a
"
).
get
(
"
href
"
).
split
(
"
/
"
)[
-
1
]
title
=
doc
.
findAll
(
"
span
"
)[
1
].
text
dataset_name
=
doc_file
.
split
(
"
.
"
)[:
-
1
]
dataset_slug
=
slugify
(
dataset_name
)
slug
=
slugify
(
title
)
categories
.
append
({
"
code
"
:
sub_code
,
"
slug
"
:
slug
,
"
datasets
"
:
[
doc_file
]
})
return
categories
def
build_file_tree
():
data
=
{
"
CATEGORIES
"
:
[
build_categorie_d
(
i
+
1
,
c
)
for
i
,
c
in
enumerate
(
CATEGORIES
)]}
with
open
(
"
./tree.json
"
,
"
w
"
)
as
f
:
pdata
=
json
.
dumps
(
data
,
sort_keys
=
True
,
indent
=
4
,
ensure_ascii
=
False
)
f
.
write
(
pdata
,
f
)
def
datasets_tree
():
import
pprint
pp
=
pprint
.
PrettyPrinter
(
indent
=
4
)
root
=
[
build_root_categorie
(
i
,
cat
)
for
i
,
cat
in
enumerate
(
PAGES
)]
with
open
(
"
./tree.json
"
,
"
w
"
)
as
f
:
jdata
=
json
.
dumps
(
root
,
sort_keys
=
True
,
indent
=
4
,
separators
=
(
'
,
'
,
'
:
'
),
ensure_ascii
=
False
)
f
.
write
(
jdata
)
print
(
"
New Tree available ./tree.json
"
)
if
__name__
==
"
__main__
"
:
datasets_tree
()
\ No newline at end of file