Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
bceao-fetcher
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Michel Juillard
bceao-fetcher
Compare revisions
master to master
Compare revisions
Changes are shown as if the
source
revision was being merged into the
target
revision.
Learn more about comparing revisions.
Source
MichelJuillard/bceao-fetcher
Select target project
No results found
master
Select Git revision
Swap
Target
dbnomics-fetchers/bceao-fetcher
Select target project
dbnomics-fetchers/bceao-fetcher
MichelJuillard/bceao-fetcher
2 results
master
Select Git revision
Show changes
Only incoming changes from source
Include changes to target since source was created
Compare
Commits on Source (5)
add second initial request with cookies to get the web site
· e28184d6
Michel Juillard
authored
5 years ago
e28184d6
find dataset code (s_id) another way
· a8903ae9
Michel Juillard
authored
5 years ago
a8903ae9
moved creation category_tree to download.py
· 3935d068
Michel Juillard
authored
5 years ago
3935d068
refactoring category_tree
· 16d48830
Michel Juillard
authored
5 years ago
16d48830
moved building category tree to download.py
· f96bf5af
Michel Juillard
authored
5 years ago
f96bf5af
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
convert.py
+11
-37
11 additions, 37 deletions
convert.py
download.py
+72
-27
72 additions, 27 deletions
download.py
with
83 additions
and
64 deletions
convert.py
View file @
f96bf5af
...
...
@@ -83,39 +83,6 @@ def get_countries(source_dir):
return
countries
def
build_categories_tree
(
source_dir
,
target_dir
):
'''
build categories > datasets from index.html
'''
with
open
(
os
.
path
.
join
(
source_dir
,
"
index.html
"
),
"
r
"
,
encoding
=
'
utf-8
'
)
as
f
:
soup
=
bs
(
f
.
read
(),
"
lxml
"
)
categories_tree
=
[]
for
ul
in
soup
.
find_all
(
"
ul
"
,
class_
=
"
extend
"
):
cat_name
=
clean_special_char
(
ul
.
parent
.
a
.
text
)
category
=
{
# ugly fix of broken unicode char
"
name
"
:
clean_special_char
(
ul
.
parent
.
a
.
text
),
"
code
"
:
slugify
.
slugify
(
clean_special_char
(
ul
.
parent
.
a
.
text
))
}
category
[
"
children
"
]
=
[]
for
li
in
ul
.
find_all
(
"
li
"
):
series
=
list
(
ast
.
literal_eval
(
li
.
a
.
get
(
"
onclick
"
).
replace
(
"
soumettreTab
"
,
""
)))
code
,
name
=
series
[
-
1
].
split
(
"
-
"
)
category
[
"
children
"
].
append
({
"
code
"
:
code
,
"
name
"
:
name
})
# for country_code, country_label in countries.items():
# categories[cat].append({
# "code": "_".join([code, country_code]),
# "name": " - ".join([name, country_label])
# })
categories_tree
.
append
(
category
)
write_json_file
(
os
.
path
.
join
(
target_dir
,
"
category_tree.json
"
),
categories_tree
)
write_json_file
(
os
.
path
.
join
(
target_dir
,
"
provider.json
"
),
provider_json
)
return
categories_tree
def
detect_unit_label
(
category_table
):
# detect unit_label from title NOT USED
...
...
@@ -153,14 +120,15 @@ def define_label(label, sub_label, top_label):
return
None
,
None
,
last_label
def
build_series
(
source_dir
,
target_dir
):
def
build_series
(
source_dir
,
target_dir
,
category_tree
):
'''
From datasets < dataset_code > .html to series
'''
countries
=
get_countries
(
source_dir
)
categories
=
build_categories_tree
(
source_dir
,
target_dir
)
datasets
=
itertools
.
chain
.
from_iterable
([
category
[
"
children
"
]
for
category
in
categories
])
datasets
=
itertools
.
chain
.
from_iterable
([
category
[
"
children
"
]
for
category
in
category_tree
])
datasets_dict
=
{
n
[
"
code
"
]:
n
[
"
name
"
]
for
n
in
datasets
}
for
n
in
datasets_dict
.
items
():
print
(
n
)
for
f
in
os
.
listdir
(
source_dir
):
if
f
.
endswith
(
"
.html
"
)
and
f
!=
"
index.html
"
:
dataset_file
=
os
.
path
.
join
(
source_dir
,
f
)
...
...
@@ -245,7 +213,13 @@ def main():
source_dir
=
os
.
path
.
abspath
(
args
[
"
<source_dir>
"
])
target_dir
=
os
.
path
.
abspath
(
args
[
"
<target_dir>
"
])
assert
os
.
path
.
exists
(
source_dir
),
source_dir
build_series
(
source_dir
,
target_dir
)
write_json_file
(
os
.
path
.
join
(
target_dir
,
"
provider.json
"
),
provider_json
)
category_tree_filename_in
=
os
.
path
.
join
(
source_dir
,
"
category_tree.json
"
)
with
open
(
category_tree_filename_in
)
as
f
:
category_tree
=
json
.
load
(
f
)
category_tree_filename_out
=
os
.
path
.
join
(
source_dir
,
"
category_tree.json
"
)
write_json_file
(
category_tree_filename_out
,
category_tree
)
build_series
(
source_dir
,
target_dir
,
category_tree
)
if
__name__
==
"
__main__
"
:
...
...
This diff is collapsed.
Click to expand it.
download.py
View file @
f96bf5af
...
...
@@ -28,13 +28,35 @@ Usage:
"""
import
ast
import
json
import
logging
import
os
import
slugify
import
sys
import
requests
from
bs4
import
BeautifulSoup
as
bs
from
docopt
import
docopt
log
=
logging
.
getLogger
(
__name__
)
def
clean_special_char
(
string
):
'''
ugly fix of encoding error of HTML source file
can be fixed replacing unrecognized char into é because 98,79% are é
two exceptions:
- Côte d
'
Ivoire
- impôts
'''
if
"
C�te
"
in
string
:
return
string
.
replace
(
"
�
"
,
"
ô
"
)
elif
"
imp�ts
"
in
string
:
return
string
.
replace
(
"
�
"
,
"
ô
"
)
else
:
return
string
.
replace
(
"
�
"
,
"
é
"
)
def
write_json_file
(
file_path
,
data
):
with
open
(
file_path
,
'
w
'
,
encoding
=
'
utf-8
'
)
as
file_
:
json
.
dump
(
data
,
file_
,
ensure_ascii
=
False
,
indent
=
2
,
sort_keys
=
True
)
def
download_datasets
(
target_dir
):
'''
download provider root url and build the datasets source page
'''
...
...
@@ -47,20 +69,27 @@ def download_datasets(target_dir):
# 'User-Agent': 'DBNomics Downloader BOT see next.dbnomics.world',
# }
# download the main index 'index.html'
resp
=
requests
.
get
(
provider_root_url
)
resp0
=
requests
.
get
(
provider_root_url
)
resp
=
requests
.
get
(
provider_root_url
,
cookies
=
resp0
.
cookies
)
resp
.
raise_for_status
()
with
open
(
os
.
path
.
join
(
target_dir
,
"
index.html
"
),
"
w
"
)
as
_file
:
_file
.
write
(
resp
.
text
)
headers
=
resp
.
headers
soup
=
bs
(
resp
.
text
,
"
lxml
"
)
categories
=
{}
categories
=
[]
for
ul
in
soup
.
find_all
(
"
ul
"
,
class_
=
"
extend
"
):
cat
=
ul
.
parent
.
a
.
text
.
encode
(
"
utf-8
"
).
decode
(
"
utf-8
"
,
errors
=
"
ignore
"
)
categories
[
cat
]
=
[]
# ugly fix of broken unicode char
cat_name
=
clean_special_char
(
ul
.
parent
.
a
.
text
)
category
=
{
"
name
"
:
cat_name
,
"
code
"
:
slugify
.
slugify
(
clean_special_char
(
ul
.
parent
.
a
.
text
))
}
category
[
"
datasets
"
]
=
[]
for
li
in
ul
.
find_all
(
"
li
"
):
series
=
list
(
ast
.
literal_eval
(
li
.
a
.
get
(
"
onclick
"
).
replace
(
"
soumettreTab
"
,
""
)))
categor
ies
[
cat
].
append
(
series
)
categor
y
[
"
datasets
"
].
append
(
series
)
categories
.
append
(
category
)
# no need to have country names
# encoding problems: broken utf-8 from website app: countries and categories é and ô same
countries
=
{
n
.
text
.
strip
().
encode
(
'
utf-8
'
).
decode
(
"
utf-8
"
,
errors
=
"
ignore
"
):
n
.
find
(
"
input
"
)
...
...
@@ -98,31 +127,47 @@ def download_datasets(target_dir):
country
=
{
n
[
0
]:
n
[
1
]
for
n
in
zip
(
countries
.
keys
(),
countries
.
keys
())}
post_data
.
update
(
country
)
for
cat
,
series_l
in
categories
.
items
():
for
series
in
series_l
:
freq
,
id_tab
,
s_name_id
=
series
s_id
,
s_name
=
s_name_id
.
split
(
"
-
"
)
categories_tree
=
[]
for
cat
in
categories
:
category
=
{
"
code
"
:
cat
[
"
code
"
],
"
name
"
:
cat
[
"
name
"
]
}
category
[
"
children
"
]
=
[]
for
freq
,
id_tab
,
s_name_id
in
cat
[
"
datasets
"
]:
post_data
[
"
idTab
"
]
=
id_tab
data
=
[(
k
,
v
)
for
k
,
v
in
post_data
.
items
()]
# html_download
resp
=
requests
.
post
(
"
https://edenpub.bceao.int/rapportPredefini.php
"
,
data
=
data
)
assert
resp
is
not
None
,
"
requests.get() failed with url entrypoint %s
"
%
provider_root_url
assert
resp
.
status_code
in
range
(
200
,
399
),
\
"
requests response.status_code == %s
"
%
resp
.
status_code
with
open
(
os
.
path
.
join
(
target_dir
,
s_id
+
"
.html
"
),
"
w
"
)
as
_f
:
_f
.
write
(
resp
.
text
)
# xls_download
post_data
[
"
export
"
]
=
""
xport_data
=
[(
k
,
v
)
for
k
,
v
in
post_data
.
items
()
if
k
in
[
"
params
"
,
"
export
"
,
"
idTab
"
]]
resp
=
requests
.
post
(
"
https://edenpub.bceao.int/rapportPredefini.php
"
,
data
=
data
)
assert
resp
is
not
None
,
"
requests.get() failed with url entrypoint %s
"
%
provider_root_url
assert
resp
.
status_code
in
range
(
200
,
399
),
\
"
requests response.status_code == %s
"
%
resp
.
status_code
with
open
(
os
.
path
.
join
(
target_dir
,
s_id
+
"
.xls
"
),
"
w
"
)
as
_f
:
_f
.
write
(
resp
.
text
)
try
:
resp
=
requests
.
post
(
"
https://edenpub.bceao.int/rapportPredefini.php
"
,
data
=
data
)
assert
resp
is
not
None
,
"
requests.get() failed with url entrypoint %s
"
%
provider_root_url
assert
resp
.
status_code
in
range
(
200
,
399
),
\
"
requests response.status_code == %s
"
%
resp
.
status_code
except
:
log
.
warning
(
"
Dataset {} is missing
"
.
format
(
s_name_id
))
else
:
soup
=
bs
(
resp
.
text
,
"
lxml
"
)
s_id
=
soup
.
find
(
"
h2
"
).
text
.
split
(
'
-
'
)[
0
]
category
[
"
children
"
].
append
({
"
code
"
:
s_id
,
"
name
"
:
s_name_id
})
with
open
(
os
.
path
.
join
(
target_dir
,
s_id
+
"
.html
"
),
"
w
"
)
as
_f
:
_f
.
write
(
resp
.
text
)
# xls_download
post_data
[
"
export
"
]
=
""
xport_data
=
[(
k
,
v
)
for
k
,
v
in
post_data
.
items
()
if
k
in
[
"
params
"
,
"
export
"
,
"
idTab
"
]]
resp
=
requests
.
post
(
"
https://edenpub.bceao.int/rapportPredefini.php
"
,
data
=
data
)
assert
resp
is
not
None
,
"
requests.get() failed with url entrypoint %s
"
%
provider_root_url
assert
resp
.
status_code
in
range
(
200
,
399
),
\
"
requests response.status_code == %s
"
%
resp
.
status_code
with
open
(
os
.
path
.
join
(
target_dir
,
s_id
+
"
.xls
"
),
"
w
"
)
as
_f
:
_f
.
write
(
resp
.
text
)
categories_tree
.
append
(
category
)
write_json_file
(
os
.
path
.
join
(
target_dir
,
"
category_tree.json
"
),
categories_tree
)
def
main
():
args
=
docopt
(
__doc__
)
...
...
This diff is collapsed.
Click to expand it.