Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
dbnomics-pipeline-ng (experimental)
fetchers
wto
wto-fetcher
Commits
1ad895b0
Commit
1ad895b0
authored
Oct 12, 2017
by
Bruno Duyé
Browse files
WIP: get datasets descriptions from html
parent
4a23be15
Changes
2
Hide whitespace changes
Inline
Side-by-side
requirements.txt
View file @
1ad895b0
...
...
@@ -4,3 +4,4 @@ lxml
python-slugify
requests
xlrd
cssselect
wto_to_dbnomics.py
View file @
1ad895b0
...
...
@@ -33,7 +33,6 @@ target_dir: path of target directory containing datasets & series in DBnomics fo
"""
from
collections
import
defaultdict
,
OrderedDict
import
csv
import
json
import
logging
...
...
@@ -41,18 +40,19 @@ import os
import
re
import
shutil
import
sys
from
collections
import
OrderedDict
,
defaultdict
import
lxml.html
import
xlrd
from
docopt
import
docopt
from
lxml.cssselect
import
CSSSelector
from
slugify
import
slugify
import
xlrd
from
dbnomics_converters.base
import
assert_no_error
,
to_float
from
dbnomics_converters.categories
import
validate_category
from
dbnomics_converters.datasets
import
validate_dataset
from
dbnomics_converters.providers
import
validate_provider
from
dbnomics_converters.series
import
validate_series
from
dbnomics_converters.categories
import
validate_category
log
=
logging
.
getLogger
(
__name__
)
...
...
@@ -109,7 +109,12 @@ CATEGORIES = [
[
'Reporter'
,
(
'Reporter_Description'
,
'Reporter_Code'
)],
[
'Partner'
,
(
'Partner_Description'
,
'Partner_Code'
)],
[
'Source'
,
(
'Source_Description'
,
None
)],
])
]),
description_html_css_selectors
=
(
'annual.html'
,
# description, updates informations
[
'p.paranormaltext:nth-child(3)'
,
'.centerCol > p:nth-child(2)'
]
)
),
dict
(
type
=
'dataset'
,
...
...
@@ -122,7 +127,12 @@ CATEGORIES = [
[
'Flow'
,
(
'Flow_Description'
,
None
)],
[
'Partner'
,
(
'Partner_description'
,
'Partner_code'
)],
[
'Indicator'
,
(
'Indicator_description'
,
'Indicator_code'
)]
])
]),
description_html_css_selectors
=
(
'annual.html'
,
# description, updates informations
[
'p.paranormaltext:nth-child(7)'
,
'div.centerCol > p.paranormaltext:nth-child(6)'
]
)
),
dict
(
type
=
'dataset'
,
...
...
@@ -135,7 +145,12 @@ CATEGORIES = [
[
'Flow'
,
(
'Flow_Description'
,
None
)],
[
'Partner'
,
(
'Partner_description'
,
'Partner_code'
)],
[
'Unit'
,
(
'Unit'
,
None
)]
])
]),
description_html_css_selectors
=
(
'annual.html'
,
# description, updates informations
[
'p.paranormaltext:nth-child(11)'
,
'p.paranormaltext:nth-child(11)'
]
)
),
]
# ),
...
...
@@ -441,6 +456,7 @@ def create_dataset_and_series_from_csv(dataset, dataset_path):
dataset_json_data
=
{
'name'
:
dataset
[
'name'
],
'code'
:
dataset
[
'code'
],
'description'
:
get_dataset_description_from_html
(
dataset
,
source_dir
),
'dimensions_values_labels'
:
{
dimension_code
:
{
dimension_value_code
:
dimensions_values_labels
[
dimension_code
][
dimension_value_code
]
...
...
@@ -618,6 +634,35 @@ def write_series_tsv_file(series_dir_path, observations, header):
file_
.
write
(
"
\t
"
.
join
(
observation
)
+
"
\n
"
)
def
get_dataset_description_from_html
(
dataset
,
source_dir
):
filename
=
dataset
[
'description_html_css_selectors'
][
0
]
css_selectors
=
dataset
[
'description_html_css_selectors'
][
1
]
filepath
=
os
.
path
.
join
(
source_dir
,
filename
)
assert
os
.
path
.
isfile
(
filepath
),
"Source html file not found: {!r}"
.
format
(
filepath
)
with
open
(
filepath
)
as
f_
:
html
=
f_
.
read
()
return
get_text_from_css_selectors
(
html
,
css_selectors
)
def
get_text_from_css_selectors
(
html
,
css_selectors
,
join_string
=
"
\n
"
):
"""Return a string containing found texts corresponding to given css_selectors in html, joined by join_string.
The function ensures that all given css selectors match a text in html.
Parameters:
html (string): source html file
css_selectors (list of strings): list of css selectors
join_string: string to use to concatenate matching texts
"""
html_element
=
lxml
.
html
.
document_fromstring
(
html
)
text_parts
=
[]
for
css_selector
in
css_selectors
:
elements
=
html_element
.
cssselect
(
css_selector
)
assert
len
(
elements
)
==
1
,
"Found multiple elements matching {!r} in html"
.
format
(
css_selector
)
text
=
elements
[
0
].
text
assert
text
,
"Empty text from css selector {!r}"
.
format
(
css_selector
)
text_parts
.
append
(
text
)
return
"
\n
"
.
join
(
text_parts
)
def
write_json_file
(
file_path
,
data
):
with
open
(
file_path
,
'w'
)
as
file_
:
json
.
dump
(
data
,
file_
,
ensure_ascii
=
False
,
indent
=
2
,
sort_keys
=
True
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment