Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
dbnomics-pipeline-ng (experimental)
fetchers
wto
wto-fetcher
Commits
2cc34f06
Commit
2cc34f06
authored
Nov 07, 2017
by
Bruno Duyé
Browse files
Add notes to series ('a', 'b', 'c' after country name)
parent
26ee3a94
Changes
1
Hide whitespace changes
Inline
Side-by-side
wto_to_dbnomics.py
View file @
2cc34f06
...
...
@@ -187,6 +187,14 @@ CATEGORIES = [
},
unit
=
'million dollars'
,
unknown_value
=
'...'
,
series_notes
=
{
# sheet name: ((row number, text to check, excepted value), ...)
"Export data"
:
((
100
,
'a'
,
'FAS (Free Alongside Ship)'
),),
"Import data"
:
(
(
100
,
'a'
,
'imports are valued FOB'
),
(
101
,
'b'
,
'includes an adjustment for under-recorded intra-EU imports'
),
(
102
,
'c'
,
'excludes military imports'
),
),
}
)
),
dict
(
...
...
@@ -204,8 +212,19 @@ CATEGORIES = [
"Export data"
:
"Exports"
,
"Import data"
:
"Imports"
},
unknown_value
=
'…'
,
unit
=
'million dollars'
,
unknown_value
=
'…'
,
series_notes
=
{
# sheet name: ((row number, text to check, excepted value), ...)
"Export data"
:
(
(
50
,
'a'
,
'figures follow BPM5 presentation'
),
(
51
,
'b'
,
'seasonally adjusted'
),
),
"Import data"
:
(
(
50
,
'a'
,
'figures follow BPM5 presentation'
),
(
51
,
'b'
,
'seasonally adjusted'
),
(
52
,
'c'
,
'data do not include part of the costs for freight transport and insurance (i.e. cif/fob adjustment)'
),
)
}
)
),
dict
(
...
...
@@ -474,6 +493,24 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
global
source_dir
global
target_dir
def
check_series_notes
(
sheet
,
sheet_series_notes_definitions
):
"""Check that given notes definitions are found in file
- sheet_series_notes_definitions: something like :
(
(100, 'a', 'imports are valued FOB'),
(101, 'b', 'includes an adjustment for under-recorded intra-EU imports'),
)
"""
assert
isinstance
(
sheet_series_notes_definitions
,
tuple
),
\
"sheet_series_notes_definitions: expected: tuple, received: {}"
.
format
(
type
(
sheet_series_notes_definitions
))
for
row_id
,
code
,
text
in
sheet_series_notes_definitions
:
found
=
False
for
cell
in
sheet
.
row
(
row_id
):
if
cell
.
value
.
find
(
text
):
found
=
True
break
assert
found
,
"Series note with text {!r} not found in sheet {!r} (looking at row {!r})"
.
format
(
text
,
sheet
.
name
,
row_id
)
def
get_series_from_horizontal_xls
(
dataset
):
"""Return a dict containing series from dataset where series are presented horizontally:
Example: {
...
...
@@ -498,7 +535,8 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
flow = "X",
region = "european_union"
},
values = ['-0,2', '1,6', '-1,8', '-2,5', ...]
values = ['-0,2', '1,6', '-1,8', '-2,5', ...],
notes = "includes an adjustment for under-recorded intra-EU imports",
},
{
name = 'Reported exports - Switzer-land',
...
...
@@ -526,6 +564,13 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
sheet
=
doc
.
sheet_by_name
(
sheet_name
)
periods
=
xlrd_tools
.
get_periods
(
sheet
,
xls_constants
[
'periods_first_cell'
],
'H'
)
row_num
=
xls_constants
[
'periods_first_cell'
][
0
]
excepted_series_notes
=
()
sheet_series_notes_definitions
=
None
if
'series_notes'
in
xls_constants
and
xls_constants
[
'series_notes'
].
get
(
sheet_name
):
sheet_series_notes_definitions
=
xls_constants
[
'series_notes'
][
sheet_name
]
check_series_notes
(
sheet
,
sheet_series_notes_definitions
)
# list of notes to be excepted in this sheet. Ex: ('a', 'b')
excepted_series_notes
=
tuple
(
definition
[
1
]
for
definition
in
sheet_series_notes_definitions
)
while
row_num
<
sheet
.
nrows
-
1
:
row_num
+=
1
row
=
sheet
.
row_values
(
row_num
)
...
...
@@ -544,10 +589,26 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
# Pass through rows without data
first_value_col_num
=
xls_constants
[
'periods_first_cell'
][
1
]
if
set
(
row
[
first_value_col_num
:])
==
{
''
}:
print
(
"Info: row {} - ignoring {}"
.
format
(
row_num
,
"{!r}"
.
format
(
region_label
)
if
region_label
else
"line {}"
.
format
(
row_num
)))
if
sheet_series_notes_definitions
and
not
row_num
in
(
d
[
0
]
for
d
in
sheet_series_notes_definitions
):
print
(
"Info: row {} - ignoring {}"
.
format
(
row_num
,
"{!r}"
.
format
(
region_label
)
if
region_label
else
"line {}"
.
format
(
row_num
)))
continue
assert
region_label
,
"No region label found at row {}"
.
format
(
row_num
)
# Search for notes from region label, and remove them from label ("Italy a,c" => "Italy")
match
=
re
.
search
(
' ([a-z](,\s*[a-z])*)$'
,
region_label
)
# "a", " a, c"
series_notes
=
None
if
match
:
# label ends by one or more single char(s)
series_note_codes
=
match
.
group
(
1
).
replace
(
' '
,
''
).
split
(
','
)
assert
set
(
series_note_codes
)
<=
set
(
excepted_series_notes
),
\
"[row {}]: Unexpected note codes: {!r} in {!r}. Expected codes are: {!r} (sheet {!r})"
\
.
format
(
row_num
,
set
(
series_note_codes
)
-
set
(
excepted_series_notes
),
region_label
,
excepted_series_notes
,
sheet
.
name
)
series_notes
=
" -- "
.
join
(
definition
[
2
]
for
definition
in
sheet_series_notes_definitions
if
definition
[
1
]
in
series_note_codes
)
# Remove notes from end of region_label
region_label
=
region_label
[:
-
len
(
match
.
group
(
1
))].
strip
()
generated_region_code
=
region_code
or
slugify
(
region_label
)
dimensions_values_labels
[
'region'
].
add
((
generated_region_code
,
region_label
))
# Try to convert data to float and check that there is at least one valid "data" (float or "...") in row
...
...
@@ -569,8 +630,10 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
flow
=
FLOWS_CODES
[
flow_label
],
region
=
generated_region_code
),
values
=
tuple
(
converted_values
)
values
=
tuple
(
converted_values
)
,
)
if
series_notes
:
series_data
[
'notes'
]
=
series_notes
series_datas
.
append
(
series_data
)
return
dict
(
periods
=
periods
,
...
...
@@ -728,8 +791,10 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
# Create series.json
series_json_data
=
dict
(
code
=
series_directory_name
,
dimensions
=
series_data
[
'dimensions'
]
dimensions
=
series_data
[
'dimensions'
]
,
)
if
'notes'
in
series_data
:
series_json_data
[
'notes'
]
=
series_data
[
'notes'
]
validators
.
validate_series
(
series_json_data
)
write_json_file
(
os
.
path
.
join
(
series_dir_path
,
'series.json'
),
series_json_data
)
# Write series observations
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment