Commit 2cc34f06 authored by Bruno Duyé's avatar Bruno Duyé
Browse files

Add notes to series ('a', 'b', 'c' after country name)

parent 26ee3a94
......@@ -187,6 +187,14 @@ CATEGORIES = [
},
unit='million dollars',
unknown_value='...',
series_notes={ # sheet name: ((row number, text to check, excepted value), ...)
"Export data": ((100, 'a', 'FAS (Free Alongside Ship)'),),
"Import data": (
(100, 'a', 'imports are valued FOB'),
(101, 'b', 'includes an adjustment for under-recorded intra-EU imports'),
(102, 'c', 'excludes military imports'),
),
}
)
),
dict(
......@@ -204,8 +212,19 @@ CATEGORIES = [
"Export data": "Exports",
"Import data": "Imports"
},
unknown_value='…',
unit='million dollars',
unknown_value='…',
series_notes={ # sheet name: ((row number, text to check, excepted value), ...)
"Export data": (
(50, 'a', 'figures follow BPM5 presentation'),
(51, 'b', 'seasonally adjusted'),
),
"Import data": (
(50, 'a', 'figures follow BPM5 presentation'),
(51, 'b', 'seasonally adjusted'),
(52, 'c', 'data do not include part of the costs for freight transport and insurance (i.e. cif/fob adjustment)'),
)
}
)
),
dict(
......@@ -474,6 +493,24 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
global source_dir
global target_dir
def check_series_notes(sheet, sheet_series_notes_definitions):
"""Check that given notes definitions are found in file
- sheet_series_notes_definitions: something like :
(
(100, 'a', 'imports are valued FOB'),
(101, 'b', 'includes an adjustment for under-recorded intra-EU imports'),
)
"""
assert isinstance(sheet_series_notes_definitions, tuple), \
"sheet_series_notes_definitions: expected: tuple, received: {}".format(type(sheet_series_notes_definitions))
for row_id, code, text in sheet_series_notes_definitions:
found = False
for cell in sheet.row(row_id):
if cell.value.find(text):
found = True
break
assert found, "Series note with text {!r} not found in sheet {!r} (looking at row {!r})".format(text, sheet.name, row_id)
def get_series_from_horizontal_xls(dataset):
"""Return a dict containing series from dataset where series are presented horizontally:
Example: {
......@@ -498,7 +535,8 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
flow = "X",
region = "european_union"
},
values = ['-0,2', '1,6', '-1,8', '-2,5', ...]
values = ['-0,2', '1,6', '-1,8', '-2,5', ...],
notes = "includes an adjustment for under-recorded intra-EU imports",
},
{
name = 'Reported exports - Switzer-land',
......@@ -526,6 +564,13 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
sheet = doc.sheet_by_name(sheet_name)
periods = xlrd_tools.get_periods(sheet, xls_constants['periods_first_cell'], 'H')
row_num = xls_constants['periods_first_cell'][0]
excepted_series_notes = ()
sheet_series_notes_definitions = None
if 'series_notes' in xls_constants and xls_constants['series_notes'].get(sheet_name):
sheet_series_notes_definitions = xls_constants['series_notes'][sheet_name]
check_series_notes(sheet, sheet_series_notes_definitions)
# list of notes to be excepted in this sheet. Ex: ('a', 'b')
excepted_series_notes = tuple(definition[1] for definition in sheet_series_notes_definitions)
while row_num < sheet.nrows - 1:
row_num += 1
row = sheet.row_values(row_num)
......@@ -544,10 +589,26 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
# Pass through rows without data
first_value_col_num = xls_constants['periods_first_cell'][1]
if set(row[first_value_col_num:]) == {''}:
print("Info: row {} - ignoring {}".format(row_num, "{!r}".format(region_label)
if region_label else "line {}".format(row_num)))
if sheet_series_notes_definitions and not row_num in (d[0] for d in sheet_series_notes_definitions):
print("Info: row {} - ignoring {}".format(row_num, "{!r}".format(region_label)
if region_label else "line {}".format(row_num)))
continue
assert region_label, "No region label found at row {}".format(row_num)
# Search for notes from region label, and remove them from label ("Italy a,c" => "Italy")
match = re.search(' ([a-z](,\s*[a-z])*)$', region_label) # "a", " a, c"
series_notes = None
if match:
# label ends by one or more single char(s)
series_note_codes = match.group(1).replace(' ', '').split(',')
assert set(series_note_codes) <= set(excepted_series_notes), \
"[row {}]: Unexpected note codes: {!r} in {!r}. Expected codes are: {!r} (sheet {!r})" \
.format(row_num, set(series_note_codes) - set(excepted_series_notes), region_label, excepted_series_notes, sheet.name)
series_notes = " -- ".join(
definition[2] for definition in sheet_series_notes_definitions
if definition[1] in series_note_codes
)
# Remove notes from end of region_label
region_label = region_label[:-len(match.group(1))].strip()
generated_region_code = region_code or slugify(region_label)
dimensions_values_labels['region'].add((generated_region_code, region_label))
# Try to convert data to float and check that there is at least one valid "data" (float or "...") in row
......@@ -569,8 +630,10 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
flow=FLOWS_CODES[flow_label],
region=generated_region_code
),
values=tuple(converted_values)
values=tuple(converted_values),
)
if series_notes:
series_data['notes'] = series_notes
series_datas.append(series_data)
return dict(
periods=periods,
......@@ -728,8 +791,10 @@ def create_dataset_and_series_from_xls(dataset, dataset_path):
# Create series.json
series_json_data = dict(
code=series_directory_name,
dimensions=series_data['dimensions']
dimensions=series_data['dimensions'],
)
if 'notes' in series_data:
series_json_data['notes'] = series_data['notes']
validators.validate_series(series_json_data)
write_json_file(os.path.join(series_dir_path, 'series.json'), series_json_data)
# Write series observations
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment