Commit 104876b4 authored by Pierre Dittgen's avatar Pierre Dittgen

Fix validation errors

parent 2f0ad8bf
Pipeline #162513 passed with stage
in 2 minutes and 13 seconds
......@@ -94,7 +94,7 @@ YEAR_DETECT_RE = re.compile(r"201\d")
log = logging.getLogger(__name__)
def retrieve_period_info_from_header(header_values):
def retrieve_period_info_from_header(header_values, df, max_cols):
"""Retrieve (freq, period) from excel row values."""
# Single row header
......@@ -106,19 +106,51 @@ def retrieve_period_info_from_header(header_values):
yield None
# double row header
# Regex to match 'ITEMS' or 'GROUP OF COUNTRIES'
# or whatever is not a date and means that we're arrived at the end
# of the period columns
STOPPER_RE = re.compile(r"^([A-Z]+\s*)+$")
def unique_period(period, period_set):
"""Return period if not already in period_set else None.
Side-effect: add period to period_set if not already present.
if period not in period_set:
return period
return None
previous_y, current_y, previous_mq = None, None, None
qset = set()
for (y, mq) in header_values:
period_set = set()
for i, (y, mq) in enumerate(header_values):
# Set current year
if isinstance(y, int):
current_y = str(y)
elif y == "ITEMS":
# Stop going right when we encounter something different from years
elif isinstance(y, str) and STOPPER_RE.match(y):
if not isinstance(mq, str):
# Test if column contains no observation values
empty_col = i < max_cols and all(
isinstance(elt, float) and np.isnan(elt) for elt in df.iloc[2:, i].tolist()
# If such, just ignore it.
if empty_col:
yield None
# Handle annual column after monthly or quarterly columns
if isinstance(mq, float) and np.isnan(mq):
if previous_mq in ("Dec", "Q4"):
yield ("A", current_y)
yield unique_period(("A", current_y), period_set)
yield None
# Monthly or quarterly case
mq = mq.strip("*")
if mq.startswith("Q"):
......@@ -127,17 +159,18 @@ def retrieve_period_info_from_header(header_values):
if period_val in qset:
period_val = "{}-{}".format(int(current_y) + 1, mq)
yield ("Q", period_val)
yield unique_period(("Q", period_val), period_set)
# Hack to fix a problem with Year cells in TABEL8_2
if mq == "Jan" and previous_y == current_y:
current_y = str(int(current_y) + 1)
yield (
period = (
datetime.strptime("{} {}".format(current_y, mq), "%Y %b").strftime(
yield unique_period(period, period_set)
previous_mq = mq
previous_y = current_y
......@@ -357,18 +390,16 @@ def convert_excel_to_dataset(ds_dir: Path, dataset_code, excel_filepath: Path):
df.columns = pd.MultiIndex.from_arrays(header.values)
# Computes period info list from header
period_info_list = list(retrieve_period_info_from_header(df.columns.tolist()))
period_info_list = list(
retrieve_period_info_from_header(df.columns.tolist(), df, sheet.ncols)
ds_freqs = {pi[0] for pi in period_info_list if pi is not None}
period_list_map = {
freq: [tup[1] for tup in period_info_list if tup is not None and tup[0] == freq]
for freq in ds_freqs
period_id_map = {
freq: [
for i, tup in enumerate(period_info_list)
if tup is not None and tup[0] == freq
freq: [i for i, tup in enumerate(period_info_list) if tup and tup[0] == freq]
for freq in ds_freqs
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment