Commit 39b236f1 authored by Christophe Benz's avatar Christophe Benz

Merge branch 'fix-weeks' into 'master'

Enforce ISO weeks and period normalization

Closes dbnomics-fetchers/management#635

See merge request !31
parents 97520381 17ee8734
Pipeline #153365 failed with stage
in 13 seconds
stages:
- test
- deploy
- validate
Test:
stage: test
image: python:3.7
only:
- pushes
before_script:
- pip install --editable .
- pip install pytest
script:
- pytest
Publish on PyPI:
stage: deploy
image: python:3.7
......@@ -18,7 +34,7 @@ Publish on PyPI:
url: https://pypi.org/project/dbnomics-data-model/$CI_COMMIT_TAG
Validate:
stage: test
stage: validate
except:
- pushes
tags:
......
# Changelog
### 0.13.9
Non-breaking changes:
- Fix weeks handling (cf https://git.nomics.world/dbnomics-fetchers/management/issues/635)
### 0.13.8
Non-breaking changes in validation script:
......
......@@ -59,6 +59,7 @@ class Frequency(Enum):
day_range_re = "0[1-9]|[1-2][0-9]|3[0-1]" # 01-31
month_range_re = "0[1-9]|1[0-2]" # 01-12
# Cf https://docs.python.org/3/library/datetime.html#datetime.date.isocalendar
week_range_re = "0[1-9]|[1-4][0-9]|5[0-3]" # 01-53
......@@ -196,8 +197,9 @@ def complete_series_list(periods_dates_and_values_list, frequency: Frequency,
def detect_frequency(period1, period2):
"""Return a tuple like `(frequency, normalize_period)` for the given `period`.
`normalize_period` can be `None` if periods are already normalized,
or a function taking a period as `str` and returning a normalized period as `str`.
If periods are already normalized or if `period1` and `period2` are not start days
of the inferred frequency, `normalize_period` will be `None`.
Otherwise it will be a function taking a period `str` and returning a normalized period `str`.
When the format of the periods is "daily", the frequency can be different,
depending on the interval between the days (e.g. the frequency can be "quarterly" if the interval is 3 months).
......@@ -248,8 +250,18 @@ def detect_frequency(period1, period2):
(<Frequency.BI_MONTHLY: 'bi-monthly'>, ...)
>>> detect_frequency('2014-01-01', '2014-02-01') # doctest: +ELLIPSIS
(<Frequency.MONTHLY: 'monthly'>, ...)
>>> detect_frequency('2014-01-01', '2014-01-08') # doctest: +ELLIPSIS
>>> detect_frequency('2014-01-15', '2014-02-15') # doctest: +ELLIPSIS
(<Frequency.MONTHLY: 'monthly'>, None)
>>> detect_frequency('2014-01-01', '2014-01-31') # doctest: +ELLIPSIS
(<Frequency.DAILY: 'daily'>, None)
>>> detect_frequency('2014-01-06', '2014-01-13') # doctest: +ELLIPSIS
(<Frequency.WEEKLY: 'weekly'>, ...)
>>> detect_frequency('2014-01-07', '2014-01-14') # doctest: +ELLIPSIS
(<Frequency.WEEKLY: 'weekly'>, None)
>>> detect_frequency('2014-01-06', '2014-01-14') # doctest: +ELLIPSIS
(<Frequency.DAILY: 'daily'>, None)
>>> detect_frequency('2014-01-03', '2014-01-11') # doctest: +ELLIPSIS
(<Frequency.DAILY: 'daily'>, None)
# Invalid or different period formats:
>>> detect_frequency('2014', '2015-02')
......@@ -262,11 +274,17 @@ def detect_frequency(period1, period2):
(None, None)
"""
def make_normalize_period(frequency):
def make_normalize_period(frequency: Frequency, date_1: date, date_2: date):
def normalize_period(period):
period_date = datetime.fromisoformat(period)
return start_day_to_period(period_date, frequency)
return normalize_period
return (
normalize_period
if is_start_day(date_1, frequency) and is_start_day(date_2, frequency)
else None
)
period1_format, normalize_period = detect_period_format(period1)
period2_format, _ = detect_period_format(period2)
......@@ -283,18 +301,18 @@ def detect_frequency(period1, period2):
date_diff = relativedelta(date_2, date_1)
if date_diff.days == 1:
return (Frequency.DAILY, None)
elif date_diff.weeks == 1:
return (Frequency.WEEKLY, make_normalize_period(Frequency.WEEKLY))
elif date_diff.days == 7:
return (Frequency.WEEKLY, make_normalize_period(Frequency.WEEKLY, date_1, date_2))
elif date_diff.months == 1:
return (Frequency.MONTHLY, make_normalize_period(Frequency.MONTHLY))
return (Frequency.MONTHLY, make_normalize_period(Frequency.MONTHLY, date_1, date_2))
elif date_diff.months == 2:
return (Frequency.BI_MONTHLY, make_normalize_period(Frequency.BI_MONTHLY))
return (Frequency.BI_MONTHLY, make_normalize_period(Frequency.BI_MONTHLY, date_1, date_2))
elif date_diff.months == 3:
return (Frequency.QUARTERLY, make_normalize_period(Frequency.QUARTERLY))
return (Frequency.QUARTERLY, make_normalize_period(Frequency.QUARTERLY, date_1, date_2))
elif date_diff.months == 6:
return (Frequency.BI_ANNUAL, make_normalize_period(Frequency.BI_ANNUAL))
return (Frequency.BI_ANNUAL, make_normalize_period(Frequency.BI_ANNUAL, date_1, date_2))
elif date_diff.years == 1:
return (Frequency.ANNUAL, make_normalize_period(Frequency.ANNUAL))
return (Frequency.ANNUAL, make_normalize_period(Frequency.ANNUAL, date_1, date_2))
else:
# No specific interval detected, keep daily.
return (Frequency.DAILY, None)
......@@ -595,6 +613,10 @@ def normalize_observations(observations):
# Period format different than frequency:
>>> normalize_observations([['PERIOD', 'VALUE'], ['2010-01-01', 1], ['2010-04-01', 2]])
(<Frequency.QUARTERLY: 'quarterly'>, [['PERIOD', 'VALUE'], ['2010-Q1', 1.0], ['2010-Q2', 2.0]])
>>> normalize_observations([['PERIOD', 'VALUE'], ['2010-01-01', 1], ['2010-01-08', 2]])
(<Frequency.WEEKLY: 'weekly'>, [['PERIOD', 'VALUE'], ['2010-01-01', 1.0], ['2010-01-08', 2.0]])
>>> normalize_observations([['PERIOD', 'VALUE'], ['2010-01-04', 1], ['2010-01-11', 2]])
(<Frequency.WEEKLY: 'weekly'>, [['PERIOD', 'VALUE'], ['2010-W01', 1.0], ['2010-W02', 2.0]])
"""
if not observations:
return (None, [])
......@@ -629,10 +651,30 @@ def normalize_observations(observations):
return (frequency, [header] + normalized_rows)
def is_start_day(date: date, frequency: Frequency):
if frequency == Frequency.ANNUAL:
return date.day == 1 and date.month == 1
elif frequency == Frequency.BI_ANNUAL:
return date.day == 1 and date.month in {1, 7}
elif frequency == Frequency.QUARTERLY:
return date.day == 1 and date.month in {1, 4, 7, 10}
elif frequency == Frequency.BI_MONTHLY:
return date.day == 1 and date.month in {1, 3, 5, 7, 9, 11}
elif frequency == Frequency.MONTHLY:
return date.day == 1
elif frequency == Frequency.WEEKLY:
return date.isoweekday() == 1 # monday
elif frequency == Frequency.DAILY:
return True
raise ValueError("Unsupported frequency: {}".format(frequency))
def period_to_start_day(period):
"""Return the start day of `period` as ISO-8601 date string.
>>> period_to_start_day("")
>>> period_to_start_day("2000-W00")
>>> period_to_start_day("2001")
datetime.date(2001, 1, 1)
>>> period_to_start_day("2001-S1")
......@@ -696,7 +738,7 @@ def period_to_start_day(period):
year, month = match.groups()
return date(int(year), int(month), 1)
elif period_format == Frequency.WEEKLY:
return datetime.strptime("{}-1".format(period), "%Y-W%W-%w").date()
return datetime.strptime("{}-1".format(period), "%G-W%V-%u").date()
elif period_format == Frequency.DAILY:
year, month, day = match.groups()
return date(int(year), int(month), int(day))
......@@ -707,12 +749,18 @@ def period_to_start_day(period):
def start_day_to_period(period_date: date, frequency: Frequency):
"""Return a period as `str` from `period_date` start day given `frequency`.
"""Simplfy the `str` representation of `period_date` based on `frequency`.
If `period_date` is the start day of the period defined by `frequency`,
return a simpler representation of `period_date` as `str`,
otherwise return `period_date` as ISO format.
>>> start_day_to_period(date(2000, 1, 1), Frequency.ANNUAL)
'2000'
>>> start_day_to_period(date(2000, 1, 1), Frequency.BI_ANNUAL)
'2000-S1'
>>> start_day_to_period(date(2000, 7, 1), Frequency.BI_ANNUAL)
'2000-S2'
>>> start_day_to_period(date(2000, 1, 1), Frequency.BI_MONTHLY)
'2000-B1'
>>> start_day_to_period(date(2000, 1, 1), Frequency.QUARTERLY)
......@@ -721,37 +769,42 @@ def start_day_to_period(period_date: date, frequency: Frequency):
'2000-Q2'
>>> start_day_to_period(date(2000, 1, 1), Frequency.MONTHLY)
'2000-01'
>>> start_day_to_period(date(2000, 1, 1), Frequency.WEEKLY)
'2000-W00'
>>> start_day_to_period(date(2000, 1, 3), Frequency.WEEKLY)
'2000-W01'
>>> start_day_to_period(date(2000, 1, 1), Frequency.DAILY)
'2000-01-01'
# `period_date` is not the start day:
>>> start_day_to_period(date(2000, 2, 3), Frequency.ANNUAL)
'2000'
'2000-02-03'
>>> start_day_to_period(date(2000, 2, 3), Frequency.BI_ANNUAL)
'2000-S1'
'2000-02-03'
>>> start_day_to_period(date(2000, 6, 8), Frequency.BI_ANNUAL)
'2000-S1'
'2000-06-08'
>>> start_day_to_period(date(2000, 8, 21), Frequency.BI_ANNUAL)
'2000-S2'
'2000-08-21'
>>> start_day_to_period(date(2000, 12, 31), Frequency.BI_ANNUAL)
'2000-S2'
'2000-12-31'
>>> start_day_to_period(date(2000, 2, 3), Frequency.BI_MONTHLY)
'2000-B1'
'2000-02-03'
>>> start_day_to_period(date(2000, 3, 7), Frequency.BI_MONTHLY)
'2000-B2'
'2000-03-07'
>>> start_day_to_period(date(2000, 2, 3), Frequency.QUARTERLY)
'2000-Q1'
'2000-02-03'
>>> start_day_to_period(date(2000, 4, 6), Frequency.QUARTERLY)
'2000-Q2'
'2000-04-06'
>>> start_day_to_period(date(2000, 2, 3), Frequency.MONTHLY)
'2000-02'
'2000-02-03'
>>> start_day_to_period(date(2000, 5, 8), Frequency.MONTHLY)
'2000-05'
'2000-05-08'
>>> start_day_to_period(date(2000, 1, 1), Frequency.WEEKLY)
'2000-01-01'
>>> start_day_to_period(date(2000, 2, 3), Frequency.WEEKLY)
'2000-W05'
'2000-02-03'
"""
if not is_start_day(period_date, frequency):
return period_date.isoformat()
if frequency == Frequency.ANNUAL:
return period_date.strftime("%Y")
elif frequency == Frequency.BI_ANNUAL:
......@@ -763,13 +816,11 @@ def start_day_to_period(period_date: date, frequency: Frequency):
elif frequency == Frequency.MONTHLY:
return period_date.strftime("%Y-%m")
elif frequency == Frequency.WEEKLY:
return period_date.strftime("%Y-W%W")
return period_date.strftime("%G-W%V")
elif frequency == Frequency.DAILY:
return period_date.isoformat()
else:
assert False, frequency
assert False, "Should never reach this line"
raise ValueError("Unsupported frequency: {}".format(frequency))
def value_to_float(value):
......
......@@ -45,7 +45,7 @@ with readme_filepath.open('rt', encoding='utf-8') as fd:
setup(
name='dbnomics-data-model',
version='0.13.8',
version='0.13.9',
author='DBnomics Team',
author_email='contact@nomics.world',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment