Skip to content
Snippets Groups Projects
Commit 202a3b30 authored by Christophe Benz's avatar Christophe Benz
Browse files

Parse periods in Converter

parent bda69eff
No related branches found
No related tags found
No related merge requests found
Pipeline #486981 pending with stages
import re
from collections.abc import Iterator
from typing import TYPE_CHECKING
......@@ -8,7 +9,9 @@ from dbnomics_data_model.model import (
DatasetMetadata,
DatasetReference,
Observation,
Period,
ProviderMetadata,
QuarterPeriod,
Series,
)
......@@ -81,9 +84,23 @@ class Converter:
df = self._source_data_loader.load_march24_dataset()
for series_name, column in df.items():
observations = [Observation.create(period=period, value=value) for period, value in column.items()]
observations = [
Observation.create(period=self._parse_period(source_period), value=value)
for source_period, value in column.items()
]
yield Series.create(
code="E",
name=series_name,
observations=observations,
)
def _parse_period(self, source_period: str) -> Period:
quarter_period_re = re.compile(r"(\d{4})Q(\d)")
match = quarter_period_re.fullmatch(source_period)
if match is None:
msg = f"Not a period: {source_period!r}"
raise ValueError(msg)
year = int(match.group(1))
quarter = int(match.group(2))
return QuarterPeriod(year, quarter)
......@@ -77,26 +77,9 @@ class SourceDataLoader:
def _load_march24_from_xlsx(self, xlsx_file: Path) -> DataFrame:
df = pd.read_excel(xlsx_file, header=None, sheet_name="1.6")
employment_df = df.iloc[2:88, 1:3]
employment_df = df.iloc[2:88, 1:3] # keep "quarterly" section only
employment_df.rename(columns=employment_df.iloc[0], inplace=True)
employment_df = employment_df[1:]
employment_df.rename(columns={employment_df.columns[0]: "Period"}, inplace=True)
formatted_periods: list[str] = []
for period in employment_df["Period"]:
res = ""
for char in period:
if char == "Q":
res += "-"
res += char
formatted_periods.append(res)
employment_df["Period"] = formatted_periods
employment_df.replace(float("nan"), "NA", inplace=True)
employment_df.set_index(employment_df.columns[0], inplace=True)
return employment_df
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment