Skip to content
Snippets Groups Projects
Commit 3c6297e6 authored by Pierre Dittgen's avatar Pierre Dittgen
Browse files

1st convert

parent aa9049a3
No related branches found
No related tags found
No related merge requests found
......@@ -40,10 +40,11 @@ import sys
from operator import itemgetter
from pathlib import Path
from collections import defaultdict
from typing import List, Dict
from typing import List, Dict, Iterator
import pycountry
import pandas as pd
from slugify import slugify
PROVIDER_DATA = {
"code": "ENTSOE",
......@@ -56,28 +57,54 @@ PROVIDER_DATA = {
log = logging.getLogger(__name__)
def compute_series_info(
def iter_series_info(
country_code: str, csv_files: List[Path], dim_acc: Dict[str, Dict]
):
country = pycountry.countries.get(alpha_2=country_code)
) -> Iterator[Dict]:
"""Iterate series information from given CSV files.
Update dimensions accumulator (dim_acc) as a side effect
"""
# Set country name for series
country = pycountry.countries.get(alpha_2=country_code)
dim_acc["country"][country_code] = country.name
df_total = None
for idx, csv_file in enumerate(csv_files):
df = pd.read_csv(csv_file, index_col=1, header=[0, 1])
df_total = df if df_total is None else df_total.append(df)
# aggregate dataframe along years
frames = [
pd.read_csv(csv_file, index_col=0, header=[0, 1]) for csv_file in csv_files
]
df_total = pd.concat(frames)
period_list = df_total.index.to_list()
for col_name in df_total.columns[1:]:
type_label, indicator_label = col_name
type_code = slugify(type_label, separator="_")
indicator_code = slugify(indicator_label, separator="_")
# TODO: generate series_info and update dim_acc
dim_acc["type"][type_code] = type_label
dim_acc["indicator"][indicator_code] = indicator_label
breakpoint()
series_info = {
"code": ".".join([country_code, type_code, indicator_code, "D"]),
"dimensions": {
"country": country_code,
"type": type_code,
"indicator": indicator_code,
"frequency": "D",
},
"observations": [("PERIOD", "VALUE")]
+ list(zip(period_list, df_total[col_name].fillna("NA").values.tolist())),
}
yield series_info
def convert_agpt_dataset(source_dir: Path, target_dir: Path):
target_dir.mkdir(exist_ok=True)
dimension_code_list = ["country", "type", "indicator"]
dim_acc = {dim: {} for dim in dimension_code_list}
dimension_code_list = ["country", "type", "indicator", "frequency"]
dim_acc: Dict[str, Dict[str, str]] = {dim: {} for dim in dimension_code_list}
dim_acc["frequency"] = {"D": "daily"}
dataset_json_data = {
"code": "AGPT",
"name": "Actual generation per type, realised, daily",
......@@ -95,11 +122,10 @@ def convert_agpt_dataset(source_dir: Path, target_dir: Path):
continue
csv_files_by_country_code[m.group(1)].append(csv_file)
breakpoint()
series_info_list = []
for country_code, csv_files in sorted(csv_files_by_country_code.items()):
series_info_list.extend(compute_series_info(country_code, csv_files, dim_acc))
for series_info in iter_series_info(country_code, csv_files, dim_acc):
series_info_list.append(series_info)
# dataset.json
write_json_file(target_dir / "dataset.json", dataset_json_data)
......@@ -110,6 +136,9 @@ def convert_agpt_dataset(source_dir: Path, target_dir: Path):
json.dump(series_info, fd, sort_keys=True, ensure_ascii=False)
fd.write("\n")
# return dataset info for category_tree
return {"code": dataset_json_data["code"], "name": dataset_json_data["name"]}
def main():
parser = argparse.ArgumentParser(
......@@ -137,11 +166,15 @@ def main():
if not target_dir.exists():
parser.error("Target dir %r not found", target_dir)
convert_agpt_dataset(source_dir / "AGPT", target_dir / "AGPT")
# provider.json
write_json_file(target_dir / "provider.json", PROVIDER_DATA)
# AGPT dataset
dataset_info = convert_agpt_dataset(source_dir / "AGPT", target_dir / "AGPT")
# category_tree.json
write_json_file(target_dir / "category_tree.json", [dataset_info])
return 0
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment