Skip to content
Snippets Groups Projects
Commit 281f62c0 authored by Pierre Dittgen's avatar Pierre Dittgen
Browse files

1st commit

parents
No related branches found
No related tags found
No related merge requests found
.env
\ No newline at end of file
image: dbnomics/dbnomics-gitlab-ci:latest
variables:
# Can be "download" or "convert"
JOB: convert
PROVIDER_SLUG: entsoe
before_script:
# Display info about environment.
- date
- locale
- echo "Running job ${JOB} for provider ${PROVIDER_SLUG}"
# Install fetcher dependencies.
- '[ -f requirements.txt ] && pip3 install --requirement requirements.txt'
# Run ssh-agent (inside the build environment).
- eval $(ssh-agent -s)
# Add the SSH key stored in SSH_PRIVATE_KEY variable to the agent store.
- ssh-add <(echo "$SSH_PRIVATE_KEY")
# Add the SSH keys of remote Git servers, to disable host key checking questions.
- mkdir -p ~/.ssh
- ssh-keyscan -t rsa git.nomics.world >> ~/.ssh/known_hosts
- git config --global push.default simple
- git config --global user.email "${PROVIDER_SLUG}-fetcher@db.nomics.world"
- git config --global user.name "${PROVIDER_SLUG} fetcher"
job:
stage: build
except:
- pushes
tags:
- docker
script:
- set -x
- |
if [ "${JOB}" == "download" ]; then
time git clone --quiet --depth=1 git@git.nomics.world:dbnomics-source-data/${PROVIDER_SLUG}-source-data.git
cd ${PROVIDER_SLUG}-source-data
time find -not -path "./.git/*" -not -name ".git" -delete
cd ..
time python3 download.py ${PROVIDER_SLUG}-source-data
cd ${PROVIDER_SLUG}-source-data
time git add -A
time git commit -m "New download" --quiet || true
time git push
# Move errors.json file, if present; to be kept as artifact
[ -f errors.json ] && mv errors.json .. || true
fi
- |
if [ "${JOB}" == "convert" ]; then
time git clone --quiet --depth=1 https://git.nomics.world/dbnomics-source-data/${PROVIDER_SLUG}-source-data.git
time git clone --quiet --depth=1 git@git.nomics.world:dbnomics-json-data/${PROVIDER_SLUG}-json-data.git
cd ${PROVIDER_SLUG}-json-data
# If not on master, checkout corresponding branch on json-data
git checkout -B ${CI_COMMIT_REF_NAME}
# Delete all existing files
time find -not -path "./.git/*" -not -name ".git" -delete
cd ..
time python3 convert.py ${PROVIDER_SLUG}-source-data ${PROVIDER_SLUG}-json-data
cd ${PROVIDER_SLUG}-json-data
# Commit conversion result, ignoring absent datasets
time git add --ignore-removal .
time git commit -m "New conversion..." -m "from source-data $(git -C ../${PROVIDER_SLUG}-source-data rev-parse HEAD)" --quiet || true
time git push origin ${CI_COMMIT_REF_NAME}
# Move errors.json file, if present; to be kept as artifact
[ -f errors.json ] && mv errors.json .. || true
fi
artifacts:
paths:
- errors.json
# ENTSOE fetcher
Download and convert data from ENTSOE.
## Code quality
See https://git.nomics.world/dbnomics-fetchers/documentation/wikis/code-style
\ No newline at end of file
#! /usr/bin/env python3
# entsoe-fetcher -- Download and convert data from ENTSOE
# By: DBnomics team <contact@nomics.world>
#
# Copyright (C) 2020 Cepremap
# https://git.nomics.world/dbnomics-fetchers/entsoe-fetcher
#
# entsoe-fetcher is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# entsoe-fetcher is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Convert data from ENTSOE to DBnomics data model.
See https://git.nomics.world/dbnomics/dbnomics-data-model/.
Read source data from a source directory, write converted data to a target directory.
See also `.gitlab-ci.yml` in which data is committed
to a Git repository of converted data.
"""
import argparse
import json
import logging
import re
import shutil
import sys
from operator import itemgetter
from pathlib import Path
from collections import defaultdict
from typing import List, Dict
import pycountry
import pandas as pd
PROVIDER_DATA = {
"code": "ENTSOE",
"name": "European Network of Transmission System Operators for Electricity",
"region": "World",
"terms_of_use": "https://transparency.entsoe.eu/content/static_content/Static%20content/terms%20and%20conditions/terms%20and%20conditions.html",
"website": "https://transparency.entsoe.eu/",
}
log = logging.getLogger(__name__)
def compute_series_info(
country_code: str, csv_files: List[Path], dim_acc: Dict[str, Dict]
):
country = pycountry.countries.get(alpha_2=country_code)
dim_acc["country"][country_code] = country.name
df_total = None
for idx, csv_file in enumerate(csv_files):
df = pd.read_csv(csv_file, index_col=1, header=[0, 1])
df_total = df if df_total is None else df_total.append(df)
# TODO: generate series_info and update dim_acc
breakpoint()
def convert_agpt_dataset(source_dir: Path, target_dir: Path):
target_dir.mkdir(exist_ok=True)
dimension_code_list = ["country", "type", "indicator"]
dim_acc = {dim: {} for dim in dimension_code_list}
dataset_json_data = {
"code": "AGPT",
"name": "Actual generation per type, realised, daily",
"dimensions_codes_order": dimension_code_list,
"dimensions_labels": {dim: dim.title() for dim in dimension_code_list},
"dimensions_values_labels": dim_acc,
}
# Build country code list from file names
csv_files_by_country_code = defaultdict(list)
country_year_re = re.compile(r"^.*([A-Z]{2})_\d{4}$")
for csv_file in sorted(source_dir.glob("*.csv")):
m = country_year_re.match(csv_file.stem)
if not m:
continue
csv_files_by_country_code[m.group(1)].append(csv_file)
breakpoint()
series_info_list = []
for country_code, csv_files in sorted(csv_files_by_country_code.items()):
series_info_list.extend(compute_series_info(country_code, csv_files, dim_acc))
# dataset.json
write_json_file(target_dir / "dataset.json", dataset_json_data)
# series.jsonl
with (target_dir / "series.jsonl").open("wt", encoding="utf-8") as fd:
for series_info in sorted(series_info_list, key=itemgetter("code")):
json.dump(series_info, fd, sort_keys=True, ensure_ascii=False)
fd.write("\n")
def main():
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument("source_dir", type=Path, help="path of source directory")
parser.add_argument("target_dir", type=Path, help="path of target directory")
parser.add_argument("--log", default="WARNING", help="level of logging messages")
args = parser.parse_args()
numeric_level = getattr(logging, args.log.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError(f"Invalid log level: {args.log}")
logging.basicConfig(
format="%(levelname)s:%(name)s:%(asctime)s:%(message)s",
level=numeric_level,
stream=sys.stdout, # Use stderr if script outputs data to stdout.
)
source_dir = args.source_dir
if not source_dir.exists():
parser.error("Source dir %r not found", source_dir)
target_dir = args.target_dir
if not target_dir.exists():
parser.error("Target dir %r not found", target_dir)
convert_agpt_dataset(source_dir / "AGPT", target_dir / "AGPT")
# provider.json
write_json_file(target_dir / "provider.json", PROVIDER_DATA)
return 0
def write_json_file(file_path: Path, data):
"""Write data the JSON way to file_path"""
with file_path.open("w", encoding="utf-8") as json_fd:
json.dump(data, json_fd, ensure_ascii=False, indent=2, sort_keys=True)
if __name__ == "__main__":
sys.exit(main())
#! /usr/bin/env python3
# entsoe-fetcher -- Download and convert data from ENTSOE
# By: DBnomics team <contact@nomics.world>
#
# Copyright (C) 2020 Cepremap
# https://git.nomics.world/dbnomics-fetchers/entsoe-fetcher
#
# entsoe-fetcher is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# entsoe-fetcher is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Download data from ENTSOE, write it in a target directory.
See also `.gitlab-ci.yml` in which data is committed to a Git repository of source data.
"""
import argparse
import http.client
import logging
import os
import sys
from datetime import datetime
from pathlib import Path
from typing import List
import pandas as pd
import entsoe
API_KEY_NAME = "WEB_SECURITY_TOKEN"
log = logging.getLogger(__name__)
COUNTRY_CODES = ["FR", "IT", "DE"]
def download_actual_generation_per_type_daily_per_country_year(
client, country_code, year
):
# date interval
start = pd.Timestamp(f"{year}01010000", tz="Europe/Brussels")
end = pd.Timestamp(f"{year}12312359", tz="Europe/Brussels")
# query
df = client.query_generation(country_code, start=start, end=end)
# resample hourly data to daily data
return df.resample("D").sum()
def download_actual_generation_per_type_daily(
api_key, country_code_list: List[str], year_interval: List[int], target_dir: Path
):
target_dir.mkdir(exist_ok=True)
client = entsoe.EntsoePandasClient(api_key=api_key)
for country_code in country_code_list:
for year in year_interval:
log.info("Download data for %s (%d)", country_code, year)
csv_filepath = target_dir / f"AGPT_{country_code}_{year}.csv"
if not csv_filepath.exists():
df = download_actual_generation_per_type_daily_per_country_year(
client, country_code, year
)
df.to_csv(csv_filepath, date_format="%Y-%m-%d")
log = logging.getLogger(__name__)
def main():
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument("target_dir", type=Path, help="path of target directory")
parser.add_argument(
"--start-year", type=int, help="start year if different from current year"
)
parser.add_argument(
"--debug-http", action="store_true", help="display http.client debug messages"
)
parser.add_argument("--log", default="WARNING", help="level of logging messages")
args = parser.parse_args()
numeric_level = getattr(logging, args.log.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError("Invalid log level: {}".format(args.log))
logging.basicConfig(
format="%(levelname)s:%(name)s:%(asctime)s:%(message)s",
level=numeric_level,
stream=sys.stdout, # Use stderr if script outputs data to stdout.
)
logging.getLogger("urllib3").setLevel(
logging.DEBUG if args.debug_http else logging.WARNING
)
if args.debug_http:
http.client.HTTPConnection.debuglevel = 1
target_dir = args.target_dir
if not target_dir.exists():
parser.error("Target dir %r not found", target_dir)
current_year = datetime.today().year
start_year = (
args.start_year
if args.start_year and args.start_year < current_year
else current_year
)
year_interval = list(range(start_year, current_year + 1))
# Get API token
api_key = os.environ.get(API_KEY_NAME)
if not api_key:
log.error("Please define %r env variable", API_KEY_NAME)
sys.exit(1)
# download actual generation per type
log.info("Processing [AGPT]")
download_actual_generation_per_type_daily(
api_key, COUNTRY_CODES, year_interval, target_dir / "AGPT"
)
return 0
if __name__ == "__main__":
sys.exit(main())
entsoe-py
pycountry
\ No newline at end of file
#
# This file is autogenerated by pip-compile
# To update, run:
#
# pip-compile
#
beautifulsoup4==4.9.3 # via entsoe-py
certifi==2020.11.8 # via requests
chardet==3.0.4 # via requests
entsoe-py==0.3.2 # via -r requirements.in
idna==2.10 # via requests
numpy==1.19.4 # via pandas
pandas==1.1.4 # via entsoe-py
pycountry==20.7.3 # via -r requirements.in
python-dateutil==2.8.1 # via pandas
pytz==2020.4 # via entsoe-py, pandas
requests==2.25.0 # via entsoe-py
six==1.15.0 # via python-dateutil
soupsieve==2.0.1 # via beautifulsoup4
urllib3==1.26.2 # via requests
[flake8]
# Recommend matching the black line length (default 88),
# rather than using the flake8 default of 79:
max-line-length = 88
extend-ignore =
# See https://github.com/PyCQA/pycodestyle/issues/373
E203,
enable-extensions=G # for flake8-logging-format
[tool:pytest]
addopts = --doctest-modules
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment