1st commit

281f62c0 · Pierre Dittgen · 281f62c0 · 281f62c0 · 281f62c0 · 281f62c0
Commit 281f62c0 authored 4 years ago by Pierre Dittgen
--- a/.gitignore
+++ b/.gitignore
+.env
\ No newline at end of file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+image: dbnomics/dbnomics-gitlab-ci:latest
+
+variables:
+  # Can be "download" or "convert"
+  JOB: convert
+  PROVIDER_SLUG: entsoe
+
+before_script:
+  # Display info about environment.
+  - date
+  - locale
+  - echo "Running job ${JOB} for provider ${PROVIDER_SLUG}"
+
+  # Install fetcher dependencies.
+  - '[ -f requirements.txt ] && pip3 install --requirement requirements.txt'
+
+  # Run ssh-agent (inside the build environment).
+  - eval $(ssh-agent -s)
+
+  # Add the SSH key stored in SSH_PRIVATE_KEY variable to the agent store.
+  - ssh-add <(echo "$SSH_PRIVATE_KEY")
+
+  # Add the SSH keys of remote Git servers, to disable host key checking questions.
+  - mkdir -p ~/.ssh
+  - ssh-keyscan -t rsa git.nomics.world >> ~/.ssh/known_hosts
+
+  - git config --global push.default simple
+  - git config --global user.email "${PROVIDER_SLUG}-fetcher@db.nomics.world"
+  - git config --global user.name "${PROVIDER_SLUG} fetcher"
+
+job:
+  stage: build
+  except:
+    - pushes
+  tags:
+    - docker
+  script:
+    - set -x
+    - |
+      if [ "${JOB}" == "download" ]; then
+        time git clone --quiet --depth=1 git@git.nomics.world:dbnomics-source-data/${PROVIDER_SLUG}-source-data.git
+        cd ${PROVIDER_SLUG}-source-data
+        time find -not -path "./.git/*" -not -name ".git" -delete
+        cd ..
+        time python3 download.py ${PROVIDER_SLUG}-source-data
+        cd ${PROVIDER_SLUG}-source-data
+        time git add -A
+        time git commit -m "New download" --quiet || true
+        time git push
+        # Move errors.json file, if present; to be kept as artifact
+        [ -f errors.json ] && mv errors.json .. || true
+      fi
+    - |
+      if [ "${JOB}" == "convert" ]; then
+        time git clone --quiet --depth=1 https://git.nomics.world/dbnomics-source-data/${PROVIDER_SLUG}-source-data.git
+        time git clone --quiet --depth=1 git@git.nomics.world:dbnomics-json-data/${PROVIDER_SLUG}-json-data.git
+        cd ${PROVIDER_SLUG}-json-data
+        # If not on master, checkout corresponding branch on json-data
+        git checkout -B ${CI_COMMIT_REF_NAME}
+        # Delete all existing files
+        time find -not -path "./.git/*" -not -name ".git" -delete
+        cd ..
+        time python3 convert.py ${PROVIDER_SLUG}-source-data ${PROVIDER_SLUG}-json-data
+        cd ${PROVIDER_SLUG}-json-data
+        # Commit conversion result, ignoring absent datasets
+        time git add --ignore-removal .
+        time git commit -m "New conversion..." -m "from source-data $(git -C ../${PROVIDER_SLUG}-source-data rev-parse HEAD)" --quiet || true
+        time git push origin ${CI_COMMIT_REF_NAME}
+        # Move errors.json file, if present; to be kept as artifact
+        [ -f errors.json ] && mv errors.json .. || true
+      fi
+  artifacts:
+    paths:
+      - errors.json
--- a/README.md
+++ b/README.md
+# ENTSOE fetcher
+
+Download and convert data from ENTSOE.
+
+## Code quality
+
+See https://git.nomics.world/dbnomics-fetchers/documentation/wikis/code-style
\ No newline at end of file
--- a/convert.py
+++ b/convert.py
+#! /usr/bin/env python3
+
+
+# entsoe-fetcher -- Download and convert data from ENTSOE
+# By: DBnomics team <contact@nomics.world>
+#
+# Copyright (C) 2020 Cepremap
+# https://git.nomics.world/dbnomics-fetchers/entsoe-fetcher
+#
+# entsoe-fetcher is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# entsoe-fetcher is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+"""Convert data from ENTSOE to DBnomics data model.
+
+See https://git.nomics.world/dbnomics/dbnomics-data-model/.
+
+Read source data from a source directory, write converted data to a target directory.
+
+See also `.gitlab-ci.yml` in which data is committed
+to a Git repository of converted data.
+"""
+
+import argparse
+import json
+import logging
+import re
+import shutil
+import sys
+from operator import itemgetter
+from pathlib import Path
+from collections import defaultdict
+from typing import List, Dict
+
+import pycountry
+import pandas as pd
+
+PROVIDER_DATA = {
+    "code": "ENTSOE",
+    "name": "European Network of Transmission System Operators for Electricity",
+    "region": "World",
+    "terms_of_use": "https://transparency.entsoe.eu/content/static_content/Static%20content/terms%20and%20conditions/terms%20and%20conditions.html",
+    "website": "https://transparency.entsoe.eu/",
+}
+
+log = logging.getLogger(__name__)
+
+
+def compute_series_info(
+    country_code: str, csv_files: List[Path], dim_acc: Dict[str, Dict]
+):
+    country = pycountry.countries.get(alpha_2=country_code)
+
+    dim_acc["country"][country_code] = country.name
+
+    df_total = None
+    for idx, csv_file in enumerate(csv_files):
+        df = pd.read_csv(csv_file, index_col=1, header=[0, 1])
+        df_total = df if df_total is None else df_total.append(df)
+
+    # TODO: generate series_info and update dim_acc
+
+    breakpoint()
+
+
+def convert_agpt_dataset(source_dir: Path, target_dir: Path):
+    target_dir.mkdir(exist_ok=True)
+
+    dimension_code_list = ["country", "type", "indicator"]
+    dim_acc = {dim: {} for dim in dimension_code_list}
+    dataset_json_data = {
+        "code": "AGPT",
+        "name": "Actual generation per type, realised, daily",
+        "dimensions_codes_order": dimension_code_list,
+        "dimensions_labels": {dim: dim.title() for dim in dimension_code_list},
+        "dimensions_values_labels": dim_acc,
+    }
+
+    # Build country code list from file names
+    csv_files_by_country_code = defaultdict(list)
+    country_year_re = re.compile(r"^.*([A-Z]{2})_\d{4}$")
+    for csv_file in sorted(source_dir.glob("*.csv")):
+        m = country_year_re.match(csv_file.stem)
+        if not m:
+            continue
+        csv_files_by_country_code[m.group(1)].append(csv_file)
+
+    breakpoint()
+
+    series_info_list = []
+    for country_code, csv_files in sorted(csv_files_by_country_code.items()):
+        series_info_list.extend(compute_series_info(country_code, csv_files, dim_acc))
+
+    # dataset.json
+    write_json_file(target_dir / "dataset.json", dataset_json_data)
+
+    # series.jsonl
+    with (target_dir / "series.jsonl").open("wt", encoding="utf-8") as fd:
+        for series_info in sorted(series_info_list, key=itemgetter("code")):
+            json.dump(series_info, fd, sort_keys=True, ensure_ascii=False)
+            fd.write("\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("source_dir", type=Path, help="path of source directory")
+    parser.add_argument("target_dir", type=Path, help="path of target directory")
+    parser.add_argument("--log", default="WARNING", help="level of logging messages")
+    args = parser.parse_args()
+
+    numeric_level = getattr(logging, args.log.upper(), None)
+    if not isinstance(numeric_level, int):
+        raise ValueError(f"Invalid log level: {args.log}")
+    logging.basicConfig(
+        format="%(levelname)s:%(name)s:%(asctime)s:%(message)s",
+        level=numeric_level,
+        stream=sys.stdout,  # Use stderr if script outputs data to stdout.
+    )
+
+    source_dir = args.source_dir
+    if not source_dir.exists():
+        parser.error("Source dir %r not found", source_dir)
+
+    target_dir = args.target_dir
+    if not target_dir.exists():
+        parser.error("Target dir %r not found", target_dir)
+
+    convert_agpt_dataset(source_dir / "AGPT", target_dir / "AGPT")
+
+    # provider.json
+    write_json_file(target_dir / "provider.json", PROVIDER_DATA)
+
+    return 0
+
+
+def write_json_file(file_path: Path, data):
+    """Write data the JSON way to file_path"""
+
+    with file_path.open("w", encoding="utf-8") as json_fd:
+        json.dump(data, json_fd, ensure_ascii=False, indent=2, sort_keys=True)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/download.py
+++ b/download.py
+#! /usr/bin/env python3
+
+
+# entsoe-fetcher -- Download and convert data from ENTSOE
+# By: DBnomics team <contact@nomics.world>
+#
+# Copyright (C) 2020 Cepremap
+# https://git.nomics.world/dbnomics-fetchers/entsoe-fetcher
+#
+# entsoe-fetcher is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# entsoe-fetcher is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+"""Download data from ENTSOE, write it in a target directory.
+
+See also `.gitlab-ci.yml` in which data is committed to a Git repository of source data.
+"""
+
+import argparse
+import http.client
+import logging
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import List
+
+import pandas as pd
+
+import entsoe
+
+API_KEY_NAME = "WEB_SECURITY_TOKEN"
+
+log = logging.getLogger(__name__)
+
+COUNTRY_CODES = ["FR", "IT", "DE"]
+
+
+def download_actual_generation_per_type_daily_per_country_year(
+    client, country_code, year
+):
+    # date interval
+    start = pd.Timestamp(f"{year}01010000", tz="Europe/Brussels")
+    end = pd.Timestamp(f"{year}12312359", tz="Europe/Brussels")
+
+    # query
+    df = client.query_generation(country_code, start=start, end=end)
+
+    # resample hourly data to daily data
+    return df.resample("D").sum()
+
+
+def download_actual_generation_per_type_daily(
+    api_key, country_code_list: List[str], year_interval: List[int], target_dir: Path
+):
+    target_dir.mkdir(exist_ok=True)
+
+    client = entsoe.EntsoePandasClient(api_key=api_key)
+
+    for country_code in country_code_list:
+
+        for year in year_interval:
+
+            log.info("Download data for %s (%d)", country_code, year)
+            csv_filepath = target_dir / f"AGPT_{country_code}_{year}.csv"
+            if not csv_filepath.exists():
+                df = download_actual_generation_per_type_daily_per_country_year(
+                    client, country_code, year
+                )
+                df.to_csv(csv_filepath, date_format="%Y-%m-%d")
+
+
+log = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("target_dir", type=Path, help="path of target directory")
+    parser.add_argument(
+        "--start-year", type=int, help="start year if different from current year"
+    )
+    parser.add_argument(
+        "--debug-http", action="store_true", help="display http.client debug messages"
+    )
+    parser.add_argument("--log", default="WARNING", help="level of logging messages")
+    args = parser.parse_args()
+
+    numeric_level = getattr(logging, args.log.upper(), None)
+    if not isinstance(numeric_level, int):
+        raise ValueError("Invalid log level: {}".format(args.log))
+    logging.basicConfig(
+        format="%(levelname)s:%(name)s:%(asctime)s:%(message)s",
+        level=numeric_level,
+        stream=sys.stdout,  # Use stderr if script outputs data to stdout.
+    )
+    logging.getLogger("urllib3").setLevel(
+        logging.DEBUG if args.debug_http else logging.WARNING
+    )
+    if args.debug_http:
+        http.client.HTTPConnection.debuglevel = 1
+
+    target_dir = args.target_dir
+    if not target_dir.exists():
+        parser.error("Target dir %r not found", target_dir)
+
+    current_year = datetime.today().year
+    start_year = (
+        args.start_year
+        if args.start_year and args.start_year < current_year
+        else current_year
+    )
+    year_interval = list(range(start_year, current_year + 1))
+
+    # Get API token
+    api_key = os.environ.get(API_KEY_NAME)
+    if not api_key:
+        log.error("Please define %r env variable", API_KEY_NAME)
+        sys.exit(1)
+
+    # download actual generation per type
+    log.info("Processing [AGPT]")
+    download_actual_generation_per_type_daily(
+        api_key, COUNTRY_CODES, year_interval, target_dir / "AGPT"
+    )
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/requirements.in
+++ b/requirements.in
+entsoe-py
+pycountry
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+#
+# This file is autogenerated by pip-compile
+# To update, run:
+#
+#    pip-compile
+#
+beautifulsoup4==4.9.3     # via entsoe-py
+certifi==2020.11.8        # via requests
+chardet==3.0.4            # via requests
+entsoe-py==0.3.2          # via -r requirements.in
+idna==2.10                # via requests
+numpy==1.19.4             # via pandas
+pandas==1.1.4             # via entsoe-py
+pycountry==20.7.3         # via -r requirements.in
+python-dateutil==2.8.1    # via pandas
+pytz==2020.4              # via entsoe-py, pandas
+requests==2.25.0          # via entsoe-py
+six==1.15.0               # via python-dateutil
+soupsieve==2.0.1          # via beautifulsoup4
+urllib3==1.26.2           # via requests
--- a/setup.cfg
+++ b/setup.cfg
+[flake8]
+# Recommend matching the black line length (default 88),
+# rather than using the flake8 default of 79:
+max-line-length = 88
+extend-ignore =
+    # See https://github.com/PyCQA/pycodestyle/issues/373
+    E203,
+enable-extensions=G # for flake8-logging-format
+
+[tool:pytest]
+addopts = --doctest-modules
\ No newline at end of file