Commit 4ee38e42 authored by Christophe Benz's avatar Christophe Benz
Browse files

Merge branch 'handle_dataset_release' into 'master'

Handle dataset releases

See merge request !44
parents 01c69381 5002d6aa
Pipeline #186272 passed with stage
in 18 seconds
# Changelog
## next
Non-breaking changes:
- Handle releases metadata for each provider, defining a list of release codes for each dataset. ([#755](https://git.nomics.world/dbnomics-fetchers/management/-/issues/755))
- Validate releases in validation script.
## 0.13.13
Non-breaking changes:
......
......@@ -16,6 +16,7 @@
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import logging
from collections import OrderedDict
......
# dbnomics-data-model -- Define, validate and transform DBnomics data.
# By: Christophe Benz <christophe.benz@cepremap.org>
#
# Copyright (C) 2017-2020 Cepremap
# https://git.nomics.world/dbnomics/dbnomics-data-model
#
# dbnomics-data-model is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# dbnomics-data-model is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Types and functions manipulating releases metadata."""
import re
from typing import List, Optional, Tuple
from pydantic import BaseModel, Field, validator
from .exceptions import DBnomicsError
from .utils import find
LATEST_RELEASE = "latest"
RELEASE_CODE_PATTERN = r"[-0-9A-Za-z._]+"
RELEASE_CODE_RE = re.compile(RELEASE_CODE_PATTERN)
class NoReleaseError(DBnomicsError):
def __init__(self, dataset_code: str):
message = (f"Could not resolve latest release for dataset code {dataset_code!r} "
"because no release is defined for this dataset")
super().__init__(message)
self.dataset_code = dataset_code
class ReleaseCode(str):
"""Code of a release."""
def __init__(self, v):
ReleaseCode.validate(v)
super().__init__()
@classmethod
def __get_validators__(cls):
yield cls.validate
@classmethod
def validate(cls, v):
if not isinstance(v, str):
raise TypeError('string required')
if RELEASE_CODE_RE.fullmatch(v) is None:
raise ValueError(
f"Release code {v!r} does not conform to pattern {RELEASE_CODE_PATTERN!r}")
return v
class DatasetRelease(BaseModel):
"""Release of a dataset."""
code: ReleaseCode
@validator("code")
def check_not_latest(cls, v): # noqa
if v == LATEST_RELEASE:
raise ValueError(
f"Release code of a dataset must not be {LATEST_RELEASE!r}")
return v
class DatasetReleasesItem(BaseModel):
"""Releases of a dataset."""
dataset_code_prefix: str
releases: List[DatasetRelease]
name: Optional[str] = None
def find_latest_release_code(self) -> ReleaseCode:
"""Find the code of the latest release of this dataset."""
return self.releases[-1].code
def format_release(self, release_code: str) -> str:
return f"{self.dataset_code_prefix}:{release_code}"
class Releases(BaseModel):
"""Releases of datasets."""
dataset_releases: List[DatasetReleasesItem] = Field(
[], description='List of dataset releases'
)
def find_dataset_releases_item(self, dataset_code_prefix: str) -> Optional[DatasetReleasesItem]:
"""Find the dataset releases item corresponding to the given code prefix."""
return find(
lambda item: item.dataset_code_prefix == dataset_code_prefix,
self.dataset_releases,
)
def resolve_release_code(self, dataset_code: str) -> str:
"""Resolve the release code of a dataset.
Some release codes are reserved, like "latest" that references an actual release code.
If dataset_code references a reserved release code, replace it by the actual one.
"""
dataset_code_prefix, release_code = parse_dataset_release(dataset_code)
if release_code is None or release_code != LATEST_RELEASE:
return dataset_code
dataset_releases_item = self.find_dataset_releases_item(dataset_code_prefix)
if dataset_releases_item is None:
raise NoReleaseError(dataset_code)
latest_release_code = dataset_releases_item.find_latest_release_code()
if latest_release_code is None:
raise NoReleaseError(dataset_code)
return dataset_releases_item.format_release(latest_release_code)
def parse_dataset_release(dataset_code: str) -> Tuple[str, Optional[str]]:
"""Parse a dataset code that may contain a release code.
Return (dataset_code_prefix, release_code).
>>> parse_dataset_release('foo')
('foo', None)
>>> parse_dataset_release('foo:bar')
('foo', 'bar')
>>> parse_dataset_release('foo:latest')
('foo', 'latest')
>>> parse_dataset_release('foo:')
Traceback (most recent call last):
...
ValueError: Release code '' does not conform to pattern '[-0-9A-Za-z._]+'
>>> parse_dataset_release('foo: ')
Traceback (most recent call last):
...
ValueError: Release code ' ' does not conform to pattern '[-0-9A-Za-z._]+'
"""
if ":" not in dataset_code:
return dataset_code, None
dataset_code_prefix, release_code = dataset_code.split(":", 1)
release_code = ReleaseCode(release_code)
return dataset_code_prefix, release_code
......@@ -20,10 +20,13 @@
import abc
import logging
from io import StringIO
from typing import Optional
from pydantic import ValidationError
from toolz import count, pipe
from .. import datasets
from ..releases import Releases
from ..exceptions import StorageError
from ..observations import iter_tsv_decoded_rows, iter_tsv_rows
......@@ -82,6 +85,27 @@ class AbstractStorage(abc.ABC):
def load_provider_json(self):
pass
@abc.abstractmethod
def load_releases_json(self):
"""Return `releases.json` content. Since the file is optional, return `None` if not found."""
pass
def load_releases(self) -> Optional[Releases]:
"""Load releases metadata."""
releases_json = self.load_releases_json()
if releases_json is None:
return None
try:
releases = Releases.parse_obj(releases_json)
except ValidationError as exc:
raise StorageError(
f"Invalid releases for provider {self.provider_code}",
provider_code=self.provider_code,
) from exc
return releases
@property
@abc.abstractmethod
def path(self):
......
......@@ -67,6 +67,9 @@ class FileSystemStorage(AbstractStorage):
raise StorageError("Could not load \"{}\"".format(filename), self.provider_code)
return provider_json
def load_releases_json(self):
return load_json_file(self, self.path / "releases.json")
@property
def path(self):
return self.storage_dir_path
......
......@@ -84,6 +84,9 @@ class GitStorage(AbstractStorage):
raise StorageError("Could not load \"{}\"".format(entry_name), self.provider_code)
return provider_json
def load_releases_json(self):
return load_json_blob(self.repo, self.tree, "releases.json")
@property
def path(self):
return Path(self.repo.path)
......
# dbnomics-data-model -- Define, validate and transform DBnomics data.
# By: Christophe Benz <christophe.benz@cepremap.org>
#
# Copyright (C) 2017-2020 Cepremap
# https://git.nomics.world/dbnomics/dbnomics-data-model
#
# dbnomics-data-model is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# dbnomics-data-model is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Utility functions."""
from typing import Callable, Iterable, Optional, TypeVar
T = TypeVar("T")
def find(predicate: Callable[[T], bool], items: Iterable[T], default=None) -> Optional[T]:
"""Find the first item in ``items`` satisfying ``predicate(item)``.
Return the found item, or return ``default`` if no item was found.
>>> find(lambda item: item > 2, [1, 2, 3, 4])
3
>>> find(lambda item: item > 10, [1, 2, 3, 4])
>>> find(lambda item: item > 10, [1, 2, 3, 4], default=42)
42
"""
for item in items:
if predicate(item):
return item
return default
......@@ -31,11 +31,17 @@ from collections import defaultdict
from pathlib import Path
import jsonschema.exceptions
from pydantic import ValidationError
from toolz import get, take
from dbnomics_data_model import storages, validators
from dbnomics_data_model.exceptions import StorageError
from dbnomics_data_model.observations import NOT_AVAILABLE, detect_period_format_strict, value_to_float
from dbnomics_data_model.observations import (
NOT_AVAILABLE,
detect_period_format_strict,
value_to_float,
)
from dbnomics_data_model.releases import Releases, parse_dataset_release, LATEST_RELEASE
log = logging.getLogger(__name__)
......@@ -121,6 +127,12 @@ def main():
errors_codes_counts[error['error_code']] += 1
print(format_error(error, output_format=args.format))
log.debug("Validating releases ...")
releases_errors = validate_releases(storage, ignore_errors=ignore_errors)
for error in releases_errors:
errors_codes_counts[error['error_code']] += 1
print(format_error(error, output_format=args.format))
log.debug("Validating datasets...")
nb_datasets = storage.get_nb_datasets()
for dataset_index, dataset_dir in enumerate(storage.iter_datasets_dirs(sort_by_dir_name=True), start=1):
......@@ -209,7 +221,9 @@ def format_error(error, output_format="text"):
for cause_error in cause:
sio.write(" - at path: {}\n".format(cause_error["path"]))
sio.write(" message: {}\n".format(cause_error["message"]))
sio.write(" value: {!r}\n".format(cause_error["value"]))
value = cause_error.get("value")
if value is not None:
sio.write(" value: {!r}\n".format(value))
return sio.getvalue()
......@@ -226,11 +240,21 @@ def build_jsonschema_error(errors, base_path=[]):
]
def category_tree_dataset_code_iter(category_tree):
def cause_from_validation_error(exc: ValidationError):
return [
{
"path": error["loc"],
"message": error["msg"],
}
for error in exc.errors()
]
def iter_category_tree_dataset_code(category_tree):
"""Yield all dataset codes from a category_tree instance"""
for elt in category_tree:
if 'children' in elt:
for dataset_code in category_tree_dataset_code_iter(elt['children']):
for dataset_code in iter_category_tree_dataset_code(elt['children']):
yield dataset_code
elif elt.get('code'):
yield elt['code']
......@@ -257,8 +281,8 @@ def validate_category_tree(storage, ignore_errors=[]):
})
if category_tree_json is not None:
dataset_codes_on_storage = set([dataset.dataset_code for dataset in storage.iter_datasets_dirs()])
datasets_codes_in_category_tree = set(list(category_tree_dataset_code_iter(category_tree_json)))
dataset_codes_on_storage = set(dataset.dataset_code for dataset in storage.iter_datasets_dirs())
datasets_codes_in_category_tree = set(iter_category_tree_dataset_code(category_tree_json))
# Check that all datasets referenced in category_tree.json are present on disk
error_code = "dataset-not-found-on-storage"
......@@ -305,6 +329,49 @@ def validate_category_tree(storage, ignore_errors=[]):
return errors
def validate_releases(storage, ignore_errors=[]):
provider_code = storage.provider_code
errors = []
error_code = "invalid-releases"
if error_code not in ignore_errors:
log.debug("Validating releases.json...")
try:
releases = storage.load_releases()
except ValidationError as exc:
errors.append({
"cause": cause_from_validation_error(exc),
"error_code": error_code,
"message": "Invalid releases",
"provider_code": provider_code,
"location": "releases.json",
})
# Check that all datasets referenced in releases exist
error_code = "dataset-not-found-on-storage"
if error_code not in ignore_errors and releases is not None:
dataset_codes_on_storage = {
dataset.dataset_code
for dataset in storage.iter_datasets_dirs()
}
datasets_codes_in_releases = {
dataset_releases_item.format_release(release.code)
for dataset_releases_item in releases.dataset_releases
for release in dataset_releases_item.releases
}
not_found_on_storage = datasets_codes_in_releases - dataset_codes_on_storage
for dataset_code in not_found_on_storage:
errors.append({
"error_code": error_code,
"message": f"Dataset {dataset_code!r} declared in releases.json is not found on storage",
"provider_code": provider_code,
"dataset_code": dataset_code,
"location": "releases.json"
})
return errors
def validate_dataset(dataset_dir, ignore_errors=[]):
errors = []
provider_code = dataset_dir.storage.provider_code
......@@ -383,6 +450,25 @@ def validate_dataset(dataset_dir, ignore_errors=[]):
"location": dataset_code,
})
# Check that release codes are different that "latest"
error_code = "invalid-release-code"
if error_code not in ignore_errors:
is_invalid = False
try:
_, release_code = parse_dataset_release(dataset_code)
except ValueError:
is_invalid = True
else:
is_invalid = release_code == LATEST_RELEASE
if is_invalid:
errors.append({
"error_code": error_code,
"message": f"Dataset {dataset_code!r} has an invalid release code {LATEST_RELEASE!r}",
"provider_code": provider_code,
"dataset_code": dataset_code,
"location": f"{dataset_code}/dataset.json",
})
return (dataset_json, dataset_series, errors)
......
......@@ -24,6 +24,7 @@ install_requires =
backports-datetime-fromisoformat
dulwich
jsonschema >= 2.6
pydantic
python-dateutil
toolz >= 0.8.2
packages = dbnomics_data_model
......@@ -91,3 +92,6 @@ line_length = 88
[pycodestyle]
max_line_length = 88
[tool:pytest]
addopts = --doctest-modules
\ No newline at end of file
# Dummy provider 6
## What's included
- dataset releases
PERIOD VALUE
2001 3.76
2002 3.8
2003 4.7
PERIOD VALUE
2001 3.76
2002 3.8
2003 4.7
{
"code": "WEO:2020-04",
"name": "World Economic Outlook – Release of 2020, April",
"dimensions_codes_order": [
"freq",
"unit",
"country"
],
"dimensions_labels": {
"freq": "Frequency",
"unit": "Unit of measure",
"country": "Country"
},
"dimensions_values_labels": {
"freq": {
"A": "Annual",
"M": "Monthly"
},
"country": [
[
"FR",
"France"
],
[
"DE",
"Germany"
],
[
"SP",
"Spain"
],
[
"RE",
"Rest of Europe"
]
],
"unit": {
"KG": "Kilogramme",
"GG": "Tonne"
}
},
"doc_href": "https://en.wikipedia.org/wiki/Kilogram#SI_multiples",
"series": [
{
"code": "A.RE.KG",
"name": "Annual Rest of Europe Kilogram",
"dimensions": {
"freq": "A",
"country": "RE",
"unit": "KG"
}
},
{
"code": "A.SP.GG",
"name": "Annual Spain Gigagram",
"dimensions": {
"freq": "A",
"country": "SP",
"unit": "GG"
}
}
]
}
PERIOD VALUE
2001 3.76
2002 3.8
2003 4.7
PERIOD VALUE
2001 3.76
2002 3.8
2003 4.7
{
"code": "WEO:2020-10",
"name": "World Economic Outlook – Release of 2020, October",
"dimensions_codes_order": [
"freq",
"unit",
"country"
],
"dimensions_labels": {
"freq": "Frequency",
"unit": "Unit of measure",
"country": "Country"
},
"dimensions_values_labels": {
"freq": {
"A": "Annual",
"M": "Monthly"
},
"country": [
[
"FR",
"France"
],
[
"DE",
"Germany"
],
[
"SP",
"Spain"
],
[
"RE",
"Rest of Europe"
]
],
"unit": {
"KG": "Kilogramme",
"GG": "Tonne"
}
},
"doc_href": "https://en.wikipedia.org/wiki/Kilogram#SI_multiples",
"series": [
{
"code": "A.RE.KG",
"name": "Annual Rest of Europe Kilogram",
"dimensions": {
"freq": "A",
"country": "RE",
"unit": "KG"
}
},
{
"code": "A.SP.GG",
"name": "Annual Spain Gigagram",
"dimensions": {
"freq": "A"