Commit 50a76d2c authored by Christophe Benz's avatar Christophe Benz
Browse files

Add invalid-release-code checks to validate script

parent 3bff4b6c
Pipeline #185503 passed with stage
in 22 seconds
......@@ -33,7 +33,7 @@ from pathlib import Path
import jsonschema.exceptions
from toolz import get, take
from dbnomics_data_model import storages, validators
from dbnomics_data_model import datasets, storages, validators
from dbnomics_data_model.exceptions import StorageError
from dbnomics_data_model.observations import NOT_AVAILABLE, detect_period_format_strict, value_to_float
......@@ -121,7 +121,7 @@ def main():
for error in category_tree_errors:
errors_codes_counts[error['error_code']] += 1
print(format_error(error, output_format=args.format))
log.debug("Validating releases ...")
releases_errors = validate_releases(storage, ignore_errors=ignore_errors)
for error in releases_errors:
......@@ -312,12 +312,12 @@ def validate_category_tree(storage, ignore_errors=[]):
return errors
def iter_releases_dataset_code(releases):
"""Yield all dataset codes forged from a releases instance."""
def iter_dataset_code_prefix_and_release_code(releases):
"""Yield all dataset code prefixes and release codes."""
for release in releases.get("dataset_releases", []):
code_prefix = release["dataset_code_prefix"]
for rel in release["releases"]:
yield f"{code_prefix}:{rel['code']}"
yield (code_prefix, rel['code'])
def validate_releases(storage, ignore_errors=[]):
......@@ -343,11 +343,29 @@ def validate_releases(storage, ignore_errors=[]):
})
dataset_codes_on_storage = set(dataset.dataset_code for dataset in storage.iter_datasets_dirs())
datasets_codes_in_releases = set(iter_releases_dataset_code(releases_json))
dataset_code_prefix_and_release_code = set(iter_dataset_code_prefix_and_release_code(releases_json))
# Check that release codes are different that "latest"
error_code = "invalid-release-code"
if error_code not in ignore_errors:
for dataset_code_prefix, release_code in dataset_code_prefix_and_release_code:
if release_code == datasets.LATEST_RELEASE:
dataset_code = datasets.format_dataset_release(dataset_code_prefix, release_code)
errors.append({
"error_code": error_code,
"message": f"Dataset {dataset_code!r} has an invalid release code {datasets.LATEST_RELEASE!r}",
"provider_code": provider_code,
"dataset_code": dataset_code,
"location": "releases.json"
})
# Check that all datasets referenced in releases.json are present on disk
error_code = "dataset-not-found-on-storage"
if error_code not in ignore_errors:
datasets_codes_in_releases = set(
datasets.format_dataset_release(dataset_code_prefix, release_code)
for dataset_code_prefix, release_code in dataset_code_prefix_and_release_code
)
not_found_on_storage = datasets_codes_in_releases - dataset_codes_on_storage
for dataset_code in not_found_on_storage:
errors.append({
......@@ -439,6 +457,19 @@ def validate_dataset(dataset_dir, ignore_errors=[]):
"location": dataset_code,
})
# Check that release codes are different that "latest"
error_code = "invalid-release-code"
if error_code not in ignore_errors:
_, release_code = datasets.parse_dataset_release(dataset_code)
if release_code == datasets.LATEST_RELEASE:
errors.append({
"error_code": error_code,
"message": f"Dataset {dataset_code!r} has an invalid release code {datasets.LATEST_RELEASE!r}",
"provider_code": provider_code,
"dataset_code": dataset_code,
"location": f"{dataset_code}/dataset.json",
})
return (dataset_json, dataset_series, errors)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment