Commit 631e92d8 authored by Pierre Dittgen's avatar Pierre Dittgen
Browse files

Get rid of *args* global variable

parent 25729c97
......@@ -39,12 +39,10 @@ from dbnomics_data_model.observations import (NOT_AVAILABLE,
detect_period_format_strict,
value_to_float)
args = None # Will be defined by main().
log = logging.getLogger(__name__)
def main():
global args
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('storage_dir', type=Path, help='path of directory '
'containing DBnomics series from a provider (in JSON and TSV formats)')
......@@ -100,23 +98,24 @@ def main():
print(format_error({
**error_from_storage_error(exc),
"location": ".",
}))
}, args.format))
return -1
errors_codes = defaultdict(int)
try:
log.debug("Validating provider...")
_, provider_errors = validate_provider(storage)
_, provider_errors = validate_provider(storage, ignore_errors=args.ignore_errors,
storage_dir_name=args.storage_dir.name)
for error in provider_errors:
errors_codes[error['error_code']] += 1
print(format_error(error))
print(format_error(error, output_format=args.format))
log.debug("Validating category tree...")
category_tree_errors = validate_category_tree(storage)
category_tree_errors = validate_category_tree(storage, ignore_errors=args.ignore_errors)
for error in category_tree_errors:
errors_codes[error['error_code']] += 1
print(format_error(error))
print(format_error(error, output_format=args.format))
log.debug("Validating datasets...")
nb_datasets = storage.get_nb_datasets()
......@@ -126,16 +125,17 @@ def main():
continue
log.debug("Validating dataset %s (%d/%d) (except its series)...", dataset_code, dataset_index, nb_datasets)
_, dataset_series, dataset_errors = validate_dataset(dataset_dir)
_, dataset_series, dataset_errors = validate_dataset(dataset_dir, ignore_errors=args.ignore_errors)
for error in dataset_errors:
errors_codes[error['error_code']] += 1
print(format_error(error))
print(format_error(error, output_format=args.format))
log.debug("Validating series of dataset %r...", dataset_code)
series_errors = validate_series(dataset_dir, dataset_series)
series_errors = validate_series(dataset_dir, dataset_series, ignore_errors=args.ignore_errors,
max_series=args.max_series, max_observations=args.max_observations)
for error in series_errors:
errors_codes[error['error_code']] += 1
print(format_error(error))
print(format_error(error, output_format=args.format))
except KeyboardInterrupt as e:
logging.exception(e)
......@@ -158,9 +158,8 @@ def error_from_storage_error(exc: StorageError):
}
def format_error(error):
global args
if args.format == "jsonl":
def format_error(error, output_format="text"):
if output_format == "jsonl":
return json.dumps(error, sort_keys=True)
sio = io.StringIO()
......@@ -222,7 +221,7 @@ def build_jsonschema_error(errors, base_path=[]):
]
def validate_category_tree(storage):
def validate_category_tree(storage, ignore_errors=[]):
provider_code = storage.provider_code
errors = []
......@@ -230,7 +229,7 @@ def validate_category_tree(storage):
category_tree_json = storage.load_category_tree_json()
if category_tree_json is not None:
error_code = "category-tree-schema"
if error_code not in args.ignore_errors:
if error_code not in ignore_errors:
log.debug("Validating category_tree.json with JSON schema...")
category_tree_schema_errors = list(validators.category_tree_validator.iter_errors(category_tree_json))
if category_tree_schema_errors:
......@@ -245,7 +244,7 @@ def validate_category_tree(storage):
return errors
def validate_dataset(dataset_dir):
def validate_dataset(dataset_dir, ignore_errors=[]):
errors = []
provider_code = dataset_dir.storage.provider_code
dataset_code = dataset_dir.dataset_code
......@@ -262,7 +261,7 @@ def validate_dataset(dataset_dir):
# Dataset directory name MUST be the dataset code.
error_code = "invalid-dataset-directory-name"
if error_code not in args.ignore_errors and dataset_json["code"] != dataset_code:
if error_code not in ignore_errors and dataset_json["code"] != dataset_code:
errors.append({
"error_code": error_code,
"message": "Dataset code from dataset.json is different than the directory name",
......@@ -278,7 +277,7 @@ def validate_dataset(dataset_dir):
dataset_series = dataset_json.pop("series", None)
error_code = "dataset-schema"
if error_code not in args.ignore_errors:
if error_code not in ignore_errors:
log.debug("Validating dataset.json with JSON schema (except 'series' property)...")
dataset_schema_errors = list(validators.dataset_validator.iter_errors(dataset_json))
if dataset_schema_errors:
......@@ -294,7 +293,7 @@ def validate_dataset(dataset_dir):
return (dataset_json, dataset_series, errors)
def validate_provider(storage):
def validate_provider(storage, ignore_errors=[], storage_dir_name=None):
"""Yield error dicts"""
errors = []
......@@ -311,7 +310,7 @@ def validate_provider(storage):
provider_code = provider_json.get('code')
error_code = "provider-schema"
if error_code not in args.ignore_errors:
if error_code not in ignore_errors:
log.debug("Validating provider.json with schema...")
provider_schema_errors = list(validators.provider_validator.iter_errors(provider_json))
if provider_schema_errors:
......@@ -324,16 +323,17 @@ def validate_provider(storage):
})
error_code = "invalid-provider-directory-name"
if error_code not in args.ignore_errors:
if error_code not in ignore_errors:
log.debug("Validating provider directory name...")
error = validate_provider_directory_name(error_code, provider_code)
error = validate_provider_directory_name(error_code, provider_code,
storage_dir_name=storage_dir_name)
if error is not None:
errors.append(error)
return (provider_json, errors)
def validate_provider_directory_name(error_code, provider_code=None):
def validate_provider_directory_name(error_code, provider_code=None, storage_dir_name=None):
global args
if provider_code is None:
log.debug("Skipped %r validator because provider_code could not be loaded", error_code)
......@@ -343,12 +343,12 @@ def validate_provider_directory_name(error_code, provider_code=None):
"{}-json-data".format(provider_code_lower),
"{}-json-data.git".format(provider_code_lower),
}
if args.storage_dir.name not in valid_directories_names:
if storage_dir_name not in valid_directories_names:
return {
"error_code": error_code,
"message": "Directory name is invalid",
"context": {
"directory_name": args.storage_dir.name,
"directory_name": storage_dir_name,
"valid_directories_names": valid_directories_names,
},
"provider_code": provider_code,
......@@ -357,7 +357,7 @@ def validate_provider_directory_name(error_code, provider_code=None):
return None
def validate_observations(provider_code, dataset_code, series_code, location, observations):
def validate_observations(provider_code, dataset_code, series_code, location, observations, ignore_errors=[]):
errors = []
header = observations[0]
header_nb_columns = len(header)
......@@ -369,7 +369,7 @@ def validate_observations(provider_code, dataset_code, series_code, location, ob
# The two first columns of the header MUST be named `PERIOD` and `VALUE`.
error_code = "invalid-observations-header"
if error_code not in args.ignore_errors:
if error_code not in ignore_errors:
if get(0, header, default=None) != "PERIOD":
errors.append({
"error_code": error_code,
......@@ -404,7 +404,7 @@ def validate_observations(provider_code, dataset_code, series_code, location, ob
# The `PERIOD` column MUST be sorted in an ascending order.
error_code = "unordered-observations-periods"
if error_code not in args.ignore_errors:
if error_code not in ignore_errors:
periods = [row[0] for row in rows]
if periods != sorted(periods):
errors.append({
......@@ -423,7 +423,7 @@ def validate_observations(provider_code, dataset_code, series_code, location, ob
if len(rows) == 0:
error_code = "no-observations"
if error_code not in args.ignore_errors:
if error_code not in ignore_errors:
errors.append({
"error_code": error_code,
"message": "Time series has no observations",
......@@ -440,7 +440,7 @@ def validate_observations(provider_code, dataset_code, series_code, location, ob
# Each row MUST have the same number of columns than the header.
error_code = "invalid-observation-row-size"
if error_code not in args.ignore_errors and nb_columns != header_nb_columns:
if error_code not in ignore_errors and nb_columns != header_nb_columns:
errors.append({
"error_code": error_code,
"message": "Row has {} columns but header has {} columns".format(nb_columns, header_nb_columns),
......@@ -459,7 +459,7 @@ def validate_observations(provider_code, dataset_code, series_code, location, ob
error_code_invalid = "invalid-observation-period-format"
error_code_heterogeneous = "heterogeneous-observations-periods-formats"
if error_code_invalid not in args.ignore_errors and period_format is None:
if error_code_invalid not in ignore_errors and period_format is None:
errors.append({
"error_code": error_code_invalid,
"message": "Period format is invalid",
......@@ -472,7 +472,7 @@ def validate_observations(provider_code, dataset_code, series_code, location, ob
"location": location,
"line_number": line_number,
})
elif error_code_heterogeneous not in args.ignore_errors and \
elif error_code_heterogeneous not in ignore_errors and \
previous_period_format is not None and previous_period_format != period_format:
errors.append({
"error_code": error_code_heterogeneous,
......@@ -492,7 +492,7 @@ def validate_observations(provider_code, dataset_code, series_code, location, ob
# The values of the `VALUE` column MUST either be a float OR be `NA` meaning "not available".
error_code = "invalid-observation-value"
if error_code not in args.ignore_errors:
if error_code not in ignore_errors:
value = observation[1]
value_float = value_to_float(value)
if value_float != NOT_AVAILABLE and not isinstance(value_float, float):
......@@ -512,7 +512,7 @@ def validate_observations(provider_code, dataset_code, series_code, location, ob
return errors
def validate_series(dataset_dir, dataset_series):
def validate_series(dataset_dir, dataset_series, ignore_errors=[], max_series=None, max_observations=None):
errors = []
provider_code = dataset_dir.storage.provider_code
dataset_code = dataset_dir.dataset_code
......@@ -531,7 +531,7 @@ def validate_series(dataset_dir, dataset_series):
if dataset_series == []:
error_code = "no-series"
if error_code not in args.ignore_errors:
if error_code not in ignore_errors:
errors.append({
"error_code": error_code,
"message": "Series list is empty in dataset.json",
......@@ -544,7 +544,7 @@ def validate_series(dataset_dir, dataset_series):
series_jsonl_filepath = dataset_dir.path / 'series.jsonl'
if not series_jsonl_filepath.exists():
error_code = "no-series"
if error_code not in args.ignore_errors:
if error_code not in ignore_errors:
errors.append({
"error_code": error_code,
"message": "No 'series' key in dataset.json but no series.jsonl file either",
......@@ -553,9 +553,9 @@ def validate_series(dataset_dir, dataset_series):
"location": "{}/{}".format(dataset_code, 'dataset.json'),
})
if args.max_series is not None:
log.debug("Validating %d series max", args.max_series)
series_iterator = take(args.max_series, series_iterator)
if max_series is not None:
log.debug("Validating %d series max", max_series)
series_iterator = take(max_series, series_iterator)
validated_series_codes = set()
validated_series_names = set()
......@@ -568,7 +568,7 @@ def validate_series(dataset_dir, dataset_series):
log.debug("Validating series %r...", series_id_str)
error_code = "invalid-series-code"
if error_code not in args.ignore_errors:
if error_code not in ignore_errors:
series_schema_errors = list(validators.series_validator.iter_errors(series_json))
if series_schema_errors:
errors.append({
......@@ -582,7 +582,7 @@ def validate_series(dataset_dir, dataset_series):
})
error_code = "duplicated-series-code"
if error_code not in args.ignore_errors and series_code in validated_series_codes:
if error_code not in ignore_errors and series_code in validated_series_codes:
errors.append({
"error_code": error_code,
"message": "Series code already met before",
......@@ -593,7 +593,7 @@ def validate_series(dataset_dir, dataset_series):
})
error_code = "duplicate-series-name"
if error_code not in args.ignore_errors and series_name is not None and series_name in validated_series_names:
if error_code not in ignore_errors and series_name is not None and series_name in validated_series_names:
errors.append({
"error_code": error_code,
"message": "Series name already met before",
......@@ -615,7 +615,7 @@ def validate_series(dataset_dir, dataset_series):
series_jsonl_file_name = dataset_dir.get_series_jsonl_file_name(dataset_series=dataset_series)
if series_jsonl_file_name is None:
error_code = "unsupported-observations-in-dataset-json"
if error_code not in args.ignore_errors:
if error_code not in ignore_errors:
errors.append({
"error_code": error_code,
"message": "Having observations in 'series' property of dataset.json is not supported",
......@@ -629,7 +629,8 @@ def validate_series(dataset_dir, dataset_series):
log.debug("Validating observations (in JSON lines format) of series %r...", series_id_str)
location = str(series_file_path.relative_to(dataset_dir.storage.path))
observations_errors = validate_observations(
provider_code, dataset_code, series_code, location, observations)
provider_code, dataset_code, series_code, location, observations,
ignore_errors=ignore_errors)
errors.extend(observations_errors)
validated_series_codes.add(series_code)
......@@ -640,16 +641,17 @@ def validate_series(dataset_dir, dataset_series):
observations_iterator = dataset_dir.iter_observations(observations_series_codes)
if args.max_observations is not None:
log.debug("Validating %d observations max", args.max_observations)
observations_iterator = take(args.max_observations, observations_iterator)
if max_observations is not None:
log.debug("Validating %d observations max", max_observations)
observations_iterator = take(max_observations, observations_iterator)
for series_code, file_path, observations in observations_iterator:
series_id_str = dataset_dir.series_id_str(series_code)
log.debug("Validating observations of series %r...", series_id_str)
location = str(file_path.relative_to(dataset_dir.storage.path))
observations_errors = validate_observations(
provider_code, dataset_code, series_code, location, observations)
provider_code, dataset_code, series_code, location, observations,
ignore_errors=ignore_errors)
errors.extend(observations_errors)
return errors
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment