Commit 92170282 authored by Christophe Benz's avatar Christophe Benz
Browse files

Reformat files with black

parent bb595639
......@@ -32,10 +32,10 @@ from urllib.parse import urljoin
import pandas as pd
import requests
default_api_base_url = os.environ.get('API_URL') or 'https://api.db.nomics.world/v22/'
default_api_base_url = os.environ.get("API_URL") or "https://api.db.nomics.world/v22/"
default_max_nb_series = 50
default_editor_api_base_url = os.environ.get('EDITOR_API_URL') or 'https://editor.nomics.world/api/v1/'
default_editor_api_base_url = os.environ.get("EDITOR_API_URL") or "https://editor.nomics.world/api/v1/"
editor_apply_endpoint_nb_series_per_post = 100
log = logging.getLogger(__name__)
......@@ -46,14 +46,14 @@ class TooManySeries(Exception):
self.num_found = num_found
self.max_nb_series = max_nb_series
message = (
"DBnomics Web API found {num_found} series matching your request, " +
(
"DBnomics Web API found {num_found} series matching your request, "
+ (
"but you passed the argument 'max_nb_series={max_nb_series}'."
if max_nb_series is not None
else "but you did not pass any value for the 'max_nb_series' argument, "
"so a default value of {default_max_nb_series} was used."
) +
" Please give a higher value (at least max_nb_series={num_found}), and try again."
"so a default value of {default_max_nb_series} was used."
)
+ " Please give a higher value (at least max_nb_series={num_found}), and try again."
).format(
default_max_nb_series=default_max_nb_series,
max_nb_series=max_nb_series,
......@@ -62,9 +62,17 @@ class TooManySeries(Exception):
super().__init__(message)
def fetch_series(provider_code=None, dataset_code=None, series_code=None, dimensions=None, series_ids=None,
max_nb_series=None, api_base_url=None,
editor_api_base_url=default_editor_api_base_url, filters=None):
def fetch_series(
provider_code=None,
dataset_code=None,
series_code=None,
dimensions=None,
series_ids=None,
max_nb_series=None,
api_base_url=None,
editor_api_base_url=default_editor_api_base_url,
filters=None,
):
"""Download time series from DBnomics. Filter series by different ways according to the given parameters.
If not `None`, `dimensions` parameter must be a `dict` of dimensions (`list` of `str`), like so:
......@@ -112,7 +120,7 @@ def fetch_series(provider_code=None, dataset_code=None, series_code=None, dimens
global default_api_base_url
if api_base_url is None:
api_base_url = default_api_base_url
if not api_base_url.endswith('/'):
if not api_base_url.endswith("/"):
api_base_url += "/"
if dataset_code is None:
if isinstance(provider_code, list):
......@@ -131,49 +139,69 @@ def fetch_series(provider_code=None, dataset_code=None, series_code=None, dimens
if series_code is not None and not isinstance(series_code, str):
raise ValueError("`series_code` parameter must be a string")
if series_ids is not None and (
not isinstance(series_ids, list) or
any(not isinstance(series_id, str) for series_id in series_ids)
not isinstance(series_ids, list) or any(not isinstance(series_id, str) for series_id in series_ids)
):
raise ValueError("`series_ids` parameter must be a list of strings")
if api_base_url is not None and not isinstance(api_base_url, str):
raise ValueError("`api_base_url` parameter must be a string")
series_base_url = urljoin(api_base_url, 'series')
series_base_url = urljoin(api_base_url, "series")
if dimensions is None and series_code is None and series_ids is None:
if not provider_code or not dataset_code:
raise ValueError("When you don't use `dimensions`, you must specifiy `provider_code` and `dataset_code`.")
api_link = series_base_url + '/{}/{}?observations=1'.format(provider_code, dataset_code)
return fetch_series_by_api_link(api_link, filters=filters, max_nb_series=max_nb_series,
editor_api_base_url=editor_api_base_url)
api_link = series_base_url + "/{}/{}?observations=1".format(provider_code, dataset_code)
return fetch_series_by_api_link(
api_link,
filters=filters,
max_nb_series=max_nb_series,
editor_api_base_url=editor_api_base_url,
)
if dimensions is not None:
if not provider_code or not dataset_code:
raise ValueError("When you use `dimensions`, you must specifiy `provider_code` and `dataset_code`.")
api_link = series_base_url + \
'/{}/{}?observations=1&dimensions={}'.format(provider_code, dataset_code, json.dumps(dimensions))
return fetch_series_by_api_link(api_link, filters=filters, max_nb_series=max_nb_series,
editor_api_base_url=editor_api_base_url)
api_link = series_base_url + "/{}/{}?observations=1&dimensions={}".format(
provider_code, dataset_code, json.dumps(dimensions)
)
return fetch_series_by_api_link(
api_link,
filters=filters,
max_nb_series=max_nb_series,
editor_api_base_url=editor_api_base_url,
)
if series_code is not None:
if not provider_code or not dataset_code:
raise ValueError("When you use `series_code`, you must specifiy `provider_code` and `dataset_code`.")
api_link = series_base_url + '/{}/{}/{}?observations=1'.format(provider_code, dataset_code, series_code)
return fetch_series_by_api_link(api_link, filters=filters, max_nb_series=max_nb_series,
editor_api_base_url=editor_api_base_url)
api_link = series_base_url + "/{}/{}/{}?observations=1".format(provider_code, dataset_code, series_code)
return fetch_series_by_api_link(
api_link,
filters=filters,
max_nb_series=max_nb_series,
editor_api_base_url=editor_api_base_url,
)
if series_ids is not None:
if provider_code or dataset_code:
raise ValueError("When you use `series_ids`, you must not specifiy `provider_code` nor `dataset_code`.")
api_link = series_base_url + '?observations=1&series_ids={}'.format(','.join(series_ids))
return fetch_series_by_api_link(api_link, filters=filters, max_nb_series=max_nb_series,
editor_api_base_url=editor_api_base_url)
api_link = series_base_url + "?observations=1&series_ids={}".format(",".join(series_ids))
return fetch_series_by_api_link(
api_link,
filters=filters,
max_nb_series=max_nb_series,
editor_api_base_url=editor_api_base_url,
)
raise ValueError("Invalid combination of function arguments")
def fetch_series_by_api_link(api_link, max_nb_series=None,
editor_api_base_url=default_editor_api_base_url, filters=None):
def fetch_series_by_api_link(
api_link,
max_nb_series=None,
editor_api_base_url=default_editor_api_base_url,
filters=None,
):
"""Fetch series given an "API link" URL.
"API link" URLs can be found on DBnomics web site (https://db.nomics.world/) on dataset or series pages
......@@ -198,44 +226,64 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
# }
series_list = []
for series_infos in iter_series_infos(api_link, max_nb_series=max_nb_series):
complete_dataset_code = series_infos['series']['provider_code'] + \
'/' + series_infos['series']['dataset_code'] # ex 'AMECO/ZUTN'
complete_dataset_code = (
series_infos["series"]["provider_code"] + "/" + series_infos["series"]["dataset_code"]
) # ex 'AMECO/ZUTN'
if datasets_dimensions is None:
# Let see if there's only one dataset returned by API, or many datasets
datasets_dimensions = series_infos['datasets_dimensions'] if 'datasets_dimensions' in series_infos else {
# Only one dataset
complete_dataset_code: series_infos['dataset_dimensions']
}
series_list.append(series_infos['series'])
datasets_dimensions = (
series_infos["datasets_dimensions"]
if "datasets_dimensions" in series_infos
else {
# Only one dataset
complete_dataset_code: series_infos["dataset_dimensions"]
}
)
series_list.append(series_infos["series"])
# Store series dimensions information for future use
series_dims_by_dataset_code[complete_dataset_code][series_infos['series']
['series_code']] = series_infos['series']['dimensions']
series_dims_by_dataset_code[complete_dataset_code][series_infos["series"]["series_code"]] = series_infos[
"series"
]["dimensions"]
if len(series_list) == 0:
return pd.DataFrame()
common_columns = ["@frequency", "provider_code", "dataset_code", "dataset_name", "series_code", "series_name",
"original_period", "period", "original_value", "value"]
common_columns = [
"@frequency",
"provider_code",
"dataset_code",
"dataset_name",
"series_code",
"series_name",
"original_period",
"period",
"original_value",
"value",
]
# Flatten series received from the API to prepare Dataframe creation (rename some keys of JSON result to match DataFrame organization)
flat_series_list = []
for series in series_list:
flat_series = flatten_dbnomics_series(series)
# Add dimensions labels to flat_series
complete_dataset_code = flat_series['provider_code'] + '/' + flat_series['dataset_code'] # ex: "AMECO/ZUTN"
complete_dataset_code = flat_series["provider_code"] + "/" + flat_series["dataset_code"] # ex: "AMECO/ZUTN"
dataset_dimensions = datasets_dimensions[complete_dataset_code]
if 'dimensions_labels' in dataset_dimensions:
dataset_dimensions_labels = dataset_dimensions['dimensions_labels']
if "dimensions_labels" in dataset_dimensions:
dataset_dimensions_labels = dataset_dimensions["dimensions_labels"]
else:
dataset_dimensions_labels = {dim_code: "{} (label)".format(dim_code)
for dim_code in dataset_dimensions['dimensions_codes_order']}
dataset_dimensions_labels = {
dim_code: "{} (label)".format(dim_code) for dim_code in dataset_dimensions["dimensions_codes_order"]
}
# Add dimensions values labels to current series
if 'dimensions_values_labels' in dataset_dimensions:
for dimension_code in series['dimensions']:
if "dimensions_values_labels" in dataset_dimensions:
for dimension_code in series["dimensions"]:
dimension_label = dataset_dimensions_labels[dimension_code]
dimension_value_code = series_dims_by_dataset_code[complete_dataset_code][series['series_code']][dimension_code]
flat_series[dimension_label] = dict(dataset_dimensions['dimensions_values_labels']
[dimension_code])[dimension_value_code]
dimension_value_code = series_dims_by_dataset_code[complete_dataset_code][series["series_code"]][
dimension_code
]
flat_series[dimension_label] = dict(dataset_dimensions["dimensions_values_labels"][dimension_code])[
dimension_value_code
]
flat_series_list.append(flat_series)
# Only applies if filters are used.
......@@ -244,26 +292,27 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
common_columns.append("filtered")
filtered_series_list = [
{**series, "filtered": True}
for series in filter_series(series_list=series_list, filters=filters,
editor_api_base_url=editor_api_base_url)
for series in filter_series(
series_list=series_list,
filters=filters,
editor_api_base_url=editor_api_base_url,
)
]
flat_series_list = [
{**series, "filtered": False}
for series in flat_series_list
] + filtered_series_list
flat_series_list = [{**series, "filtered": False} for series in flat_series_list] + filtered_series_list
# Compute dimensions_labels_columns_names and dimensions_codes_columns_names
dimensions_labels_columns_names = []
dimensions_codes_columns_names = []
for complete_dataset_code in datasets_dimensions.keys():
for dimension_code in datasets_dimensions[complete_dataset_code]['dimensions_codes_order']:
for dimension_code in datasets_dimensions[complete_dataset_code]["dimensions_codes_order"]:
dimensions_codes_columns_names.append(dimension_code)
# We only add dimensions labels column if this information is present
if 'dimensions_labels' in dataset_dimensions and 'dimensions_values_labels' in dataset_dimensions:
if "dimensions_labels" in dataset_dimensions and "dimensions_values_labels" in dataset_dimensions:
dimensions_labels_columns_names.append(
datasets_dimensions[complete_dataset_code]['dimensions_labels'][dimension_code])
datasets_dimensions[complete_dataset_code]["dimensions_labels"][dimension_code]
)
else:
if 'dimensions_values_labels' in dataset_dimensions:
if "dimensions_values_labels" in dataset_dimensions:
# No dimensions labels but dimensions_values_labels -> we add " (label)" to the end of dimension code
dimensions_labels_columns_names.append("{} (label)".format(dimension_code))
# In the case there's no dimension_label nor dimensions_values_labels, we do not add any column
......@@ -272,35 +321,32 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
ordered_columns_names = common_columns + dimensions_codes_columns_names + dimensions_labels_columns_names
# Build dataframe
dataframes = (
pd.DataFrame(data=series, columns=ordered_columns_names)
for series in flat_series_list
)
dataframes = (pd.DataFrame(data=series, columns=ordered_columns_names) for series in flat_series_list)
return pd.concat(objs=dataframes, sort=False)
def fetch_series_page(series_endpoint_url, offset):
series_page_url = '{}{}offset={}'.format(
series_page_url = "{}{}offset={}".format(
series_endpoint_url,
'&' if '?' in series_endpoint_url else '?',
"&" if "?" in series_endpoint_url else "?",
offset,
)
response = requests.get(series_page_url)
response_json = response.json()
if not response.ok:
message = response_json.get('message')
message = response_json.get("message")
raise ValueError("Could not fetch data from URL {!r} because: {}".format(series_page_url, message))
series_page = response_json.get('series')
series_page = response_json.get("series")
if series_page is not None:
assert series_page['offset'] == offset, (series_page['offset'], offset)
assert series_page["offset"] == offset, (series_page["offset"], offset)
return response_json
def filter_series(series_list, filters, editor_api_base_url=default_editor_api_base_url):
if not editor_api_base_url.endswith('/'):
if not editor_api_base_url.endswith("/"):
editor_api_base_url += "/"
apply_endpoint_url = urljoin(editor_api_base_url, "apply")
return list(iter_filtered_series(series_list, filters, apply_endpoint_url))
......@@ -360,19 +406,17 @@ def iter_series_infos(api_link, max_nb_series=None):
'series':
}
"""
def yield_series(series, response_json):
"""Handle the cases of one-dataset and multi-datasets answer from API"""
assert 'datasets' in response_json or 'dataset' in response_json
if 'datasets' in response_json:
assert "datasets" in response_json or "dataset" in response_json
if "datasets" in response_json:
# Multi-datasets answer
datasets_dimensions_dict = {'datasets_dimensions': response_json['datasets']}
datasets_dimensions_dict = {"datasets_dimensions": response_json["datasets"]}
else:
# Mono-dataset answer
datasets_dimensions_dict = {'dataset_dimensions': response_json['dataset']}
yield {
'series': series,
**datasets_dimensions_dict
}
datasets_dimensions_dict = {"dataset_dimensions": response_json["dataset"]}
yield {"series": series, **datasets_dimensions_dict}
total_nb_series = 0
......@@ -386,11 +430,11 @@ def iter_series_infos(api_link, max_nb_series=None):
series_page = response_json["series"]
num_found = series_page['num_found']
num_found = series_page["num_found"]
if max_nb_series is None and num_found > default_max_nb_series:
raise TooManySeries(num_found, max_nb_series)
page_nb_series = len(series_page['docs'])
page_nb_series = len(series_page["docs"])
total_nb_series += page_nb_series
# If user asked for a maximum number of series
......@@ -401,16 +445,19 @@ def iter_series_infos(api_link, max_nb_series=None):
elif total_nb_series > max_nb_series:
# Do not respond more series than the asked max_nb_series.
nb_remaining_series = page_nb_series - (total_nb_series - max_nb_series)
for series in series_page['docs'][:nb_remaining_series]:
for series in series_page["docs"][:nb_remaining_series]:
yield from yield_series(series, response_json)
break
# If user didn't asked for a maximum number of series
for series in series_page['docs']:
for series in series_page["docs"]:
yield from yield_series(series, response_json)
# Stop if we downloaded all the series.
assert total_nb_series <= num_found, (total_nb_series, num_found) # Can't download more series than num_found.
assert total_nb_series <= num_found, (
total_nb_series,
num_found,
) # Can't download more series than num_found.
if total_nb_series == num_found:
break
......@@ -481,11 +528,12 @@ def normalize_value(series):
"original_value": value,
"value": [
# None will be replaced by np.NaN in DataFrame construction.
None if v == 'NA' else v
None if v == "NA" else v
for v in value
]
],
}
# UTILS
......
[tool.black]
line-length = 120
[isort]
[tool:isort]
known_first_party = dbnomics
known_third_party = pytest
skip = .git,.tox,dist,build,.eggs,__pycache__,*.egg-info,.venv
# From https://black.readthedocs.io/en/stable/the_black_code_style.html#how-black-wraps-lines
multi_line_output = 3
include_trailing_comma = True
force_grid_wrap = 0
use_parentheses = True
line_length = 120
[pycodestyle]
max_line_length = 120
[pylint]
max_line_length = 120
[aliases]
test=pytest
[tool:pytest]
addopts = --doctest-modules
[flake8]
max-line-length = 120
exclude=.git,dist,build,.eggs,__pycache__,*.egg-info,.venv
extend-ignore =
# See https://github.com/PyCQA/pycodestyle/issues/373
E203,
......@@ -30,61 +30,49 @@ from setuptools import find_packages, setup
HERE = path.abspath(path.dirname(__file__))
# Get the long description from the README file
with codecs.open(path.join(HERE, 'README.md'), encoding='utf-8') as f:
with codecs.open(path.join(HERE, "README.md"), encoding="utf-8") as f:
LONG_DESCRIPTION = f.read()
setup(
name='DBnomics',
version='1.2.0',
description='DBnomics Python Client',
name="DBnomics",
version="1.2.0",
description="DBnomics Python Client",
long_description=LONG_DESCRIPTION,
long_description_content_type='text/markdown',
url='https://git.nomics.world/dbnomics/dbnomics-python-client',
author='DBnomics Team',
author_email='contact@nomics.world',
license='AGPLv3',
long_description_content_type="text/markdown",
url="https://git.nomics.world/dbnomics/dbnomics-python-client",
author="DBnomics Team",
author_email="contact@nomics.world",
license="AGPLv3",
# See https://pypi.python.org/pypi?%3Aaction=list_classifiers
classifiers=[
# How mature is this project? Common values are
# 3 - Alpha
# 4 - Beta
# 5 - Production/Stable
'Development Status :: 5 - Production/Stable',
"Development Status :: 5 - Production/Stable",
# Indicate who your project is intended for
'Intended Audience :: Developers',
'Topic :: Scientific/Engineering :: Information Analysis',
'Environment :: Web Environment',
'Operating System :: POSIX',
"Intended Audience :: Developers",
"Topic :: Scientific/Engineering :: Information Analysis",
"Environment :: Web Environment",
"Operating System :: POSIX",
# Pick your license as you wish (should match "license" above)
'License :: OSI Approved :: GNU Affero General Public License v3',
"License :: OSI Approved :: GNU Affero General Public License v3",
# Specify the Python versions you support here. In particular, ensure
# that you indicate whether you support Python 2, Python 3 or both.
'Programming Language :: Python :: 3',
"Programming Language :: Python :: 3",
],
# What does your project relate to?
keywords='economics time-series db.nomics',
keywords="economics time-series db.nomics",
packages=find_packages(),
install_requires=[
'requests >= 2.18.4',
'pandas >= 0.21',
"requests >= 2.18.4",
"pandas >= 0.21",
],
setup_requires=[
'pytest-runner',
"pytest-runner",
],
tests_require=[
'pytest',
'pytest-vcr',
"pytest",
"pytest-vcr",
],
)
......@@ -28,28 +28,30 @@ from dbnomics import default_api_base_url, fetch_series, fetch_series_by_api_lin
def test_fetch_series_with_filter_on_one_series_with_filter_parameter_error(caplog):
filters = [{"code": "interpolate", "parameters": {"foo": "bar"}}]
with caplog.at_level(logging.INFO):
df = fetch_series('AMECO', 'ZUTN', 'DEU.1.0.0.0.ZUTN', filters=filters)
df = fetch_series("AMECO", "ZUTN", "DEU.1.0.0.0.ZUTN", filters=filters)
assert all(df.filtered == False)
assert len(caplog.records) == 1
assert caplog.records[0].levelname == 'ERROR'
assert caplog.records[0].levelname == "ERROR"
assert "Error with filter parameters" in caplog.records[0].message
def test_fetch_series_with_filter_on_one_series_with_wrong_frequency(caplog):
filters = [{"code": "aggregate", "parameters": {"frequency": "annual"}}]
with caplog.at_level(logging.INFO):
df = fetch_series('AMECO', 'ZUTN', 'DEU.1.0.0.0.ZUTN', filters=filters)
df = fetch_series("AMECO", "ZUTN", "DEU.1.0.0.0.ZUTN", filters=filters)
assert all(df.filtered == False)
assert len(caplog.records) == 1
assert caplog.records[0].levelname == 'ERROR'
assert caplog.records[0].levelname == "ERROR"
assert "Annual is already the lowest frequency" in caplog.records[0].message
def test_fetch_series_with_filter_on_one_series_with_filter_error(caplog):
filters = [{"code": "foo", "parameters": {}}]
with caplog.at_level(logging.INFO):
df = fetch_series('AMECO', 'ZUTN', 'DEU.1.0.0.0.ZUTN', filters=filters)
df = fetch_series("AMECO", "ZUTN", "DEU.1.0.0.0.ZUTN", filters=filters)
assert all(df.filtered == False)
assert len(caplog.records) == 1
assert caplog.records[0].levelname == 'ERROR'
assert caplog.records[0].levelname == "ERROR"
assert "Filter not found" in caplog.records[0].message
......@@ -59,7 +61,7 @@ def test_fetch_series_with_filter_on_one_series_with_filter_error(caplog):
@pytest.mark.vcr(decode_compressed_response=True)
def test_fetch_series_by_code():
# Thanks to @pytest.mark.vcr decorator, this request result will be read from cassettes yaml file (the one that match the test function name)
df = fetch_series('AMECO', 'ZUTN', 'EA19.1.0.0.0.ZUTN')
df = fetch_series("AMECO", "ZUTN", "EA19.1.0.0.0.ZUTN")
provider_codes = df["provider_code"].unique()
assert len(provider_codes) == 1
......@@ -94,7 +96,7 @@ def test_fetch_series_by_code_mask():
@pytest.mark.vcr(decode_compressed_response=True)
def test_fetch_series_by_code_mask_with_plus_in_dimension_code():
# Thanks to @pytest.mark.vcr decorator, this request result will be read from cassettes yaml file (the one that match the test function name)
df = fetch_series('SCB', 'AKIAM', '"J+K"+"G+H".AM0301C1')
df = fetch_series("SCB", "AKIAM", '"J+K"+"G+H".AM0301C1')
provider_codes = df["provider_code"].unique()
assert len(provider_codes) == 1
......@@ -105,13 +107,13 @@ def test_fetch_series_by_code_mask_with_plus_in_dimension_code():
assert dataset_codes[0] == "AKIAM"
series_codes = df["series_code"].unique()
assert set(series_codes) == {'J+K.AM0301C1', 'G+H.AM0301C1'}, series_codes
assert set(series_codes) == {"J+K.AM0301C1", "G+H.AM0301C1"}, series_codes
@pytest.mark.vcr(decode_compressed_response=True)
def test_fetch_series_by_id():
# Thanks to @pytest.mark.vcr decorator, this request result will be read from cassettes yaml file (the one that match the test function name)
df = fetch_series('AMECO/ZUTN/EA19.1.0.0.0.ZUTN')
df = fetch_series("AMECO/ZUTN/EA19.1.0.0.0.ZUTN")
provider_codes = df["provider_code"].unique()
assert len(provider_codes) == 1
......@@ -129,10 +131,12 @@ def test_fetch_series_by_id():
@pytest.mark.vcr(decode_compressed_response=True)
def test_fetch_series_by_ids_in_same_dataset():
# Thanks to @pytest.mark.vcr decorator, this request result will be read from cassettes yaml file (the one that match the test function name)
df = fetch_series([
'AMECO/ZUTN/EA19.1.0.0.0.ZUTN',
'AMECO/ZUTN/DNK.1.0.0.0.ZUTN',
])
df = fetch_series(
[
"AMECO/ZUTN/EA19.1.0.0.0.ZUTN",
"AMECO/ZUTN/DNK.1.0.0.0.ZUTN",
]
)
provider_codes = df["provider_code"].unique()
assert len(provider_codes) == 1
......@@ -151,10 +155,12 @@ def test_fetch_series_by_ids_in_same_dataset():
@pytest.mark.vcr(decode_compressed_response=True)
def test_fetch_series_by_ids_in_different_datasets():
# Thanks to @pytest.mark.vcr decorator, this request result will be read from cassettes yaml file (the one that match the test function name)
df = fetch_series([
'AMECO/ZUTN/EA19.1.0.0.0.ZUTN',
'BIS/cbs/Q.S.5A.4B.F.B.A.A.LC1.A.1C',
])
df = fetch_series(
[
"AMECO/ZUTN/EA19.1.0.0.0.ZUTN",