Commit 2f701784 authored by Christophe Benz's avatar Christophe Benz
Browse files

Merge branch 'issue_482-Python_client-Add_dimensions_labels_to_dataframes' into 'master'

#482 python client - add dimensions labels to dataframes

Closes dbnomics-fetchers/management#482

See merge request !4
parents 8e244d0b 6a5bbfed
Pipeline #87236 failed with stages
in 1 minute
......@@ -9,7 +9,7 @@ Run tests:
stage: test
image: python:3.7-slim-stretch
before_script:
- pip install pytest
- pip install pytest pytest-vcr
- pip install --editable .
script:
- pytest
......
......@@ -8,7 +8,7 @@ This project relies on [Python Pandas](https://pandas.pydata.org/).
A tutorial is available as a [Jupyter notebook](./index.ipynb).
The "Binder" tool allows you to run it interactively in your browser. Click on [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/git/https%3A%2F%2Fgit.nomics.world%2Fdbnomics%2Fdbnomics-python-client.git/2817d45bee4d5cb0e88d4ffd19bcbd4f6248fb9b?filepath=index.ipynb) then wait a couple of seconds. After loading a list of files should be displayed. Click on `index.ipynb` to open the tutorial notebook, where you'll be able to play with the DBnomics Python client.
The "Binder" tool allows you to run it interactively in your browser. Click on [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/git/https%3A%2F%2Fgit.nomics.world%2Fdbnomics%2Fdbnomics-python-client.git/master?filepath=index.ipynb) then wait a couple of seconds. After loading a list of files should be displayed. Click on `index.ipynb` to open the tutorial notebook, where you'll be able to play with the DBnomics Python client.
## Install
......
......@@ -21,6 +21,7 @@
"""Access DBnomics time series from Python."""
from collections import defaultdict
import itertools
import json
import logging
......@@ -183,8 +184,32 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
Example:
fetch_series(api_link="https://api.db.nomics.world/v22/series?provider_code=AMECO&dataset_code=ZUTN")
"""
# Call API via `iter_series`, store result in `series_list`.
series_list = list(iter_series(api_link, max_nb_series=max_nb_series))
# Call API via `iter_series_infos`, add dimensions labels and store result in `series_list`. Fill `datasets_dimensions`
datasets_dimensions = None
series_dims_by_dataset_code = defaultdict(dict)
# series_dims_by_dataset_code example:
# {
# 'WB/DB': {
# 'EA19.1.0.0.0.ZUTN': { 'freq':'a', 'geo':'ea19', 'unit':'percentage-of-active-population'},
# 'EA20.1.0.0.0.ZUTN': { 'freq':'a', 'geo':'ea20', 'unit':'percentage-of-active-population'},
# ...
# },
# ...
# }
series_list = []
for series_infos in iter_series_infos(api_link, max_nb_series=max_nb_series):
complete_dataset_code = series_infos['series']['provider_code'] + \
'/' + series_infos['series']['dataset_code'] # ex 'AMECO/ZUTN'
if datasets_dimensions is None:
# Let see if there's only one dataset returned by API, or many datasets
datasets_dimensions = series_infos['datasets_dimensions'] if 'datasets_dimensions' in series_infos else {
# Only one dataset
complete_dataset_code: series_infos['dataset_dimensions']
}
series_list.append(series_infos['series'])
# Store series dimensions information for future use
series_dims_by_dataset_code[complete_dataset_code][series_infos['series']
['series_code']] = series_infos['series']['dimensions']
if len(series_list) == 0:
return pd.DataFrame()
......@@ -192,8 +217,25 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
common_columns = ["@frequency", "provider_code", "dataset_code", "dataset_name", "series_code", "series_name",
"original_period", "period", "original_value", "value"]
# Normalize series received from the API (rename some keys of JSON result to match DataFrame organization).
normalized_series_list = list(map(normalize_dbnomics_series, series_list))
# Flatten series received from the API (rename some keys of JSON result to match DataFrame organization)
flat_series_list = []
for series in series_list:
flat_series = flatten_dbnomics_series(series)
# Add dimensions labels to flat_series
complete_dataset_code = flat_series['provider_code'] + '/' + flat_series['dataset_code'] # ex: "AMECO/ZUTN"
dataset_dimensions = datasets_dimensions[complete_dataset_code]
if 'dimensions_labels' in dataset_dimensions:
dataset_dimensions_labels = dataset_dimensions['dimensions_labels']
else:
dataset_dimensions_labels = {dim_code: "{} (label)".format(dim_code)
for dim_code in dataset_dimensions['dimensions_codes_order']}
# Add dimensions values labels to current series
if 'dimensions_values_labels' in dataset_dimensions:
for dimension_code, dimension_label in dataset_dimensions_labels.items():
dimension_value_code = series_dims_by_dataset_code[complete_dataset_code][series['series_code']][dimension_code]
flat_series[dimension_label] = dict(dataset_dimensions['dimensions_values_labels']
[dimension_code])[dimension_value_code]
flat_series_list.append(flat_series)
# Only applies if filters are used.
if filters:
......@@ -204,30 +246,34 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
for series in filter_series(series_list=series_list, filters=filters,
editor_api_base_url=editor_api_base_url)
]
normalized_series_list = [
flat_series_list = [
{**series, "filtered": False}
for series in normalized_series_list
for series in flat_series_list
] + filtered_series_list
# `normalized_series_list` is a list of dicts like [{"code": "A.B.C", "a_key": 9}, {"code": "X.Y.Z", "other_key": 42}]
# Each series can have different keys so we want to do the union of all the keys of all the series. {"code", "a_key", "other_key"}
# In the DataFrame the different columns will be sparse (there will be `NaN` values when a series does not have a specific key).
# code | a_key | other_key
# ----- | ----- | ---------
# A.B.C | 9 | NaN
# X.Y.Z | NaN | 42
def union_sets(sets):
return set.union(*sets)
all_columns = union_sets([set(series.keys()) for series in normalized_series_list])
dimension_columns = sorted(all_columns - set(common_columns))
# Compute dimensions_labels_columns_names and dimensions_codes_columns_names
dimensions_labels_columns_names = []
dimensions_codes_columns_names = []
for complete_dataset_code in datasets_dimensions.keys():
for dimension_code in datasets_dimensions[complete_dataset_code]['dimensions_codes_order']:
dimensions_codes_columns_names.append(dimension_code)
# We only add dimensions labels column if this information is present
if 'dimensions_labels' in dataset_dimensions and 'dimensions_values_labels' in dataset_dimensions:
dimensions_labels_columns_names.append(
datasets_dimensions[complete_dataset_code]['dimensions_labels'][dimension_code])
else:
if 'dimensions_values_labels' in dataset_dimensions:
# No dimensions labels but dimensions_values_labels -> we add " (label)" to the end of dimension code
dimensions_labels_columns_names.append("{} (label)".format(dimension_code))
# In the case there's no dimension_label nor dimensions_values_labels, we do not add any column
# In the DataFrame we want to display the dimension columns at the right so we reorder them.
ordered_columns = common_columns + dimension_columns
ordered_columns_names = common_columns + dimensions_codes_columns_names + dimensions_labels_columns_names
# Build dataframe
dataframes = (
pd.DataFrame(data=series, columns=ordered_columns)
for series in normalized_series_list
pd.DataFrame(data=series, columns=ordered_columns_names)
for series in flat_series_list
)
return pd.concat(objs=dataframes, sort=False)
......@@ -285,10 +331,48 @@ def iter_filtered_series(series_list, filters, apply_endpoint_url):
continue
for dbnomics_series, filter_result in zip(series_group, filter_results):
yield normalize_editor_series(series=filter_result["series"], dbnomics_series=dbnomics_series)
yield flatten_editor_series(series=filter_result["series"], dbnomics_series=dbnomics_series)
def iter_series_infos(api_link, max_nb_series=None):
"""Iterate through series.docs returned by API
Returns dicts of dataset(s) dimensions and series.
The answer can have a key 'dataset_dimensions' if only one dataset is returned by API, or 'datasets_dimensions' if
more than one dataset is returned.
- datasets_dimensions or dataset_dimensions don't change between calls
- series is the current series
Example:
{
'datasets_dimensions': {
"AMECO/ZUTN": {
"code": "ZUTN",
"converted_at": "2019-05-08T02:51:04Z",
"dimensions_codes_order": ["freq", "unit", "geo" ...],
...
},
"CEPII/CHELEM-TRADE-GTAP": {
"code": "CHELEM-TRADE-GTAP",
"converted_at": "2019-01-29T15:53:30Z",
"dimensions_codes_order": ["exporter", "importer", "secgroup", ...],
...
},
'series':
}
"""
def yield_series(series, response_json):
"""Handle the cases of one-dataset and multi-datasets answer from API"""
assert 'datasets' in response_json or 'dataset' in response_json
if 'datasets' in response_json:
# Multi-datasets answer
datasets_dimensions_dict = {'datasets_dimensions': response_json['datasets']}
else:
# Mono-dataset answer
datasets_dimensions_dict = {'dataset_dimensions': response_json['dataset']}
yield {
'series': series,
**datasets_dimensions_dict
}
def iter_series(api_link, max_nb_series=None):
total_nb_series = 0
while True:
......@@ -308,17 +392,21 @@ def iter_series(api_link, max_nb_series=None):
page_nb_series = len(series_page['docs'])
total_nb_series += page_nb_series
# Stop if we have enough series.
# If user asked for a maximum number of series
if max_nb_series is not None:
if total_nb_series == max_nb_series:
# Stop if we have enough series.
break
elif total_nb_series > max_nb_series:
# Do not respond more series than the asked max_nb_series.
nb_remaining_series = page_nb_series - (total_nb_series - max_nb_series)
yield from series_page['docs'][:nb_remaining_series]
for series in series_page['docs'][:nb_remaining_series]:
yield from yield_series(series, response_json)
break
yield from series_page['docs']
# If user didn't asked for a maximum number of series
for series in series_page['docs']:
yield from yield_series(series, response_json)
# Stop if we downloaded all the series.
assert total_nb_series <= num_found, (total_nb_series, num_found) # Can't download more series than num_found.
......@@ -326,7 +414,7 @@ def iter_series(api_link, max_nb_series=None):
break
def normalize_dbnomics_series(series):
def flatten_dbnomics_series(series):
"""Adapt DBnomics series attributes to ease DataFrame construction.
Rename some dict attributes, flatten other ones
......@@ -352,7 +440,7 @@ def normalize_dbnomics_series(series):
return series
def normalize_editor_series(series, dbnomics_series):
def flatten_editor_series(series, dbnomics_series):
"""Adapt Time Series Editor series attributes to ease DataFrame construction."""
series = normalize_period(series)
series = normalize_value(series)
......
This diff is collapsed.
......@@ -85,5 +85,6 @@ setup(
],
tests_require=[
'pytest',
'pytest-vcr',
],
)
interactions:
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.21.0
method: GET
uri: https://api.db.nomics.world/v22/series?observations=1&series_ids=simu/lated/series2&offset=0
response:
body:
string: |
{ "_meta": {
"args": {
"align_periods": false,
"complete_missing_periods": false,
"facets": false,
"format": "json",
"limit": 1000,
"metadata": true,
"observations": true,
"offset": 0,
"series_ids": [
[
"BLS",
"is",
"ISU00000000000000"
]
]
},
"version": "22.0.0"
},
"datasets": {
"BLS/is": {
"code": "is",
"converted_at": "2019-04-12T16:59:00Z",
"created_at": "2018-01-17T12:12:26Z",
"description": "Occupational injury and illness counts and incidence rates are annual measures of the level and frequency of work-related injuries and illnesses. The rate equals the form of the number of injuries and illnesses per 100 full-time employees.",
"dimensions_codes_order": [
"area",
"case_type",
"data_type",
"industry",
"supersector"
],
"indexed_at": "2019-04-13T07:51:04.095Z",
"json_data_commit_ref": "0ace11084beae68061408e3a2cf9c02f6060c5cf",
"name": "Occupational Injuries and Illnesses: Industry Data (2014 forward)",
"nb_series": 809690,
"provider_code": "BLS",
"provider_name": "U.S. Bureau of Labor Statistics"
}
},
"errors": null,
"providers": {
"BLS": {
"code": "BLS",
"converted_at": "2019-09-07T09:00:40Z",
"created_at": "2017-11-15T16:25:33Z",
"indexed_at": "2019-09-07T09:02:20.155Z",
"json_data_commit_ref": "857b0b8cc3dfdc476c79fbda3f22f7eaec216a81",
"name": "U.S. Bureau of Labor Statistics",
"region": "US",
"slug": "bls",
"website": "https://www.bls.gov/"
}
},
"series": {
"docs": [
{
"@frequency": "annual",
"dataset_code": "is",
"dataset_name": "Occupational Injuries and Illnesses: Industry Data (2014 forward)",
"dimensions": {
"area": "000",
"case_type": "0",
"data_type": "0",
"industry": "000000",
"supersector": "000"
},
"indexed_at": "2019-04-13T07:51:04.095Z",
"period": [
"2014",
"2015",
"2016",
"2017"
],
"period_start_day": [
"2014-01-01",
"2015-01-01",
"2016-01-01",
"2017-01-01"
],
"provider_code": "BLS",
"series_code": "ISU00000000000000",
"series_name": "All ownerships, All U.S. – RSE of Total recordable cases – Rate of illness cases per 10,000 full-time workers (Size class 0) – All workers – All",
"value": [
1.6,
1.7,
1.6,
1.5
]
}
],
"limit": 1000,
"num_found": 1,
"offset": 0
}
}
headers:
Access-Control-Allow-Origin:
- '*'
Connection:
- keep-alive
Content-Type:
- application/json
Date:
- Tue, 10 Sep 2019 10:45:46 GMT
Server:
- nginx/1.10.3
Transfer-Encoding:
- chunked
content-length:
- '76323'
status:
code: 200
message: OK
version: 1
interactions:
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.21.0
method: GET
uri: https://api.db.nomics.world/v22/series?observations=1&series_ids=simu/lated/series1&offset=0
response:
body:
string: |
{ "_meta": {
"args": {
"align_periods": false,
"complete_missing_periods": false,
"facets": false,
"format": "json",
"limit": 1000,
"metadata": true,
"observations": true,
"offset": 0,
"series_ids": [
[
"BLS",
"is",
"ISU00000000000000"
]
]
},
"version": "22.0.0"
},
"datasets": {
"BLS/is": {
"code": "is",
"converted_at": "2019-04-12T16:59:00Z",
"created_at": "2018-01-17T12:12:26Z",
"description": "Occupational injury and illness counts and incidence rates are annual measures of the level and frequency of work-related injuries and illnesses. The rate equals the form of the number of injuries and illnesses per 100 full-time employees.",
"dimensions_codes_order": [
"area",
"case_type",
"data_type",
"industry",
"supersector"
],
"dimensions_labels": {
"area": "Area",
"case_type": "Case Type",
"data_type": "Data Type",
"industry": "Industry",
"supersector": "Supersector"
},
"indexed_at": "2019-04-13T07:51:04.095Z",
"json_data_commit_ref": "0ace11084beae68061408e3a2cf9c02f6060c5cf",
"name": "Occupational Injuries and Illnesses: Industry Data (2014 forward)",
"nb_series": 809690,
"provider_code": "BLS",
"provider_name": "U.S. Bureau of Labor Statistics"
}
},
"errors": null,
"providers": {
"BLS": {
"code": "BLS",
"converted_at": "2019-09-07T09:00:40Z",
"created_at": "2017-11-15T16:25:33Z",
"indexed_at": "2019-09-07T09:02:20.155Z",
"json_data_commit_ref": "857b0b8cc3dfdc476c79fbda3f22f7eaec216a81",
"name": "U.S. Bureau of Labor Statistics",
"region": "US",
"slug": "bls",
"website": "https://www.bls.gov/"
}
},
"series": {
"docs": [
{
"@frequency": "annual",
"dataset_code": "is",
"dataset_name": "Occupational Injuries and Illnesses: Industry Data (2014 forward)",
"dimensions": {
"area": "000",
"case_type": "0",
"data_type": "0",
"industry": "000000",
"supersector": "000"
},
"indexed_at": "2019-04-13T07:51:04.095Z",
"period": [
"2014",
"2015",
"2016",
"2017"
],
"period_start_day": [
"2014-01-01",
"2015-01-01",
"2016-01-01",
"2017-01-01"
],
"provider_code": "BLS",
"series_code": "ISU00000000000000",
"series_name": "All ownerships, All U.S. – RSE of Total recordable cases – Rate of illness cases per 10,000 full-time workers (Size class 0) – All workers – All",
"value": [
1.6,
1.7,
1.6,
1.5
]
}
],
"limit": 1000,
"num_found": 1,
"offset": 0
}
}
headers:
Access-Control-Allow-Origin:
- '*'
Connection:
- keep-alive
Content-Type:
- application/json
Date:
- Tue, 10 Sep 2019 10:45:46 GMT
Server:
- nginx/1.10.3
Transfer-Encoding:
- chunked
content-length:
- '76323'
status:
code: 200
message: OK
version: 1
......@@ -19,6 +19,7 @@
import logging
import pytest
import pandas as pd
......@@ -228,3 +229,41 @@ def test_fetch_series_with_filter_on_one_series_with_wrong_frequency(caplog):
assert len(caplog.records) == 1
assert caplog.records[0].levelname == 'ERROR'
assert "Annual is already the lowest frequency" in caplog.records[0].message
# --- Tests above uses VCR.py (https://vcrpy.readthedocs.io/en/latest/usage.html) to load fixtures ---
@pytest.mark.vcr(decode_compressed_response=True)
def test_fetch_series_without_dimensions_labels():
df = fetch_series("WB", "DB", dimensions={
"country": ["FR", "IT", "ES"],
"indicator": ["IC.REG.COST.PC.FE.ZS.DRFN"],
}) # Thanks to @pytest.mark.vcr decorator, this request result will be read from cassettes/test_fetch_series_without_dimensions_labels.yaml file
# Check that all expected columns are present
expected_columns = {'indicator', 'country', 'indicator (label)', 'country (label)'}
assert expected_columns & set(df.columns) == expected_columns, set(df.columns)
# Check dimensions and dimensions_values_labels
df_line = df.iloc[30]
assert df_line['country'] == 'FR'
assert df_line['country (label)'] == 'France'
@pytest.mark.vcr(decode_compressed_response=True)
def test_fetch_series_without_dimensions_values_labels():
# Thanks to @pytest.mark.vcr decorator, this request result will be read from cassettes/test_fetch_series_without_dimensions_values_labels.yaml file
df = fetch_series('simu/lated/series1')
# In the case of any dimensions_values_labels, we do not want dimensions_labels column to be added
assert not 'Data Type' in list(df.columns)
@pytest.mark.vcr(decode_compressed_response=True)
def test_fetch_series_without_dimensions_labels_nor_dimensions_values_labels():
# Thanks to @pytest.mark.vcr decorator, this request result will be read from cassettes/test_fetch_series_without_dimensions_labels_nor_dimensions_values_labels.yaml file
df = fetch_series('simu/lated/series2')
# dimensions_labels column shouldn't exist
assert not 'Data Type' in list(df.columns)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment