Commit 839db1b7 authored by Bruno Duyé's avatar Bruno Duyé

--no commit message

--no commit message
parent 91bf6ff1
Pipeline #86982 passed with stage
in 40 seconds
......@@ -9,7 +9,7 @@ Run tests:
stage: test
image: python:3.7-slim-stretch
before_script:
- pip install pytest
- pip install pytest pytest-vcr
- pip install --editable .
script:
- pytest
......
......@@ -21,6 +21,7 @@
"""Access DBnomics time series from Python."""
from collections import defaultdict
import itertools
import json
import logging
......@@ -28,6 +29,8 @@ import os
import urllib.parse
from urllib.parse import urljoin
import vcr
import pandas as pd
import requests
......@@ -40,6 +43,13 @@ editor_apply_endpoint_nb_series_per_post = 100
log = logging.getLogger(__name__)
my_vcr = vcr.VCR(
cassette_library_dir='fixtures/cassettes',
record_mode='once',
decode_compressed_response=True,
)
class TooManySeries(Exception):
def __init__(self, num_found, max_nb_series):
self.num_found = num_found
......@@ -185,18 +195,30 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
"""
# Call API via `iter_series`, add dimensions labels and store result in `series_list`. Fill `datasets_dimensions`
datasets_dimensions = None
series_dims_by_dataset_code = {} # TODO doc
series_dims_by_dataset_code = defaultdict(dict)
# series_dims_by_dataset_code example:
# {
# 'WB/DB': {
# 'EA19.1.0.0.0.ZUTN': { 'freq':'a', 'geo':'ea19', 'unit':'percentage-of-active-population'},
# 'EA20.1.0.0.0.ZUTN': { 'freq':'a', 'geo':'ea20', 'unit':'percentage-of-active-population'},
# ...
# },
# ...
# }
series_list = []
for series_infos in iter_series(api_link, max_nb_series=max_nb_series):
complete_dataset_code = series_infos['series']['provider_code'] + \
'/' + series_infos['series']['dataset_code'] # ex 'AMECO/ZUTN'
if datasets_dimensions is None:
# Let see if there's only one dataset returned by API, or many datasets
datasets_dimensions = series_infos['datasets_dimensions'] if 'datasets_dimensions' in series_infos else {
# Only one dataset
complete_dataset_code: series_infos['dataset_dimensions']
}
series_list.append(series_infos['series'])
# Store series dimensions information for future use
series_dims_by_dataset_code[complete_dataset_code] = series_infos['series']['dimensions']
series_dims_by_dataset_code[complete_dataset_code][series_infos['series']
['series_code']] = series_infos['series']['dimensions']
if len(series_list) == 0:
return pd.DataFrame()
......@@ -211,10 +233,17 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
# Add dimensions labels to flat_series
complete_dataset_code = flat_series['provider_code'] + '/' + flat_series['dataset_code'] # ex: "AMECO/ZUTN"
dataset_dimensions = datasets_dimensions[complete_dataset_code]
for dimension_code, dimension_label in dataset_dimensions['dimensions_labels'].items():
dimension_value_code = series_dims_by_dataset_code[complete_dataset_code][dimension_code]
flat_series[dimension_label] = dict(dataset_dimensions['dimensions_values_labels']
[dimension_code])[dimension_value_code]
if 'dimensions_labels' in dataset_dimensions:
dataset_dimensions_labels = dataset_dimensions['dimensions_labels']
else:
dataset_dimensions_labels = {dim_code: "{} (label)".format(dim_code)
for dim_code in dataset_dimensions['dimensions_codes_order']}
# Add dimensions values labels to current series
if 'dimensions_values_labels' in dataset_dimensions:
for dimension_code, dimension_label in dataset_dimensions_labels.items():
dimension_value_code = series_dims_by_dataset_code[complete_dataset_code][series['series_code']][dimension_code]
flat_series[dimension_label] = dict(dataset_dimensions['dimensions_values_labels']
[dimension_code])[dimension_value_code]
flat_series_list.append(flat_series)
# Only applies if filters are used.
......@@ -231,32 +260,28 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
for series in flat_series_list
] + filtered_series_list
# `flat_series_list` is a list of dicts like [{"code": "A.B.C", "a_key": 9}, {"code": "X.Y.Z", "other_key": 42}]
# Each series can have different keys so we want to do the union of all the keys of all the series. {"code", "a_key", "other_key"}
# In the DataFrame the different columns will be sparse (there will be `NaN` values when a series does not have a specific key).
# code | a_key | other_key
# ----- | ----- | ---------
# A.B.C | 9 | NaN
# X.Y.Z | NaN | 42
def union_sets(sets):
return set.union(*sets)
all_columns = union_sets([set(series.keys()) for series in flat_series_list])
# Compute dimensions_labels_columns and dimensions_codes_columns
dimensions_labels_columns = []
dimensions_codes_columns = []
# Compute dimensions_labels_columns_names and dimensions_codes_columns_names
dimensions_labels_columns_names = []
dimensions_codes_columns_names = []
for complete_dataset_code in datasets_dimensions.keys():
for dimension_code in datasets_dimensions[complete_dataset_code]['dimensions_codes_order']:
dimensions_codes_columns.append(dimension_code)
dimensions_labels_columns.append(
datasets_dimensions[complete_dataset_code]['dimensions_labels'][dimension_code])
dimensions_codes_columns_names.append(dimension_code)
# We only add dimensions labels column if this information is present
if 'dimensions_labels' in dataset_dimensions and 'dimensions_values_labels' in dataset_dimensions:
dimensions_labels_columns_names.append(
datasets_dimensions[complete_dataset_code]['dimensions_labels'][dimension_code])
else:
if 'dimensions_values_labels' in dataset_dimensions:
# No dimensions labels but dimensions_values_labels -> we add " (label)" to the end of dimension code
dimensions_labels_columns_names.append("{} (label)".format(dimension_code))
# In the case there's no dimension_label nor dimensions_values_labels, we do not add any column
# In the DataFrame we want to display the dimension columns at the right so we reorder them.
ordered_columns = common_columns + dimensions_codes_columns + dimensions_labels_columns
ordered_columns_names = common_columns + dimensions_codes_columns_names + dimensions_labels_columns_names
# Build dataframe
dataframes = (
pd.DataFrame(data=series, columns=ordered_columns)
pd.DataFrame(data=series, columns=ordered_columns_names)
for series in flat_series_list
)
return pd.concat(objs=dataframes, sort=False)
......@@ -360,6 +385,7 @@ def iter_series(api_link, max_nb_series=None):
total_nb_series = 0
while True:
# with vcr.use_cassette('tests/cassettes/test_fetch_series_without_dimensions_values_labels.yaml'):
response_json = fetch_series_page(api_link, offset=total_nb_series)
errors = response_json.get("errors")
......@@ -484,3 +510,4 @@ def grouper(n, iterable):
def without_keys(d, keys):
return {k: v for k, v in d.items() if k not in keys}
......@@ -85,5 +85,6 @@ setup(
],
tests_require=[
'pytest',
'pytest-vcr',
],
)
interactions:
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.21.0
method: GET
uri: https://api.db.nomics.world/v22/series?observations=1&series_ids=simu/lated/series2&offset=0
response:
body:
string: |
{ "_meta": {
"args": {
"align_periods": false,
"complete_missing_periods": false,
"facets": false,
"format": "json",
"limit": 1000,
"metadata": true,
"observations": true,
"offset": 0,
"series_ids": [
[
"BLS",
"is",
"ISU00000000000000"
]
]
},
"version": "22.0.0"
},
"datasets": {
"BLS/is": {
"code": "is",
"converted_at": "2019-04-12T16:59:00Z",
"created_at": "2018-01-17T12:12:26Z",
"description": "Occupational injury and illness counts and incidence rates are annual measures of the level and frequency of work-related injuries and illnesses. The rate equals the form of the number of injuries and illnesses per 100 full-time employees.",
"dimensions_codes_order": [
"area",
"case_type",
"data_type",
"industry",
"supersector"
],
"indexed_at": "2019-04-13T07:51:04.095Z",
"json_data_commit_ref": "0ace11084beae68061408e3a2cf9c02f6060c5cf",
"name": "Occupational Injuries and Illnesses: Industry Data (2014 forward)",
"nb_series": 809690,
"provider_code": "BLS",
"provider_name": "U.S. Bureau of Labor Statistics"
}
},
"errors": null,
"providers": {
"BLS": {
"code": "BLS",
"converted_at": "2019-09-07T09:00:40Z",
"created_at": "2017-11-15T16:25:33Z",
"indexed_at": "2019-09-07T09:02:20.155Z",
"json_data_commit_ref": "857b0b8cc3dfdc476c79fbda3f22f7eaec216a81",
"name": "U.S. Bureau of Labor Statistics",
"region": "US",
"slug": "bls",
"website": "https://www.bls.gov/"
}
},
"series": {
"docs": [
{
"@frequency": "annual",
"dataset_code": "is",
"dataset_name": "Occupational Injuries and Illnesses: Industry Data (2014 forward)",
"dimensions": {
"area": "000",
"case_type": "0",
"data_type": "0",
"industry": "000000",
"supersector": "000"
},
"indexed_at": "2019-04-13T07:51:04.095Z",
"period": [
"2014",
"2015",
"2016",
"2017"
],
"period_start_day": [
"2014-01-01",
"2015-01-01",
"2016-01-01",
"2017-01-01"
],
"provider_code": "BLS",
"series_code": "ISU00000000000000",
"series_name": "All ownerships, All U.S. – RSE of Total recordable cases – Rate of illness cases per 10,000 full-time workers (Size class 0) – All workers – All",
"value": [
1.6,
1.7,
1.6,
1.5
]
}
],
"limit": 1000,
"num_found": 1,
"offset": 0
}
}
headers:
Access-Control-Allow-Origin:
- '*'
Connection:
- keep-alive
Content-Type:
- application/json
Date:
- Tue, 10 Sep 2019 10:45:46 GMT
Server:
- nginx/1.10.3
Transfer-Encoding:
- chunked
content-length:
- '76323'
status:
code: 200
message: OK
version: 1
interactions:
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.21.0
method: GET
uri: https://api.db.nomics.world/v22/series?observations=1&series_ids=simu/lated/series1&offset=0
response:
body:
string: |
{ "_meta": {
"args": {
"align_periods": false,
"complete_missing_periods": false,
"facets": false,
"format": "json",
"limit": 1000,
"metadata": true,
"observations": true,
"offset": 0,
"series_ids": [
[
"BLS",
"is",
"ISU00000000000000"
]
]
},
"version": "22.0.0"
},
"datasets": {
"BLS/is": {
"code": "is",
"converted_at": "2019-04-12T16:59:00Z",
"created_at": "2018-01-17T12:12:26Z",
"description": "Occupational injury and illness counts and incidence rates are annual measures of the level and frequency of work-related injuries and illnesses. The rate equals the form of the number of injuries and illnesses per 100 full-time employees.",
"dimensions_codes_order": [
"area",
"case_type",
"data_type",
"industry",
"supersector"
],
"dimensions_labels": {
"area": "Area",
"case_type": "Case Type",
"data_type": "Data Type",
"industry": "Industry",
"supersector": "Supersector"
},
"indexed_at": "2019-04-13T07:51:04.095Z",
"json_data_commit_ref": "0ace11084beae68061408e3a2cf9c02f6060c5cf",
"name": "Occupational Injuries and Illnesses: Industry Data (2014 forward)",
"nb_series": 809690,
"provider_code": "BLS",
"provider_name": "U.S. Bureau of Labor Statistics"
}
},
"errors": null,
"providers": {
"BLS": {
"code": "BLS",
"converted_at": "2019-09-07T09:00:40Z",
"created_at": "2017-11-15T16:25:33Z",
"indexed_at": "2019-09-07T09:02:20.155Z",
"json_data_commit_ref": "857b0b8cc3dfdc476c79fbda3f22f7eaec216a81",
"name": "U.S. Bureau of Labor Statistics",
"region": "US",
"slug": "bls",
"website": "https://www.bls.gov/"
}
},
"series": {
"docs": [
{
"@frequency": "annual",
"dataset_code": "is",
"dataset_name": "Occupational Injuries and Illnesses: Industry Data (2014 forward)",
"dimensions": {
"area": "000",
"case_type": "0",
"data_type": "0",
"industry": "000000",
"supersector": "000"
},
"indexed_at": "2019-04-13T07:51:04.095Z",
"period": [
"2014",
"2015",
"2016",
"2017"
],
"period_start_day": [
"2014-01-01",
"2015-01-01",
"2016-01-01",
"2017-01-01"
],
"provider_code": "BLS",
"series_code": "ISU00000000000000",
"series_name": "All ownerships, All U.S. – RSE of Total recordable cases – Rate of illness cases per 10,000 full-time workers (Size class 0) – All workers – All",
"value": [
1.6,
1.7,
1.6,
1.5
]
}
],
"limit": 1000,
"num_found": 1,
"offset": 0
}
}
headers:
Access-Control-Allow-Origin:
- '*'
Connection:
- keep-alive
Content-Type:
- application/json
Date:
- Tue, 10 Sep 2019 10:45:46 GMT
Server:
- nginx/1.10.3
Transfer-Encoding:
- chunked
content-length:
- '76323'
status:
code: 200
message: OK
version: 1
......@@ -19,6 +19,7 @@
import logging
import pytest
import pandas as pd
......@@ -228,3 +229,41 @@ def test_fetch_series_with_filter_on_one_series_with_wrong_frequency(caplog):
assert len(caplog.records) == 1
assert caplog.records[0].levelname == 'ERROR'
assert "Annual is already the lowest frequency" in caplog.records[0].message
# --- Tests above uses VCR.py (https://vcrpy.readthedocs.io/en/latest/usage.html) to load fixtures ---
@pytest.mark.vcr(decode_compressed_response=True)
def test_fetch_series_without_dimensions_labels():
df = fetch_series("WB", "DB", dimensions={
"country": ["FR", "IT", "ES"],
"indicator": ["IC.REG.COST.PC.FE.ZS.DRFN"],
}) # Thanks to @pytest.mark.vcr decorator, this request result will be read from cassettes/test_fetch_series_without_dimensions_labels.yaml file
# Check that all expected columns are present
expected_columns = {'indicator', 'country', 'indicator (label)', 'country (label)'}
assert expected_columns & set(df.columns) == expected_columns, set(df.columns)
# Check dimensions and dimensions_values_labels
df_line = df.iloc[30]
assert df_line['country'] == 'FR'
assert df_line['country (label)'] == 'France'
@pytest.mark.vcr(decode_compressed_response=True)
def test_fetch_series_without_dimensions_values_labels():
# Thanks to @pytest.mark.vcr decorator, this request result will be read from cassettes/test_fetch_series_without_dimensions_values_labels.yaml file
df = fetch_series('simu/lated/series1')
# In the case of any dimensions_values_labels, we do not want dimensions_labels column to be added
assert not 'Data Type' in list(df.columns)
@pytest.mark.vcr(decode_compressed_response=True)
def test_fetch_series_without_dimensions_labels_nor_dimensions_values_labels():
# Thanks to @pytest.mark.vcr decorator, this request result will be read from cassettes/test_fetch_series_without_dimensions_labels_nor_dimensions_values_labels.yaml file
df = fetch_series('simu/lated/series2')
# dimensions_labels column shouldn't exist
assert not 'Data Type' in list(df.columns)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment