Commit d0c4d925 authored by Bruno Duyé's avatar Bruno Duyé Committed by Bruno Duyé

Fix #482: Python client - Add dimensions labels to dataframes

parent 8e244d0b
......@@ -183,8 +183,18 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
Example:
fetch_series(api_link="https://api.db.nomics.world/v22/series?provider_code=AMECO&dataset_code=ZUTN")
"""
# Call API via `iter_series`, store result in `series_list`.
series_list = list(iter_series(api_link, max_nb_series=max_nb_series))
# Call API via `iter_series`, add dimensions labels and store result in `series_list`. Fill `datasets_dimensions`
datasets_dimensions = None
series_dims_by_dataset_code = {} # TODO doc
series_list = []
for series_infos in iter_series(api_link, max_nb_series=max_nb_series):
if datasets_dimensions is None:
datasets_dimensions = series_infos['datasets_dimensions']
series_infos = series_infos['series']
series_list.append(series_infos)
complete_dataset_code = series_infos['provider_code'] + '/' + series_infos['dataset_code'] # ex 'AMECO/ZUTN'
# Store series dimensions information for future use
series_dims_by_dataset_code[complete_dataset_code] = series_infos['dimensions']
if len(series_list) == 0:
return pd.DataFrame()
......@@ -192,8 +202,17 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
common_columns = ["@frequency", "provider_code", "dataset_code", "dataset_name", "series_code", "series_name",
"original_period", "period", "original_value", "value"]
# Normalize series received from the API (rename some keys of JSON result to match DataFrame organization).
normalized_series_list = list(map(normalize_dbnomics_series, series_list))
flat_series_list = []
for series in series_list:
# Flatten series received from the API (rename some keys of JSON result to match DataFrame organization).
flat_series = flatten_dbnomics_series(series)
# Add dimensions labels to flat_series
complete_dataset_code = flat_series['provider_code'] + '/' + flat_series['dataset_code'] # ex: "AMECO/ZUTN"
dataset_dimensions = datasets_dimensions[complete_dataset_code]
for dimension_code, dimension_label in dataset_dimensions['dimensions_labels'].items():
dimension_value_code = series_dims_by_dataset_code[complete_dataset_code][dimension_code]
flat_series[dimension_label] = dataset_dimensions['dimensions_values_labels'][dimension_code][dimension_value_code]
flat_series_list.append(flat_series)
# Only applies if filters are used.
if filters:
......@@ -204,12 +223,12 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
for series in filter_series(series_list=series_list, filters=filters,
editor_api_base_url=editor_api_base_url)
]
normalized_series_list = [
flat_series_list = [
{**series, "filtered": False}
for series in normalized_series_list
for series in flat_series_list
] + filtered_series_list
# `normalized_series_list` is a list of dicts like [{"code": "A.B.C", "a_key": 9}, {"code": "X.Y.Z", "other_key": 42}]
# `flat_series_list` is a list of dicts like [{"code": "A.B.C", "a_key": 9}, {"code": "X.Y.Z", "other_key": 42}]
# Each series can have different keys so we want to do the union of all the keys of all the series. {"code", "a_key", "other_key"}
# In the DataFrame the different columns will be sparse (there will be `NaN` values when a series does not have a specific key).
# code | a_key | other_key
......@@ -218,16 +237,24 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
# X.Y.Z | NaN | 42
def union_sets(sets):
return set.union(*sets)
all_columns = union_sets([set(series.keys()) for series in normalized_series_list])
all_columns = union_sets([set(series.keys()) for series in flat_series_list])
dimension_codes_columns = sorted(all_columns - set(common_columns)) # TODO: use dataset's dimensions ?
dimension_columns = sorted(all_columns - set(common_columns))
# Get dimensions labels and store them in dimensions_labels_columns
dimensions_labels_columns = []
for complete_dataset_code in datasets_dimensions.keys():
for dimension_code in datasets_dimensions[complete_dataset_code]['dimensions_codes_order']:
dimensions_labels_columns.append(
datasets_dimensions[complete_dataset_code]['dimensions_labels'][dimension_code])
dimensions_labels_columns = sorted(dimensions_labels_columns)
# In the DataFrame we want to display the dimension columns at the right so we reorder them.
ordered_columns = common_columns + dimension_columns
ordered_columns = common_columns + dimension_codes_columns + dimensions_labels_columns
# Build dataframe
dataframes = (
pd.DataFrame(data=series, columns=ordered_columns)
for series in normalized_series_list
for series in flat_series_list
)
return pd.concat(objs=dataframes, sort=False)
......@@ -285,10 +312,31 @@ def iter_filtered_series(series_list, filters, apply_endpoint_url):
continue
for dbnomics_series, filter_result in zip(series_group, filter_results):
yield normalize_editor_series(series=filter_result["series"], dbnomics_series=dbnomics_series)
yield flatten_editor_series(series=filter_result["series"], dbnomics_series=dbnomics_series)
def iter_series(api_link, max_nb_series=None):
# """Iterate through series.docs returned by API"""
# Returns dicts of datasets dimensions and series.
# - dataset_dimensions don't change between calls
# - series is the current series
# Example: like:
# {
# 'dataset_dimensions': {
# "AMECO/ZUTN": {
# "code": "ZUTN",
# "converted_at": "2019-05-08T02:51:04Z",
# "dimensions_codes_order": ["freq", "unit", "geo" ...],
# ...
# },
# "CEPII/CHELEM-TRADE-GTAP": {
# "code": "CHELEM-TRADE-GTAP",
# "converted_at": "2019-01-29T15:53:30Z",
# "dimensions_codes_order": ["exporter", "importer", "secgroup", ...],
# ...
# },
# 'series':
# }
total_nb_series = 0
while True:
......@@ -308,17 +356,27 @@ def iter_series(api_link, max_nb_series=None):
page_nb_series = len(series_page['docs'])
total_nb_series += page_nb_series
# Stop if we have enough series.
# If user asked for a maximum number of series
if max_nb_series is not None:
if total_nb_series == max_nb_series:
# Stop if we have enough series.
break
elif total_nb_series > max_nb_series:
# Do not respond more series than the asked max_nb_series.
nb_remaining_series = page_nb_series - (total_nb_series - max_nb_series)
yield from series_page['docs'][:nb_remaining_series]
for series in series_page['docs'][:nb_remaining_series]:
yield {
'datasets_dimensions': response_json['datasets'],
'series': series
}
break
yield from series_page['docs']
# If user didn't asked for a maximum number of series
for series in series_page['docs']:
yield {
'datasets_dimensions': response_json['datasets'],
'series': series
}
# Stop if we downloaded all the series.
assert total_nb_series <= num_found, (total_nb_series, num_found) # Can't download more series than num_found.
......@@ -326,7 +384,7 @@ def iter_series(api_link, max_nb_series=None):
break
def normalize_dbnomics_series(series):
def flatten_dbnomics_series(series):
"""Adapt DBnomics series attributes to ease DataFrame construction.
Rename some dict attributes, flatten other ones
......@@ -352,7 +410,7 @@ def normalize_dbnomics_series(series):
return series
def normalize_editor_series(series, dbnomics_series):
def flatten_editor_series(series, dbnomics_series):
"""Adapt Time Series Editor series attributes to ease DataFrame construction."""
series = normalize_period(series)
series = normalize_value(series)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment