Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
D
dbnomics-python-client
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
2
Merge Requests
2
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
dbnomics
dbnomics-python-client
Commits
d0c4d925
Commit
d0c4d925
authored
Sep 04, 2019
by
Bruno Duyé
Committed by
Bruno Duyé
Sep 04, 2019
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix #482: Python client - Add dimensions labels to dataframes
parent
8e244d0b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
75 additions
and
17 deletions
+75
-17
dbnomics/__init__.py
dbnomics/__init__.py
+75
-17
No files found.
dbnomics/__init__.py
View file @
d0c4d925
...
...
@@ -183,8 +183,18 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
Example:
fetch_series(api_link="https://api.db.nomics.world/v22/series?provider_code=AMECO&dataset_code=ZUTN")
"""
# Call API via `iter_series`, store result in `series_list`.
series_list
=
list
(
iter_series
(
api_link
,
max_nb_series
=
max_nb_series
))
# Call API via `iter_series`, add dimensions labels and store result in `series_list`. Fill `datasets_dimensions`
datasets_dimensions
=
None
series_dims_by_dataset_code
=
{}
# TODO doc
series_list
=
[]
for
series_infos
in
iter_series
(
api_link
,
max_nb_series
=
max_nb_series
):
if
datasets_dimensions
is
None
:
datasets_dimensions
=
series_infos
[
'datasets_dimensions'
]
series_infos
=
series_infos
[
'series'
]
series_list
.
append
(
series_infos
)
complete_dataset_code
=
series_infos
[
'provider_code'
]
+
'/'
+
series_infos
[
'dataset_code'
]
# ex 'AMECO/ZUTN'
# Store series dimensions information for future use
series_dims_by_dataset_code
[
complete_dataset_code
]
=
series_infos
[
'dimensions'
]
if
len
(
series_list
)
==
0
:
return
pd
.
DataFrame
()
...
...
@@ -192,8 +202,17 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
common_columns
=
[
"@frequency"
,
"provider_code"
,
"dataset_code"
,
"dataset_name"
,
"series_code"
,
"series_name"
,
"original_period"
,
"period"
,
"original_value"
,
"value"
]
# Normalize series received from the API (rename some keys of JSON result to match DataFrame organization).
normalized_series_list
=
list
(
map
(
normalize_dbnomics_series
,
series_list
))
flat_series_list
=
[]
for
series
in
series_list
:
# Flatten series received from the API (rename some keys of JSON result to match DataFrame organization).
flat_series
=
flatten_dbnomics_series
(
series
)
# Add dimensions labels to flat_series
complete_dataset_code
=
flat_series
[
'provider_code'
]
+
'/'
+
flat_series
[
'dataset_code'
]
# ex: "AMECO/ZUTN"
dataset_dimensions
=
datasets_dimensions
[
complete_dataset_code
]
for
dimension_code
,
dimension_label
in
dataset_dimensions
[
'dimensions_labels'
].
items
():
dimension_value_code
=
series_dims_by_dataset_code
[
complete_dataset_code
][
dimension_code
]
flat_series
[
dimension_label
]
=
dataset_dimensions
[
'dimensions_values_labels'
][
dimension_code
][
dimension_value_code
]
flat_series_list
.
append
(
flat_series
)
# Only applies if filters are used.
if
filters
:
...
...
@@ -204,12 +223,12 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
for
series
in
filter_series
(
series_list
=
series_list
,
filters
=
filters
,
editor_api_base_url
=
editor_api_base_url
)
]
normalized
_series_list
=
[
flat
_series_list
=
[
{
**
series
,
"filtered"
:
False
}
for
series
in
normalized
_series_list
for
series
in
flat
_series_list
]
+
filtered_series_list
# `
normalized
_series_list` is a list of dicts like [{"code": "A.B.C", "a_key": 9}, {"code": "X.Y.Z", "other_key": 42}]
# `
flat
_series_list` is a list of dicts like [{"code": "A.B.C", "a_key": 9}, {"code": "X.Y.Z", "other_key": 42}]
# Each series can have different keys so we want to do the union of all the keys of all the series. {"code", "a_key", "other_key"}
# In the DataFrame the different columns will be sparse (there will be `NaN` values when a series does not have a specific key).
# code | a_key | other_key
...
...
@@ -218,16 +237,24 @@ def fetch_series_by_api_link(api_link, max_nb_series=None,
# X.Y.Z | NaN | 42
def
union_sets
(
sets
):
return
set
.
union
(
*
sets
)
all_columns
=
union_sets
([
set
(
series
.
keys
())
for
series
in
normalized_series_list
])
all_columns
=
union_sets
([
set
(
series
.
keys
())
for
series
in
flat_series_list
])
dimension_codes_columns
=
sorted
(
all_columns
-
set
(
common_columns
))
# TODO: use dataset's dimensions ?
dimension_columns
=
sorted
(
all_columns
-
set
(
common_columns
))
# Get dimensions labels and store them in dimensions_labels_columns
dimensions_labels_columns
=
[]
for
complete_dataset_code
in
datasets_dimensions
.
keys
():
for
dimension_code
in
datasets_dimensions
[
complete_dataset_code
][
'dimensions_codes_order'
]:
dimensions_labels_columns
.
append
(
datasets_dimensions
[
complete_dataset_code
][
'dimensions_labels'
][
dimension_code
])
dimensions_labels_columns
=
sorted
(
dimensions_labels_columns
)
# In the DataFrame we want to display the dimension columns at the right so we reorder them.
ordered_columns
=
common_columns
+
dimension_columns
ordered_columns
=
common_columns
+
dimension_co
des_columns
+
dimensions_labels_co
lumns
# Build dataframe
dataframes
=
(
pd
.
DataFrame
(
data
=
series
,
columns
=
ordered_columns
)
for
series
in
normalized
_series_list
for
series
in
flat
_series_list
)
return
pd
.
concat
(
objs
=
dataframes
,
sort
=
False
)
...
...
@@ -285,10 +312,31 @@ def iter_filtered_series(series_list, filters, apply_endpoint_url):
continue
for
dbnomics_series
,
filter_result
in
zip
(
series_group
,
filter_results
):
yield
normalize
_editor_series
(
series
=
filter_result
[
"series"
],
dbnomics_series
=
dbnomics_series
)
yield
flatten
_editor_series
(
series
=
filter_result
[
"series"
],
dbnomics_series
=
dbnomics_series
)
def
iter_series
(
api_link
,
max_nb_series
=
None
):
# """Iterate through series.docs returned by API"""
# Returns dicts of datasets dimensions and series.
# - dataset_dimensions don't change between calls
# - series is the current series
# Example: like:
# {
# 'dataset_dimensions': {
# "AMECO/ZUTN": {
# "code": "ZUTN",
# "converted_at": "2019-05-08T02:51:04Z",
# "dimensions_codes_order": ["freq", "unit", "geo" ...],
# ...
# },
# "CEPII/CHELEM-TRADE-GTAP": {
# "code": "CHELEM-TRADE-GTAP",
# "converted_at": "2019-01-29T15:53:30Z",
# "dimensions_codes_order": ["exporter", "importer", "secgroup", ...],
# ...
# },
# 'series':
# }
total_nb_series
=
0
while
True
:
...
...
@@ -308,17 +356,27 @@ def iter_series(api_link, max_nb_series=None):
page_nb_series
=
len
(
series_page
[
'docs'
])
total_nb_series
+=
page_nb_series
#
Stop if we have enough series.
#
If user asked for a maximum number of series
if
max_nb_series
is
not
None
:
if
total_nb_series
==
max_nb_series
:
# Stop if we have enough series.
break
elif
total_nb_series
>
max_nb_series
:
# Do not respond more series than the asked max_nb_series.
nb_remaining_series
=
page_nb_series
-
(
total_nb_series
-
max_nb_series
)
yield
from
series_page
[
'docs'
][:
nb_remaining_series
]
for
series
in
series_page
[
'docs'
][:
nb_remaining_series
]:
yield
{
'datasets_dimensions'
:
response_json
[
'datasets'
],
'series'
:
series
}
break
yield
from
series_page
[
'docs'
]
# If user didn't asked for a maximum number of series
for
series
in
series_page
[
'docs'
]:
yield
{
'datasets_dimensions'
:
response_json
[
'datasets'
],
'series'
:
series
}
# Stop if we downloaded all the series.
assert
total_nb_series
<=
num_found
,
(
total_nb_series
,
num_found
)
# Can't download more series than num_found.
...
...
@@ -326,7 +384,7 @@ def iter_series(api_link, max_nb_series=None):
break
def
normalize
_dbnomics_series
(
series
):
def
flatten
_dbnomics_series
(
series
):
"""Adapt DBnomics series attributes to ease DataFrame construction.
Rename some dict attributes, flatten other ones
...
...
@@ -352,7 +410,7 @@ def normalize_dbnomics_series(series):
return
series
def
normalize
_editor_series
(
series
,
dbnomics_series
):
def
flatten
_editor_series
(
series
,
dbnomics_series
):
"""Adapt Time Series Editor series attributes to ease DataFrame construction."""
series
=
normalize_period
(
series
)
series
=
normalize_value
(
series
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment