From 2102b8cb11e73adbfd8724560a8d628d4c9e25e2 Mon Sep 17 00:00:00 2001 From: s915 <s915.stem@gmail.com> Date: Tue, 14 Apr 2020 00:17:05 +0200 Subject: [PATCH] New function 'rdb_series()'. --- NAMESPACE | 1 + R/dot_rdb.R | 4 +- R/rdb_datasets.R | 24 ++++- R/rdb_dimensions.R | 10 +- R/rdb_providers.R | 2 +- R/rdb_series.R | 244 ++++++++++++++++++++++++++++++++++++++++++ R/zzz.R | 3 +- man/rdb_datasets.Rd | 7 +- man/rdb_dimensions.Rd | 2 +- man/rdb_providers.Rd | 2 +- man/rdb_series.Rd | 99 +++++++++++++++++ 11 files changed, 383 insertions(+), 15 deletions(-) create mode 100644 R/rdb_series.R create mode 100644 man/rdb_series.Rd diff --git a/NAMESPACE b/NAMESPACE index 62f5dda..64bf2c0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(rdb_datasets) export(rdb_dimensions) export(rdb_last_updates) export(rdb_providers) +export(rdb_series) import(curl) import(data.table) import(jsonlite) diff --git a/R/dot_rdb.R b/R/dot_rdb.R index bb162a9..1358993 100644 --- a/R/dot_rdb.R +++ b/R/dot_rdb.R @@ -142,7 +142,9 @@ DBdata <- lapply(sequence, function(i) { # Modifying link - tmp_api_link <- paste0(api_link, sep, "offset=", i * limit) + tmp_api_link <- paste0( + api_link, sep, "offset=", format(i * limit, scientific = FALSE) + ) # Fetching data DBlist <- get_data(tmp_api_link, use_readLines, curl_config) diff --git a/R/rdb_datasets.R b/R/rdb_datasets.R index e83632f..bd34ccf 100644 --- a/R/rdb_datasets.R +++ b/R/rdb_datasets.R @@ -27,6 +27,7 @@ #' @param simplify Logical (default \code{FALSE}). If \code{TRUE}, when the #' datasets are requested for only one provider then a \code{data.table} #' is returned, not a list of \code{data.table}s. +#' @param ... Additionals arguments. #' @return A named list of \code{data.table}s or a \code{data.table}. #' @examples #' \dontrun{ @@ -49,14 +50,15 @@ #' ) #' } #' @seealso \code{\link{rdb_providers}}, \code{\link{rdb_last_updates}}, -#' \code{\link{rdb_dimensions}} +#' \code{\link{rdb_dimensions}}, \code{\link{rdb_series}} #' @author Sebastien Galais #' @export rdb_datasets <- function( provider_code = NULL, use_readLines = getOption("rdbnomics.use_readLines"), curl_config = getOption("rdbnomics.curl_config"), - simplify = FALSE + simplify = FALSE, + ... ) { # TO CHECK THE DATASETS # parse_web <- sapply(provider_code, function(x) { @@ -66,6 +68,16 @@ rdb_datasets <- function( # gsub("\".*", "", y) # }, simplify = FALSE) + # Additionals arguments + progress_bar = TRUE + if (length(list(...)) > 0) { + tmp_progress_bar = list(...)$progress_bar + if (!is.null(tmp_progress_bar)) { + progress_bar <- tmp_progress_bar + } + } + check_argument(progress_bar, "logical") + # All providers if (is.null(provider_code)) { provider_code <- rdb_providers( @@ -89,7 +101,7 @@ rdb_datasets <- function( authorized_version(api_version) # Fetching all datasets - if (getOption("rdbnomics.progress_bar_datasets")) { + if (getOption("rdbnomics.progress_bar_datasets") & progress_bar) { pb <- utils::txtProgressBar(min = 0, max = length(provider_code), style = 3) } @@ -105,7 +117,7 @@ rdb_datasets <- function( tmp <- tmp[, .(code, name)] tmp <- unique(tmp) - if (getOption("rdbnomics.progress_bar_datasets")) { + if (getOption("rdbnomics.progress_bar_datasets") & progress_bar) { utils::setTxtProgressBar(pb, i) } @@ -115,7 +127,9 @@ rdb_datasets <- function( }) }, simplify = FALSE) - if (getOption("rdbnomics.progress_bar_datasets")) { close(pb) } + if (getOption("rdbnomics.progress_bar_datasets") & progress_bar) { + close(pb) + } datasets <- stats::setNames(datasets, provider_code) datasets <- Filter(Negate(is.null), datasets) diff --git a/R/rdb_dimensions.R b/R/rdb_dimensions.R index 3329743..385347d 100644 --- a/R/rdb_dimensions.R +++ b/R/rdb_dimensions.R @@ -60,7 +60,7 @@ #' ) #' } #' @seealso \code{\link{rdb_providers}}, \code{\link{rdb_last_updates}}, -#' \code{\link{rdb_datasets}} +#' \code{\link{rdb_datasets}}, \code{\link{rdb_series}} #' @author Sebastien Galais #' @export rdb_dimensions <- function( @@ -73,8 +73,12 @@ rdb_dimensions <- function( # Additionals arguments progress_bar = TRUE if (length(list(...)) > 0) { - progress_bar = list(...)$progress_bar + tmp_progress_bar = list(...)$progress_bar + if (!is.null(tmp_progress_bar)) { + progress_bar <- tmp_progress_bar + } } + check_argument(progress_bar, "logical") # All providers if (is.null(provider_code) & !is.null(dataset_code)) { @@ -96,7 +100,7 @@ rdb_dimensions <- function( dataset_code <- rdb_datasets( provider_code = provider_code, use_readLines = use_readLines, curl_config = curl_config, - simplify = FALSE + simplify = FALSE, progress_bar = FALSE ) dataset_code <- sapply(dataset_code, `[[`, "code", simplify = FALSE) } else { diff --git a/R/rdb_providers.R b/R/rdb_providers.R index c5d173a..fc4bff4 100644 --- a/R/rdb_providers.R +++ b/R/rdb_providers.R @@ -35,7 +35,7 @@ #' rdb_providers(curl_config = list(proxy = "<proxy>", proxyport = <port>)) #' } #' @seealso \code{\link{rdb_last_updates}}, \code{\link{rdb_datasets}}, -#' \code{\link{rdb_dimensions}} +#' \code{\link{rdb_dimensions}}, \code{\link{rdb_series}} #' @author Sebastien Galais #' @export rdb_providers <- function( diff --git a/R/rdb_series.R b/R/rdb_series.R new file mode 100644 index 0000000..4732a3b --- /dev/null +++ b/R/rdb_series.R @@ -0,0 +1,244 @@ +#' Download list of series for datasets of DBnomics providers. +#' +#' \code{rdb_series} downloads the list of series for +#' available datasets of a selection of providers from +#' \href{https://db.nomics.world/}{DBnomics}. \cr +#' /!\ We warn the user that this function can be (very) long to execute. We remind +#' that DBnomics requests data from 63 providers to retrieve 21675 datasets for +#' a total of approximately 720 millions series. +#' +#' By default, the function returns a nested named list of \code{data.table}s +#' containing the dimensions of datasets for providers from +#' \href{https://db.nomics.world/}{DBnomics}. +#' +#' @param provider_code Character string (default \code{NULL}). DBnomics code +#' of one or multiple providers. If \code{NULL}, the providers are firstly +#' dowloaded with the function \code{\link{rdb_providers}} and then the +#' datasets are requested. +#' @param dataset_code Character string (default \code{NULL}). DBnomics code +#' of one or multiple datasets of a provider. If \code{NULL}, the datasets +#' codes are dowloaded with the function \code{\link{rdb_datasets}} and then +#' the dimensions are requested. +#' @param use_readLines Logical (default \code{FALSE}). If \code{TRUE}, then +#' the data are requested and read with the base function \code{readLines} i.e. +#' through the default R internet connection. This can be used to get round the +#' error \code{Could not resolve host: api.db.nomics.world}. +#' @param curl_config Named list (default \code{NULL}). If not +#' \code{NULL}, it is used to configure a proxy connection. This +#' configuration is passed to the function \code{curl_fetch_memory} of the package +#' \pkg{curl}. A temporary \code{curl_handle} object is created internally +#' with arguments equal to the provided list in \code{curl_config}.\cr +#' For \code{curl_fetch_memory} arguments see \code{\link[curl]{curl_fetch}}. +#' For available curl options see \code{\link[curl]{curl_options}}, +#' \code{names(curl_options())} and +#' \href{https://curl.haxx.se/libcurl/c/curl_easy_setopt.html}{libcurl}. +#' @param simplify Logical (default \code{FALSE}). If \code{TRUE}, when the +#' dimensions are requested for only one provider and one dataset then a +#' named list of \code{data.table}s is returned, not a nested named list of +#' \code{data.table}s. +#' @param verbose Logical (default \code{FALSE}). Show number of series per +#' datasets and providers. +#' @param ... Additionals arguments. +#' @return A nested named list of \code{data.table}s or a named list of +#' \code{data.table}s. +#' @examples +#' \dontrun{ +#' rdb_series(provider_code = "IMF", dataset_code = "WEO") +#' +#' rdb_series(provider_code = "IMF", dataset_code = "WEO", simplify = TRUE) +#' +#' rdb_series(provider_code = "IMF", verbose = TRUE) +#' +#' options(rdbnomics.progress_bar_series = TRUE) +#' rdb_series(provider_code = "IMF", dataset_code = "WEO") +#' options(rdbnomics.progress_bar_series = FALSE) +#' +#' rdb_series( +#' provider_code = "IMF", dataset_code = "WEO", +#' use_readLines = TRUE +#' ) +#' +#' rdb_series( +#' provider_code = "IMF", dataset_code = "WEO", +#' curl_config = list(proxy = "<proxy>", proxyport = <port>) +#' ) +#' } +#' @seealso \code{\link{rdb_providers}}, \code{\link{rdb_last_updates}}, +#' \code{\link{rdb_datasets}}, \code{\link{rdb_dimensions}} +#' @author Sebastien Galais +#' @export +rdb_series <- function( + provider_code = NULL, dataset_code = NULL, + use_readLines = getOption("rdbnomics.use_readLines"), + curl_config = getOption("rdbnomics.curl_config"), + simplify = FALSE, verbose = FALSE, + ... +) { + # Additionals arguments + progress_bar = TRUE + if (length(list(...)) > 0) { + tmp_progress_bar = list(...)$progress_bar + if (!is.null(tmp_progress_bar)) { + progress_bar <- tmp_progress_bar + } + } + check_argument(progress_bar, "logical") + + only_first_two = FALSE + if (length(list(...)) > 0) { + tmp_only_first_two = list(...)$only_first_two + if (!is.null(tmp_only_first_two)) { + only_first_two <- tmp_only_first_two + } + } + check_argument(only_first_two, "logical") + + # All providers + if (is.null(provider_code)) { + provider_code <- rdb_providers( + code = TRUE, + use_readLines = use_readLines, curl_config = curl_config + ) + } + check_argument(provider_code, "character", len = FALSE) + + if (is.null(dataset_code)) { + dataset_code <- rdb_datasets( + provider_code = provider_code, + use_readLines = use_readLines, curl_config = curl_config, + simplify = FALSE, progress_bar = FALSE + ) + dataset_code <- sapply(dataset_code, `[[`, "code", simplify = FALSE) + } else { + check_argument(dataset_code, "character", len = FALSE) + dataset_code <- list(dataset_code) + dataset_code <- stats::setNames(dataset_code, provider_code) + } + + # Checking arguments + check_argument(use_readLines, "logical") + check_argument(simplify, "logical") + + # Setting API url + api_base_url <- getOption("rdbnomics.api_base_url") + check_argument(api_base_url, "character") + + # Setting API version + api_version <- getOption("rdbnomics.api_version") + check_argument(api_version, c("numeric", "integer")) + authorized_version(api_version) + + # Fetching all datasets + series <- sapply(provider_code, function(pc) { + if (getOption("rdbnomics.progress_bar_series") & progress_bar) { + pb <- utils::txtProgressBar( + min = 0, max = length(dataset_code[[pc]]), style = 3 + ) + } + + tmp_ser <- sapply(seq_along(dataset_code[[pc]]), function(i) { + tryCatch({ + dc <- dataset_code[[pc]][i] + + api_link <- paste0(api_base_url, "/v", api_version, "/series/", pc, "/", dc) + DBlist <- get_data(api_link, use_readLines, curl_config) + + limit <- DBlist$series$limit + num_found <- DBlist$series$num_found + + if (verbose) { + if (getOption("rdbnomics.progress_bar_series") & progress_bar) { + cat("\n") + } + cat( + paste0( + "The dataset '", dc, "' from provider '", pc, "' contains ", + num_found, " series." + ), + "\n" + ) + } + + DBdata <- list( + data.table::data.table( + series_code = DBlist$series$docs$series_code, + series_name = DBlist$series$docs$series_name + ) + ) + + if (num_found > limit) { + DBdata0 <- DBdata + rm(DBdata) + + sequence <- seq(1, floor(num_found / limit), 1) + + # Modifying link + if (grepl("offset=", api_link)) { + api_link <- gsub("\\&offset=[0-9]+", "", api_link) + api_link <- gsub("\\?offset=[0-9]+", "", api_link) + } + sep <- ifelse(grepl("\\?", api_link), "&", "?") + + if (only_first_two) { + sequence <- utils::head(sequence, 1) + } + + DBdata <- lapply(sequence, function(j) { + # Modifying link + tmp_api_link <- paste0( + api_link, sep, "offset=", format(j * limit, scientific = FALSE) + ) + # Fetching data + DBlist <- get_data(tmp_api_link, use_readLines, curl_config) + + # Extracting data + data.table::data.table( + series_code = DBlist$series$docs$series_code, + series_name = DBlist$series$docs$series_name + ) + }) + + DBdata <- append(DBdata, DBdata0, 0) + rm(DBdata0) + } + + DBdata <- rbindlist(DBdata, use.names = TRUE, fill = TRUE) + + if (getOption("rdbnomics.progress_bar_series") & progress_bar) { + utils::setTxtProgressBar(pb, i) + } + + DBdata + }, error = function(e) { + NULL + }) + }, simplify = FALSE) + + if (getOption("rdbnomics.progress_bar_series") & progress_bar) { + close(pb) + } + + tmp_ser <- stats::setNames(tmp_ser, dataset_code[[pc]]) + Filter(Negate(is.null), tmp_ser) + }, simplify = FALSE) + series <- Filter(Negate(is.null), series) + # We remove the empty lists, the empty data.tables, etc. + series <- check_dimensions(series, 2) + + if (length(series) <= 0) { + warning( + "Error when fetching the series.", + call. = FALSE + ) + return(NULL) + } + + if (simplify) { + len <- sapply(series, length) + if (length(series) == 1 & len[1] == 1) { + return(series[[1]][[1]]) + } + } + + series +} diff --git a/R/zzz.R b/R/zzz.R index f8441a0..344c066 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -40,7 +40,8 @@ rdbnomics.progress_bar = TRUE, rdbnomics.translate_codes = TRUE, rdbnomics.progress_bar_datasets = FALSE, - rdbnomics.progress_bar_dimensions = FALSE + rdbnomics.progress_bar_dimensions = FALSE, + rdbnomics.progress_bar_series = FALSE ) opts <- append( opts, diff --git a/man/rdb_datasets.Rd b/man/rdb_datasets.Rd index dbba6a1..3a55a07 100644 --- a/man/rdb_datasets.Rd +++ b/man/rdb_datasets.Rd @@ -8,7 +8,8 @@ rdb_datasets( provider_code = NULL, use_readLines = getOption("rdbnomics.use_readLines"), curl_config = getOption("rdbnomics.curl_config"), - simplify = FALSE + simplify = FALSE, + ... ) } \arguments{ @@ -35,6 +36,8 @@ For available curl options see \code{\link[curl]{curl_options}}, \item{simplify}{Logical (default \code{FALSE}). If \code{TRUE}, when the datasets are requested for only one provider then a \code{data.table} is returned, not a list of \code{data.table}s.} + +\item{...}{Additionals arguments.} } \value{ A named list of \code{data.table}s or a \code{data.table}. @@ -71,7 +74,7 @@ rdb_datasets( } \seealso{ \code{\link{rdb_providers}}, \code{\link{rdb_last_updates}}, -\code{\link{rdb_dimensions}} +\code{\link{rdb_dimensions}}, \code{\link{rdb_series}} } \author{ Sebastien Galais diff --git a/man/rdb_dimensions.Rd b/man/rdb_dimensions.Rd index babe7d8..538be4c 100644 --- a/man/rdb_dimensions.Rd +++ b/man/rdb_dimensions.Rd @@ -86,7 +86,7 @@ rdb_dimensions( } \seealso{ \code{\link{rdb_providers}}, \code{\link{rdb_last_updates}}, -\code{\link{rdb_datasets}} +\code{\link{rdb_datasets}}, \code{\link{rdb_series}} } \author{ Sebastien Galais diff --git a/man/rdb_providers.Rd b/man/rdb_providers.Rd index dc4c198..35d2e8c 100644 --- a/man/rdb_providers.Rd +++ b/man/rdb_providers.Rd @@ -55,7 +55,7 @@ rdb_providers(curl_config = list(proxy = "<proxy>", proxyport = <port>)) } \seealso{ \code{\link{rdb_last_updates}}, \code{\link{rdb_datasets}}, -\code{\link{rdb_dimensions}} +\code{\link{rdb_dimensions}}, \code{\link{rdb_series}} } \author{ Sebastien Galais diff --git a/man/rdb_series.Rd b/man/rdb_series.Rd new file mode 100644 index 0000000..3a4d8fa --- /dev/null +++ b/man/rdb_series.Rd @@ -0,0 +1,99 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rdb_series.R +\name{rdb_series} +\alias{rdb_series} +\title{Download list of series for datasets of DBnomics providers.} +\usage{ +rdb_series( + provider_code = NULL, + dataset_code = NULL, + use_readLines = getOption("rdbnomics.use_readLines"), + curl_config = getOption("rdbnomics.curl_config"), + simplify = FALSE, + verbose = FALSE, + ... +) +} +\arguments{ +\item{provider_code}{Character string (default \code{NULL}). DBnomics code +of one or multiple providers. If \code{NULL}, the providers are firstly +dowloaded with the function \code{\link{rdb_providers}} and then the +datasets are requested.} + +\item{dataset_code}{Character string (default \code{NULL}). DBnomics code +of one or multiple datasets of a provider. If \code{NULL}, the datasets +codes are dowloaded with the function \code{\link{rdb_datasets}} and then +the dimensions are requested.} + +\item{use_readLines}{Logical (default \code{FALSE}). If \code{TRUE}, then +the data are requested and read with the base function \code{readLines} i.e. +through the default R internet connection. This can be used to get round the +error \code{Could not resolve host: api.db.nomics.world}.} + +\item{curl_config}{Named list (default \code{NULL}). If not +\code{NULL}, it is used to configure a proxy connection. This +configuration is passed to the function \code{curl_fetch_memory} of the package +\pkg{curl}. A temporary \code{curl_handle} object is created internally +with arguments equal to the provided list in \code{curl_config}.\cr +For \code{curl_fetch_memory} arguments see \code{\link[curl]{curl_fetch}}. +For available curl options see \code{\link[curl]{curl_options}}, +\code{names(curl_options())} and +\href{https://curl.haxx.se/libcurl/c/curl_easy_setopt.html}{libcurl}.} + +\item{simplify}{Logical (default \code{FALSE}). If \code{TRUE}, when the +dimensions are requested for only one provider and one dataset then a +named list of \code{data.table}s is returned, not a nested named list of +\code{data.table}s.} + +\item{verbose}{Logical (default \code{FALSE}). Show number of series per +datasets and providers.} + +\item{...}{Additionals arguments.} +} +\value{ +A nested named list of \code{data.table}s or a named list of +\code{data.table}s. +} +\description{ +\code{rdb_series} downloads the list of series for +available datasets of a selection of providers from +\href{https://db.nomics.world/}{DBnomics}. \cr +/!\ We warn the user that this function can be (very) long to execute. We remind +that DBnomics requests data from 63 providers to retrieve 21675 datasets for +a total of approximately 720 millions series. +} +\details{ +By default, the function returns a nested named list of \code{data.table}s +containing the dimensions of datasets for providers from +\href{https://db.nomics.world/}{DBnomics}. +} +\examples{ +\dontrun{ +rdb_series(provider_code = "IMF", dataset_code = "WEO") + +rdb_series(provider_code = "IMF", dataset_code = "WEO", simplify = TRUE) + +rdb_series(provider_code = "IMF", verbose = TRUE) + +options(rdbnomics.progress_bar_series = TRUE) +rdb_series(provider_code = "IMF", dataset_code = "WEO") +options(rdbnomics.progress_bar_series = FALSE) + +rdb_series( + provider_code = "IMF", dataset_code = "WEO", + use_readLines = TRUE +) + +rdb_series( + provider_code = "IMF", dataset_code = "WEO", + curl_config = list(proxy = "<proxy>", proxyport = <port>) +) +} +} +\seealso{ +\code{\link{rdb_providers}}, \code{\link{rdb_last_updates}}, +\code{\link{rdb_datasets}}, \code{\link{rdb_dimensions}} +} +\author{ +Sebastien Galais +} -- GitLab