...
 
Commits (6)
.vscode/
__pycache__
.mypy_cache/
......@@ -43,7 +43,7 @@ job:
cd ${PROVIDER_SLUG}-source-data
time find -not -path "./.git/*" -not -name ".git" -delete
cd ..
time python3 --log info download.py ${PROVIDER_SLUG}-source-data
time python3 download.py --log info ${PROVIDER_SLUG}-source-data
cd ${PROVIDER_SLUG}-source-data
time git add -A
time git commit -m "New download" --quiet || true
......
......@@ -2,19 +2,31 @@
"""
Common information for download and convert script
"""
from collections import OrderedDict
from pathlib import Path
# Topics extracted from
# https://www.bundesbank.de/en/statistics/time-series-databases/-/help-on-the-time-series-databases-750894#tar-9
TOPICS = {
"BANKEN": "Banks and other financial corporations",
"KONJUNKTUR": "Economic activity and price",
"UNTERNEHMEN": "Enterprises and households",
"WECHSELKURSE": "Exchange rates",
"AUSSENWIRTSCHAFT": "External sector",
"FINANZSTAB": "Financial stability",
"GESAMT": "Macroeconomic accounting systems",
"GELD": "Money and capital markets",
"FINANZEN": "Public finances",
"INDIKATOR": "Sets of indicators",
}
import bs4
TOPICS_HTML_PAGE_NAME = 'help_on_time_series_databases.html'
def extract_table_rows_from_html_table(table_elt):
"""Transform a beautiful soup table element into a row (<tr>) list,
each row containing cell (<td>) stripped text
"""
rows = []
for tr_elt in table_elt.find_all('tr'):
row = [td_elt.text.strip() for td_elt in tr_elt.find_all('td')]
if row:
rows.append(row)
return rows
def extract_topics_dict_from_html_help_page(html_filepath: Path):
"""Read help HTML page, find topics table and return it as a dict"""
with html_filepath.open('rt', encoding='utf-8') as fd:
soup = bs4.BeautifulSoup(fd, 'lxml')
for table_elt in soup.find_all('table'):
rows = extract_table_rows_from_html_table(table_elt)
if not rows or not rows[0] or not rows[0][0] or rows[0][0] != 'Topic':
continue
return dict([(tup[1], tup[0]) for tup in rows[1:]])
This diff is collapsed.
......@@ -27,16 +27,16 @@
import argparse
import io
import logging
from pathlib import Path
import re
import sys
import zipfile
import buba_common as bc
from pathlib import Path
import requests
import buba_common as bc
HELP_ON_TIMESERIES_URL = 'https://www.bundesbank.de/en/statistics/time-series-databases/-/help-on-the-time-series-databases-750894'
DATASETS_URL = 'https://www.bundesbank.de/cae/servlet/StatisticDownload?its_fileFormat=Archive&mode=its'
TOPICS_URL = 'https://www.bundesbank.de/cae/servlet/StatisticDownload?its_fileFormat=Archive&mode=its&tree='
STRUCTURE_URL = 'https://www.bundesbank.de/cae/servlet/StatisticDownload?metaDSI='
......@@ -48,6 +48,14 @@ DATE_ERASE_STR = 'IIIIIIIIIIII'
log = logging.getLogger(__name__)
def download_html(url, file_path: Path, encoding='utf-8'):
"""Download html file and save it to file"""
req = requests.get(url)
req.raise_for_status()
with file_path.open('wt', encoding=encoding) as fd:
fd.write(req.text)
def download_main_zip(url, target_dir: Path):
""" Downloads datasets zip archive (~200Mb) and extracts it """
......@@ -85,12 +93,11 @@ def download_topics(topics_url, topics_codes, target_dir: Path):
log.info('Downloading topics...')
for code in sorted(topics_codes):
log.info('Downloading topic [{}]...'.format(code))
log.info('Downloading topic [{}]'.format(code))
topic_dir = topics_main_dir / code
r = requests.get(topics_url + code)
if not r.ok:
log.error("A download error occurred, skipping topic [%s]...", code)
log.error("Is your topic list up-to-date? (see buba-common.py)")
continue
with zipfile.ZipFile(io.BytesIO(r.content)) as zip_data:
zip_data.extractall(str(topic_dir))
......@@ -114,7 +121,7 @@ def download_structures(structure_url, ds_codes, target_dir: Path, cache=True):
metaDSI = code if code != 'BBSA1' else 'BBSAP'
# Really download
log.info('Downloading structure [{}]...'.format(code))
log.info('Downloading structure [{}]'.format(code))
r = requests.get(structure_url + metaDSI)
if b'Error 404' not in r.content:
with structure_filepath.open('wb') as fd:
......@@ -150,11 +157,16 @@ def main():
# Download main zip to get all the datasets
download_main_zip(DATASETS_URL, target_dir)
# Downloads HTML page that contains topics reference table
topics_html_filepath = target_dir / bc.TOPICS_HTML_PAGE_NAME
download_html(HELP_ON_TIMESERIES_URL, topics_html_filepath)
# Extract topics_dict
topics_dict = bc.extract_topics_dict_from_html_help_page(topics_html_filepath)
# Downloads by topics to be able to sort datasets by topic
# TODO: find a better way to associate datasets with topics
# without downloading so much useless data
# Note: some datasets appear in more than one topic
download_topics(TOPICS_URL, bc.TOPICS.keys(), target_dir)
download_topics(TOPICS_URL, topics_dict.keys(), target_dir)
# Downloads structure files
# dataset codes are computed from directory names
......
bs4==0.0.1
cryptography==2.8
lxml==4.4.2
requests[security]==2.22.0
\ No newline at end of file
requests[security]==2.22.0