...
 
Commits (4)
......@@ -4,14 +4,17 @@
"""
from collections import OrderedDict
TOPICS = OrderedDict([
('GESAMT', 'Macroeconomic accounting systems'),
('GELD', 'Money and capital markets'),
('BANKEN', 'Banks and other financial institutions'),
('UNTERNEHMEN', 'Enterprises and households'),
('FINANZEN', 'Public finances'),
('AUSSENWIRTSCHAFT', 'External sector'),
('EURORAUM', 'Euro-area aggregates'),
('IWF', 'IMF related data'),
('FINANZSTAB', 'Financial stability'),
])
# Topics extracted from
# https://www.bundesbank.de/en/statistics/time-series-databases/-/help-on-the-time-series-databases-750894#tar-9
TOPICS = {
"BANKEN": "Banks and other financial corporations",
"KONJUNKTUR": "Economic activity and price",
"UNTERNEHMEN": "Enterprises and households",
"WECHSELKURSE": "Exchange rates",
"AUSSENWIRTSCHAFT": "External sector",
"FINANZSTAB": "Financial stability",
"GESAMT": "Macroeconomic accounting systems",
"GELD": "Money and capital markets",
"FINANZEN": "Public finances",
"INDIKATOR": "Sets of indicators",
}
......@@ -383,7 +383,7 @@ def generate_dataset(ds_code, source_dir: Path, structure_file: Path, ds_dir: Pa
def browse_topics(topics_dir: Path, dataset_info_dict):
""" Yields all topics along with datasets ids """
for topic_id in bc.TOPICS:
for topic_id in sorted(bc.TOPICS):
topic_dir = topics_dir / topic_id
if not topic_dir.exists():
log.warning('Topic directory [{}] not found!'.format(str(topic_dir)))
......
......@@ -45,13 +45,13 @@ STRUCTURE_URL = 'https://www.bundesbank.de/cae/servlet/StatisticDownload?metaDSI
PREPARED_RE = re.compile(r'<(?P<tagname>prepared)>[^<]*</(?P=tagname)>', re.IGNORECASE)
DATE_ERASE_STR = 'IIIIIIIIIIII'
LOG = logging.getLogger(__name__)
log = logging.getLogger(__name__)
def download_main_zip(url, target_dir: Path):
""" Downloads datasets zip archive (~200Mb) and extracts it """
LOG.info('Downloading main zip archive...')
log.info('Downloading main zip archive...')
r = requests.get(url)
with zipfile.ZipFile(io.BytesIO(r.content)) as zip_data:
zip_data.extractall(str(target_dir))
......@@ -81,14 +81,17 @@ def erase_prepared_dates(file_paths):
def download_topics(topics_url, topics_codes, target_dir: Path):
""" Downloads each topic information in a separate folder under topics/ folder """
topics_main_dir = target_dir / 'topics'
if not topics_main_dir.exists():
topics_main_dir.mkdir()
topics_main_dir.mkdir(exist_ok=True)
LOG.info('Downloading topics...')
for code in topics_codes:
LOG.info('Downloading topic [{}]...'.format(code))
log.info('Downloading topics...')
for code in sorted(topics_codes):
log.info('Downloading topic [{}]...'.format(code))
topic_dir = topics_main_dir / code
r = requests.get(topics_url + code)
if not r.ok:
log.error("A download error occurred, skipping topic [%s]...", code)
log.error("Is your topic list up-to-date? (see buba-common.py)")
continue
with zipfile.ZipFile(io.BytesIO(r.content)) as zip_data:
zip_data.extractall(str(topic_dir))
......@@ -96,23 +99,22 @@ def download_topics(topics_url, topics_codes, target_dir: Path):
def download_structures(structure_url, ds_codes, target_dir: Path, cache=True):
""" Downloads each structure information structures folder """
structure_dir = target_dir / 'structures'
if not structure_dir.exists():
structure_dir.mkdir()
structure_dir.mkdir(exist_ok=True)
LOG.info('Downloading structures...')
log.info('Downloading structures...')
for code in ds_codes:
structure_filepath = structure_dir / '{}.xml'.format(code)
# Cache handling
if cache and structure_filepath.exists():
LOG.debug('already downloaded.')
log.debug('already downloaded.')
continue
# Special case for BBSA1 dataset
metaDSI = code if code != 'BBSA1' else 'BBSAP'
# Really download
LOG.info('Downloading structure [{}]...'.format(code))
log.info('Downloading structure [{}]...'.format(code))
r = requests.get(structure_url + metaDSI)
if b'Error 404' not in r.content:
with structure_filepath.open('wb') as fd:
......@@ -122,19 +124,28 @@ def download_structures(structure_url, ds_codes, target_dir: Path, cache=True):
def main():
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('target_dir', type=Path, help='path of target directory')
parser.add_argument(
'--verbose',
type=bool,
help='verbose mode'
'target_dir', type=Path,
help='path of target directory for downloaded data',
)
parser.add_argument('--log', default='WARNING',
help='level of logging messages')
parser.add_argument('--all-datasets', default=False, action='store_true',
help='force download all datasets from all categories')
args = parser.parse_args()
logging.basicConfig(level=(logging.DEBUG if args.verbose else logging.INFO),
format='%(levelname)s: %(message)s')
numeric_level = getattr(logging, args.log.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError('Invalid log level: {}'.format(args.log))
logging.basicConfig(
format="%(levelname)s:%(name)s:%(asctime)s:%(message)s",
level=numeric_level,
stream=sys.stdout, # Use stderr if script outputs data to stdout.
)
target_dir = args.target_dir
assert target_dir.is_dir()
if not target_dir.exists():
parser.error("Target dir {!r} not found".format(str(target_dir)))
# Download main zip to get all the datasets
download_main_zip(DATASETS_URL, target_dir)
......