Commit e0e8135e authored by Bruno Duyé's avatar Bruno Duyé

Category tree + download script

parent 87fe8d45
[
{
"children": [
{
"name": "Section 1 Domestic Product and Income",
"url": "https://www.bea.gov/national/Release/XLS/Survey/Section1All_xls.xlsx"
},
{
"name": "Section 2 Personal Income and Outlays",
"url": "https://www.bea.gov/national/Release/XLS/Survey/Section2All_xls.xlsx"
},
{
"name": "Section 3 Government Current Receipts and Expenditures",
"url": "https://www.bea.gov/national/Release/XLS/Survey/Section3All_xls.xlsx"
},
{
"name": "Section 4 Foreign Transactions",
"url": "https://www.bea.gov/national/Release/XLS/Survey/Section4All_xls.xlsx"
},
{
"name": "Section 5 Saving and Investment",
"url": "https://www.bea.gov/national/Release/XLS/Survey/Section5All_xls.xlsx"
},
{
"name": "Section 6 Income and Employment by Industry",
"url": "https://www.bea.gov/national/Release/XLS/Survey/Section6All_xls.xlsx"
},
{
"name": "Section 7 Supplemental Tables",
"url": "https://www.bea.gov/national/Release/XLS/Survey/Section7All_xls.xlsx"
}
],
"name": "GDP and Personal Income"
},
{
"children": [
{
"name": "Section 0 Real Inventories and Sales",
"url": "https://www.bea.gov/national/Release/XLS/Underlying/Section0All_xls.xlsx"
},
{
"name": "Section 2 Personal Consumption Expenditures",
"url": "https://www.bea.gov/national/Release/XLS/Underlying/Section2All_xls.xlsx"
},
{
"name": "Section 3 Government Current Receipts and Expenditures",
"url": "https://www.bea.gov/national/Release/XLS/Underlying/Section3All_xls.xlsx"
},
{
"name": "Section 4 Foreign Transactions",
"url": "https://www.bea.gov/national/Release/XLS/Underlying/Section4All_xls.xlsx"
},
{
"name": "Section 5 Gross Private Domestic Investment and Capital Transfers",
"url": "https://www.bea.gov/national/Release/XLS/Underlying/Section5All_xls.xlsx"
},
{
"name": "Section 7 Motor Vehicle Output",
"url": "https://www.bea.gov/national/Release/XLS/Underlying/Section7All_xls.xlsx"
},
{
"name": "Section 9 Other Tables",
"url": "https://www.bea.gov/national/Release/XLS/Underlying/Section9All_xls.xlsx"
}
],
"name": "Underlying Detail"
},
{
"children": [
{
"name": "Section 1 International style aggregates",
"url": "https://www.bea.gov/national/Release/XLS/ProtoSNA/Section1All_xls.xlsx"
}
],
"name": "International Style Aggregates"
}
]
\ No newline at end of file
This directory contains tools created to generate category_tree.
Steps:
- using a browser, goto https://www.bea.gov/iTable/iTable.cfm?ReqID=19&step=4#reqid=19&step=4&isuri=1&1921=flatfile
- save html file
- lunch generate_tree.py with the saved file in argument
- the script output `../category_tree.json` that is then used by download and convert scripts
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# bea-fetcher -- Fetch series from http://www.bea.gov
# By: Bruno Duyé <bruno.duye@cepremap.org>
#
# Copyright (C) 2017 Cepremap
# https://git.nomics.world/dbnomics-fetchers/bea-fetcher
#
# This is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This software is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http:>www.gnu.org/licenses/>.
"""
Generate ../category_tree.json from html file
Usage:
{self_filename} <source_html> [options]
"""
import json
import os
import sys
import lxml.html
from docopt import docopt
from requests import Session, exceptions
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util import Retry
from slugify import slugify
BASE_URL = 'https://www.bea.gov/'
def main():
args = docopt(__doc__.format(self_filename=os.path.basename(__file__)))
with open(args['<source_html>'], "r") as f:
content = f.read()
tree = generate_category_tree(content)
current_dir = os.path.realpath(os.path.dirname(__file__))
write_json_file(os.path.join(current_dir, '../category_tree.json'), tree)
print("\n../category_tree.json file written.")
def generate_category_tree(html):
xml_tree = lxml.html.document_fromstring(html)
xml_tree.make_links_absolute(BASE_URL)
category_tree = []
for category_node in xml_tree.findall(".//div[@id='wraper']//div[@class='TableListMemo']"):
category_name = category_node.text_content().strip()
if category_name == 'Flat Files':
continue
print("* {}".format(category_name))
subtree = []
link_parent = category_node
while True:
link_parent = link_parent.getnext()
if link_parent is None:
# last element
break
if link_parent.attrib['class'] == 'TableListMemo':
break
assert link_parent.attrib['class'] == 'tableList_row_0', link_parent.attrib['class']
link = link_parent.find('./a')
assert link is not None
link_name = link.text_content().strip()
if link_name.startswith('Section'):
subtree.append({'name': link_name, 'url': link.attrib['href']})
print(link.text_content())
category_tree.append({'name': category_name, 'children': subtree})
return category_tree
def write_json_file(file_path, data):
with open(file_path, 'w', encoding='utf-8') as file_:
json.dump(data, file_, ensure_ascii=False, indent=2, sort_keys=True)
if __name__ == '__main__':
sys.exit(main())
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# bea-fetcher -- Fetch series from http://www.bea.gov
# By: Bruno Duyé <bruno.duye@cepremap.org>
#
# Copyright (C) 2017 Cepremap
# https://git.nomics.world/dbnomics-fetchers/bea-fetcher
#
# This is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This software is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http:>www.gnu.org/licenses/>.
"""Download BEA series from http://www.bea.gov
Usage:
{self_filename} <target_dir> [options]
Options:
--debug show debug output
"""
import json
import logging
import os
import sys
from docopt import docopt
from requests import Session, exceptions
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util import Retry
from slugify import slugify
log = logging.getLogger(__name__)
def main():
global log
requests_session = Session()
# http://www.coglib.com/~icordasc/blog/2014/12/retries-in-requests.html
# backoff_factor=2 will make sleep for 2 * (2 ^ (retry_number - 1)), ie 0, 2, 4, 8, 16, 32 ...
requests_session.mount('http://', HTTPAdapter(max_retries=Retry(total=50, backoff_factor=2, status_forcelist=[500, 503, 504])))
# Parse command line arguments
args = docopt(__doc__.format(self_filename=os.path.basename(__file__)))
target_dir = args['<target_dir>']
debug_mode = args['--debug']
logging.basicConfig(level=(logging.DEBUG if debug_mode else logging.INFO), format='%(message)s')
category_tree = json.load(open('category_tree.json'))
for category in category_tree:
category_name = category['name']
print("* {}".format(category_name))
category_dir = os.path.join(target_dir, slugify(category_name))
os.mkdir(category_dir)
for link in category['children']:
url = link['url']
print(url)
response = requests_session.get(url)
content = response.content
filename = os.path.basename(url)
with open(os.path.join(category_dir, filename), 'wb') as _f:
_f.write(content)
if __name__ == '__main__':
sys.exit(main())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment