...
 
Commits (15)
This diff is collapsed.
This diff is collapsed.
......@@ -32,17 +32,19 @@ Options:
"""
import logging
import os
import sys
import time
from pathlib import Path
import ujson as json
import yaml
from docopt import docopt
from requests import Session, exceptions
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util import Retry
import ine_fetcher_common
import ujson as json
from dbnomics_json_errors import ErrorsArtifact
DOWNLOAD_TABLE_URL = "http://servicios.ine.es/wstempus/js/en/DATOS_TABLA/{}?tip=AM"
......@@ -58,21 +60,41 @@ def main():
requests_session.mount('http://', HTTPAdapter(max_retries=Retry(total=50,
backoff_factor=2, status_forcelist=[500, 502, 503, 504])))
# Parse command line arguments
args = docopt(__doc__.format(self_filename=os.path.basename(__file__)))
target_dir = args['<target_dir>']
args = docopt(__doc__.format(self_filename=Path(__file__).parent))
target_dir = Path(args['<target_dir>'])
debug_mode = args['--debug']
logging.basicConfig(level=(logging.DEBUG if debug_mode else logging.INFO), format='%(message)s')
for category_dict in ine_fetcher_common.CATEGORY_TREE:
# Load category tree from category_tree.yml
category_tree = None
with Path('category_tree.yml').open() as f:
category_tree = yaml.load(f.read(), Loader=yaml.BaseLoader)
# Init error artifact
errors_artifact = ErrorsArtifact()
# Download
nb_expected_datasets = 0
for category_dict in category_tree:
log.info("* {!r}".format(category_dict['name']))
for subcategory in category_dict['children']:
for table_dict in subcategory['children']:
table_code = table_dict['code']
log.debug(table_code)
for table_dict, path in ine_fetcher_common.yield_final_nodes(category_dict['children'], path=[category_dict['name']]):
nb_expected_datasets += 1
table_code = table_dict['code']
log.debug(table_code)
try:
json_answer = download_table(table_code)
if json_answer is not None:
with open(os.path.join(target_dir, table_code + '.json'), "wb") as _file:
_file.write(json_answer)
except:
error = "Exception while downloading dataset '{}' (code: {}, path: {})".format(
table_dict['name'], table_code, ' -> '.join(path))
log.exception(error)
errors_artifact.add_dataset_error(table_code, error)
continue
if json_answer is not None:
with (target_dir / (table_code + '.json')).open("wb") as _file:
_file.write(json_answer)
# Write errors.json file
errors_artifact.write_json_file(target_dir, nb_expected_datasets=nb_expected_datasets)
def download_table(table_code):
......
......@@ -21,135 +21,19 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
CATEGORY_TREE = [
# http://www.ine.es/dyngs/IOE/en/clasificaciones.htm
{
'name': 'Economy/Economics accounts',
'code': '14',
'children': [
{
'name': "Quarterly Spanish National Accounts",
'code': '30024',
'doc_href': 'http://www.ine.es/dyngs/INEbase/en/operacion.htm?c=Estadistica_C&cid=1254736164439&menu=ultiDatos&idp=1254735576581',
'children': [
{
'code': '9130',
'name': 'GDP mp Supply (Current prices)',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=9130&L=1',
},
{
'code': '9148',
'name': 'GDP mp Supply (Chain-linked volume indices)',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=9148&L=1',
},
{
'code': '9374',
'name': 'GDP mp Demand (Current prices)',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=9374&L=1',
},
{
'code': '9375',
'name': 'GDP mp Demand (Chain-linked volume indices)',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=9375&L=1',
},
{
'code': '9376',
'name': 'GDP mp Income (Current prices)',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=9376&L=1',
},
{
'code': '9377',
'name': 'Employment by branches of activity',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=9377&L=1',
},
]
},
# {
# 'name': 'Annual Spanish National Accounts',
# 'children': [] # TODO => see xls_files git branch
# },
]
},
{
'name': 'Industry, energy and construction/Industrial price index',
'code': '30051',
'children': [
{
'name': 'Industrial production indices',
'code': '4349',
'children': [
{
'code': '27068',
'name': 'National group indices',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=27068&L=1',
},
{
'code': '3284',
'name': 'National indices and by Autonomous Community: general and by economic destination of the goods',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=3284&L=1',
},
{
'code': '3280',
'name': 'National indices and by section Autonomous Community',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=3280&L=1',
},
],
}
]
},
{
'name': 'Industry, energy and construction/Export and import price of industrial products',
'code': '30071',
'children': [
{
'name': 'Export price index export of industrial products',
'code': '30071',
'children': [
{
'code': '3285',
'name': 'National indices: general and by economic destination of the goods',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=3285&L=1',
},
{
'code': '3286',
'name': 'National section indices',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=3286&L=1',
},
]
},
{
'name': 'Industrial price indices',
'code': '30051',
'children': [
{
'code': '3289',
'name': 'National indices: general and by economic destination of the goods',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=3289&L=1',
},
{
'code': '3293',
'name': 'National section indices',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=3293&L=1',
},
],
},
{
'name': 'Import price index import of industrial products',
'code': '1254736148943',
'children': [
{
'code': '3295',
'name': 'National indices: general and by economic destination of the goods',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=3295&L=1',
},
{
'code': '3296',
'name': 'National section indices',
'doc_href': 'http://www.ine.es/jaxiT3/Tabla.htm?t=3296&L=1',
},
],
},
]
},
]
def yield_final_nodes(tree, path=[]):
""" Recursively walk given category tree and yield final nodes and their path.
"""
if isinstance(tree, list):
for subtree in tree:
yield from yield_final_nodes(subtree, path=path)
else:
assert(isinstance(tree, dict))
newpath = path.copy()
newpath.append(tree['name'])
if 'children' in tree:
for subtree in tree['children']:
yield from yield_final_nodes(subtree, path=newpath)
else:
# final node
yield tree, path
......@@ -4,3 +4,5 @@ ujson
humanize
toolz
python-slugify
PyYAML
git+https://git.nomics.world/dbnomics/dbnomics-json-errors.git@0.1.1#egg=dbnomics_json_errors