Commit 4e788a9a authored by Christophe Benz's avatar Christophe Benz
Browse files

Use fetcher toolbox for downloader

parent 963b3e9f
......@@ -21,87 +21,95 @@
"""Downloader for Meti Japan provider."""
import argparse
import http.client
import asyncio
import logging
import shutil
import sys
import zipfile
from pathlib import Path
import requests
log = logging.getLogger(__name__)
from dbnomics_fetcher_toolbox.arguments import add_arguments_for_download
from dbnomics_fetcher_toolbox.logging_utils import setup_logging
from dbnomics_fetcher_toolbox.resources import Resource, process_resources
from dbnomics_fetcher_toolbox.status import load_events, open_status_writer
logger = logging.getLogger(__name__)
def download_binary_file(url, file_path: Path):
""" Download url into binary file """
log.info("Downloading %s from %s ... ", str(file_path), url)
req = requests.get(url, stream=True)
async def main():
"""Downloads and extracts zip files in folders"""
with file_path.open("wb") as fout:
req.raw.decode_content = True
shutil.copyfileobj(req.raw, fout)
log.info("-> done.")
parser = argparse.ArgumentParser(description=__doc__)
add_arguments_for_download(parser)
args = parser.parse_args()
setup_logging(args)
resources = [
MetiResource(
id="b2015_g1e",
url="http://www.meti.go.jp/english/statistics/tyo/iip/csv/b2015_g1e.zip",
dir=args.target_dir / "b2015_g1e",
),
MetiResource(
id="b2010_ke",
url="http://www.meti.go.jp/english/statistics/tyo/sanzi/csv/b2010_ke.zip",
dir=args.target_dir / "b2010_ke",
),
]
events = load_events(args.target_dir)
with open_status_writer(args) as append_event:
await process_resources(
resources=resources,
args=args,
process_resource=process_resource,
on_event=append_event,
events=events,
)
class MetiResource(Resource):
"""A resource for Meti data provider."""
dir: Path
url: str
def download_and_extract(url, target_dir: Path, dir_name):
""" Downloads zip archive and extracts it in a folder """
zip_filepath = target_dir / "{}.zip".format(dir_name)
def create_context(self):
"""Create the directory of the resource."""
self.dir.mkdir(exist_ok=True)
download_binary_file(url, zip_filepath)
def delete(self):
"""Delete the directory file."""
shutil.rmtree(self.dir)
def process_resource(resource: MetiResource):
"""Downloads zip archive and extracts it in a folder"""
zip_filepath = resource.dir / "{}.zip".format(resource.id)
download_binary_file(resource.url, zip_filepath)
# Extracts all CSV
csv_dir = target_dir / dir_name
csv_dir.mkdir(exist_ok=True)
with zipfile.ZipFile(zip_filepath) as zip_archive:
zip_archive.extractall(csv_dir)
log.info("Zip [%s] extracted.", str(zip_filepath))
zip_archive.extractall(resource.dir)
logger.info("Zip [%s] extracted.", str(zip_filepath))
# And removes zip archive
zip_filepath.unlink()
def main():
""" Downloads and extracts zip files in folders """
def download_binary_file(url, file_path: Path):
""" Download url into binary file """
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("target_dir", type=Path, help="path of target directory")
parser.add_argument(
"--debug-http", action="store_true", help="display http.client debug messages"
)
parser.add_argument("--log", default="INFO", help="level of logging messages")
args = parser.parse_args()
logger.info("Downloading %s from %s ... ", str(file_path), url)
req = requests.get(url, stream=True)
numeric_level = getattr(logging, args.log.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError("Invalid log level: {}".format(args.log))
logging.basicConfig(
format="%(levelname)s:%(name)s:%(asctime)s:%(message)s",
level=numeric_level,
stream=sys.stdout, # Use stderr if script outputs data to stdout.
)
logging.getLogger("urllib3").setLevel(
logging.DEBUG if args.debug_http else logging.WARNING
)
if args.debug_http:
http.client.HTTPConnection.debuglevel = 1
target_dir = args.target_dir
if not target_dir.exists():
parser.error("Target dir {!r} not found".format(str(target_dir)))
download_and_extract(
"http://www.meti.go.jp/english/statistics/tyo/iip/csv/b2015_g1e.zip",
target_dir,
"b2015_g1e",
)
download_and_extract(
"http://www.meti.go.jp/english/statistics/tyo/sanzi/csv/b2010_ke.zip",
target_dir,
"b2010_ke",
)
with file_path.open("wb") as fout:
req.raw.decode_content = True
shutil.copyfileobj(req.raw, fout)
logger.info("-> done.")
if __name__ == "__main__":
main()
asyncio.run(main())
dbnomics-fetcher-toolbox <= 0.1.0
requests
\ No newline at end of file
......@@ -4,8 +4,31 @@
#
# pip-compile
#
aiodns==2.0.0 # via aiohttp
aiohttp[speedups]==3.6.2 # via dbnomics-fetcher-toolbox
async-timeout==3.0.1 # via aiohttp
attrs==19.3.0 # via aiohttp
brotlipy==0.7.0 # via aiohttp
cchardet==2.1.5 # via aiohttp
certifi==2019.11.28 # via requests
chardet==3.0.4 # via requests
idna==2.8 # via requests
cffi==1.13.2 # via brotlipy, pycares
chardet==3.0.4 # via aiohttp, requests
contexttimer==0.3.3 # via dbnomics-fetcher-toolbox
daiquiri==1.6.1 # via dbnomics-fetcher-toolbox
dbnomics-fetcher-toolbox==0.0.4
humanfriendly==4.18 # via dbnomics-fetcher-toolbox
idna==2.8 # via requests, yarl
jsonlines==1.2.0 # via dbnomics-fetcher-toolbox
lxml==4.4.2 # via dbnomics-fetcher-toolbox
multidict==4.7.3 # via aiohttp, yarl
orjson==2.1.3 # via dbnomics-fetcher-toolbox
pycares==3.1.0 # via aiodns
pycparser==2.19 # via cffi
pydantic==1.3 # via dbnomics-fetcher-toolbox
requests==2.22.0
six==1.13.0 # via jsonlines
toolz==0.10.0 # via dbnomics-fetcher-toolbox
typing-extensions==3.7.4.1 # via dbnomics-fetcher-toolbox
ujson==1.35 # via dbnomics-fetcher-toolbox
urllib3==1.25.7 # via requests
yarl==1.4.2 # via aiohttp
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment