...
 
Commits (2)
......@@ -35,6 +35,15 @@ LOG = logging.getLogger('inept')
BASE_URL = 'https://www.ine.pt'
MAIN_URL = BASE_URL + '/xportal/xmain?xpid=INE&xpgid=ine_cnacionais2010&perfil=220675163'\
+ '&INST=220616736&contexto=am'
HEADERS = {
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
}
INTERESTING_CODES = ['A.1.1', 'A.1.2.1', 'A.1.2.2', 'A.1.2.3', 'A.1.2.4', 'A.1.2.5',
'A.1.3.1', 'A.1.3.2', 'A.1.3.3', 'A.1.3.4',
......@@ -63,7 +72,7 @@ def download_html_and_crop(url, file_path, xpath_expr, cache=True):
if cache and os.path.exists(file_path):
LOG.debug('-> cached.')
return
req = requests.get(url)
req = requests.get(url, headers=HEADERS)
soup = bs4.BeautifulSoup(req.text, "html.parser")
for elt in soup.select(xpath_expr):
......