Skip to content
Snippets Groups Projects
Commit 0d7d7079 authored by Pierre Dittgen's avatar Pierre Dittgen
Browse files

Extract updated date from <pubDate> in RSS

parent 72a75b44
No related branches found
No related tags found
1 merge request!2Draft: Read previous datetime from env
......@@ -25,12 +25,14 @@
User and password are given trough environnement variables USER_NAME and PASSWORD
"""
import argparse
import datetime
import io
import logging
import os
import re
import sys
from pathlib import Path
from typing import Any, Dict
import requests
from lxml import etree
......@@ -132,25 +134,48 @@ def check_updated_categories(
"""
TITLE_RE = re.compile(
"^(?P<date>[0-9]{4}-[0-9]{2}-[0-9]{2}):( New table:)? (?P<theme_code>[0-9]{2})[0-9]{3}[ -]"
"^([0-9]{4}-[0-9]{2}-[0-9]{2}):( New table:)? (?P<theme_code>[0-9]{2})[0-9]{3}[ -]"
)
ITEM_TAG = "item"
TITLE_TAG = "title"
PUBDATE_TAG = "pubDate"
buff = io.BytesIO(rss_xml_content)
item_nb = 0
codes = set()
in_entry = False
entry_info: Dict[str, Any] = {}
for evt, elt in etree.iterparse(
buff, tag=(ITEM_TAG, TITLE_TAG), events=("start", "end"), huge_tree=True
buff,
tag=(ITEM_TAG, PUBDATE_TAG, TITLE_TAG),
events=("start", "end"),
huge_tree=True,
):
if elt.tag == ITEM_TAG:
in_entry = evt == "start"
if in_entry:
if evt == "start":
item_nb += 1
entry_info = {}
else:
entry_pub_date = entry_info.get("pub_date")
entry_theme_code = entry_info.get("theme_code")
# TO BE CONTINUED
if entry_pub_date and entry_theme_code:
if (
entry_theme_code in observed_categories
and entry_pub_date >= ref_date
):
codes.add(entry_theme_code)
in_entry = evt == "start"
continue
if elt.tag == PUBDATE_TAG and in_entry and evt == "end":
entry_info["pub_date"] = datetime.datetime.strptime(
elt.text.strip(), "%a, %d %b %Y %H:%M:%S %z"
)
if elt.tag == TITLE_TAG and in_entry and evt == "end":
# Do we have to consider this entry title?
......@@ -158,10 +183,7 @@ def check_updated_categories(
if not m:
continue
entry_date = m.group("date")
entry_theme_code = m.group("theme_code")
if entry_theme_code in observed_categories and entry_date >= ref_date:
codes.add(entry_theme_code)
entry_info["theme_code"] = m.group("theme_code")
if item_nb == 0:
log.warning("New datasets RSS contains no entries (?)")
......@@ -284,14 +306,15 @@ def main():
updated_categories = check_updated_categories(
u.content, CATEGORIES, args.from_datetime
)
log.info("%d categories to update", len(updated_categories))
log.info(
"%d categories to update: %r", len(updated_categories), updated_categories
)
for cat_id in updated_categories:
log.info("Downloading category %s datasets", cat_id)
# Use a folder by category
# Delete if exists
cat_dir = target_dir / str(cat_id)
cat_dir.mkdir(exist_ok=True)
......@@ -328,8 +351,8 @@ def main():
)
def datetime_with_timezone(s: str) -> datetime:
d = datetime.fromisoformat(s)
def datetime_with_timezone(s: str) -> datetime.datetime:
d = datetime.datetime.fromisoformat(s)
if d.tzinfo is None:
raise ValueError(f"Datetime must be provided with a timezone. Received {s!r}")
return d
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment