Skip to content
Snippets Groups Projects
Commit 0d7d7079 authored by Pierre Dittgen's avatar Pierre Dittgen
Browse files

Extract updated date from <pubDate> in RSS

parent 72a75b44
No related branches found
No related tags found
1 merge request!2Draft: Read previous datetime from env
...@@ -25,12 +25,14 @@ ...@@ -25,12 +25,14 @@
User and password are given trough environnement variables USER_NAME and PASSWORD User and password are given trough environnement variables USER_NAME and PASSWORD
""" """
import argparse import argparse
import datetime
import io import io
import logging import logging
import os import os
import re import re
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any, Dict
import requests import requests
from lxml import etree from lxml import etree
...@@ -132,25 +134,48 @@ def check_updated_categories( ...@@ -132,25 +134,48 @@ def check_updated_categories(
""" """
TITLE_RE = re.compile( TITLE_RE = re.compile(
"^(?P<date>[0-9]{4}-[0-9]{2}-[0-9]{2}):( New table:)? (?P<theme_code>[0-9]{2})[0-9]{3}[ -]" "^([0-9]{4}-[0-9]{2}-[0-9]{2}):( New table:)? (?P<theme_code>[0-9]{2})[0-9]{3}[ -]"
) )
ITEM_TAG = "item" ITEM_TAG = "item"
TITLE_TAG = "title" TITLE_TAG = "title"
PUBDATE_TAG = "pubDate"
buff = io.BytesIO(rss_xml_content) buff = io.BytesIO(rss_xml_content)
item_nb = 0 item_nb = 0
codes = set() codes = set()
in_entry = False in_entry = False
entry_info: Dict[str, Any] = {}
for evt, elt in etree.iterparse( for evt, elt in etree.iterparse(
buff, tag=(ITEM_TAG, TITLE_TAG), events=("start", "end"), huge_tree=True buff,
tag=(ITEM_TAG, PUBDATE_TAG, TITLE_TAG),
events=("start", "end"),
huge_tree=True,
): ):
if elt.tag == ITEM_TAG: if elt.tag == ITEM_TAG:
in_entry = evt == "start" if evt == "start":
if in_entry:
item_nb += 1 item_nb += 1
entry_info = {}
else:
entry_pub_date = entry_info.get("pub_date")
entry_theme_code = entry_info.get("theme_code")
# TO BE CONTINUED
if entry_pub_date and entry_theme_code:
if (
entry_theme_code in observed_categories
and entry_pub_date >= ref_date
):
codes.add(entry_theme_code)
in_entry = evt == "start"
continue continue
if elt.tag == PUBDATE_TAG and in_entry and evt == "end":
entry_info["pub_date"] = datetime.datetime.strptime(
elt.text.strip(), "%a, %d %b %Y %H:%M:%S %z"
)
if elt.tag == TITLE_TAG and in_entry and evt == "end": if elt.tag == TITLE_TAG and in_entry and evt == "end":
# Do we have to consider this entry title? # Do we have to consider this entry title?
...@@ -158,10 +183,7 @@ def check_updated_categories( ...@@ -158,10 +183,7 @@ def check_updated_categories(
if not m: if not m:
continue continue
entry_date = m.group("date") entry_info["theme_code"] = m.group("theme_code")
entry_theme_code = m.group("theme_code")
if entry_theme_code in observed_categories and entry_date >= ref_date:
codes.add(entry_theme_code)
if item_nb == 0: if item_nb == 0:
log.warning("New datasets RSS contains no entries (?)") log.warning("New datasets RSS contains no entries (?)")
...@@ -284,14 +306,15 @@ def main(): ...@@ -284,14 +306,15 @@ def main():
updated_categories = check_updated_categories( updated_categories = check_updated_categories(
u.content, CATEGORIES, args.from_datetime u.content, CATEGORIES, args.from_datetime
) )
log.info("%d categories to update", len(updated_categories)) log.info(
"%d categories to update: %r", len(updated_categories), updated_categories
)
for cat_id in updated_categories: for cat_id in updated_categories:
log.info("Downloading category %s datasets", cat_id) log.info("Downloading category %s datasets", cat_id)
# Use a folder by category # Use a folder by category
# Delete if exists
cat_dir = target_dir / str(cat_id) cat_dir = target_dir / str(cat_id)
cat_dir.mkdir(exist_ok=True) cat_dir.mkdir(exist_ok=True)
...@@ -328,8 +351,8 @@ def main(): ...@@ -328,8 +351,8 @@ def main():
) )
def datetime_with_timezone(s: str) -> datetime: def datetime_with_timezone(s: str) -> datetime.datetime:
d = datetime.fromisoformat(s) d = datetime.datetime.fromisoformat(s)
if d.tzinfo is None: if d.tzinfo is None:
raise ValueError(f"Datetime must be provided with a timezone. Received {s!r}") raise ValueError(f"Datetime must be provided with a timezone. Received {s!r}")
return d return d
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment