Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
destatis-fetcher
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dbnomics-fetchers
destatis-fetcher
Commits
0d7d7079
Commit
0d7d7079
authored
4 years ago
by
Pierre Dittgen
Browse files
Options
Downloads
Patches
Plain Diff
Extract updated date from <pubDate> in RSS
parent
72a75b44
No related branches found
No related tags found
1 merge request
!2
Draft: Read previous datetime from env
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
download.py
+35
-12
35 additions, 12 deletions
download.py
with
35 additions
and
12 deletions
download.py
+
35
−
12
View file @
0d7d7079
...
...
@@ -25,12 +25,14 @@
User and password are given trough environnement variables USER_NAME and PASSWORD
"""
import
argparse
import
datetime
import
io
import
logging
import
os
import
re
import
sys
from
pathlib
import
Path
from
typing
import
Any
,
Dict
import
requests
from
lxml
import
etree
...
...
@@ -132,25 +134,48 @@ def check_updated_categories(
"""
TITLE_RE
=
re
.
compile
(
"
^(
?P<date>
[0-9]{4}-[0-9]{2}-[0-9]{2}):( New table:)? (?P<theme_code>[0-9]{2})[0-9]{3}[ -]
"
"
^([0-9]{4}-[0-9]{2}-[0-9]{2}):( New table:)? (?P<theme_code>[0-9]{2})[0-9]{3}[ -]
"
)
ITEM_TAG
=
"
item
"
TITLE_TAG
=
"
title
"
PUBDATE_TAG
=
"
pubDate
"
buff
=
io
.
BytesIO
(
rss_xml_content
)
item_nb
=
0
codes
=
set
()
in_entry
=
False
entry_info
:
Dict
[
str
,
Any
]
=
{}
for
evt
,
elt
in
etree
.
iterparse
(
buff
,
tag
=
(
ITEM_TAG
,
TITLE_TAG
),
events
=
(
"
start
"
,
"
end
"
),
huge_tree
=
True
buff
,
tag
=
(
ITEM_TAG
,
PUBDATE_TAG
,
TITLE_TAG
),
events
=
(
"
start
"
,
"
end
"
),
huge_tree
=
True
,
):
if
elt
.
tag
==
ITEM_TAG
:
in_entry
=
evt
==
"
start
"
if
in_entry
:
if
evt
==
"
start
"
:
item_nb
+=
1
entry_info
=
{}
else
:
entry_pub_date
=
entry_info
.
get
(
"
pub_date
"
)
entry_theme_code
=
entry_info
.
get
(
"
theme_code
"
)
# TO BE CONTINUED
if
entry_pub_date
and
entry_theme_code
:
if
(
entry_theme_code
in
observed_categories
and
entry_pub_date
>=
ref_date
):
codes
.
add
(
entry_theme_code
)
in_entry
=
evt
==
"
start
"
continue
if
elt
.
tag
==
PUBDATE_TAG
and
in_entry
and
evt
==
"
end
"
:
entry_info
[
"
pub_date
"
]
=
datetime
.
datetime
.
strptime
(
elt
.
text
.
strip
(),
"
%a, %d %b %Y %H:%M:%S %z
"
)
if
elt
.
tag
==
TITLE_TAG
and
in_entry
and
evt
==
"
end
"
:
# Do we have to consider this entry title?
...
...
@@ -158,10 +183,7 @@ def check_updated_categories(
if
not
m
:
continue
entry_date
=
m
.
group
(
"
date
"
)
entry_theme_code
=
m
.
group
(
"
theme_code
"
)
if
entry_theme_code
in
observed_categories
and
entry_date
>=
ref_date
:
codes
.
add
(
entry_theme_code
)
entry_info
[
"
theme_code
"
]
=
m
.
group
(
"
theme_code
"
)
if
item_nb
==
0
:
log
.
warning
(
"
New datasets RSS contains no entries (?)
"
)
...
...
@@ -284,14 +306,15 @@ def main():
updated_categories
=
check_updated_categories
(
u
.
content
,
CATEGORIES
,
args
.
from_datetime
)
log
.
info
(
"
%d categories to update
"
,
len
(
updated_categories
))
log
.
info
(
"
%d categories to update: %r
"
,
len
(
updated_categories
),
updated_categories
)
for
cat_id
in
updated_categories
:
log
.
info
(
"
Downloading category %s datasets
"
,
cat_id
)
# Use a folder by category
# Delete if exists
cat_dir
=
target_dir
/
str
(
cat_id
)
cat_dir
.
mkdir
(
exist_ok
=
True
)
...
...
@@ -328,8 +351,8 @@ def main():
)
def
datetime_with_timezone
(
s
:
str
)
->
datetime
:
d
=
datetime
.
fromisoformat
(
s
)
def
datetime_with_timezone
(
s
:
str
)
->
datetime
.
datetime
:
d
=
datetime
.
datetime
.
fromisoformat
(
s
)
if
d
.
tzinfo
is
None
:
raise
ValueError
(
f
"
Datetime must be provided with a timezone. Received
{
s
!r}
"
)
return
d
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment