...
 
Commits (2)
......@@ -63,7 +63,7 @@ DATACUBE_URL_TPL = 'https://www-genesis.destatis.de/genesisWS/web/ExportService_
+ '&sachschluessel2=&sachmerkmal3=&sachschluessel3=&stand=&sprache=en'
def download_url_if_needed(url, file_path: Path, xml_content=False, cache=True):
def download_url_if_needed(url, file_path: Path, xml_content=False, cache=True, post_process_func=None):
""" Download url into file_path and xml indent if it's XML content """
log.debug("Downloading %s... ", file_path.stem)
......@@ -91,6 +91,9 @@ def download_url_if_needed(url, file_path: Path, xml_content=False, cache=True):
except ValueError:
log.warning("Can't indent %s", file_path)
if post_process_func:
content = post_process_func(content)
with file_path.open(mode='wt') as fout:
fout.write(content)
log.debug('-> done.')
......@@ -100,6 +103,15 @@ def index_of(value, elts):
"""
Retrieves the first index of value in elts
Returns -1 if not found
>>> index_of('a', ['a', 'b', 'c'])
0
>>> index_of('b', ['a', 'b', 'c'])
1
>>> index_of('c', ['a', 'b', 'c'])
2
>>> index_of('d', ['a', 'b', 'c'])
-1
"""
for i, elt in enumerate(elts):
if elt == value:
......@@ -172,6 +184,28 @@ def get_last_commit_date(repo_dir: Path):
return date.strftime('%Y-%m-%d')
def untimestamp_content(content):
"""Replace date and time by static content to avoid false commits
>>> untimestamp_content('foobar\\n <quaderDaten>* * Der Benutzer GPLA6VMEU8 der Benutzergruppe GP1152 hat am 26.09.2019 um 03:45:53 diesen Export angestossen.\\nbaz')
'foobar\\n <quaderDaten>* * Der Benutzer GPLA6VMEU8 der Benutzergruppe GP1152 hat am dd.mm.yyyy um hh:MM:ss diesen Export angestossen.\\nbaz'
"""
output = []
fixed = False
for line in content.split('\n'):
# '<quaderDaten>...' is one of the first lines in content
# Once fixed, just copy the other lines as-is
if fixed:
output.append(line)
continue
if line.lstrip().startswith('<quaderDaten>* ') and line.rstrip().endswith('angestossen.'):
line = re.sub(r' am \d{2}\.\d{2}\.\d{4} um \d{2}:\d{2}:\d{2} ',
' am dd.mm.yyyy um hh:MM:ss ', line)
fixed = True
output.append(line)
return '\n'.join(output)
def main():
""" Downloads data from destatis website """
parser = argparse.ArgumentParser(description=__doc__,
......@@ -257,7 +291,8 @@ def main():
log.info('Downloading datacube %s data...', dc_code)
datacube_url = DATACUBE_URL_TPL.format(**tpl_dict)
datacube_filepath = cat_dir / '{}.xml'.format(dc_code)
download_url_if_needed(datacube_url, datacube_filepath, xml_content=True)
download_url_if_needed(datacube_url, datacube_filepath, xml_content=True,
post_process_func=untimestamp_content)
if __name__ == '__main__':
......