Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
eurostat-fetcher
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dbnomics-fetchers
eurostat-fetcher
Commits
0ff0ac25
Commit
0ff0ac25
authored
4 years ago
by
Christophe Benz
Browse files
Options
Downloads
Patches
Plain Diff
Remove incremental mode for convert
parent
282c2a14
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
convert.py
+6
-68
6 additions, 68 deletions
convert.py
with
6 additions
and
68 deletions
convert.py
+
6
−
68
View file @
0ff0ac25
...
...
@@ -28,12 +28,9 @@ import argparse
import
logging
import
os
import
re
import
shutil
import
subprocess
import
sys
import
time
from
collections
import
OrderedDict
from
io
import
StringIO
from
pathlib
import
Path
import
humanize
...
...
@@ -375,12 +372,6 @@ def main():
default
=
datasets_from_env
,
help
=
"
convert only the given datasets (datasets codes, space separated)
"
,
)
parser
.
add_argument
(
"
--full
"
,
action
=
"
store_true
"
,
default
=
os
.
getenv
(
FULL_ENV_VAR
),
help
=
"
convert all datasets; default behavior is to convert what changed since last commit
"
,
)
parser
.
add_argument
(
"
--log
"
,
default
=
"
INFO
"
,
help
=
"
level of logging messages
"
)
parser
.
add_argument
(
"
--resume
"
,
action
=
"
store_true
"
,
help
=
"
do not process already written datasets
"
)
parser
.
add_argument
(
"
--start-from
"
,
metavar
=
"
DATASET_CODE
"
,
help
=
"
start indexing from dataset code
"
)
...
...
@@ -396,40 +387,9 @@ def main():
raise
ValueError
(
"
Invalid log level: {}
"
.
format
(
args
.
log
))
logging
.
basicConfig
(
format
=
"
%(levelname)s:%(message)s
"
,
level
=
numeric_level
)
if
args
.
datasets
:
args
.
full
=
True
# Ask Git which datasets directories were modified in latest commit in source-data repository.
if
not
args
.
full
:
try
:
output
=
subprocess
.
check_output
(
[
"
git
"
,
"
diff
"
,
"
--name-status
"
,
"
HEAD^
"
,
datasets_dir_name
],
cwd
=
str
(
args
.
source_dir
),
universal_newlines
=
True
,
)
except
subprocess
.
CalledProcessError
:
args
.
full
=
True
else
:
modified_datasets_codes
=
set
()
deleted_datasets_codes
=
set
()
for
line
in
StringIO
(
output
):
action
,
file_path
=
line
.
strip
().
split
()
try
:
dataset_code
=
Path
(
file_path
).
parent
.
relative_to
(
datasets_dir_name
).
name
except
ValueError
:
continue
if
action
in
{
"
A
"
,
"
M
"
}:
modified_datasets_codes
.
add
(
dataset_code
)
else
:
assert
action
==
"
D
"
,
action
deleted_datasets_codes
.
add
(
dataset_code
)
log
.
info
(
"
%d datasets were modified and %d were deleted by last download
"
,
len
(
modified_datasets_codes
),
len
(
deleted_datasets_codes
),
)
log
.
info
(
"
Command-line arguments: %r
"
,
args
)
log
.
info
(
"
Mode: %s
"
,
"
full
"
if
args
.
full
else
"
incremental
"
)
write_json_file
(
args
.
target_dir
/
"
provider.json
"
,
provider_json
)
# Parse "table_of_contents", abbreviated "toc".
toc_element
=
etree
.
parse
(
str
(
args
.
source_dir
/
"
table_of_contents.xml
"
)).
getroot
()
...
...
@@ -439,6 +399,9 @@ def main():
toc_dataset_json_stub_by_code
=
{}
category_tree_json
=
toc_to_category_tree
(
toc_element
,
toc_dataset_json_stub_by_code
)
if
category_tree_json
:
write_json_file
(
args
.
target_dir
/
"
category_tree.json
"
,
category_tree_json
)
# Build list of datasets codes to convert
datasets_codes_to_convert
=
set
()
for
dataset_code
in
sorted
(
toc_dataset_json_stub_by_code
):
...
...
@@ -448,12 +411,6 @@ def main():
dataset_code
,
)
continue
if
not
args
.
full
and
dataset_code
not
in
modified_datasets_codes
:
log
.
debug
(
"
Skipping dataset %r because it was not modified by last download (due to incremental mode)
"
,
dataset_code
,
)
continue
if
args
.
start_from
is
not
None
and
dataset_code
<
args
.
start_from
:
log
.
debug
(
"
Skipping dataset %r because of --start-from option
"
,
dataset_code
)
continue
...
...
@@ -474,7 +431,7 @@ def main():
)
continue
dataset_dir
=
args
.
target_dir
/
dataset_code
if
args
.
resume
and
(
dataset_dir
/
"
dataset.json
"
).
is_file
():
if
args
.
resume
and
dataset_dir
.
is_dir
():
log
.
debug
(
"
Skipping dataset %r because it already exists (due to --resume option)
"
,
dataset_code
,
...
...
@@ -484,20 +441,6 @@ def main():
log
.
info
(
"
Converting %d datasets...
"
,
len
(
datasets_codes_to_convert
))
# Remove directories of datasets to be converted before converting.
if
not
args
.
resume
:
datasets_codes_to_delete
=
datasets_codes_to_convert
if
not
args
.
full
:
datasets_codes_to_delete
=
datasets_codes_to_delete
.
union
(
deleted_datasets_codes
)
log
.
info
(
"
Removing directories of deleted datasets and datasets to be converted: %r
"
,
datasets_codes_to_delete
,
)
for
dataset_code
in
datasets_codes_to_delete
:
dataset_dir
=
args
.
target_dir
/
dataset_code
if
dataset_dir
.
is_dir
():
shutil
.
rmtree
(
str
(
dataset_dir
))
# Convert SDMX files. Side-effect: write files for each dataset.
converted_datasets_codes
=
set
()
for
index
,
dataset_code
in
enumerate
(
sorted
(
datasets_codes_to_convert
),
start
=
1
):
...
...
@@ -524,11 +467,6 @@ def main():
converted_datasets_codes
.
add
(
dataset_code
)
write_json_file
(
args
.
target_dir
/
"
provider.json
"
,
provider_json
)
if
category_tree_json
:
write_json_file
(
args
.
target_dir
/
"
category_tree.json
"
,
category_tree_json
)
return
0
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment