Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
eurostat-fetcher
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
There was a problem fetching the latest pipeline status.
Show more breadcrumbs
dbnomics-fetchers
eurostat-fetcher
Commits
a97ab709
Commit
a97ab709
authored
7 years ago
by
Christophe Benz
Browse files
Options
Downloads
Patches
Plain Diff
Find existing dataset tree in packs instead of commit
parent
60183894
No related branches found
No related tags found
No related merge requests found
Pipeline
#1183
canceled with stage
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
eurostat_to_dbnomics.py
+30
-29
30 additions, 29 deletions
eurostat_to_dbnomics.py
with
30 additions
and
29 deletions
eurostat_to_dbnomics.py
+
30
−
29
View file @
a97ab709
...
...
@@ -25,7 +25,7 @@
import
argparse
from
collections
import
OrderedDict
from
collections
import
deque
,
OrderedDict
import
hashlib
import
io
import
itertools
...
...
@@ -313,9 +313,6 @@ def main():
repo
=
Repo
(
args
.
target_dir
)
if
args
.
keep_packs
:
assert
b
'
HEAD
'
in
repo
.
get_refs
()
# Write provider.json, category, datasets, series and observations in repo.
provider_tree
=
Tree
()
...
...
@@ -373,25 +370,26 @@ def main():
return
0
def
tail
(
n
,
iterable
):
"
Return an iterator over the last n items
"
# tail(3, 'ABCDEFG') --> E F G
return
iter
(
deque
(
iterable
,
maxlen
=
n
))
def
add_dataset_to_category
(
repo
,
dataset_code
,
category_tree
):
# Keep dataset tree of current commit for the next commit, to avoid deleting the dataset.
dataset_tree
=
find_dataset_tree
(
repo
,
dataset_code
,
tree
=
repo
[
repo
[
repo
.
head
()].
tree
])
# Don't find in HEAD commit because the script could have failed before committing.
if
not
repo
.
object_store
.
packs
:
return
None
dataset_tree
=
None
for
pack
in
repo
.
object_store
.
packs
:
if
os
.
path
.
basename
(
pack
.
_basename
)
!=
"
pack-{}
"
.
format
(
dataset_code
):
continue
# Because the last yield in iter_git_objects_in_sdmx_file is dataset_tree.
dataset_tree
=
next
(
tail
(
1
,
pack
.
iterobjects
()))
assert
isinstance
(
dataset_tree
,
Tree
)
and
b
"
dataset.json
"
in
dataset_tree
assert
dataset_tree
is
not
None
category_tree
.
add
(
dataset_code
.
encode
(
'
utf-8
'
),
git_tree_filemode
,
dataset_tree
.
id
)
def
find_dataset_tree
(
repo
,
dataset_code
,
tree
):
"""
Find the Git tree corresponding to a dataset.
"""
if
b
"
dataset.json
"
in
tree
and
tree
[
b
"
dataset.json
"
][
0
]
==
git_blob_filemode
:
dataset_json
=
json
.
loads
(
repo
[
tree
[
b
"
dataset.json
"
][
1
]].
data
.
decode
(
'
utf-8
'
))
return
tree
\
if
dataset_json
[
"
code
"
]
==
dataset_code
\
else
None
for
tree_entry
in
tree
.
iteritems
():
if
tree_entry
.
mode
==
git_tree_filemode
:
found_tree
=
find_dataset_tree
(
repo
,
dataset_code
,
tree
=
repo
[
tree_entry
.
sha
])
if
found_tree
is
not
None
:
return
found_tree
return
None
...
...
@@ -401,16 +399,18 @@ def write_dataset_pack(repo, sdmx_file_path, category_tree):
dataset_code
=
os
.
path
.
basename
(
sdmx_file_path
[:
-
len
(
sdmx_file_extension
)])
pack_file_name
=
"
pack-{}.pack
"
.
format
(
dataset_code
)
# Must be named like so to be recognized as packs by dulwich.
pack_file_path
=
os
.
path
.
abspath
(
os
.
path
.
join
(
args
.
target_dir
,
"
objects
"
,
"
pack
"
,
pack_file_name
))
if
args
.
datasets_codes
is
not
None
and
dataset_code
not
in
args
.
datasets_codes
:
add_dataset_to_category
(
repo
,
dataset_code
,
category_tree
)
elif
args
.
keep_packs
and
os
.
path
.
isfile
(
pack_file_path
):
log
.
info
(
"
Git pack file %s already exists: skipping pack generation
"
,
pack_file_path
)
add_dataset_to_category
(
repo
,
dataset_code
,
category_tree
)
if
args
.
datasets_codes
is
None
or
dataset_code
in
args
.
datasets_codes
:
if
args
.
keep_packs
and
os
.
path
.
isfile
(
pack_file_path
):
log
.
info
(
"
Git pack file %s already exists: skipping pack generation
"
,
pack_file_path
)
add_dataset_to_category
(
repo
,
dataset_code
,
category_tree
)
else
:
pack_start_time
=
time
.
time
()
write_pack
(
pack_file_path
,
git_objects
)
pack_time
=
time
.
time
()
-
pack_start_time
log
.
info
(
"
Git pack file %s written, took %s seconds
"
,
pack_file_path
,
pack_time
)
else
:
pack_start_time
=
time
.
time
()
write_pack
(
pack_file_path
,
git_objects
)
pack_time
=
time
.
time
()
-
pack_start_time
log
.
info
(
"
Git pack file %s written, took %s seconds
"
,
pack_file_path
,
pack_time
)
# In order not to delete this dataset in the new commit, add it to category_tree.
add_dataset_to_category
(
repo
,
dataset_code
,
category_tree
)
# Dulwich functions sightly modified to accept a generator and iterate it only one time.
...
...
@@ -480,11 +480,12 @@ def write_pack(pack_file_path, objects, deltify=None, delta_window_size=None):
deltify
=
deltify
,
)
# Overwrite the dummy number of Git objects written during `write_pack_objects`.
with
open
(
pack_file_path
,
"
r+b
"
)
as
f
:
# Overwrite the dummy number of Git objects written during `write_pack_objects`.
f
.
seek
(
8
)
f
.
write
(
struct
.
pack
(
b
'
>L
'
,
num_objects
))
# Recompute the pack SHA-1 checksum.
f
.
seek
(
0
)
BUF_SIZE
=
65536
*
1024
sha1
=
hashlib
.
sha1
()
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment