Skip to content
Snippets Groups Projects
Commit 80e781cf authored by Christophe Benz's avatar Christophe Benz
Browse files

Dedupe observations TSV blobs directly to spare RAM

parent ad73429a
No related branches found
No related tags found
No related merge requests found
......@@ -67,7 +67,9 @@ toc_nsmap = {
}
def iter_git_objects_in_sdmx_file(sdmx_file_path, provider_tree, dataset_pair_by_dataset_code, dataset_json_stub):
def iter_git_objects_in_sdmx_file(sdmx_file_path, data_package_tree, dataset_pair_by_dataset_code, dataset_json_stub):
yielded_git_object_ids = set()
dsd_file_path = "{}.dsd.xml".format(sdmx_file_path[:-len(sdmx_file_extension)])
with open(dsd_file_path) as dsd_file:
dsd_tree = etree.parse(dsd_file)
......@@ -230,9 +232,13 @@ def iter_git_objects_in_sdmx_file(sdmx_file_path, provider_tree, dataset_pair_by
])
observations_tsv_blob = Blob.from_string(observations_tsv_str.encode('utf-8'))
yield observations_tsv_blob
observations_tsv_blob_id = observations_tsv_blob.id
# Some TSV observations files are exactly the same, and Git packs don't tolerate duplicate objects.
if observations_tsv_blob_id not in yielded_git_object_ids:
yielded_git_object_ids.add(observations_tsv_blob_id)
yield observations_tsv_blob
dataset_tree.add("{}.tsv".format(series_code).encode('utf-8'),
git_blob_filemode, observations_tsv_blob.id)
git_blob_filemode, observations_tsv_blob_id)
# From https://stackoverflow.com/questions/12160418/why-is-lxml-etree-iterparse-eating-up-all-my-memory
# It's safe to call clear() here because no descendants will be accessed
......@@ -524,7 +530,7 @@ def write_pack(pack_file_path, objects, deltify=None, delta_window_size=None):
write_pack_header(f, 42) # Temporary value, overwritten after Git objects are written.
entries, num_objects = write_pack_objects(
f,
iter_deduped(objects),
objects,
delta_window_size=delta_window_size,
deltify=deltify,
)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment