Skip to content
Snippets Groups Projects
Commit 72337249 authored by Christophe Benz's avatar Christophe Benz
Browse files

Create temporary commits in branch...

for datasets and compress git objects between commits.

Then merge all trees of those commits in the branch in a single tree in master.
parent 9c29a896
No related branches found
No related tags found
No related merge requests found
Pipeline #1133 failed with stage
in 1 minute and 23 seconds
......@@ -26,7 +26,7 @@
import argparse
from collections import OrderedDict
from itertools import count, islice
import itertools
import json
import logging
import os
......@@ -42,6 +42,7 @@ from slugify import slugify
from dbnomics_data_model import validators
current_conversion_branch_name = "current-conversion"
log = logging.getLogger(__name__)
provider_code = 'Eurostat'
provider_json = dict(
......@@ -56,7 +57,7 @@ required_observations_tsv_column_names = ["TIME_PERIOD", "OBS_VALUE"]
sdmx_file_extension = ".sdmx.xml"
def convert_sdmx_file(repo, sdmx_file_path, provider_tree):
def convert_sdmx_file(repo, sdmx_file_path, previous_commit_id):
log.info("Converting %s", sdmx_file_path)
with open(sdmx_file_path) as sdmx_file:
sdmx_tree = etree.parse(sdmx_file)
......@@ -75,7 +76,7 @@ def convert_sdmx_file(repo, sdmx_file_path, provider_tree):
dataset_element = sdmx_tree.find("./data:DataSet", namespaces=sdmx_nsmap)
assert dataset_element is not None, sdmx_file_path
dataset_tree = repo.TreeBuilder()
dataset_tree_builder = repo.TreeBuilder()
dataset_json = {
"attributes_labels": {}, # Will be filled by every series.
......@@ -146,7 +147,7 @@ def convert_sdmx_file(repo, sdmx_file_path, provider_tree):
dimension_code, {})[dimension_value_code] = dimension_value_label
validators.validate_series(series_json)
dataset_tree.insert(
dataset_tree_builder.insert(
"{}.json".format(series_code),
repo.create_blob(json.dumps(series_json, ensure_ascii=False, indent=2, sort_keys=True)),
pygit2.GIT_FILEMODE_BLOB,
......@@ -214,7 +215,7 @@ def convert_sdmx_file(repo, sdmx_file_path, provider_tree):
)
for observation in observations_tsv
])
dataset_tree.insert(
dataset_tree_builder.insert(
"{}.tsv".format(series_code),
repo.create_blob(observations_tsv_str),
pygit2.GIT_FILEMODE_BLOB,
......@@ -223,21 +224,39 @@ def convert_sdmx_file(repo, sdmx_file_path, provider_tree):
# Write dataset.json
validators.validate_dataset(dataset_json)
dataset_tree.insert(
dataset_tree_builder.insert(
"dataset.json",
repo.create_blob(json.dumps(dataset_json, ensure_ascii=False, indent=2, sort_keys=True)),
pygit2.GIT_FILEMODE_BLOB,
)
log.info("Writing dataset tree...")
dataset_tree_id = dataset_tree.write()
dataset_tree_id = dataset_tree_builder.write()
log.info("Dataset tree ID: %s", dataset_tree_id)
provider_tree.insert(
provider_tree_builder = repo.TreeBuilder()
provider_tree_builder.insert(
dataset_code,
dataset_tree_id,
pygit2.GIT_FILEMODE_TREE,
)
log.info("Writing provider tree...")
provider_tree_id = provider_tree_builder.write()
log.info("Provider tree ID: %s", provider_tree_id)
log.info("Committing dataset in provider tree...")
author = pygit2.Signature(provider_code, "eurostat@db.nomics.world")
dataset_commit_id = repo.create_commit(
"refs/heads/{}".format(current_conversion_branch_name),
author,
author,
dataset_code,
provider_tree_id,
[previous_commit_id],
)
log.info("Commit ID: %s", dataset_commit_id)
return dataset_commit_id
def main():
script_name = os.path.basename(sys.argv[0])
......@@ -261,47 +280,98 @@ def main():
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
repo = pygit2.Repository(args.target_dir)
provider_tree = repo.TreeBuilder()
assert not repo.head_is_unborn, repo
master_commit = repo.head.get_object()
# Write provider.json
provider_tree_builder = repo.TreeBuilder()
validators.validate_provider(provider_json)
provider_tree.insert(
provider_tree_builder.insert(
"provider.json",
repo.create_blob(json.dumps(provider_json, ensure_ascii=False, indent=2, sort_keys=True)),
pygit2.GIT_FILEMODE_BLOB,
)
log.info("Writing provider tree...")
provider_tree_id = provider_tree_builder.write()
log.info("Provider tree ID: %s", provider_tree_id)
log.info("Committing provider...")
author = pygit2.Signature(provider_code, "eurostat@db.nomics.world")
current_conversion_provider_commit_id = repo.create_commit(
"refs/heads/{}".format(current_conversion_branch_name),
author,
author,
provider_code,
provider_tree_id,
[master_commit.hex],
)
log.info("Commit ID: %s", current_conversion_provider_commit_id)
# Iterate over datasets in source files.
if os.path.isdir(args.source_dir_or_sdmx_file):
data_dir_path = os.path.join(args.source_dir_or_sdmx_file, "data")
previous_commit_id = current_conversion_provider_commit_id
for dirpath, dirnames, filenames in os.walk(data_dir_path):
for filename in filenames:
if not filename.endswith(sdmx_file_extension):
continue
sdmx_file_path = os.path.abspath(os.path.join(dirpath, filename))
convert_sdmx_file(repo, sdmx_file_path, provider_tree)
previous_commit_id = convert_sdmx_file(
repo=repo,
sdmx_file_path=sdmx_file_path,
previous_commit_id=previous_commit_id,
)
log.info("Compressing Git objects...")
subprocess.run(["git", "gc"], cwd=args.target_dir, check=True)
else:
assert os.path.isfile(args.source_dir_or_sdmx_file) and \
args.source_dir_or_sdmx_file.endswith(sdmx_file_extension), args.source_dir_or_sdmx_file
convert_sdmx_file(repo, args.source_dir_or_sdmx_file, provider_tree)
convert_sdmx_file(
repo=repo,
sdmx_file_path=args.source_dir_or_sdmx_file,
previous_commit_id=current_conversion_provider_commit_id,
)
log.info("Writing provider tree...")
provider_tree_id = provider_tree.write()
log.info("Provider tree ID: %s", provider_tree_id)
# Merge trees step
log.info("Merge trees of commits in `current-conversion` branch into a single tree in `master`.")
def commit_with_parents(commit):
return list(itertools.chain.from_iterable(
commit_with_parents(parent)
for parent in commit.parents
)) + [commit]
current_conversion_head_commit = repo.revparse_single(current_conversion_branch_name)
current_conversion_commits = commit_with_parents(current_conversion_head_commit)
log.info("Creating commit...")
author = pygit2.Signature(script_name, "eurostat@db.nomics.world")
commit_id = repo.create_commit(
"HEAD",
provider_tree_builder = repo.TreeBuilder()
for commit in current_conversion_commits:
for tree_entry in commit.tree:
provider_tree_builder.insert(tree_entry.name, tree_entry.hex, tree_entry.filemode)
provider_tree_id = provider_tree_builder.write()
repo.create_commit(
'refs/heads/master',
author,
author,
"New conversion",
provider_tree_id,
[] if repo.head_is_unborn else [repo.head.get_object().hex],
[master_commit.hex],
)
log.info("Commit ID: %s", commit_id)
current_conversion_branch = repo.branches.get(current_conversion_branch_name)
assert current_conversion_branch is not None
current_conversion_branch.delete()
log.info("Compressing Git objects and pruning unreachable objects...")
subprocess.run(["git", "gc", "--aggressive", "--prune=now"], cwd=args.target_dir, check=True)
return 0
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment