Commit e741b44e authored by Christophe Benz's avatar Christophe Benz

Update to data model 0.7

parent fbd58ae0
Pipeline #2020 passed with stage
in 14 seconds
......@@ -33,18 +33,19 @@ import sys
from slugify import slugify
import xlrd
from dbnomics_data_model import validators
EXCEL_FILE_NAME_WITHOUT_EXTENSION = "LA PROTECTION SOCIALE EN FRANCE ET EN EUROPE - De 1959 à 2015"
EXCEL_FILE_NAME = "{}.xls".format(EXCEL_FILE_NAME_WITHOUT_EXTENSION)
PROVIDER_CODE = "DREES"
datapackage_json = {
"dbnomics": {
"data_model_version": "0.7.3"
}
}
PROVIDER = {
"code": PROVIDER_CODE,
"code": "DREES",
"name": "Direction de la recherche, des études, de l’évaluation et des statistiques",
"region": "France",
"slug": slugify(PROVIDER_CODE),
"terms_of_use": "http://drees.solidarites-sante.gouv.fr/etudes-et-statistiques/article/mentions-legales",
"website": "http://solidarites-sante.gouv.fr/ministere/organisation/directions/article/"
"drees-direction-de-la-recherche-des-etudes-de-l-evaluation-et-des-statistiques",
......@@ -63,88 +64,56 @@ def main():
'target_dir',
help='path of target directory containing datasets & series in DB.nomics JSON and TSV formats',
)
parser.add_argument(
'--rewrite-history',
action='store_true',
help='erase and rewrite history, writing a commit to json data repo for each commit of source data repo',
)
parser.add_argument(
'--push',
action='store_true',
help='push commits to remote Git server',
)
args = parser.parse_args()
assert os.path.exists(args.source_dir)
if not os.path.exists(args.target_dir):
subprocess.check_call(['git', 'clone', TARGET_REPOSITORY_URL, args.target_dir])
# Remove all tracked files.
subprocess.check_call(
['git', 'rm', '--ignore-unmatch', '-r', '.'],
cwd=args.target_dir,
)
write_json_file(os.path.join(args.target_dir, 'provider.json'), PROVIDER)
excel_file_path = os.path.join(args.source_dir, EXCEL_FILE_NAME)
book = xlrd.open_workbook(excel_file_path)
# book_release_date_str = book.props['created'] # TODO Store the creation date
validators.validate_provider(PROVIDER)
write_json_file(os.path.join(args.target_dir, 'provider.json'), PROVIDER)
# The Excel file contains 10 sheets that should be handled as different data set.
sheets = book.sheets()
# We consider the Excel file name to be a category.
category_json = {
first_category = {
"name": EXCEL_FILE_NAME_WITHOUT_EXTENSION,
"code": slugify(EXCEL_FILE_NAME_WITHOUT_EXTENSION),
}
category_dir_path = os.path.join(args.target_dir, EXCEL_FILE_NAME_WITHOUT_EXTENSION)
os.mkdir(category_dir_path)
dataset_codes = map(slugify, book.sheet_names())
for dataset_code, sheet in zip(dataset_codes, sheets):
write_dataset(category_dir_path, dataset_code, sheet)
validators.validate_category(category_json)
write_json_file(os.path.join(category_dir_path, 'category.json'), category_json)
def iter_first_category_children():
for dataset_code, sheet in zip(dataset_codes, sheets):
dataset_dir_path = os.path.join(args.target_dir, dataset_code)
os.mkdir(dataset_dir_path)
dataset_json = get_and_write_dataset_json(dataset_dir_path, dataset_code, sheet)
dataset = {"code": dataset_json["code"]}
dataset_name = dataset_json.get("name")
if dataset_name is not None:
dataset["name"] = dataset_name
yield dataset
subprocess.check_call(
['git', 'add', '--all'],
cwd=args.target_dir,
)
# Note: Ignore error occuring when there is nothing to commit.
subprocess.call(
['git', 'commit', '-m', 'New fetch'],
cwd=args.target_dir,
)
if args.push:
# Note: Ignore error occuring when there is nothing to push.
if args.rewrite_history:
subprocess.call(
['git', 'push', '-f', 'origin', 'master'],
cwd=args.target_dir,
)
else:
subprocess.call(
['git', 'push', 'origin', 'master'],
cwd=args.target_dir,
)
first_category["children"] = list(iter_first_category_children())
categories_tree_json = [first_category]
write_json_file(os.path.join(args.target_dir, 'datapackage.json'), datapackage_json)
write_json_file(os.path.join(args.target_dir, 'categories_tree.json'), categories_tree_json)
return 0
def write_dataset(category_dir_path, dataset_code, sheet):
def get_and_write_dataset_json(dataset_dir_path, dataset_code, sheet):
"""Return dataset_json.
Side-effect: write dataset.json
"""
dataset_json = {"code": dataset_code}
top_left_cell_value = sheet.cell_value(0, 0)
if top_left_cell_value:
dataset_json["name"] = str(top_left_cell_value).strip()
dataset_dir_path = os.path.join(category_dir_path, dataset_code)
os.mkdir(dataset_dir_path)
# Extract time vector
first_row_values = sheet.row_values(0)
time_vector_values = first_row_values[1:]
......@@ -152,23 +121,32 @@ def write_dataset(category_dir_path, dataset_code, sheet):
time_vector_values[-1] = time_vector_values[-1][:-2] # Remove "SD"
time_vector_values = list(map(int, time_vector_values))
# Extract series JSON and observation vectors
for row_index in range(1, sheet.nrows):
row = sheet.row_values(row_index)
if all(cell == '' for cell in row):
# An empty line means that there are "series notes" below, like:
# Source : DREES ...
# Pour en savoir plus : ...
break
write_series(dataset_dir_path, row_index, row, time_vector_values)
def iter_series_json():
"""Extract series JSON and observation vectors.
Side-effect: write TSV files in dataset directory.
"""
for row_index in range(1, sheet.nrows):
row = sheet.row_values(row_index)
if all(cell == '' for cell in row):
# An empty line means that there are "series notes" below, like:
# Source : DREES ...
# Pour en savoir plus : ...
break
yield get_series_json_and_write_tsv(dataset_dir_path, row_index, row, time_vector_values)
# TODO Convert dataset notes at the bottom of the Excel sheet.
validators.validate_dataset(dataset_json)
dataset_json["series"] = list(iter_series_json())
write_json_file(os.path.join(dataset_dir_path, 'dataset.json'), dataset_json)
return dataset_json
def write_series(dataset_dir_path, row_index, row, time_vector):
def get_series_json_and_write_tsv(dataset_dir_path, row_index, row, time_vector):
"""Return series_json.
Side-effect: write TSV file in dataset directory.
"""
observations_vector = row[1:]
# validate_observations_vector = uniform_sequence(check_not_none)
# observations_vector = verified_value(validate_observations_vector(observations_vector))
......@@ -177,20 +155,12 @@ def write_series(dataset_dir_path, row_index, row, time_vector):
assert series_name, row
series_code = "row-{}-{}".format(row_index + 1, slugify(series_name))
series_dir_path = os.path.join(dataset_dir_path, series_code)
os.mkdir(series_dir_path)
series_json = {
"code": series_code,
"name": series_name,
}
validators.validate_series(series_json)
write_json_file(os.path.join(series_dir_path, 'series.json'), series_json)
with open(os.path.join(series_dir_path, 'observations.tsv'), 'w') as observations_file:
with open(os.path.join(dataset_dir_path, '{}.tsv'.format(series_code)), 'w') as observations_file:
for period, observation in zip(time_vector, observations_vector):
observations_file.write("{}\t{}\n".format(period, observation))
return {"code": series_code, "name": series_name}
def write_json_file(file_path, data):
with open(file_path, 'w') as file_:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment