...
 
Commits (2)
#! /usr/bin/env python3
# dress-fetcher -- Fetch series from DRESS
# drees-fetcher -- Fetch series from DREES
# By: Christophe Benz <christophe.benz@cepremap.org>
#
# Copyright (C) 2017 Cepremap
# https://git.nomics.world/dbnomics-fetchers/dress-fetcher
# https://git.nomics.world/dbnomics-fetchers/drees-fetcher
#
# This is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
......@@ -22,7 +22,7 @@
"""
Fetch series from DRESS (Direction de la recherche, des études, de l’évaluation et des statistiques).
Fetch series from DREES (Direction de la recherche, des études, de l’évaluation et des statistiques).
See http://www.data.drees.sante.gouv.fr/ReportFolders/reportFolders.aspx?IF_ActivePath=P,304,305
"""
......@@ -35,7 +35,7 @@ import sys
import requests
TARGET_REPOSITORY_URL = "git@git.nomics.world:dbnomics-source-data/dress-source-data.git"
TARGET_REPOSITORY_URL = "git@git.nomics.world:dbnomics-source-data/drees-source-data.git"
EXCEL_FILE_NAME = "LA PROTECTION SOCIALE EN FRANCE ET EN EUROPE - De 1959 à 2015.xls"
EXCEL_URL = "http://www.data.drees.sante.gouv.fr/TableViewer/document.aspx?ReportId=3012"
......@@ -44,7 +44,7 @@ def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'target_dir',
help='path of target directory containing DRESS series in source format',
help='path of target directory containing DREES series in source format',
)
parser.add_argument(
'--debug',
......
#! /usr/bin/env python3
# dress-fetcher -- Fetch series from DRESS
# drees-fetcher -- Fetch series from DREES
# By: Christophe Benz <christophe.benz@cepremap.org>
#
# Copyright (C) 2017 Cepremap
# https://git.nomics.world/dbnomics-fetchers/dress-fetcher
# https://git.nomics.world/dbnomics-fetchers/drees-fetcher
#
# This is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
......@@ -21,7 +21,7 @@
# along with this program. If not, see <http:>www.gnu.org/licenses/>.
"""Convert series from DRESS macro economic database to DB.nomics JSON and TSV formats."""
"""Convert series from DREES macro economic database to DB.nomics JSON and TSV formats."""
import argparse
......@@ -33,33 +33,31 @@ import sys
from slugify import slugify
import xlrd
from dbnomics_converters.base import check_converter, check_not_none, pipe, to_float, to_int, uniform_sequence, verified_value
from dbnomics_converters.categories import validate_category
from dbnomics_converters.datasets import validate_dataset
from dbnomics_converters.providers import validate_provider
from dbnomics_converters.series import validate_series
from dbnomics_data_model import validators
EXCEL_FILE_NAME_WITHOUT_EXTENSION = "LA PROTECTION SOCIALE EN FRANCE ET EN EUROPE - De 1959 à 2015"
EXCEL_FILE_NAME = "{}.xls".format(EXCEL_FILE_NAME_WITHOUT_EXTENSION)
PROVIDER_CODE = "DREES"
PROVIDER = {
"name": "DRESS",
"long_name": "Direction de la recherche, des études, de l’évaluation et des statistiques",
"code": PROVIDER_CODE,
"name": "Direction de la recherche, des études, de l’évaluation et des statistiques",
"region": "France",
"categories": [EXCEL_FILE_NAME_WITHOUT_EXTENSION],
"slug": slugify(PROVIDER_CODE),
"terms_of_use": "http://drees.solidarites-sante.gouv.fr/etudes-et-statistiques/article/mentions-legales",
"website": "http://solidarites-sante.gouv.fr/ministere/organisation/directions/article/"
"drees-direction-de-la-recherche-des-etudes-de-l-evaluation-et-des-statistiques",
}
TARGET_REPOSITORY_URL = "git@git.nomics.world:dbnomics-json-data/dress-json-data.git"
TARGET_REPOSITORY_URL = "git@git.nomics.world:dbnomics-json-data/drees-json-data.git"
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'source_dir',
help='path of source directory containing DRESS series in source format',
help='path of source directory containing DREES series in source format',
)
parser.add_argument(
'target_dir',
......@@ -91,8 +89,8 @@ def main():
book = xlrd.open_workbook(excel_file_path)
# book_release_date_str = book.props['created'] # TODO Store the creation date
provider_json = verified_value(validate_provider(PROVIDER, format='json'))
write_json_file(os.path.join(args.target_dir, 'provider.json'), provider_json)
validators.validate_provider(PROVIDER)
write_json_file(os.path.join(args.target_dir, 'provider.json'), PROVIDER)
# The Excel file contains 10 sheets that should be handled as different data set.
sheets = book.sheets()
......@@ -100,18 +98,16 @@ def main():
# We consider the Excel file name to be a category.
category_json = {
"name": EXCEL_FILE_NAME_WITHOUT_EXTENSION,
"category_code": slugify(EXCEL_FILE_NAME_WITHOUT_EXTENSION),
"datasets": [],
"code": slugify(EXCEL_FILE_NAME_WITHOUT_EXTENSION),
}
category_dir_path = os.path.join(args.target_dir, EXCEL_FILE_NAME_WITHOUT_EXTENSION)
os.mkdir(category_dir_path)
dataset_codes = map(slugify, book.sheet_names())
for dataset_code, sheet in zip(dataset_codes, sheets):
dataset_dir_name = write_dataset(category_dir_path, dataset_code, sheet)
category_json["datasets"].append(dataset_dir_name)
write_dataset(category_dir_path, dataset_code, sheet)
category_json = verified_value(validate_category(category_json, format='json'))
validators.validate_category(category_json)
write_json_file(os.path.join(category_dir_path, 'category.json'), category_json)
subprocess.check_call(
......@@ -140,10 +136,7 @@ def main():
def write_dataset(category_dir_path, dataset_code, sheet):
dataset_json = {
"dataset_code": dataset_code,
"series": [],
}
dataset_json = {"code": dataset_code}
top_left_cell_value = sheet.cell_value(0, 0)
if top_left_cell_value:
......@@ -153,52 +146,51 @@ def write_dataset(category_dir_path, dataset_code, sheet):
os.mkdir(dataset_dir_path)
# Extract time vector
to_time_vector = uniform_sequence(pipe(to_int, check_not_none))
first_row_values = sheet.row_values(0)
time_vector_values = first_row_values[1:]
assert time_vector_values[-1].endswith("SD"), time_vector_values[-1]
time_vector_values[-1] = time_vector_values[-1][:-2] # Remove "SD"
time_vector = verified_value(to_time_vector(time_vector_values))
time_vector_values = list(map(int, time_vector_values))
# Extract series JSON and observation vectors
for row_index in range(1, sheet.nrows):
row = sheet.row_values(row_index)
series_dir_name = write_series(dataset_dir_path, row_index, row, time_vector)
dataset_json["series"].append(series_dir_name)
if all(cell == '' for cell in row):
# An empty line means that there are "series notes" below, like:
# Source : DREES ...
# Pour en savoir plus : ...
break
write_series(dataset_dir_path, row_index, row, time_vector_values)
# TODO Convert dataset notes at the bottom of the Excel sheet.
dataset_json = verified_value(validate_dataset(dataset_json, format='json'))
validators.validate_dataset(dataset_json)
write_json_file(os.path.join(dataset_dir_path, 'dataset.json'), dataset_json)
return dataset_code
def write_series(dataset_dir_path, row_index, row, time_vector):
observations_vector = row[1:]
validate_observations_vector = uniform_sequence(check_not_none)
observations_vector = verified_value(validate_observations_vector(observations_vector))
# validate_observations_vector = uniform_sequence(check_not_none)
# observations_vector = verified_value(validate_observations_vector(observations_vector))
series_name = row[0].strip()
series_key = "row-{}-{}".format(row_index + 1, slugify(series_name))
assert series_name, row
series_code = "row-{}-{}".format(row_index + 1, slugify(series_name))
series_dir_path = os.path.join(dataset_dir_path, series_key)
series_dir_path = os.path.join(dataset_dir_path, series_code)
os.mkdir(series_dir_path)
series_json = {
"key": series_key,
"code": series_code,
"name": series_name,
}
series_json = verified_value(validate_series(series_json, format='json'))
validators.validate_series(series_json)
write_json_file(os.path.join(series_dir_path, 'series.json'), series_json)
with open(os.path.join(series_dir_path, 'observations.tsv'), 'w') as observations_file:
for period, observation in zip(time_vector, observations_vector):
observations_file.write("{}\t{}\n".format(period, observation))
return series_key
def write_json_file(file_path, data):
with open(file_path, 'w') as file_:
......