Skip to content
Snippets Groups Projects
Commit 4b0b83f5 authored by Constance de Quatrebarbes's avatar Constance de Quatrebarbes
Browse files

ADDING LEVEL1: category.json README.md + LEVEL2: dataset_dir for single files

parent 8c0281d1
No related branches found
No related tags found
1 merge request!1Implement download script and more
......@@ -31,6 +31,7 @@ import re
import sys
from collections import defaultdict
from slugify import slugify
import xlrd
from docopt import docopt
......@@ -334,6 +335,9 @@ TREE = [
}
]
TARGET_REPOSITORY_URL = "git@git.nomics.world:dbnomics-json-data/dares-json-data.git"
def get_root_categories():
'''from TREE declared as CONSTANT in file create CAT 0'''
categories_d = {cat["code"]: cat["slug"] for cat in TREE}
......@@ -342,14 +346,16 @@ def get_root_categories():
def build_root_categories(dest_dir):
'''
build provider
get categorie level0
build the categories level0
get category category0
build the categories category0
'''
for cat in get_root_categories():
os.makedirs(os.path.join(dest_dir,cat))
create_provider_json(dest_dir)
create_provider_md(dest_dir)
return
def build_datasets():
def create_provider_json(dest_dir):
'''Create provider.json'''
provider_json_data = PROVIDER
......@@ -373,30 +379,65 @@ def create_provider_md(dest_dir):
f.write(msg_part+"\n"+msg_part2)
return
# def create_tree(dest_dir):
# '''Create CATEGORIES level 0 inside <dest_dir>'''
# for cat in TREE:
# cat_dir = os.path.join(dest_dir, cat["name"])
# os.makedirs(cat_dir)
# # if only one file use categorie and each sheet is a dataset
# if cat["file_nb"] == 1:
# create_categorie_json(cat_dir, cat)
# # singular case categorie 18 is a categorie and each sheet is a dataset
# elif cat["category_code"] == 18:
# create_categorie_json(cat_dir, cat)
# else:
# # create_categorie_json(cat_dir, cat)
# #if multiple file each file is a categorie
# for i, file in enumerate(cat["files"]):
# sub_cat["category_code"] = "%i.%i" %(cat["category_code"],i+1
# dir_file = "%i.%i %s" %(cat["category_code"],i,re.sub("\.xls?", "", file["slug"])
# sub_cat = os.path.join(cat_dir, dir_file)
# os.makedirs(sub_cat)
# # create_categorie_json(sub_cat, cat)
def write_category_to_json(category_json_data, dest_dir):
category_json_data = verified_value(validate_category(category_json_data, format='json'))
with open(os.path.join(dest_dir, "category.json"), "w") as f:
jdata = json.dumps(category_json_data, sort_keys=True,
indent=4, separators=(',', ': '), ensure_ascii=False)
f.write(jdata)
return
def write_category_to_md(category_json_data, dest_dir):
f_datasets = ["- [%s](%s)" %(n.replace("-", " "),n) for n in category_json_data["datasets"]]
msg = [
'# Category %s %s\n' % (category_json_data["category_code"], category_json_data["name"]),
'Metadata: [category.json](category.json)\n\n',
'## Datasets\n',
]
msg.extend(f_datasets)
msg_part = ("\n").join(msg)
with open(os.path.join(dest_dir, "README.md"), "w") as f:
f.write(msg_part)
return
def load_dataset(source_dir, dest_dir):
for category in TREE:
try:
for dataset in category["datasets"]:
excel_file_path = os.path.join(source_dir, dataset)
book = xlrd.open_workbook(excel_file_path)
# The Excel file contains sheets that should be handled
# as different dataset.
sheets = book.sheets()
#For this categories
if dataset["code"] in [1, 10, 13]:
#skip first page To Read or Synthèse
datasets = sorted([slugify(n) for n in book.sheet_names()[1:]])
else:
#each sheet is a dataset
datasets = sorted([slugify(n) for n in book.sheet_names()])
cat_dir_name = "%i %s" %(category["code"], category["slug"])
#write category.json into category category
category_dir_path = os.path.join(dest_dir, cat_dir_name)
datasets_dir_path = [os.path.join(category_dir_path, d) for d in datasets]
category_json = {
"name": category["slug"],
"category_code": str(category["code"]),
"datasets": datasets,
}
write_category_to_json(category_json, category_dir_path)
write_category_to_md(category_json, category_dir_path)
for dataset_dir in datasets_dir_path:
try:
os.makedirs(dataset_dir)
#Exception of last category that have two datasets...
except FileExistsError:
pass
except KeyError:
pass
# break
def main():
args = docopt(__doc__)
source_dir = os.path.abspath(args["<source_dir>"])
......@@ -406,7 +447,7 @@ def main():
#0. Create the provider : provider.json + README.md + build root_dir
build_root_categories(dest_dir)
#1. Build the directories following architecture from categories to datasets
load_dataset(source_dir, dest_dir)
# create_tree(dest_dir)
# for excel_f in os.listdir(source_dir):
# excel_file_path = os.path.join(args["<source_dir>"], excel_f)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment