Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
dares-fetcher
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dbnomics-fetchers
dares-fetcher
Commits
4b0b83f5
Commit
4b0b83f5
authored
7 years ago
by
Constance de Quatrebarbes
Browse files
Options
Downloads
Patches
Plain Diff
ADDING LEVEL1: category.json README.md + LEVEL2: dataset_dir for single files
parent
8c0281d1
No related branches found
Branches containing commit
No related tags found
1 merge request
!1
Implement download script and more
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
dares_to_dbnomics.py
+66
-25
66 additions, 25 deletions
dares_to_dbnomics.py
with
66 additions
and
25 deletions
dares_to_dbnomics.py
+
66
−
25
View file @
4b0b83f5
...
...
@@ -31,6 +31,7 @@ import re
import
sys
from
collections
import
defaultdict
from
slugify
import
slugify
import
xlrd
from
docopt
import
docopt
...
...
@@ -334,6 +335,9 @@ TREE = [
}
]
TARGET_REPOSITORY_URL
=
"
git@git.nomics.world:dbnomics-json-data/dares-json-data.git
"
def
get_root_categories
():
'''
from TREE declared as CONSTANT in file create CAT 0
'''
categories_d
=
{
cat
[
"
code
"
]:
cat
[
"
slug
"
]
for
cat
in
TREE
}
...
...
@@ -342,14 +346,16 @@ def get_root_categories():
def
build_root_categories
(
dest_dir
):
'''
build provider
get categor
ie level
0
build the categories
level
0
get categor
y category
0
build the categories
category
0
'''
for
cat
in
get_root_categories
():
os
.
makedirs
(
os
.
path
.
join
(
dest_dir
,
cat
))
create_provider_json
(
dest_dir
)
create_provider_md
(
dest_dir
)
return
def
build_datasets
():
def
create_provider_json
(
dest_dir
):
'''
Create provider.json
'''
provider_json_data
=
PROVIDER
...
...
@@ -373,30 +379,65 @@ def create_provider_md(dest_dir):
f
.
write
(
msg_part
+
"
\n
"
+
msg_part2
)
return
# def create_tree(dest_dir):
# '''Create CATEGORIES level 0 inside <dest_dir>'''
# for cat in TREE:
# cat_dir = os.path.join(dest_dir, cat["name"])
# os.makedirs(cat_dir)
# # if only one file use categorie and each sheet is a dataset
# if cat["file_nb"] == 1:
# create_categorie_json(cat_dir, cat)
# # singular case categorie 18 is a categorie and each sheet is a dataset
# elif cat["category_code"] == 18:
# create_categorie_json(cat_dir, cat)
# else:
# # create_categorie_json(cat_dir, cat)
# #if multiple file each file is a categorie
# for i, file in enumerate(cat["files"]):
# sub_cat["category_code"] = "%i.%i" %(cat["category_code"],i+1
# dir_file = "%i.%i %s" %(cat["category_code"],i,re.sub("\.xls?", "", file["slug"])
# sub_cat = os.path.join(cat_dir, dir_file)
# os.makedirs(sub_cat)
# # create_categorie_json(sub_cat, cat)
def
write_category_to_json
(
category_json_data
,
dest_dir
):
category_json_data
=
verified_value
(
validate_category
(
category_json_data
,
format
=
'
json
'
))
with
open
(
os
.
path
.
join
(
dest_dir
,
"
category.json
"
),
"
w
"
)
as
f
:
jdata
=
json
.
dumps
(
category_json_data
,
sort_keys
=
True
,
indent
=
4
,
separators
=
(
'
,
'
,
'
:
'
),
ensure_ascii
=
False
)
f
.
write
(
jdata
)
return
def
write_category_to_md
(
category_json_data
,
dest_dir
):
f_datasets
=
[
"
- [%s](%s)
"
%
(
n
.
replace
(
"
-
"
,
"
"
),
n
)
for
n
in
category_json_data
[
"
datasets
"
]]
msg
=
[
'
# Category %s %s
\n
'
%
(
category_json_data
[
"
category_code
"
],
category_json_data
[
"
name
"
]),
'
Metadata: [category.json](category.json)
\n\n
'
,
'
## Datasets
\n
'
,
]
msg
.
extend
(
f_datasets
)
msg_part
=
(
"
\n
"
).
join
(
msg
)
with
open
(
os
.
path
.
join
(
dest_dir
,
"
README.md
"
),
"
w
"
)
as
f
:
f
.
write
(
msg_part
)
return
def
load_dataset
(
source_dir
,
dest_dir
):
for
category
in
TREE
:
try
:
for
dataset
in
category
[
"
datasets
"
]:
excel_file_path
=
os
.
path
.
join
(
source_dir
,
dataset
)
book
=
xlrd
.
open_workbook
(
excel_file_path
)
# The Excel file contains sheets that should be handled
# as different dataset.
sheets
=
book
.
sheets
()
#For this categories
if
dataset
[
"
code
"
]
in
[
1
,
10
,
13
]:
#skip first page To Read or Synthèse
datasets
=
sorted
([
slugify
(
n
)
for
n
in
book
.
sheet_names
()[
1
:]])
else
:
#each sheet is a dataset
datasets
=
sorted
([
slugify
(
n
)
for
n
in
book
.
sheet_names
()])
cat_dir_name
=
"
%i %s
"
%
(
category
[
"
code
"
],
category
[
"
slug
"
])
#write category.json into category category
category_dir_path
=
os
.
path
.
join
(
dest_dir
,
cat_dir_name
)
datasets_dir_path
=
[
os
.
path
.
join
(
category_dir_path
,
d
)
for
d
in
datasets
]
category_json
=
{
"
name
"
:
category
[
"
slug
"
],
"
category_code
"
:
str
(
category
[
"
code
"
]),
"
datasets
"
:
datasets
,
}
write_category_to_json
(
category_json
,
category_dir_path
)
write_category_to_md
(
category_json
,
category_dir_path
)
for
dataset_dir
in
datasets_dir_path
:
try
:
os
.
makedirs
(
dataset_dir
)
#Exception of last category that have two datasets...
except
FileExistsError
:
pass
except
KeyError
:
pass
# break
def
main
():
args
=
docopt
(
__doc__
)
source_dir
=
os
.
path
.
abspath
(
args
[
"
<source_dir>
"
])
...
...
@@ -406,7 +447,7 @@ def main():
#0. Create the provider : provider.json + README.md + build root_dir
build_root_categories
(
dest_dir
)
#1. Build the directories following architecture from categories to datasets
load_dataset
(
source_dir
,
dest_dir
)
# create_tree(dest_dir)
# for excel_f in os.listdir(source_dir):
# excel_file_path = os.path.join(args["<source_dir>"], excel_f)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment