Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
dbnomics-fetchers
google-fetcher
Commits
fa5c6d51
Commit
fa5c6d51
authored
Oct 29, 2020
by
opentable fetcher
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
adding new dimensions, refactor code
parent
a96e2516
Pipeline
#195049
passed with stages
in 7 minutes and 13 seconds
Changes
1
Pipelines
21
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
137 additions
and
127 deletions
+137
-127
convert.py
convert.py
+137
-127
No files found.
convert.py
View file @
fa5c6d51
...
...
@@ -29,12 +29,13 @@ See also `.gitlab-ci.yml` in which data is committed to a Git repository of conv
"""
import
argparse
import
csv
import
json
import
logging
import
sys
from
pathlib
import
Path
from
datetime
import
datetime
import
pandas
as
pd
import
unicodedata
from
dbnomics_json_errors
import
ErrorsArtifact
...
...
@@ -53,12 +54,12 @@ ds_info = {
}
places
=
{
'
RETREC
'
:
'Retail and recreation'
,
'
GRPH
'
:
'Grocery and pharmacy'
,
'
PARKS
'
:
'Parks'
,
'
TRANSIT
'
:
'Transit stations'
,
'
WORK
'
:
'Workplaces'
,
'
RESID
'
:
'Residential'
'
retrec
'
:
'Retail and recreation'
,
'
grph
'
:
'Grocery and pharmacy'
,
'
parks
'
:
'Parks'
,
'
transit
'
:
'Transit stations'
,
'
work
'
:
'Workplaces'
,
'
resid
'
:
'Residential'
}
...
...
@@ -94,11 +95,6 @@ def main():
dataset_code
=
'GMR'
# read with explicit type and missing value handling (latest is needed for country code 'NA, Namibia')
df
=
pd
.
read_csv
(
str
(
source_dir
)
+
'/Global_Mobility_Report.csv'
,
na_filter
=
True
,
keep_default_na
=
False
,
na_values
=
[
''
],
low_memory
=
False
)
# it may be useful to test for columncounts - and raise an error of column numbers change
# Creates dataset_dir if not already done.
dataset_dir
=
target_dir
/
dataset_code
...
...
@@ -108,146 +104,160 @@ def main():
# provider.json
write_json_file
(
target_dir
/
'provider.json'
,
provider_json
)
# Gets dimension values from data
dimension_list
=
compute_dimension_list
(
df
)
with
open
(
source_dir
/
"Global_Mobility_Report.csv"
)
as
fin
,
\
open
(
dataset_dir
/
"series.jsonl"
,
"w"
)
as
fout
:
other_dimensions
=
write_series_jsonl
(
fin
,
fout
)
#write_json_file(dataset_dir / 'dataset.json', dataset_json)
generate_dataset_json
(
dataset_dir
/
'dataset.json'
,
ds_info
,
dimension
_list
)
generate_dataset_json
(
dataset_dir
/
'dataset.json'
,
ds_info
,
other_
dimension
s
)
# series.jsonl
write_series_jsonl
(
dataset_dir
/
'series.jsonl'
,
df
,
dimension_list
)
#print("Program ends at: ", datetime.now().strftime("%H:%M:%S"))
return
0
def
generate_dataset_json
(
filepath
:
Path
,
ds_info
,
dimension
_list
):
def
generate_dataset_json
(
filepath
:
Path
,
ds_info
,
other_
dimension
s
):
"""Generates dataset.json file from dataset info and dimension list"""
dim_v_l
=
{
t
[
0
]:
t
[
2
]
for
t
in
dimension_list
}
other_dimensions
.
update
({
"place"
:
places
,
"freq"
:
{
"D"
:
"Daily"
}})
data
=
{
'code'
:
ds_info
[
'code'
],
'name'
:
ds_info
[
'name'
],
'dimensions_codes_order'
:
[
t
[
0
]
for
t
in
dimension_list
],
'dimensions_labels'
:
{
t
[
0
]:
t
[
1
]
for
t
in
dimension_list
},
'dimensions_values_labels'
:
dim_v_l
,
'dimensions_codes_order'
:
[
"country"
,
"region_1"
,
"region_2"
,
"metro_area"
,
"place"
,
"freq"
],
'dimensions_labels'
:
{
"country"
:
"Country"
,
"region_1"
:
"Region 1"
,
"region_2"
:
"Region 2"
,
"metro_area"
:
"Metropolitan area"
,
"place"
:
"Place"
,
"freq"
:
"Frequency"
},
'dimensions_values_labels'
:
other_dimensions
}
write_json_file
(
filepath
,
data
)
def
compute_dimension_list
(
df
):
"""Extracts dimension dicts from dimension codes"""
freq_dict
=
{
'D'
:
'Daily'
}
code
=
df
[[
'country_region_code'
,
'sub_region_1'
,
'sub_region_2'
]].
fillna
(
''
).
apply
(
lambda
x
:
"_"
.
join
(
filter
(
None
,
x
)),
axis
=
1
)
label
=
df
[[
'country_region'
,
'sub_region_1'
,
'sub_region_2'
]].
fillna
(
''
).
apply
(
lambda
x
:
"_"
.
join
(
filter
(
None
,
x
)),
axis
=
1
)
area_dict
=
dict
(
zip
(
label_to_code
(
code
),
label_to_code
(
label
)))
return
[
(
'FREQ'
,
'Frequency'
,
freq_dict
),
(
'places'
,
'Places'
,
places
),
(
'area'
,
'Area'
,
area_dict
)
]
def
write_json_file
(
file_path
:
Path
,
data
):
"""Writes data the JSON way to file_path"""
with
file_path
.
open
(
'w'
,
encoding
=
'utf-8'
)
as
json_fd
:
json
.
dump
(
data
,
json_fd
,
ensure_ascii
=
False
,
indent
=
2
,
sort_keys
=
True
)
def
write_series_jsonl
(
series_filepath
:
Path
,
df
,
dimension_list
):
def
parse_line
(
line
):
metadata
=
{
"country_code"
:
line
[
0
],
"country"
:
line
[
1
],
"region_1"
:
line
[
2
],
"region_2"
:
line
[
3
],
"metro_area"
:
line
[
4
],
}
data
=
line
[
7
:]
return
(
metadata
,
data
)
def
make_observations
(
data
,
index
):
observations
=
[(
"PERIOD"
,
"VALUE"
)]
for
d
in
data
:
v
=
d
[
index
+
1
]
if
len
(
v
)
==
0
:
obs
=
(
d
[
0
],
'NA'
)
else
:
obs
=
(
d
[
0
],
float
(
v
))
observations
.
append
(
obs
)
return
observations
def
write_series
(
fout
,
metadata
,
data
):
code_elements
=
[
metadata
[
"country_code"
]]
name_elements
=
[
metadata
[
"country"
]]
region_1
=
metadata
[
"region_1"
]
region_1_code
=
get_code
(
region_1
)
if
len
(
region_1
)
>
0
:
code_elements
.
append
(
region_1_code
)
name_elements
.
append
(
region_1
)
region_2
=
metadata
[
"region_2"
]
region_2_code
=
get_code
(
region_2
)
if
len
(
region_2
)
>
0
:
code_elements
.
append
(
region_2_code
)
name_elements
.
append
(
region_2
)
metro_area
=
metadata
[
"metro_area"
]
metro_area_code
=
get_code
(
metro_area
).
replace
(
'_metropolitan_area'
,
''
)
if
len
(
metro_area
)
>
0
:
code_elements
.
append
(
metro_area_code
)
name_elements
.
append
(
metro_area
)
code_stub
=
'.'
.
join
(
code_elements
)
name_stub
=
' - '
.
join
(
name_elements
)
for
i
,
v
in
enumerate
([
'retrec'
,
'grph'
,
'parks'
,
'transit'
,
'work'
,
'resid'
]):
ts_data
=
{
'code'
:
'.'
.
join
([
code_stub
,
v
]),
'name'
:
' - '
.
join
([
name_stub
,
places
[
v
]]),
'dimensions'
:
{
'country'
:
metadata
[
"country_code"
],
'region_1'
:
region_1_code
,
'region_2'
:
region_2_code
,
'metro_area'
:
metro_area_code
,
'place'
:
v
.
lower
(),
'freq'
:
'D'
},
'observations'
:
make_observations
(
data
,
i
)
}
json_str
=
json
.
dumps
(
ts_data
,
ensure_ascii
=
False
,
sort_keys
=
True
)
fout
.
write
(
json_str
+
'
\n
'
)
def
get_code
(
s
):
if
len
(
s
)
==
0
:
return
"all"
else
:
return
(
unicodedata
.
normalize
(
'NFD'
,
s
).
encode
(
'ascii'
,
'ignore'
).
lower
().
decode
(
'ascii'
,
'ignore'
).
replace
(
' '
,
'_'
).
replace
(
'/'
,
'_'
).
replace
(
"_metropolitan_area"
,
""
))
def
write_series_jsonl
(
fin
,
fout
):
"""Write series list to series.jsonl file"""
rename_places
=
{
'retail_and_recreation_percent_change_from_baseline'
:
'RETREC'
,
'grocery_and_pharmacy_percent_change_from_baseline'
:
'GRPH'
,
'parks_percent_change_from_baseline'
:
'PARKS'
,
'transit_stations_percent_change_from_baseline'
:
'TRANSIT'
,
'workplaces_percent_change_from_baseline'
:
'WORK'
,
'residential_percent_change_from_baseline'
:
'RESID'
country_dict
=
{}
other_dimensions
=
{
"country"
:
{},
"region_1"
:
{},
"region_2"
:
{},
"metro_area"
:
{}
}
df
=
df
.
rename
(
columns
=
rename_places
)
code
=
df
[[
'country_region_code'
,
'sub_region_1'
,
'sub_region_2'
]].
fillna
(
''
).
apply
(
lambda
x
:
"_"
.
join
(
filter
(
None
,
x
)),
axis
=
1
)
df
[
'ref_area_code'
]
=
label_to_code
(
code
)
modified_df
=
df
.
drop
(
columns
=
[
'country_region_code'
,
'country_region'
,
'sub_region_1'
,
'sub_region_2'
,
'metro_area'
,
'iso_3166_2_code'
,
'census_fips_code'
])
pivoted_df
=
pd
.
pivot_table
(
modified_df
,
values
=
[
'RETREC'
,
'GRPH'
,
'PARKS'
,
'TRANSIT'
,
'WORK'
,
'RESID'
],
columns
=
'date'
,
index
=
[
'ref_area_code'
])
dim_v_l
=
{
t
[
0
]:
t
[
2
]
for
t
in
dimension_list
}
# Write series.jsonl
with
series_filepath
.
open
(
'wt'
,
encoding
=
'utf-8'
)
as
fd
:
for
area
in
places
:
for
i
in
range
(
len
(
pivoted_df
[
area
]))
:
ts_dimension
=
{
'area'
:
pivoted_df
[
area
].
index
[
i
],
'places'
:
area
}
name
=
' - '
.
join
(
create_name_label
(
ts_dimension
,
dim_v_l
))
ts_dimension
.
update
({
'FREQ'
:
'D'
})
period_list
=
pivoted_df
[
area
].
columns
.
to_list
()
value_list
=
(
pivoted_df
[
area
].
iloc
[
i
]).
fillna
(
'NA'
).
to_list
()
obs_list
=
[(
'PERIOD'
,
'VALUE'
)]
+
list
(
zip
(
period_list
,
value_list
))
ts_data
=
{
'code'
:
pivoted_df
[
area
].
index
[
i
]
+
'.'
+
area
,
'name'
:
name
,
'dimensions'
:
ts_dimension
,
'observations'
:
obs_list
,
}
json_str
=
json
.
dumps
(
ts_data
,
ensure_ascii
=
False
,
sort_keys
=
True
)
fd
.
write
(
json_str
+
'
\n
'
)
def
create_name_label
(
ts_dimensions
:
dict
,
dict_dim
:
dict
)
:
L
=
[]
for
i
in
ts_dimensions
:
tmp
=
ts_dimensions
[
i
]
L
.
append
(
dict_dim
[
i
][
tmp
])
return
L
SYMBOLS
=
{
' '
:
'_'
,
'.'
:
'-'
,
'-'
:
'_'
,
'/'
:
'_'
}
def
after_slugify
(
str_value
):
"""replace symbols into expression after calling slugify"""
for
k
in
SYMBOLS
:
str_value
=
str_value
.
replace
(
k
,
SYMBOLS
[
k
])
return
str_value
def
label_to_code
(
df_value
:
pd
.
Series
):
"""replace capital letter into lowercase letter"""
result
=
df_value
result
.
str
.
encode
(
'ascii'
,
'ignore'
)
result
.
str
.
decode
(
'ascii'
)
#result=result.apply(lambda x :slugify(x))
result
=
result
.
apply
(
lambda
x
:
after_slugify
(
x
))
return
result
def
create_dimenson_dict
(
df_code
:
pd
.
Series
,
df_label
:
pd
.
Series
):
""" Cette fonction utilise de Series pour créer le dictionnaire, ce qui permet de faire abstraction de l'orientation dans le dataframe"""
if
df_code
.
empty
and
df_label
.
empty
:
raise
ValueError
(
'Please provide either a non empty list of code or non empty list of label.'
)
if
df_code
.
empty
:
df_code
=
label_to_code
(
df_label
)
if
df_label
.
empty
:
df_label
=
df_code
df_dict
=
dict
(
zip
(
df_code
,
df_label
))
return
df_dict
data
=
[[]]
csvreader
=
csv
.
reader
(
fin
,
delimiter
=
','
,
quotechar
=
'"'
)
for
lineno
,
line
in
enumerate
(
csvreader
):
if
lineno
==
0
:
continue
elif
lineno
==
1
:
(
metadata
,
data
[
0
])
=
parse_line
(
line
)
country_code
=
metadata
[
"country_code"
]
if
country_code
not
in
other_dimensions
[
"country"
]:
other_dimensions
[
"country"
][
country_code
]
=
metadata
[
"country"
]
for
d
in
[
"region_1"
,
"region_2"
,
"metro_area"
]:
label
=
metadata
[
d
]
if
len
(
label
)
>
0
:
code
=
get_code
(
label
)
other_dimensions
[
d
][
code
]
=
label
else
:
other_dimensions
[
d
][
"all"
]
=
"ALL"
period_counter
=
0
else
:
(
new_metadata
,
new_data
)
=
parse_line
(
line
)
if
new_data
[
0
]
>
data
[
period_counter
][
0
]:
data
.
append
(
new_data
)
period_counter
+=
1
else
:
write_series
(
fout
,
metadata
,
data
)
metadata
=
new_metadata
country_code
=
metadata
[
"country_code"
]
if
country_code
not
in
other_dimensions
[
"country"
]
:
other_dimensions
[
"country"
][
country_code
]
=
metadata
[
"country"
]
for
d
in
[
"region_1"
,
"region_2"
,
"metro_area"
]:
label
=
metadata
[
d
]
if
len
(
label
)
>
0
:
code
=
get_code
(
label
)
if
code
not
in
other_dimensions
[
d
]:
other_dimensions
[
d
][
code
]
=
label
elif
"all"
not
in
other_dimensions
:
other_dimensions
[
d
][
"all"
]
=
"ALL"
data
=
[
new_data
]
period_counter
=
0
return
other_dimensions
if
__name__
==
'__main__'
:
sys
.
exit
(
main
())
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment