Commit fa5c6d51 authored by opentable fetcher's avatar opentable fetcher

adding new dimensions, refactor code

parent a96e2516
Pipeline #195049 passed with stages
in 7 minutes and 13 seconds
......@@ -29,12 +29,13 @@ See also `.gitlab-ci.yml` in which data is committed to a Git repository of conv
"""
import argparse
import csv
import json
import logging
import sys
from pathlib import Path
from datetime import datetime
import pandas as pd
import unicodedata
from dbnomics_json_errors import ErrorsArtifact
......@@ -53,12 +54,12 @@ ds_info = {
}
places = {
'RETREC':'Retail and recreation',
'GRPH':'Grocery and pharmacy',
'PARKS':'Parks',
'TRANSIT':'Transit stations',
'WORK':'Workplaces',
'RESID':'Residential'
'retrec': 'Retail and recreation',
'grph': 'Grocery and pharmacy',
'parks': 'Parks',
'transit': 'Transit stations',
'work': 'Workplaces',
'resid': 'Residential'
}
......@@ -94,11 +95,6 @@ def main():
dataset_code = 'GMR'
# read with explicit type and missing value handling (latest is needed for country code 'NA, Namibia')
df = pd.read_csv(str(source_dir)+'/Global_Mobility_Report.csv',na_filter=True, keep_default_na=False, na_values=[''],low_memory=False)
# it may be useful to test for columncounts - and raise an error of column numbers change
# Creates dataset_dir if not already done.
dataset_dir = target_dir / dataset_code
......@@ -108,146 +104,160 @@ def main():
# provider.json
write_json_file(target_dir / 'provider.json', provider_json)
# Gets dimension values from data
dimension_list = compute_dimension_list(df)
with open(source_dir / "Global_Mobility_Report.csv") as fin, \
open(dataset_dir / "series.jsonl", "w") as fout:
other_dimensions = write_series_jsonl(fin, fout)
#write_json_file(dataset_dir / 'dataset.json', dataset_json)
generate_dataset_json( dataset_dir / 'dataset.json', ds_info, dimension_list)
generate_dataset_json( dataset_dir / 'dataset.json', ds_info, other_dimensions)
# series.jsonl
write_series_jsonl(dataset_dir / 'series.jsonl', df, dimension_list)
#print("Program ends at: ", datetime.now().strftime("%H:%M:%S"))
return 0
def generate_dataset_json(filepath: Path, ds_info, dimension_list):
def generate_dataset_json(filepath: Path, ds_info, other_dimensions):
"""Generates dataset.json file from dataset info and dimension list"""
dim_v_l = {t[0]: t[2] for t in dimension_list}
other_dimensions.update({"place": places, "freq": {"D": "Daily"}})
data = {
'code': ds_info['code'],
'name': ds_info['name'],
'dimensions_codes_order': [t[0] for t in dimension_list],
'dimensions_labels': {t[0]: t[1] for t in dimension_list},
'dimensions_values_labels': dim_v_l,
'dimensions_codes_order': ["country", "region_1", "region_2", "metro_area", "place", "freq"],
'dimensions_labels': {"country": "Country",
"region_1": "Region 1",
"region_2": "Region 2",
"metro_area": "Metropolitan area",
"place": "Place",
"freq": "Frequency"},
'dimensions_values_labels': other_dimensions
}
write_json_file(filepath, data)
def compute_dimension_list(df):
"""Extracts dimension dicts from dimension codes"""
freq_dict={
'D':'Daily'
}
code = df[['country_region_code','sub_region_1','sub_region_2']].fillna('').apply(lambda x: "_".join(filter(None,x)), axis=1)
label = df[['country_region','sub_region_1','sub_region_2']].fillna('').apply(lambda x: "_".join(filter(None,x)), axis=1)
area_dict = dict(zip(label_to_code(code),label_to_code(label)))
return [
('FREQ','Frequency',freq_dict),
('places', 'Places', places),
('area', 'Area',area_dict)
]
def write_json_file(file_path: Path, data):
"""Writes data the JSON way to file_path"""
with file_path.open('w', encoding='utf-8') as json_fd:
json.dump(data, json_fd, ensure_ascii=False, indent=2, sort_keys=True)
def write_series_jsonl(series_filepath: Path, df, dimension_list):
def parse_line(line):
metadata = {
"country_code": line[0],
"country": line[1],
"region_1": line[2],
"region_2": line[3],
"metro_area": line[4],
}
data = line[7:]
return(metadata, data)
def make_observations(data, index):
observations = [("PERIOD", "VALUE")]
for d in data:
v = d[index + 1]
if len(v) == 0:
obs = (d[0], 'NA')
else:
obs = (d[0], float(v))
observations.append(obs)
return observations
def write_series(fout, metadata, data):
code_elements = [metadata["country_code"]]
name_elements = [metadata["country"]]
region_1 = metadata["region_1"]
region_1_code = get_code(region_1)
if len(region_1) > 0:
code_elements.append(region_1_code)
name_elements.append(region_1)
region_2 = metadata["region_2"]
region_2_code = get_code(region_2)
if len(region_2) > 0:
code_elements.append(region_2_code)
name_elements.append(region_2)
metro_area = metadata["metro_area"]
metro_area_code = get_code(metro_area).replace('_metropolitan_area', '')
if len(metro_area) > 0:
code_elements.append(metro_area_code)
name_elements.append(metro_area)
code_stub = '.'.join(code_elements)
name_stub = ' - '.join(name_elements)
for i, v in enumerate(['retrec','grph','parks','transit','work','resid']):
ts_data = {
'code': '.'.join([code_stub, v]),
'name' : ' - '.join([name_stub, places[v]]),
'dimensions': {
'country': metadata["country_code"],
'region_1': region_1_code,
'region_2': region_2_code,
'metro_area': metro_area_code,
'place': v.lower(),
'freq': 'D'
},
'observations': make_observations(data, i)
}
json_str = json.dumps(ts_data, ensure_ascii=False, sort_keys=True)
fout.write(json_str + '\n')
def get_code(s):
if len(s) == 0:
return "all"
else:
return (unicodedata.normalize('NFD', s).encode('ascii', 'ignore').
lower().decode('ascii','ignore').replace(' ', '_').
replace('/', '_').replace("_metropolitan_area", ""))
def write_series_jsonl(fin, fout):
"""Write series list to series.jsonl file"""
rename_places = {
'retail_and_recreation_percent_change_from_baseline':'RETREC',
'grocery_and_pharmacy_percent_change_from_baseline':'GRPH',
'parks_percent_change_from_baseline':'PARKS',
'transit_stations_percent_change_from_baseline':'TRANSIT',
'workplaces_percent_change_from_baseline':'WORK',
'residential_percent_change_from_baseline':'RESID'
country_dict = {}
other_dimensions = {
"country": {},
"region_1": {},
"region_2": {},
"metro_area": {}
}
df = df.rename(columns=rename_places)
code = df[['country_region_code','sub_region_1','sub_region_2']].fillna('').apply(lambda x: "_".join(filter(None,x)), axis=1)
df['ref_area_code'] = label_to_code(code)
modified_df = df.drop(columns = ['country_region_code','country_region','sub_region_1','sub_region_2','metro_area','iso_3166_2_code','census_fips_code'])
pivoted_df = pd.pivot_table(modified_df,values=['RETREC','GRPH','PARKS','TRANSIT','WORK','RESID'],columns = 'date',index =['ref_area_code'])
dim_v_l = {t[0]: t[2] for t in dimension_list}
# Write series.jsonl
with series_filepath.open('wt', encoding='utf-8') as fd:
for area in places :
for i in range(len(pivoted_df[area])) :
ts_dimension = {'area' : pivoted_df[area].index[i], 'places' : area}
name = ' - '.join(create_name_label(ts_dimension,dim_v_l))
ts_dimension.update({'FREQ' : 'D'})
period_list = pivoted_df[area].columns.to_list()
value_list = (pivoted_df[area].iloc[i]).fillna('NA').to_list()
obs_list = [('PERIOD', 'VALUE')] + list(zip(period_list, value_list))
ts_data = {
'code': pivoted_df[area].index[i] + '.' + area,
'name' : name,
'dimensions': ts_dimension,
'observations': obs_list,
}
json_str = json.dumps(ts_data, ensure_ascii=False, sort_keys=True)
fd.write(json_str + '\n')
def create_name_label(ts_dimensions : dict, dict_dim : dict) :
L=[]
for i in ts_dimensions :
tmp = ts_dimensions[i]
L.append(dict_dim[i][tmp])
return L
SYMBOLS={' ':'_', '.':'-', '-':'_', '/' : '_'}
def after_slugify(str_value):
"""replace symbols into expression after calling slugify"""
for k in SYMBOLS:
str_value = str_value.replace(k,SYMBOLS[k])
return str_value
def label_to_code(df_value: pd.Series):
"""replace capital letter into lowercase letter"""
result=df_value
result.str.encode('ascii','ignore')
result.str.decode('ascii')
#result=result.apply(lambda x :slugify(x))
result=result.apply(lambda x : after_slugify(x))
return result
def create_dimenson_dict(df_code: pd.Series ,df_label: pd.Series):
""" Cette fonction utilise de Series pour créer le dictionnaire, ce qui permet de faire abstraction de l'orientation dans le dataframe"""
if df_code.empty and df_label.empty:
raise ValueError('Please provide either a non empty list of code or non empty list of label.')
if df_code.empty:
df_code=label_to_code(df_label)
if df_label.empty:
df_label=df_code
df_dict=dict(zip(df_code,df_label))
return df_dict
data = [[]]
csvreader = csv.reader(fin, delimiter=',', quotechar='"')
for lineno, line in enumerate(csvreader):
if lineno == 0:
continue
elif lineno == 1:
(metadata, data[0]) = parse_line(line)
country_code = metadata["country_code"]
if country_code not in other_dimensions["country"]:
other_dimensions["country"][country_code] = metadata["country"]
for d in ["region_1", "region_2", "metro_area"]:
label = metadata[d]
if len(label) > 0:
code = get_code(label)
other_dimensions[d][code] = label
else:
other_dimensions[d]["all"] = "ALL"
period_counter = 0
else:
(new_metadata, new_data) = parse_line(line)
if new_data[0] > data[period_counter][0]:
data.append(new_data)
period_counter += 1
else:
write_series(fout, metadata, data)
metadata = new_metadata
country_code = metadata["country_code"]
if country_code not in other_dimensions["country"] :
other_dimensions["country"][country_code] = metadata["country"]
for d in ["region_1", "region_2", "metro_area"]:
label = metadata[d]
if len(label) > 0:
code = get_code(label)
if code not in other_dimensions[d]:
other_dimensions[d][code] = label
elif "all" not in other_dimensions:
other_dimensions[d]["all"] = "ALL"
data = [new_data]
period_counter = 0
return other_dimensions
if __name__ == '__main__':
sys.exit(main())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment