Commit ee126198 authored by Pierre Dittgen's avatar Pierre Dittgen
Browse files

add 5 datasets

parent 9e2a1ff0
Pipeline #191948 passed with stage
in 32 seconds
......@@ -29,7 +29,9 @@ import json
import logging
import re
import sys
from operator import itemgetter
from pathlib import Path
from typing import Any, Dict, List, Tuple
import bs4
import xlrd
......@@ -112,6 +114,52 @@ def iter_child_categories(
return total_children
def iter_2_1_datasets(dataset_code_prefix, book, sheet_index, sheet):
"""Yield dataset_json_content and series_json_l content."""
# Catégories
if sheet_index == 0:
yield load_dataset_category_code_2_file_1_sheet_1(
book, dataset_code_prefix + "categories", sheet
)
# Catégories ABC
elif sheet_index == 1:
yield load_dataset_category_code_2_file_1_sheet_2_and_3(
book, dataset_code_prefix + "categories_ABC", sheet
)
# Catégorie A
elif sheet_index == 2:
yield load_dataset_category_code_2_file_1_sheet_2_and_3(
book, dataset_code_prefix + "categorie_A", sheet
)
# Ancienneté et durée
elif sheet_index == 6:
dataset_name_prefix = "Demandeurs d'emploi inscrits en fin de mois à Pôle emploi en catégories A, B, C : "
yield load_dataset_category_code_2_file_1_sheet_6(
book,
dataset_code_prefix + "anciennete",
dataset_name_prefix + "ancienneté d'inscription des demandeurs d'emploi",
sheet,
set(range(14)),
)
yield load_dataset_category_code_2_file_1_sheet_6(
book,
dataset_code_prefix + "duree",
dataset_name_prefix + "durée moyenne d'inscription des sortants",
sheet,
set(range(15, 20)),
)
# Flux ABC
elif sheet_index == 7:
yield load_dataset_category_code_2_file_1_sheet_7(
book, dataset_code_prefix + "flux_ABC", sheet
)
def iter_datasets_for_categories_tree(
xls_file_path: Path, target_dir: Path, category_code
):
......@@ -140,23 +188,26 @@ def iter_datasets_for_categories_tree(
for sheet_index, (sheet_name, sheet) in enumerate(zip(sheet_names, sheets)):
# `Demandeurs d’emploi en {month} {year} - séries nationales CVS-CJO`
if category_code == "2.1" and sheet_index == 0:
data_type_cell = sheet.cell(1, 1).value.strip()
dataset_code = "{}-{}-{}".format(
slugify(category_code), data_type_cell, slugify(sheet_name)
)
dataset_dir = target_dir / dataset_code
dataset_dir.mkdir(exist_ok=True)
dataset_json = load_dataset_category_code_2_file_1_sheet_1(
book, dataset_code, dataset_dir, sheet, sheet_name
)
write_json_file(dataset_dir / "dataset.json", dataset_json)
if category_code == "2.1":
dataset = {"code": dataset_json["code"]}
dataset_name = dataset_json.get("name")
if dataset_name is not None:
dataset["name"] = dataset_name
yield dataset
# Compute dataset code prefix
data_type = sheet.cell(1, 1).value.strip()
dataset_code_prefix = f"{slugify(category_code)}-{data_type}-"
for dataset_json, series_info_list in iter_2_1_datasets(
dataset_code_prefix, book, sheet_index, sheet
):
dataset_code = dataset_json["code"]
# Create dataset dir from dataset code
dataset_dir = target_dir / dataset_code
dataset_dir.mkdir(exist_ok=True)
# Write dataset.json and series.jsonl file
write_json_file(dataset_dir / "dataset.json", dataset_json)
write_series_jsonl_file(dataset_dir, series_info_list)
yield {"code": dataset_json["code"], "name": dataset_json["name"]}
# `L'emploi intérimaire au mois de {month} {year}`
if category_code == "8.1" and sheet_index == 0:
......@@ -177,58 +228,94 @@ def iter_datasets_for_categories_tree(
yield dataset
def load_dataset_category_code_2_file_1_sheet_1(
book, dataset_code, dataset_dir: Path, sheet, sheet_name
):
"""Process `Demandeurs d’emploi en {month} {year} - séries nationales CVS-CJO` dataset."""
def extract_name_and_unit(sheet):
"""Extract name and unit info from 2.1 sheet."""
return (sheet.cell(0, 1).value.strip(), sheet.cell(2, 1).value.strip())
name = sheet.cell(0, 1).value.strip()
if not name:
name = sheet_name
unit = sheet.cell(2, 1).value.strip()
dataset_json = {
def iter_series_observations_by_column(book, sheet, column_idx, first_data_row=10):
"""Iterate on observations found in a sheet column."""
def norm_obs_value(obs_value):
try:
return float(obs_value)
except:
return "NA"
for row_idx in range(first_data_row, sheet.nrows):
row_values = sheet.row_values(row_idx)
period = datetime.datetime(
*xlrd.xldate.xldate_as_tuple(row_values[0], book.datemode)
).strftime("%Y-%m")
value = norm_obs_value(row_values[column_idx + 1])
yield period, value
def prepare_dataset_json_dict(
dataset_code: str,
dataset_name: str,
compact_dimensions: List[Tuple[str, str, Dict]],
):
"""Prepare dataset_json dict from code, name and dimensions info."""
return {
"code": dataset_code,
"dimensions_codes_order": [
"region",
"category",
"unit",
"frequency",
"adjustment",
],
"dimensions_labels": {
"adjustment": "Adjustment",
"frequency": "Frequency",
"region": "Region",
"category": "Category",
"unit": "Unit",
},
"dimensions_values_labels": {
"frequency": {"M": "Mensuel"},
"unit": {unit.lower(): unit.capitalize()},
},
"name": name,
"name": dataset_name,
"dimensions_codes_order": [dim[0] for dim in compact_dimensions],
"dimensions_labels": {dim[0]: dim[1] for dim in compact_dimensions},
"dimensions_values_labels": {dim[0]: dim[2] for dim in compact_dimensions},
}
def build_series_info(
dimensions_codes_order, series_dimensions_codes, observation_iterator
):
"""Compute series information."""
return {
"code": "_".join(series_dimensions_codes),
"dimensions": dict(zip(dimensions_codes_order, series_dimensions_codes)),
"observations": [("PERIOD", "VALUE")] + list(observation_iterator),
}
def load_dataset_category_code_2_file_1_sheet_1(book, dataset_code, sheet):
"""Process 'Catégories' sheet."""
dataset_name, unit = extract_name_and_unit(sheet)
compact_dimensions = [
("region", "Region", {}),
("category", "Category", {}),
("unit", "Unit", {unit.lower(): unit.capitalize()}),
("frequency", "Frequency", {"M": "Mensuel"}),
("adjustment", "Adjustment", {"CVS-CJO": "CVS-CJO"}),
]
dataset_json = prepare_dataset_json_dict(
dataset_code, dataset_name, compact_dimensions
)
categories = {}
regions = {}
series_info_map = {}
series_info_list = []
# define headers from XLS line headers and store into a list of code to preserve order
for column_index, (category_label1, category_label2) in enumerate(
for column_index, (category_line_1, category_line_2) in enumerate(
zip(sheet.row_values(8)[1:], sheet.row_values(9)[1:])
):
category_label2_parts = category_label2.split("(France)")
if len(category_label2_parts) == 1:
# 'France métropolitaine sauf mention contraire'
region_label = "France métropolitaine"
region_code = "fx"
# Compute region and category dimensions
if "(France)" in category_line_2:
region_code, region_label = "fr", "France"
category_label = norm_space(
category_line_2[: category_line_2.find("(France)")]
)
else:
region_label = "France"
region_code = "fr"
category_label = norm_space(category_label2_parts[0])
category_code = slugify(category_label1)
categories[category_code] = "{} – {}".format(category_label1, category_label)
region_code, region_label = "fx", "France métropolitaine"
category_label = category_line_2
category_code = slugify(category_line_1)
# Accumulate dimension code, label
categories[category_code] = "{} – {}".format(category_line_1, category_label)
regions[region_code] = region_label
series_dimensions_codes = [
region_code,
category_code,
......@@ -236,38 +323,230 @@ def load_dataset_category_code_2_file_1_sheet_1(
"M",
"CVS-CJO",
]
series_code = "_".join(series_dimensions_codes)
series_dimensions = dict(
zip(dataset_json["dimensions_codes_order"], series_dimensions_codes)
series_info_list.append(
build_series_info(
dataset_json["dimensions_codes_order"],
series_dimensions_codes,
iter_series_observations_by_column(book, sheet, column_index),
)
)
series_observations = []
for line_index in range(10, sheet.nrows):
row_values = sheet.row_values(line_index)
# First cell defines period.
period = datetime.datetime(
*xlrd.xldate.xldate_as_tuple(row_values[0], book.datemode)
).strftime("%Y-%m")
value = row_values[column_index + 1]
series_observations.append((period, value))
series_info_map[series_code] = {
"code": series_code,
"dimensions": series_dimensions,
"observations": [("PERIOD", "VALUE")] + series_observations,
}
dataset_json["dimensions_values_labels"].update(
{"category": categories, "region": regions}
)
# Write series.jsonl
series_jsonl_filepath = dataset_dir / "series.jsonl"
with series_jsonl_filepath.open("wt", encoding="utf-8") as fd:
for series_code, series_info in sorted(series_info_map.items()):
json.dump(series_info, fd, ensure_ascii=False, sort_keys=True)
fd.write("\n")
return dataset_json, series_info_list
return dataset_json
def load_dataset_category_code_2_file_1_sheet_2_and_3(book, dataset_code, sheet):
"""Process 'Catégories ABC' and 'Catégorie A' sheets."""
dataset_name, unit = extract_name_and_unit(sheet)
compact_dimensions = [
("region", "Region", {"fx": "France métropolitaine"}),
("sex", "Sex", {}),
("age", "Age", {}),
("unit", "Unit", {unit.lower(): unit.capitalize()}),
("frequency", "Frequency", {"M": "Mensuel"}),
("adjustment", "Adjustment", {"CVS-CJO": "CVS-CJO"}),
]
dataset_json = prepare_dataset_json_dict(
dataset_code, dataset_name, compact_dimensions
)
sex_dim = {}
age_dim = {}
series_info_list = []
current_sex = None
# define headers from XLS line headers and store into a list of code to preserve order
for column_index, (category_line_1, category_line_2) in enumerate(
zip(sheet.row_values(8)[1:], sheet.row_values(9)[1:])
):
# category first line is sex (hommes, femmes, ensemble)
if category_line_1:
current_sex = (slugify(category_line_1), category_line_1)
sex_dim[current_sex[0]] = current_sex[1]
# category second line is age (Moins 25 ans, 25 à 49 ans, 50 ans ou plus, Ensemble)
age = (slugify(category_line_2), category_line_2)
age_dim[age[0]] = age[1]
series_dimensions_codes = [
"fx",
"milliers",
"M",
"CVS-CJO",
current_sex[0],
age[0],
]
series_info_list.append(
build_series_info(
dataset_json["dimensions_codes_order"],
series_dimensions_codes,
iter_series_observations_by_column(book, sheet, column_index),
)
)
dataset_json["dimensions_values_labels"].update({"sex": sex_dim, "age": age_dim})
return dataset_json, series_info_list
def load_dataset_category_code_2_file_1_sheet_6(
book, dataset_code, dataset_name, sheet, column_range
):
"""Process `Ancienneté et durée` sheet, resulting in two different datasets."""
compact_dimensions = [
("region", "Region", {"fr": "France", "fx": "France métropolitaine"}),
("indicator", "Indicator", {}),
(
"unit",
"Unit",
{"eff": "effectif en milliers", "percent": "Part en %", "j": "jours"},
),
("frequency", "Frequency", {"M": "Mensuel"}),
("adjustment", "Adjustment", {"CVS-CJO": "CVS-CJO"}),
]
dataset_json = prepare_dataset_json_dict(
dataset_code, dataset_name, compact_dimensions
)
indicator_dim = {}
series_info_list = []
current_category = None
# define headers from XLS line headers and store into a list of code to preserve order
for column_index, (category, sub_category) in enumerate(
zip(sheet.row_values(8)[1:], sheet.row_values(9)[1:])
):
# Filter columns to process
if column_index not in column_range:
continue
# indicator dimension
if category:
current_category = category
indicator_chunks = [current_category]
if sub_category:
indicator_chunks.append(sub_category)
indicator = " - ".join(indicator_chunks)
indicator_code = slugify(indicator)
indicator_dim[indicator_code] = indicator
# region dimension
region_code = "fr" if "France" in indicator else "fx"
# unit dimension
unit_code = (
"percent"
if column_index == 9 # column J
else "j"
if column_index == 10 # column K
else "eff"
)
series_dimensions_codes = [
region_code,
indicator_code,
unit_code,
"M",
"CVS-CJO",
]
series_info_list.append(
build_series_info(
dataset_json["dimensions_codes_order"],
series_dimensions_codes,
iter_series_observations_by_column(book, sheet, column_index),
)
)
dataset_json["dimensions_values_labels"].update({"indicator": indicator_dim})
return dataset_json, series_info_list
def load_dataset_category_code_2_file_1_sheet_7(book, dataset_code, sheet):
"""Process 'Flux ABC' sheet."""
dataset_name, unit = extract_name_and_unit(sheet)
unit_code = unit.lower()
compact_dimensions = [
("region", "Region", {"fr": "France", "fx": "France métropolitaine"}),
("flow", "Flow", {}),
("indicator", "Indicator", {}),
("unit", "Unit", {unit_code: unit.capitalize()}),
("frequency", "Frequency", {"M": "Mensuel"}),
("adjustment", "Adjustment", {"CVS-CJO": "CVS-CJO"}),
]
dataset_json = prepare_dataset_json_dict(
dataset_code, dataset_name, compact_dimensions
)
flow_dim = {}
indicator_dim = {}
series_info_list = []
current_flow = None
curent_indicator = None
# define headers from XLS line headers and store into a list of code to preserve order
for column_index, (categ_0, categ_1, categ_2) in enumerate(
zip(*[sheet.row_values(row_id)[1:] for row_id in (7, 8, 9)])
):
# Empty column
if not categ_1 and not categ_2:
continue
# flow dimension ('Entrées', 'Sorties', 'Équation comptable`)
if categ_0:
current_flow = categ_0
flow_code = slugify(current_flow)
flow_dim[flow_code] = current_flow
# indicator dimension is form of categ_1 and categ2
if categ_1:
current_indicator = categ_1
indicator_chunks = [current_indicator]
if categ_2:
indicator_chunks.append(categ_2)
indicator = " - ".join(indicator_chunks)
indicator_code = slugify(indicator)
indicator_dim[indicator_code] = indicator
# region code
region_code = "fr" if "France" in indicator else "fx"
series_dimensions_codes = [
region_code,
flow_code,
indicator_code,
unit_code,
"M",
"CVS-CJO",
]
series_info_list.append(
build_series_info(
dataset_json["dimensions_codes_order"],
series_dimensions_codes,
iter_series_observations_by_column(book, sheet, column_index),
)
)
dataset_json["dimensions_values_labels"].update(
{"flow": flow_dim, "indicator": indicator_dim}
)
return dataset_json, series_info_list
def load_dataset_category_code_8_file_1_sheet_1(
......@@ -339,20 +618,26 @@ def load_dataset_category_code_8_file_1_sheet_1(
series_info["observations"] = [("PERIOD", "VALUE", "STATUS")] + obs
# Write series.jsonl
series_jsonl_filepath = dataset_dir / "series.jsonl"
with series_jsonl_filepath.open("wt", encoding="utf-8") as fd:
json.dump(series_info, fd, ensure_ascii=False, sort_keys=True)
fd.write("\n")
write_series_jsonl_file(dataset_dir, [series_info])
return dataset_json
def write_json_file(file_path: Path, data):
"""Write JSON file."""
with file_path.open("wt", encoding="utf-8") as fd:
json.dump(data, fd, ensure_ascii=False, indent=2, sort_keys=True)
def write_series_jsonl_file(target_dir: Path, series_info_list: List[Dict[str, Any]]):
"""Write series.jsonl."""
jsonl_file = target_dir / "series.jsonl"
with jsonl_file.open("wt", encoding="utf-8") as fd:
for series_info in sorted(series_info_list, key=itemgetter("code")):
json.dump(series_info, fd, ensure_ascii=False, sort_keys=True)
fd.write("\n")
log = logging.getLogger(__name__)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment