Commit 1ad895b0 authored by Bruno Duyé's avatar Bruno Duyé
Browse files

WIP: get datasets descriptions from html

parent 4a23be15
......@@ -33,7 +33,6 @@ target_dir: path of target directory containing datasets & series in DBnomics fo
"""
from collections import defaultdict, OrderedDict
import csv
import json
import logging
......@@ -41,18 +40,19 @@ import os
import re
import shutil
import sys
from collections import OrderedDict, defaultdict
import lxml.html
import xlrd
from docopt import docopt
from lxml.cssselect import CSSSelector
from slugify import slugify
import xlrd
from dbnomics_converters.base import assert_no_error, to_float
from dbnomics_converters.categories import validate_category
from dbnomics_converters.datasets import validate_dataset
from dbnomics_converters.providers import validate_provider
from dbnomics_converters.series import validate_series
from dbnomics_converters.categories import validate_category
log = logging.getLogger(__name__)
......@@ -109,7 +109,12 @@ CATEGORIES = [
['Reporter', ('Reporter_Description', 'Reporter_Code')],
['Partner', ('Partner_Description', 'Partner_Code')],
['Source', ('Source_Description', None)],
])
]),
description_html_css_selectors=(
'annual.html',
# description, updates informations
['p.paranormaltext:nth-child(3)', '.centerCol > p:nth-child(2)']
)
),
dict(
type='dataset',
......@@ -122,7 +127,12 @@ CATEGORIES = [
['Flow', ('Flow_Description', None)],
['Partner', ('Partner_description', 'Partner_code')],
['Indicator', ('Indicator_description', 'Indicator_code')]
])
]),
description_html_css_selectors=(
'annual.html',
# description, updates informations
['p.paranormaltext:nth-child(7)', 'div.centerCol > p.paranormaltext:nth-child(6)']
)
),
dict(
type='dataset',
......@@ -135,7 +145,12 @@ CATEGORIES = [
['Flow', ('Flow_Description', None)],
['Partner', ('Partner_description', 'Partner_code')],
['Unit', ('Unit', None)]
])
]),
description_html_css_selectors=(
'annual.html',
# description, updates informations
['p.paranormaltext:nth-child(11)', 'p.paranormaltext:nth-child(11)']
)
),
]
# ),
......@@ -441,6 +456,7 @@ def create_dataset_and_series_from_csv(dataset, dataset_path):
dataset_json_data = {
'name': dataset['name'],
'code': dataset['code'],
'description': get_dataset_description_from_html(dataset, source_dir),
'dimensions_values_labels': {
dimension_code: {
dimension_value_code: dimensions_values_labels[dimension_code][dimension_value_code]
......@@ -618,6 +634,35 @@ def write_series_tsv_file(series_dir_path, observations, header):
file_.write("\t".join(observation) + "\n")
def get_dataset_description_from_html(dataset, source_dir):
filename = dataset['description_html_css_selectors'][0]
css_selectors = dataset['description_html_css_selectors'][1]
filepath = os.path.join(source_dir, filename)
assert os.path.isfile(filepath), "Source html file not found: {!r}".format(filepath)
with open(filepath) as f_:
html = f_.read()
return get_text_from_css_selectors(html, css_selectors)
def get_text_from_css_selectors(html, css_selectors, join_string="\n"):
"""Return a string containing found texts corresponding to given css_selectors in html, joined by join_string.
The function ensures that all given css selectors match a text in html.
Parameters:
html (string): source html file
css_selectors (list of strings): list of css selectors
join_string: string to use to concatenate matching texts
"""
html_element = lxml.html.document_fromstring(html)
text_parts = []
for css_selector in css_selectors:
elements = html_element.cssselect(css_selector)
assert len(elements) == 1, "Found multiple elements matching {!r} in html".format(css_selector)
text = elements[0].text
assert text, "Empty text from css selector {!r}".format(css_selector)
text_parts.append(text)
return "\n".join(text_parts)
def write_json_file(file_path, data):
with open(file_path, 'w') as file_:
json.dump(data, file_, ensure_ascii=False, indent=2, sort_keys=True)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment