Use topics extracted from BUBA website

29d433d2 · Pierre Dittgen · 053e5aac · 29d433d2
Commit 29d433d2 authored 5 years ago by Pierre Dittgen
--- a/convert.py
+++ b/convert.py
@@ -51,6 +51,7 @@ DAY_RE = re.compile(r'^([12]\d{3})-([01]\d)-([0-3]\d)$')
 # Helps normalize space
 NORM_SPACES_RE = re.compile(r'\s+')

+
 def compute_series_observations(periods, observations, obs_addon_cols, obs_addon_values):
    """Computes series observations"""

@@ -383,10 +384,11 @@ def generate_dataset(ds_code, source_dir: Path, structure_file: Path, ds_dir: Pa
    write_json_file(ds_dir / 'dataset.json', dataset_info)


-def browse_topics(topics_dir: Path, dataset_info_dict):
+def browse_topics(topics_dir: Path, dataset_info_dict, topics_dict):
    """ Yields all topics along with datasets ids """

-    for topic_id in sorted(bc.TOPICS):
+    # Sort by topics label
+    for topic_id, _ in sorted(topics_dict.items(), key=itemgetter(1)):
        topic_dir = topics_dir / topic_id
        if not topic_dir.exists():
            log.warning('Topic directory [{}] not found!'.format(str(topic_dir)))
@@ -396,7 +398,7 @@ def browse_topics(topics_dir: Path, dataset_info_dict):
        yield topic_dir.name, sorted(datasets, key=lambda elt: elt['name'])


-def compute_category_data(topics_dir: Path, dataset_info_dict):
+def compute_category_data(topics_dir: Path, dataset_info_dict, topics_dict):
    """ Compute category_tree.json data ready to be serialized in category_tree.json """

    categ_data = []
@@ -406,9 +408,9 @@ def compute_category_data(topics_dir: Path, dataset_info_dict):

    # Then distribute datasets along topics
    # Some datasets belong to more than a topic
-    for topic_id, datasets in browse_topics(topics_dir, dataset_info_dict):
+    for topic_id, datasets in browse_topics(topics_dir, dataset_info_dict, topics_dict):
        categ_data.append({
-            'name': bc.TOPICS[topic_id],
+            'name': topics_dict.get(topic_id, topic_id),
            'children': datasets
        })
        # Removes dataset ids that belong to the category
@@ -454,6 +456,10 @@ def main():
    if not target_dir.exists():
        parser.error("Target dir {!r} not found".format(str(target_dir)))

+    # Read topics dict from 'help on time series' html page
+    help_html_filepath = source_dir / bc.TOPICS_HTML_PAGE_NAME
+    topics_dict = bc.extract_topics_dict_from_html_help_page(help_html_filepath)
+
    # dataset structure info
    structures_dir = source_dir / 'structures'

@@ -476,7 +482,9 @@ def main():

    # Category_tree.json
    if not args.skip_category_tree:
-        write_json_file(target_dir / 'category_tree.json', compute_category_data(source_dir / 'topics', dataset_info_dict))
+        category_json_data = compute_category_data(source_dir / 'topics',
+                                                   dataset_info_dict, topics_dict)
+        write_json_file(target_dir / 'category_tree.json', category_json_data)


 def write_json_file(file_path: Path, data):