Commit 9db1c370 authored by Emmanuel Raviart's avatar Emmanuel Raviart
Browse files

Add facets for terms & organizations. Remove nodes & groups.

parent e7e81c72
......@@ -76,12 +76,12 @@ npx babel-node --extensions ".ts" src/scripts/repair_adisp_ddis.ts --source=../p
### Indexing DDI files
```bash
npx babel-node --extensions ".ts" -- src/scripts/index_code_books.ts --language=fr --path=adisp --title="Archives de données issues de la statistique publique (ADISP)" ../public_data/adisp-ddi-repaired/
npx babel-node --extensions ".ts" -- src/scripts/index_code_books.ts --language=fr --path=cdsp --title="SciencesPo Centre de données socio-politiques (CDSP)" ../public_data/cdsp-ddi/
npx babel-node --extensions ".ts" -- src/scripts/index_code_books.ts --language=fr --path=ined --title="Institut national d'études démographiques (INED)" ../public_data/ined-ddi/
npx babel-node --extensions ".ts" -- src/scripts/index_code_books.ts --language=fr --path=ined/gpgsurvey --title="Enquête Générations et Genre du projet international Generations and Gender Programme (GGP)" ../public_data/ined-gpgsurvey-ddi/
npx babel-node --extensions ".ts" -- src/scripts/index_code_books.ts --language=en --path=ukdataservice --title="UK Data Service" ../public_data/ukdataservice-ddi/
npx babel-node --extensions ".ts" -- src/scripts/index_code_books.ts --language=no --path=nsddata --title="Norwegian Centre for Research Data (NSD)" ../public_data/nsddata-ddi/
npx babel-node --extensions ".ts" -- src/scripts/index_code_books.ts --language=fr --path=adisp ../public_data/adisp-ddi-repaired/
npx babel-node --extensions ".ts" -- src/scripts/index_code_books.ts --language=fr --path=cdsp ../public_data/cdsp-ddi/
npx babel-node --extensions ".ts" -- src/scripts/index_code_books.ts --language=fr --path=ined ../public_data/ined-ddi/
npx babel-node --extensions ".ts" -- src/scripts/index_code_books.ts --language=fr --path=ined/gpgsurvey ../public_data/ined-gpgsurvey-ddi/
npx babel-node --extensions ".ts" -- src/scripts/index_code_books.ts --language=en --path=ukdataservice ../public_data/ukdataservice-ddi/
npx babel-node --extensions ".ts" -- src/scripts/index_code_books.ts --language=no --path=nsddata ../public_data/nsddata-ddi/
```
### Extracting words from CodeBooks from autocompletion
......
import type { Organization } from "./data"
import type { CodeBook, SerStmt, Var } from "./raw_types/code_books"
type CodeBookPartialOrganization =
| string
| { "#text"?: string; "@abbr"?: string }
export function getSeries(codeBook: CodeBook): SerStmt | undefined {
const series = codeBook.stdyDscr.citation.serStmt
return series?.serName === undefined ? undefined : series
......@@ -17,15 +22,98 @@ export function getStudyDescription(codeBook: CodeBook): string | undefined {
return undefined
}
export function* iterDepositors(
codeBook: CodeBook,
): Generator<Organization, void, void> {
yield* iterOrganizations(codeBook.stdyDscr.citation.distStmt?.depositr)
}
export function* iterDistributors(
codeBook: CodeBook,
): Generator<Organization, void, void> {
yield* iterOrganizations(codeBook.stdyDscr.citation.distStmt?.distrbtr)
}
function* iterOrganizations(
organizations:
| CodeBookPartialOrganization
| CodeBookPartialOrganization[]
| undefined,
): Generator<Organization, void, void> {
if (organizations == null) {
return
}
if (!Array.isArray(organizations)) {
organizations = [organizations]
}
for (const organization of organizations ?? []) {
if (typeof organization === "string") {
if (organization) {
const name = organization.trim()
if (name) {
yield { name }
}
}
} else {
let acronym = organization["@abbr"]
if (acronym !== undefined) {
acronym = acronym.trim()
}
let name = organization["#text"]
if (name !== undefined) {
name = name.trim()
}
if (!name) {
name = acronym
}
if (name) {
if (!acronym || acronym === name) {
acronym = undefined
}
yield {
acronym,
name,
}
}
}
}
}
export function* iterProducers(
codeBook: CodeBook,
): Generator<Organization, void, void> {
yield* iterOrganizations(codeBook.stdyDscr.citation.prodStmt.producer)
}
export function* iterStudyKeywords(
codeBook: CodeBook,
): Generator<string, void, void> {
for (const keyword of codeBook.stdyDscr.stdyInfo.subject?.keyword ?? []) {
yield typeof keyword === "number"
? keyword.toString()
: typeof keyword === "string"
? keyword
: keyword["#text"]
const label =
typeof keyword === "number"
? keyword.toString()
: typeof keyword === "string"
? keyword
: keyword["#text"]
if (label) {
yield label
}
}
}
export function* iterStudyTopicsClass(
codeBook: CodeBook,
): Generator<string, void, void> {
let topicsClass = codeBook.stdyDscr.stdyInfo.subject?.topcClas ?? []
if (!Array.isArray(topicsClass)) {
topicsClass = [topicsClass]
}
for (const topicClass of topicsClass) {
const label =
typeof topicClass === "string" ? topicClass : topicClass["#text"]
if (label) {
yield label
}
}
}
......
<script lang="ts">
import { stores } from "@sapper/app"
import Pagination from "./Pagination.svelte"
import SearchForm from "./SearchForm.svelte"
import type { NodeGroup } from "../data"
import { NodeType } from "../data"
import { localize } from "../stores"
export let node: NodeGroup
const { page } = stores()
$: _ = $localize
$: children = node.children!
$: childrenCount = node.childrenCount!
$: query = $page.query
$: tab = "studies" // or "series" or "variables"
$: term = query.q
</script>
<!-- <h1 class="heading">{_(node.type)}{_("colon")} <i>{node.title}</i></h1> -->
<div class="text-neutral-700">
<span
class="portal {tab === 'studies' ? '~neutral active' : ''}"
on:click={() => (tab = 'studies')}>{_('Studies')}
(COUNT)</span>
<span
class="portal {tab === 'series' ? '~neutral active' : ''}"
on:click={() => (tab = 'series')}>{_('Series')}
(COUNT)</span>
<a
class="portal {tab === 'variables' ? '~neutral active' : ''}"
href="variables"
on:click={() => (tab = 'variables')}>{_('Variables')}
(COUNT)</a>
</div>
<div class="mx-auto">
<SearchForm placeholder="niveau de vie…" searchPath={$page.path} {term} />
<ul>
{#each children as child}
<li>
<a class="button ~info !low" href={child.path}>
{#if child.type === NodeType.CodeBook}
{child.title}
({_(child.type)}
{_('version')}
{child.version})
{:else}{child.title}{/if}
</a>
</li>
{/each}
</ul>
<Pagination
count={childrenCount}
currentPageCount={children.length}
limit={20}
queryParams={query}
url={$page.path} />
</div>
<script lang="ts">
import type { Organization } from "../data"
export let organization: Organization
</script>
{#if organization.acronym == null}{organization.name}{:else}<abbr
title={organization.name}>{organization.acronym}</abbr
>{/if}
......@@ -2,8 +2,15 @@
import JsonTree from "svelte-json-tree"
import type { Study } from "../data"
import { iterStudyKeywords, iterVariables } from "../accessors"
import {
iterDepositors,
iterDistributors,
iterProducers,
iterStudyKeywords,
iterVariables,
} from "../accessors"
import { localize } from "../stores"
import OrganizationName from "./OrganizationName.svelte"
export let study: Study
......@@ -11,10 +18,10 @@
// CodeBook Fields
$: codeBook = study.codeBook!
$: depositors = [...iterDepositors(codeBook)]
$: distributors = [...iterDistributors(codeBook)]
$: keywords = [...iterStudyKeywords(codeBook)]
$: producer = codeBook.stdyDscr?.citation?.prodStmt.producer
$: distributor = codeBook.stdyDscr?.citation?.distStmt?.distrbtr
$: depositor = codeBook.stdyDscr?.citation?.distStmt?.depositr
$: producers = [...iterProducers(codeBook)]
$: variables = [...iterVariables(codeBook)]
$: tab = "description"
</script>
......@@ -36,22 +43,30 @@
<div class="my-2 px-2 overflow-hidden w-1/4">
<hr class="sep h-8" />
<ul class="list-inside list-disc text-sm">
{#if producer != null}
{#if producers.length > 0}
<li>
{_("Producer")}{_("colon")}
<abbr title={producer["#text"]}>{producer["@abbr"]}</abbr>
{#each producers as producer, index}
{#if index > 0}, {/if}<OrganizationName organization={producer} />
{/each}
</li>
{/if}
{#if distributor != null}
{#if distributors.length > 0}
<li>
{_("Distributor")}{_("colon")}
<abbr title={distributor["#text"]}>{distributor["@abbr"]}</abbr>
{#each distributors as distributor, index}
{#if index > 0}, {/if}<OrganizationName
organization={distributor}
/>
{/each}
</li>
{/if}
{#if depositor != null}
{#if depositors.length > 0}
<li>
{_("Depositor")}{_("colon")}
<abbr title={depositor["#text"]}>{depositor["@abbr"]}</abbr>
{#each depositors as depositor, index}
{#if index > 0}, {/if}<OrganizationName organization={depositor} />
{/each}
</li>
{/if}
</ul>
......
import assert from "assert"
import dedent from "dedent-js"
import {
Language,
NodeType,
postgreSqlConfigurationNameByLanguage,
} from "./data"
import { Language, postgreSqlConfigurationNameByLanguage } from "./data"
import { db, versionNumber } from "./database"
import { indexGroup } from "./indexers"
export async function configure(): Promise<void> {
await configureDatabase()
......@@ -34,13 +29,6 @@ export async function configure(): Promise<void> {
},
)
}
await indexGroup({
path: "",
language: Language.En,
title: "Root",
type: NodeType.Group,
})
}
async function configureDatabase(): Promise<void> {
......@@ -76,40 +64,6 @@ async function configureDatabase(): Promise<void> {
// Types
// Enum: field
try {
await db.none(
dedent`
CREATE TYPE field AS ENUM (
'Title',
'Variable'
)
`,
)
} catch (e) {
// 42710: type "field" already exists
if (e.code !== "42710") {
throw e
}
}
// Enum: node_type
try {
await db.none(
dedent`
CREATE TYPE node_type AS ENUM (
'CodeBook',
'Group'
)
`,
)
} catch (e) {
// 42710: type "node_type" already exists
if (e.code !== "42710") {
throw e
}
}
// Tables
// Table: languages
......@@ -122,24 +76,12 @@ async function configureDatabase(): Promise<void> {
`,
)
// Table: nodes
// Table: organizations
await db.none(
dedent`
CREATE TABLE IF NOT EXISTS nodes (
language text NOT NULL REFERENCES languages(code) ON DELETE CASCADE,
path text NOT NULL PRIMARY KEY,
title text NOT NULL,
type node_type NOT NULL
)
`,
)
// Table: nodes_text_search
await db.none(
dedent`
CREATE TABLE IF NOT EXISTS nodes_text_search (
path text NOT NULL PRIMARY KEY REFERENCES nodes(path) ON DELETE CASCADE,
text_search tsvector
CREATE TABLE IF NOT EXISTS organizations (
acronym text,
name text NOT NULL PRIMARY KEY
)
`,
)
......@@ -186,6 +128,43 @@ async function configureDatabase(): Promise<void> {
`,
)
// Table: terms
await db.none(
dedent`
CREATE TABLE IF NOT EXISTS terms (
label text NOT NULL,
language text NOT NULL REFERENCES languages(code) ON DELETE CASCADE,
PRIMARY KEY (language, label)
)
`,
)
// Table: study_organization_associations
await db.none(
dedent`
CREATE TABLE IF NOT EXISTS study_organization_associations (
organization_name text NOT NULL REFERENCES organizations(name) ON DELETE CASCADE,
relation text NOT NULL,
study_path text NOT NULL REFERENCES studies(path) ON DELETE CASCADE,
PRIMARY KEY (study_path, relation, organization_name)
)
`,
)
// Table: study_term_associations
await db.none(
dedent`
CREATE TABLE IF NOT EXISTS study_term_associations (
language text NOT NULL REFERENCES languages(code) ON DELETE CASCADE,
relation text NOT NULL,
study_path text NOT NULL REFERENCES studies(path) ON DELETE CASCADE,
term_label text NOT NULL,
PRIMARY KEY (study_path, relation, language, term_label),
FOREIGN KEY (language, term_label) REFERENCES terms(language, label) ON DELETE CASCADE
)
`,
)
// Table: users
await db.none(
dedent`
......@@ -226,25 +205,25 @@ async function configureDatabase(): Promise<void> {
await db.none(
dedent`
CREATE INDEX IF NOT EXISTS nodes_text_search_idx
ON nodes_text_search
CREATE INDEX IF NOT EXISTS series_idx
ON series
USING GIN (text_search)
`,
)
await db.none(
dedent`
CREATE INDEX IF NOT EXISTS series_idx
ON series
CREATE INDEX IF NOT EXISTS studies_idx
ON studies
USING GIN (text_search)
`,
)
await db.none(
dedent`
CREATE INDEX IF NOT EXISTS studies_idx
ON studies
USING GIN (text_search)
CREATE INDEX IF NOT EXISTS terms_trigrams_idx
ON terms
USING GIST (label gist_trgm_ops)
`,
)
......@@ -266,21 +245,14 @@ async function configureDatabase(): Promise<void> {
// Add comments once every table and column exists.
for (const command of [
"COMMENT ON TABLE languages IS 'languages used by nodes & CodeBooks'",
"COMMENT ON TABLE languages IS 'languages used by CodeBooks'",
"COMMENT ON COLUMN languages.code IS '2-letters ISO code of language'",
"COMMENT ON COLUMN languages.postgresql_configuration_name IS 'name of" +
" PostgreSQL configuration for language'",
"COMMENT ON TABLE nodes IS 'nodes (of the tree of nodes)'",
"COMMENT ON COLUMN nodes.language IS 'language used for node'",
'COMMENT ON COLUMN nodes.path IS \'"/"-separated concatenation of' +
" the segments of the node and its ancestors'",
"COMMENT ON COLUMN nodes.title IS 'name of node'",
"COMMENT ON COLUMN nodes.type IS 'type of node'",
"COMMENT ON TABLE nodes_text_search IS 'full text search vectors for nodes'",
"COMMENT ON COLUMN nodes_text_search.path IS 'path of node'",
"COMMENT ON COLUMN nodes_text_search.text_search IS 'full text search vector of node'",
"COMMENT ON TABLE organizations IS 'organizations (providers, distributors, etc)'",
"COMMENT ON COLUMN organizations.acronym IS 'acronym (or abbreviation) of organization'",
"COMMENT ON COLUMN organizations.name IS 'name of organization'",
"COMMENT ON TABLE series IS 'series of studies'",
"COMMENT ON COLUMN series.description IS 'description of series'",
......@@ -303,6 +275,25 @@ async function configureDatabase(): Promise<void> {
"COMMENT ON COLUMN studies.title IS 'name of study'",
"COMMENT ON COLUMN studies.version IS 'version of CodeBook used'",
"COMMENT ON TABLE study_organization_associations IS 'relations between a study and" +
" its organizations (providers, distributors, etc)'",
"COMMENT ON COLUMN study_organization_associations.organization_name IS 'name of organization'",
"COMMENT ON COLUMN study_organization_associations.relation IS 'type of association between" +
" study and organization'",
"COMMENT ON COLUMN study_organization_associations.study_path IS 'path of study'",
"COMMENT ON TABLE study_term_associations IS 'relations between a study and" +
" its terms (keywords, topics classes, etc)'",
"COMMENT ON COLUMN study_term_associations.language IS 'language of study and of term'",
"COMMENT ON COLUMN study_term_associations.relation IS 'type of association between" +
" study and term'",
"COMMENT ON COLUMN study_term_associations.study_path IS 'path of study'",
"COMMENT ON COLUMN study_term_associations.term_label IS 'text of term'",
"COMMENT ON TABLE terms IS 'terms extracted from thesauri and used for topics, etc'",
"COMMENT ON COLUMN terms.label IS 'text of term'",
"COMMENT ON COLUMN terms.language IS 'language of term'",
"COMMENT ON TABLE variables IS 'variables of studies (aka of CodeBooks)'",
"COMMENT ON COLUMN variables.id IS 'ID of variable in node'",
'COMMENT ON COLUMN variables.path IS \'"/"-separated concatenation of' +
......@@ -313,7 +304,7 @@ async function configureDatabase(): Promise<void> {
"COMMENT ON TABLE version IS 'version of database'",
"COMMENT ON COLUMN version.number IS 'version number of database schema'",
"COMMENT ON TABLE words IS 'words used in text columns, for autocompletion'",
"COMMENT ON TABLE words IS 'words extracted from text columns, for autocompletion'",
"COMMENT ON COLUMN words.label IS 'text of word'",
"COMMENT ON COLUMN words.language IS 'language of word'",
]) {
......
......@@ -5,15 +5,15 @@ export enum CodeBookVersion {
Version_1_3 = "1.3",
}
export enum Field {
Title = "Title",
Variable = "Variable",
}
export enum Facet {
Series = "Series",
Studies = "Studies",
Variables = "Variables",
Depositors = "depositors",
Distributors = "distributors",
Keywords = "keywords",
Producers = "producers",
Series = "series",
Studies = "studies",
Topics = "topics",
Variables = "variables",
}
export const allFacets = Object.values(Facet)
......@@ -53,39 +53,13 @@ export enum Language {
Fr = "fr",
}
export type Node = NodeCodeBook | NodeGroup
export interface NodeBase {
language: Language
path: string
title: string
type: NodeType
}
export interface NodeCodeBook extends NodeBase {
codeBook?: CodeBook
file_path?: string // Required on server but removed for clients.
type: NodeType.CodeBook
version: CodeBookVersion
}
export interface NodeGroup extends NodeBase {
children?: Node[]
childrenAutocompletion?: NodeSearchResult[]
childrenCount?: number
nextChildrenUrl?: string
type: NodeType.Group
export interface Organization {
acronym?: string
name: string
}
export interface NodeSearchResult {
field: Field
path: string
rank?: number
}
export enum NodeType {
CodeBook = "CodeBook",
Group = "Group",
export interface OrganizationWithCount extends Organization {
count: number
}
export interface RetrievalError {
......@@ -119,10 +93,21 @@ export interface Study {
version: CodeBookVersion
}
export enum StudyOrganizationRelation {
Depositor = "depositor",
Distributor = "distributor",
Producer = "producer",
}
export interface StudySearchResult extends Study {
rank?: number
}
export enum StudyTermRelation {
Keyword = "keyword",
TopicClass = "topic class",
}
export interface Variable {
id: string
path: string
......@@ -146,11 +131,3 @@ export const postgreSqlConfigurationNameByLanguage: {
[Language.En]: "english",
[Language.Fr]: "french",
}
export function assertNeverNode(node: never): never {
throw new Error("Unexpected node: " + node)
}
export function assertNeverNodeType(type: never): never {
throw new Error("Unexpected node type: " + type)
}
......@@ -3,16 +3,7 @@ import xmlParser from "fast-xml-parser"
import fs from "fs-extra"
import path from "path"
import {
Field,
Follow,
JsonData,
Language,
NodeType,
Series,
Study,
Variable,
} from "./data"
import { Follow, JsonData, Series, Study, Variable } from "./data"
import config from "./config"
import { db } from "./database"