Commit f64f3d9c authored by Emmanuel Raviart's avatar Emmanuel Raviart
Browse files

Add API for full text search and words autocompletion.

parent c7cce909
Pipeline #209037 failed with stage
in 2 minutes and 31 seconds
......@@ -84,6 +84,12 @@ npx babel-node --extensions ".ts" -- src/scripts/index_code_books.ts --language=
npx babel-node --extensions ".ts" -- src/scripts/index_code_books.ts --language=no --path=nsddata --title=\"Norwegian Centre for Research Data \(NSD\)\" ../public_data/nsddata-ddi/
```
### Extracting words from CodeBooks from autocompletion
```bash
npx babel-node --extensions ".ts" -- src/scripts/index_words.ts
```
## Development
### Extracting TypeScript Raw Types from DDI files
......
import assert from "assert"
import dedent from "dedent-js"
import { Language, NodeType } from "./data"
import {
Language,
NodeType,
postgreSqlConfigurationNameByLanguage,
} from "./data"
import { db, versionNumber } from "./database"
import { indexGroup } from "./indexers"
export async function configure(): Promise<void> {
await configureDatabase()
for (const language of Object.values(Language)) {
await db.none(
dedent`
INSERT INTO languages (
code,
postgresql_configuration_name
)
VALUES (
$<code>,
$<postgreSqlConfigurationName>
)
ON CONFLICT (code)
DO UPDATE SET
postgresql_configuration_name = $<postgreSqlConfigurationName>
`,
{
code: language,
postgreSqlConfigurationName:
postgreSqlConfigurationNameByLanguage[language],
},
)
}
await indexGroup({
path: "",
language: Language.En,
......@@ -54,8 +81,8 @@ async function configureDatabase(): Promise<void> {
await db.none(
dedent`
CREATE TYPE field AS ENUM (
'Question',
'Title'
'Title',
'Variable'
)
`,
)
......@@ -66,23 +93,6 @@ async function configureDatabase(): Promise<void> {
}
}
// Enum: language
try {
await db.none(
dedent`
CREATE TYPE language AS ENUM (
'en',
'fr'
)
`,
)
} catch (e) {
// 42710: type "language" already exists
if (e.code !== "42710") {
throw e
}
}
// Enum: node_type
try {
await db.none(
......@@ -102,11 +112,21 @@ async function configureDatabase(): Promise<void> {
// Tables
// Table: languages
await db.none(
dedent`
CREATE TABLE IF NOT EXISTS languages (
code text NOT NULL PRIMARY KEY,
postgresql_configuration_name text
)
`,
)
// Table: nodes
await db.none(
dedent`
CREATE TABLE IF NOT EXISTS nodes (
language language NOT NULL,
language text NOT NULL REFERENCES languages(code) ON DELETE CASCADE,
path text NOT NULL PRIMARY KEY,
title text NOT NULL,
type node_type NOT NULL
......@@ -137,6 +157,16 @@ async function configureDatabase(): Promise<void> {
`,
)
// Table: nodes_text_search
await db.none(
dedent`
CREATE TABLE IF NOT EXISTS nodes_text_search (
path text NOT NULL PRIMARY KEY REFERENCES nodes(path) ON DELETE CASCADE,
text_search tsvector
)
`,
)
// Table: sessions
// Cf node_modules/connect-pg-simple/table.sql
await db.none(
......@@ -159,6 +189,30 @@ async function configureDatabase(): Promise<void> {
`,
)
// Table: variables
await db.none(
dedent`
CREATE TABLE IF NOT EXISTS variables (
id text NOT NULL,
path text NOT NULL REFERENCES nodes(path) ON DELETE CASCADE,
text_search tsvector,
title text NOT NULL,
PRIMARY KEY (path, id)
)
`,
)
// Table: words
await db.none(
dedent`
CREATE TABLE IF NOT EXISTS words (
language text NOT NULL REFERENCES languages(code) ON DELETE CASCADE,
word text NOT NULL,
PRIMARY KEY (language, word)
)
`,
)
// Apply patches that must be executed after every table is created.
// Add indexes once every table and column exists.
......@@ -171,6 +225,30 @@ async function configureDatabase(): Promise<void> {
`,
)
await db.none(
dedent`
CREATE INDEX IF NOT EXISTS nodes_text_search_idx
ON nodes_text_search
USING GIN (text_search)
`,
)
await db.none(
dedent`
CREATE INDEX IF NOT EXISTS variables_idx
ON variables
USING GIN (text_search)
`,
)
await db.none(
dedent`
CREATE INDEX IF NOT EXISTS words_trigrams_idx
ON words
USING GIST (word gist_trgm_ops)
`,
)
// Add comments once every table and column exists.
for (const command of [
"COMMENT ON TABLE code_books IS 'additional informations for nodes of type CodeBook'",
......@@ -179,6 +257,11 @@ async function configureDatabase(): Promise<void> {
"COMMENT ON COLUMN code_books.path IS 'path of node'",
"COMMENT ON COLUMN code_books.version IS 'version of CodeBook used'",
"COMMENT ON TABLE languages IS 'languages used by nodes & CodeBooks'",
"COMMENT ON COLUMN languages.code IS '2-letters ISO code of language'",
"COMMENT ON COLUMN languages.postgresql_configuration_name IS 'name of" +
" PostgreSQL configuration for language'",
"COMMENT ON TABLE nodes IS 'nodes (of the tree of nodes)'",
"COMMENT ON COLUMN nodes.language IS 'language used for node'",
'COMMENT ON COLUMN nodes.path IS \'"/"-separated concatenation of' +
......@@ -191,8 +274,21 @@ async function configureDatabase(): Promise<void> {
"COMMENT ON COLUMN nodes_autocomplete.field IS 'field(s) used for autocompletion'",
"COMMENT ON COLUMN nodes_autocomplete.path IS 'path of node'",
"COMMENT ON TABLE nodes_text_search IS 'full text search vectors for nodes'",
"COMMENT ON COLUMN nodes_text_search.path IS 'path of node'",
"COMMENT ON COLUMN nodes_text_search.text_search IS 'full text search vector of node'",
"COMMENT ON TABLE variables IS 'variables of studies (aka of CodeBooks)'",
"COMMENT ON COLUMN variables.id IS 'ID of variable in node'",
"COMMENT ON COLUMN variables.path IS 'path of node'",
"COMMENT ON COLUMN variables.text_search IS 'full text search vector of variable'",
"COMMENT ON TABLE version IS 'version of database'",
"COMMENT ON COLUMN version.number IS 'version number of database schema'",
"COMMENT ON TABLE words IS 'words used in text columns, for autocompletion'",
"COMMENT ON COLUMN words.language IS 'language of word'",
"COMMENT ON COLUMN words.word IS '(autocompleted) word'",
]) {
await db.none(command)
}
......
......@@ -6,8 +6,8 @@ export enum CodeBookVersion {
}
export enum Field {
Question = "Question",
Title = "Title",
Variable = "Variable",
}
export enum Follow {
......@@ -38,6 +38,7 @@ export interface NodeAutocompletion {
distance?: number
field: Field
path: string
rank?: number
}
export interface NodeBase {
......@@ -75,6 +76,13 @@ export interface RetrievalError {
url: string
}
export const postgreSqlConfigurationNameByLanguage: {
[language: string]: string
} = {
[Language.En]: "english",
[Language.Fr]: "french",
}
export function assertNeverNode(node: never): never {
throw new Error("Unexpected node: " + node)
}
......
......@@ -16,6 +16,7 @@ import {
NodeCodeBook,
NodeGroup,
NodeType,
postgreSqlConfigurationNameByLanguage,
} from "./data"
import config from "./config"
import { db } from "./database"
......@@ -133,17 +134,36 @@ export class DataProducer implements JsonData {
orderByClause = "title ASC"
selectClauses = ["*"]
} else {
// // Trigrams-based search
// joinClauses.push(
// "INNER JOIN nodes_autocomplete ON nodes.path = nodes_autocomplete.path",
// )
// orderByClause = "distance ASC, title ASC"
// selectClauses = [
// "nodes.*",
// "nodes_autocomplete.autocomplete",
// "nodes_autocomplete.autocomplete <-> $<term> AS distance",
// "nodes_autocomplete.field",
// ]
// if (fields.length > 0) {
// whereClauses.push("nodes_autocomplete.field IN ($<fields:list>)")
// }
// Full text search
joinClauses.push(
"INNER JOIN nodes_autocomplete" +
" ON nodes.path = nodes_autocomplete.path",
"INNER JOIN nodes_text_search ON nodes.path = nodes_text_search.path",
dedent`
CROSS JOIN (
SELECT plainto_tsquery($<languageConfigurationName>, $<term>) AS query
) AS variables
`,
)
orderByClause = "distance ASC, title ASC"
orderByClause = "rank DESC"
selectClauses = [
"nodes.*",
"nodes_autocomplete.autocomplete",
"nodes_autocomplete.autocomplete <-> $<term> AS distance",
"nodes_autocomplete.field",
"ts_rank_cd(text_search, query) AS rank",
]
whereClauses.push("query @@ text_search")
if (fields.length > 0) {
whereClauses.push("nodes_autocomplete.field IN ($<fields:list>)")
}
......@@ -162,6 +182,8 @@ export class DataProducer implements JsonData {
`,
{
fields,
languageConfigurationName:
postgreSqlConfigurationNameByLanguage[language],
path,
term,
types,
......@@ -185,6 +207,8 @@ export class DataProducer implements JsonData {
`,
{
fields,
languageConfigurationName:
postgreSqlConfigurationNameByLanguage[language],
limit,
offset,
path,
......@@ -195,11 +219,13 @@ export class DataProducer implements JsonData {
autocomplete,
distance,
field,
rank,
...node
}: Node & {
autocomplete?: string
distance: string
field: Field
rank: string
}) => {
if (this.nodeByPath[node.path] === undefined) {
this.addNodeBase(node, childrenOptions)
......@@ -214,6 +240,9 @@ export class DataProducer implements JsonData {
if (distance !== undefined) {
autocompletion.distance = parseFloat(distance)
}
if (rank !== undefined) {
autocompletion.rank = parseFloat(rank)
}
return autocompletion
},
),
......
import assert from "assert"
import dedent from "dedent-js"
import type { IConnected } from "pg-promise"
// The use of "require" instead of "import" for "pg-promise" is needed:
// - to remove circular dependencies:
......@@ -15,6 +16,7 @@ const pgPromiseFactory = require("pg-promise")
import type { IClient } from "pg-promise/typescript/pg-subset"
import config from "./config"
import { postgreSqlConfigurationNameByLanguage } from "./data"
const pgPromise = pgPromiseFactory()
export const db = pgPromise({
......@@ -28,7 +30,7 @@ export let dbSharedConnectionObject: IConnected<{}, IClient> | null = null
export const versionNumber = 1
/// Check that database exists and is up to date.
export async function connectDb() {
export async function connectDb(): Promise<void> {
dbSharedConnectionObject = await db.connect()
assert(
(
......@@ -46,6 +48,26 @@ export async function connectDb() {
versionNumber,
'Database must be upgraded. Run "npm run configure" to do it.',
)
// Update "global constant" postgreSqlConfigurationNameByLanguage from "languages" table.
await db.map(
dedent`
SELECT *
FROM languages
`,
undefined,
({
code,
postgresql_configuration_name,
}: {
code: string
postgresql_configuration_name: string
}) => {
postgreSqlConfigurationNameByLanguage[
code
] = postgresql_configuration_name
},
)
}
export function extractDataFromEntry(entry: any) {
......
......@@ -10,6 +10,7 @@ import {
Language,
NodeGroup,
NodeType,
postgreSqlConfigurationNameByLanguage,
} from "./data"
import { walkDir } from "./file_systems"
import type { CodeBook } from "./raw_types/code_books"
......@@ -17,8 +18,9 @@ import type { CodeBook } from "./raw_types/code_books"
const publicDataDir = path.resolve(config.publicDataDir)
class CodeBooksIndexer {
existingAutocompletes: Set<string> = new Set()
existingAutocompletesSymbol: Set<string> = new Set()
existingPaths: Set<string> = new Set()
existingVariablesSymbol: Set<string> = new Set()
readonly path: string
// Doesn't work, because of "path" module:
......@@ -51,7 +53,7 @@ class CodeBooksIndexer {
this.existingPaths.add(path)
}
const autocompletes = await db.map(
const autocompletesSymbol = await db.map(
dedent`
SELECT autocomplete, field, path
FROM nodes_autocomplete
......@@ -75,15 +77,35 @@ class CodeBooksIndexer {
path: string
}) => `${path}|${field}|${autocomplete}`,
)
for (const autocomplete of autocompletes) {
this.existingAutocompletes.add(autocomplete)
for (const autocompleteSymbol of autocompletesSymbol) {
this.existingAutocompletesSymbol.add(autocompleteSymbol)
}
const variablesSymbol = await db.map(
dedent`
SELECT id, path
FROM variables
WHERE path IN (
SELECT path
FROM nodes
${whereClause}
)
`,
{
path: this.path,
types: [...(this.types ?? [])],
},
({ id, path }: { id: string; path: string }) => `${path}|${id}`,
)
for (const symbol of variablesSymbol) {
this.existingVariablesSymbol.add(symbol)
}
}
/// Delete obsolete nodes from database.
async stop(): Promise<void> {
for (const key of this.existingAutocompletes) {
const [path, field, autocomplete] = key.split("|")
for (const symbol of this.existingAutocompletesSymbol) {
const [path, field, autocomplete] = symbol.split("|")
await db.none(
dedent`
DELETE FROM nodes_autocomplete
......@@ -99,6 +121,23 @@ class CodeBooksIndexer {
},
)
}
for (const symbol of this.existingVariablesSymbol) {
const [path, id] = symbol.split("|")
await db.none(
dedent`
DELETE FROM variables
WHERE
id = $<id>
AND path = $<path>
`,
{
id,
path,
},
)
}
for (const path of this.existingPaths) {
await db.none(
dedent`
......@@ -196,38 +235,130 @@ class CodeBooksIndexer {
path: nodePath,
},
)
this.existingAutocompletes.delete(`${nodePath}|${Field.Title}|${title}`)
this.existingAutocompletesSymbol.delete(
`${nodePath}|${Field.Title}|${title}`,
)
const dataDscr = codeBook.dataDscr
if (typeof dataDscr !== "string") {
for (const variable of dataDscr.var) {
const question = variable.labl
const autocomplete = `${variable["@ID"]} - ${variable.labl}`
await db.none(
dedent`
INSERT INTO nodes_autocomplete (
autocomplete,
field,
path
)
VALUES (
$<autocomplete>,
$<field>,
$<path>
)
ON CONFLICT
DO NOTHING
`,
INSERT INTO nodes_autocomplete (
autocomplete,
field,
path
)
VALUES (
$<autocomplete>,
$<field>,
$<path>
)
ON CONFLICT
DO NOTHING
`,
{
autocomplete: question,
field: Field.Question,
autocomplete,
field: Field.Variable,
path: nodePath,
},
)
this.existingAutocompletes.delete(
`${nodePath}|${Field.Question}|${question}`,
this.existingAutocompletesSymbol.delete(
`${nodePath}|${Field.Variable}|${autocomplete}`,
)
}
}
const textA = title
const textBFragments = []
if (typeof dataDscr !== "string") {
for (const variable of dataDscr.var) {
textBFragments.push(variable["@ID"])
textBFragments.push(variable.labl)
}
}
const { vector }: { vector: string } = await db.one(
dedent`
SELECT
setweight(to_tsvector($<languageConfigurationName>, $<textA>), 'A') ||
setweight(to_tsvector($<languageConfigurationName>, $<textB>), 'B')
AS vector
`,
{
languageConfigurationName:
postgreSqlConfigurationNameByLanguage[language] ?? "english",
textA,
textB: textBFragments.join(" "),
},
)
await db.none(
dedent`
INSERT INTO nodes_text_search(
path,
text_search
)
VALUES (
$<path>,
$<vector>
)
ON CONFLICT (path)
DO UPDATE SET
text_search = $<vector>
`,
{
path: nodePath,
vector,
},
)
if (typeof dataDscr !== "string") {
for (const variable of dataDscr.var) {
const id = variable["@ID"]
const title = variable.labl
const textA = [id, title].join(" ")
const { vector }: { vector: string } = await db.one(
dedent`
SELECT
setweight(to_tsvector($<languageConfigurationName>, $<textA>), 'A')
AS vector
`,
{
languageConfigurationName:
postgreSqlConfigurationNameByLanguage[language] ?? "english",
textA,
},
)
await db.none(
dedent`
INSERT INTO variables (
id,
path,
text_search,
title
)
VALUES (
$<id>,
$<path>,
$<vector>,
$<title>
)
ON CONFLICT (path, id)
DO UPDATE SET
text_search = $<vector>,
title = $<title>
`,
{
id,
path: nodePath,
title,
vector,
},
)
this.existingVariablesSymbol.delete(`${nodePath}|${id}`)
}
}
}
}
......@@ -268,7 +399,7 @@ export async function indexCodeBooks(
}
export async function indexGroup(node: NodeGroup) {