Commit d55be5d3 authored by Emmanuel Raviart's avatar Emmanuel Raviart
Browse files

Improve extraction of IDs and DOIs.

parent 8384aaba
......@@ -117,7 +117,8 @@ node --experimental-specifier-resolution=node package/scripts/repair_ined_nessta
# node --experimental-specifier-resolution=node -- package/scripts/index_codebooks.js --language=fr --path=adisp ../public_data/adisp-oai-pmh-ddi-repaired/
node --experimental-specifier-resolution=node -- package/scripts/index_codebooks.js --language=fr --path=adisp ../public_data/adisp-ddis-repaired/
node --experimental-specifier-resolution=node -- package/scripts/index_codebooks.js --language=fr --path=cdsp ../public_data/sciencespo-dataverse-ddi/
node --experimental-specifier-resolution=node -- package/scripts/index_codebooks.js --language=fr --path=ined ../public_data/ined-nesstar-ddi-repaired/
# node --experimental-specifier-resolution=node -- package/scripts/index_codebooks.js --language=fr --path=ined ../public_data/ined-nesstar-ddi-repaired/
node --experimental-specifier-resolution=node -- package/scripts/index_codebooks.js --language=fr --path=ined ../public_data/ined-manual-ddi/
```
#### Indexing Other (non Progedo-related) DDI Files
......
......@@ -109,16 +109,24 @@ export function extractCodeBookText(
}
export function getCodeBookDoi(codeBook: CodeBook): string | undefined {
let ids: string | IDNoElement | (string | IDNoElement)[] =
let studyIds: string | IDNoElement | (string | IDNoElement)[] =
codeBook.stdyDscr.citation.titlStmt.IDNo
if (!Array.isArray(ids)) {
ids = [ids]
if (!Array.isArray(studyIds)) {
studyIds = [studyIds]
}
for (const id of ids) {
let documentIds: string | IDNoElement | (string | IDNoElement)[] =
codeBook.docDscr.citation.titlStmt.IDNo
if (!Array.isArray(documentIds)) {
documentIds = [documentIds]
}
for (const id of [...studyIds, ...documentIds]) {
if (["DataCite", "DOI"].includes(id["@agency"])) {
const idText = extractCodeBookText(id)
if (idText !== undefined && idText.startsWith("doi:")) {
return idText.substring("doi:".length)
if (
idText !== undefined &&
(idText.startsWith("doi:") || idText.match(/^\d+\.\d+\//))
) {
return idText.replace(/^doi:/, "")
}
}
}
......@@ -446,20 +454,23 @@ export function* iterCodeBookFilesDescription(
export function* iterCodeBookIds(
codeBook: CodeBook,
): Generator<string, void, void> {
let ids: string | IDNoElement | (string | IDNoElement)[] =
let studyIds: string | IDNoElement | (string | IDNoElement)[] =
codeBook.stdyDscr.citation.titlStmt.IDNo
ids = [
...new Set(
Array.isArray(ids) ? [codeBook["@ID"], ...ids] : [codeBook["@ID"], ids],
),
]
const encounteredIds = new Set<string>()
if (!Array.isArray(studyIds)) {
studyIds = [studyIds]
}
let documentIds: string | IDNoElement | (string | IDNoElement)[] =
codeBook.docDscr.citation.titlStmt.IDNo
if (!Array.isArray(documentIds)) {
documentIds = [documentIds]
}
const ids = new Set(
[codeBook["@ID"], ...studyIds, ...documentIds]
.map((id) => extractCodeBookText(id))
.filter((id) => id !== undefined),
)
for (const id of ids) {
const idText = extractCodeBookText(id)
if (idText !== undefined && !encounteredIds.has(idText)) {
yield idText
encounteredIds.add(idText)
}
yield id
}
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment