Commit 5a575c19 authored by Emmanuel Raviart's avatar Emmanuel Raviart
Browse files

DDI Parsing: Add script to repair language of DDI files retrieved from INED Nesstar server.

parent b3c79c47
......@@ -88,6 +88,7 @@ npx babel-node --extensions ".ts" src/scripts/retrieve_nesstar_ddis.ts --url htt
# ADISP
npx babel-node --extensions ".ts" src/scripts/repair_adisp_oai-pmh_ddis.ts --source=../public_data/adisp-oai-pmh-ddi/ ../public_data/adisp-oai-pmh-ddi-repaired/
npx babel-node --extensions ".ts" src/scripts/repair_adisp_nesstar_ddis.ts --source=../public_data/adisp-nesstar-ddi/ ../public_data/adisp-nesstar-ddi-repaired/
npx babel-node --extensions ".ts" src/scripts/repair_ined_nesstar_ddis.ts --source=../public_data/ined-nesstar-ddi/ ../public_data/ined-nesstar-ddi-repaired/
```
### Indexing DDI files
......@@ -97,7 +98,7 @@ npx babel-node --extensions ".ts" src/scripts/repair_adisp_nesstar_ddis.ts --sou
```bash
npx babel-node --extensions ".ts" -- src/scripts/index_codebooks.ts --language=fr --path=adisp ../public_data/adisp-oai-pmh-ddi-repaired/
npx babel-node --extensions ".ts" -- src/scripts/index_codebooks.ts --language=fr --path=cdsp ../public_data/sciencespo-dataverse-ddi/
npx babel-node --extensions ".ts" -- src/scripts/index_codebooks.ts --language=fr --path=ined ../public_data/ined-nesstar-ddi/
npx babel-node --extensions ".ts" -- src/scripts/index_codebooks.ts --language=fr --path=ined ../public_data/ined-nesstar-ddi-repaired/
```
#### Indexing Other (non Progedo-related) DDI Files
......
import {
Audit,
auditBoolean,
auditRequire,
auditSetNullish,
auditTrimString,
cleanAudit,
} from "@auditors/core"
import commandLineArgs from "command-line-args"
import fs from "fs-extra"
import path from "path"
const optionsDefinitions = [
{
alias: "S",
help: "directory containing XML DDI CodeBook files to repair",
name: "source",
type: String,
},
{
alias: "s",
help: "don't log anything",
name: "silent",
type: Boolean,
},
{
defaultOption: true,
help: "directory containing repaired XML DDI CodeBook files",
name: "target",
type: String,
},
{
alias: "v",
help: "verbose logs",
name: "verbose",
type: Boolean,
},
]
export function auditCommandLineOptions(audit: Audit, data: any): [any, any] {
if (data == null) {
return [data, null]
}
if (typeof data !== "object") {
return audit.unexpectedType(data, "object")
}
data = { ...data }
const errors: { [key: string]: any } = {}
const remainingKeys = new Set(Object.keys(data))
audit.attribute(
data,
"silent",
true,
errors,
remainingKeys,
auditBoolean,
auditSetNullish(false),
)
audit.attribute(
data,
"source",
true,
errors,
remainingKeys,
auditTrimString,
auditRequire,
)
audit.attribute(
data,
"target",
true,
errors,
remainingKeys,
auditTrimString,
auditRequire,
)
audit.attribute(
data,
"verbose",
true,
errors,
remainingKeys,
auditBoolean,
auditSetNullish(false),
)
return audit.reduceRemaining(data, errors, remainingKeys)
}
async function main() {
const [options, optionsError] = auditCommandLineOptions(
cleanAudit,
commandLineArgs(optionsDefinitions),
)
if (optionsError !== null) {
console.error("Errors in command line arguments:")
console.error("Arguments:", JSON.stringify(options, null, 2))
console.error("Errors:", JSON.stringify(optionsError, null, 2))
process.exit(1)
}
const { source: sourceDir, target: targetDir } = options
await fs.remove(targetDir)
await fs.ensureDir(targetDir)
for (const filename of await fs.readdir(sourceDir)) {
if (!filename.endsWith(".xml")) {
continue
}
const xml: string = await fs.readFile(path.join(sourceDir, filename), {
encoding: "utf8",
})
let xmlRepaired = xml
const codeBookElement = xmlRepaired.match(/<codeBook\s.+?>/)![0]!
if (codeBookElement.includes('xml-lang="en"')) {
xmlRepaired = xmlRepaired.replace('xml-lang="en"', 'xml-lang="fr"')
if (xmlRepaired !== xml && !options.silent) {
console.log(`Set language of ${filename} to french.`)
}
} else if (!codeBookElement.includes("xml-lang=")) {
xmlRepaired = xmlRepaired.replace(/ID="(.*?)"/, 'ID="$1" xml-lang="en"')
if (xmlRepaired !== xml && !options.silent) {
console.log(`Set language of ${filename} to english.`)
}
}
await fs.writeFile(path.join(targetDir, filename), xmlRepaired, {
encoding: "utf8",
})
}
}
main()
.then(() => {
process.exit(0)
})
.catch((error) => {
console.error(error)
process.exit(1)
})
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment