Commit 616a0bdb authored by Emmanuel Raviart's avatar Emmanuel Raviart
Browse files

Add script to retrieve DDIs from Dataverse.

parent 55eda996
......@@ -54,8 +54,8 @@ npm run configure
```bash
# ADISP
npx babel-node --extensions ".ts" src/scripts/retrieve_nesstar_ddis.ts --url http://nesstar.progedo-adisp.fr/ ../public_data/adisp-ddi/
# CDSP Sciences Po
npx babel-node --extensions ".ts" src/scripts/retrieve_nesstar_ddis.ts --url http://nesstar.sciences-po.fr/ ../public_data/cdsp-ddi/
# CDSP Sciences Po (obsolete & closed)
# npx babel-node --extensions ".ts" src/scripts/retrieve_nesstar_ddis.ts --url http://nesstar.sciences-po.fr/ ../public_data/cdsp-ddi/
# INED
npx babel-node --extensions ".ts" src/scripts/retrieve_nesstar_ddis.ts --url http://nesstar.ined.fr/ ../public_data/ined-ddi/
# INED - Generations and Gender Survey
......@@ -66,6 +66,13 @@ npx babel-node --extensions ".ts" src/scripts/retrieve_nesstar_ddis.ts --url htt
npx babel-node --extensions ".ts" src/scripts/retrieve_nesstar_ddis.ts --url http://nsddata.nsd.uib.no ../public_data/nsddata-ddi/
```
### Fetching Dataverse Servers
```bash
# data.sciencespo
npx babel-node --extensions ".ts" -- src/scripts/retrieve_dataverse_ddis.ts --url https://data.sciencespo.fr/ --verbose ../public_data/sciences-po/
```
### Repairing DDI files
```bash
......
import {
Audit,
auditBoolean,
auditHttpUrl,
auditRequire,
auditSetNullish,
auditTrimString,
cleanAudit,
} from "@auditors/core"
import assert from "assert"
import commandLineArgs from "command-line-args"
import fs from "fs-extra"
import fetch from "node-fetch"
import path from "path"
import stream from "stream"
import util from "util"
import { stringifyQuery } from "../urls"
const optionsDefinitions = [
{
alias: "s",
help: "don't log anything",
name: "silent",
type: Boolean,
},
{
alias: "u",
help: "base URL of Dataverse server",
name: "url",
type: String,
},
{
alias: "v",
help: "verbose logs",
name: "verbose",
type: Boolean,
},
{
defaultOption: true,
help: "directory to store XML DDI Codebook files",
name: "xmlDir",
type: String,
},
]
const [options, optionsError] = auditCommandLineOptions(
cleanAudit,
commandLineArgs(optionsDefinitions),
)
if (optionsError !== null) {
console.error("Errors in command line arguments:")
console.error("Arguments:", JSON.stringify(options, null, 2))
console.error("Errors:", JSON.stringify(optionsError, null, 2))
process.exit(1)
}
const pipeline = util.promisify(stream.pipeline)
export function auditCommandLineOptions(audit: Audit, data: any): [any, any] {
if (data == null) {
return [data, null]
}
if (typeof data !== "object") {
return audit.unexpectedType(data, "object")
}
data = { ...data }
const errors: { [key: string]: any } = {}
const remainingKeys = new Set(Object.keys(data))
audit.attribute(
data,
"silent",
true,
errors,
remainingKeys,
auditBoolean,
auditSetNullish(false),
)
audit.attribute(
data,
"url",
true,
errors,
remainingKeys,
auditHttpUrl,
auditRequire,
)
audit.attribute(
data,
"verbose",
true,
errors,
remainingKeys,
auditBoolean,
auditSetNullish(false),
)
audit.attribute(
data,
"xmlDir",
true,
errors,
remainingKeys,
auditTrimString,
auditRequire,
)
return audit.reduceRemaining(data, errors, remainingKeys)
}
// async function fetchDataset(global_id: string) {
// const query = stringifyQuery({
// persistentId: global_id,
// })
// const url = new URL(
// `api/datasets/:persistentId/?${query}`,
// options.url,
// ).toString()
// console.log(`Retrieving dataset "${global_id}" at ${url}…`)
// const response = await fetch(url)
// const result = await response.json()
// console.log(JSON.stringify(result, null, 2))
// }
async function fetchDatasetMetadataDdi(global_id: string) {
const query = stringifyQuery({
exporter: "ddi",
persistentId: global_id,
})
const url = new URL(`api/datasets/export?${query}`, options.url).toString()
console.log(`Retrieving DDI metadata of dataset "${global_id}" at ${url}…`)
const response = await fetch(url)
if (!response.ok) {
console.error(response.status, response.statusText)
console.error(await response.text())
throw new Error(`Fetch failed at ${url}`)
}
assert(global_id.startsWith("doi:"))
const ddiFilePath =
path.join(options.xmlDir, ...global_id.replace(/doi:/, "").split("/")) +
".xml"
await fs.ensureDir(path.dirname(ddiFilePath))
await pipeline(response.body, fs.createWriteStream(ddiFilePath))
}
async function* iterDatasets(identifier?: string) {
console.log(`Iterating datasets of dataverse "${identifier}"…`)
for (let start = 0; ; ) {
const query = stringifyQuery({
q: "*",
start,
subtree: identifier,
type: "dataset",
})
const url = new URL(`api/search?${query}`, options.url).toString()
if (options.verbose) {
console.log(` Retrieving (partial) list of datasets at ${url}…`)
}
const response = await fetch(url)
const result = await response.json()
assert.strictEqual(result.status, "OK")
const { data } = result
assert.strictEqual(data.start, start)
assert.strictEqual(data.items.length, data.count_in_response)
yield* data.items
start += data.items.length
if (start >= data.total_count) {
break
}
}
}
async function* iterDataverses() {
console.log("Iterating dataverses…")
for (let start = 0; ; ) {
const query = stringifyQuery({
q: "*",
start,
type: "dataverse",
})
const url = new URL(`api/search?${query}`, options.url).toString()
if (options.verbose) {
console.log(` Retrieving (partial) list of dataverses at ${url}…`)
}
const response = await fetch(url)
const result = await response.json()
assert.strictEqual(result.status, "OK")
const { data } = result
assert.strictEqual(data.start, start)
assert.strictEqual(data.items.length, data.count_in_response)
yield* data.items
start += data.items.length
if (start >= data.total_count) {
break
}
}
}
async function main(): Promise<void> {
fs.ensureDir(options.xmlDir)
for await (const dataverseSummary of iterDataverses()) {
// console.log(JSON.stringify(dataverseSummary, null, 2))
for await (const datasetSummary of iterDatasets(
dataverseSummary.identifier,
)) {
// console.log(JSON.stringify(datasetSummary, null, 2))
// await fetchDataset(datasetSummary.global_id)
await fetchDatasetMetadataDdi(datasetSummary.global_id)
}
}
}
main()
.then(() => {
process.exit(0)
})
.catch((error) => {
console.error(error)
process.exit(1)
})
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment