From 5e409bedcfcbada1ccdecdbd4e678420e0ff08cb Mon Sep 17 00:00:00 2001 From: doogyb Date: Mon, 20 Jun 2022 15:05:51 +0100 Subject: [PATCH] AI map entities classes This PR introduces logic which labels entities with their DBpedia ontological classes. Using this logic, it creates a file which tags every unique entity found for the AI map dataset with their respective classes. closes #120 --- src/bin/dbpedia/getClasses.mjs | 47 ++++++++++++++++++++++++++ src/node_modules/dbpedia/ontology.mjs | 22 ++++++++++++ src/node_modules/dbpedia/requests.mjs | 48 +++++++++++++++++++++++++-- src/node_modules/es/entities.mjs | 35 +++++++++++++++++++ src/node_modules/util/array.mjs | 8 ----- 5 files changed, 150 insertions(+), 10 deletions(-) create mode 100644 src/bin/dbpedia/getClasses.mjs create mode 100644 src/node_modules/dbpedia/ontology.mjs create mode 100644 src/node_modules/es/entities.mjs diff --git a/src/bin/dbpedia/getClasses.mjs b/src/bin/dbpedia/getClasses.mjs new file mode 100644 index 00000000..8a641567 --- /dev/null +++ b/src/bin/dbpedia/getClasses.mjs @@ -0,0 +1,47 @@ +import { promises as fs } from 'fs'; + +import { Command, InvalidArgumentError } from 'commander'; +import * as _ from 'lamb'; + +import { arxliveCopy } from 'conf/config.mjs'; +import { getClasses } from 'dbpedia/requests.mjs'; +import { getEntities } from 'es/entities.mjs'; + +const program = new Command(); +program.option( + '-d, --domain ', + 'ES domain on which to aggregate', + arxliveCopy +); +program.option('-i, --index ', 'ES index from which to get titles'); +program.option('-f, --path ', 'Path to file containing DBpedia titles'); +program.requiredOption('-o, --out ', 'Path of output file'); + +program.parse(); +const options = program.opts(); + +const main = async () => { + if (!options.index && !options.path) { + throw new InvalidArgumentError(` + You must specify either the index containing the DBpedia entities + or a path to the file containing the entities\' titles` + ); + } + if (options.index && options.path) { + throw new InvalidArgumentError(` + Ambigous input. Do you want to use the index or the file as input?` + ); + } + + // previous checks garauntee the following: + const titles = options.index + ? await getEntities(options.index) + : JSON.parse(await fs.readFile(options.path)); + + const sample=titles.slice(100, 110); + console.log(sample); + console.log(await getClasses(sample, { depth: 10, squash: true, fullURI:true })); + +}; + +main(); diff --git a/src/node_modules/dbpedia/ontology.mjs b/src/node_modules/dbpedia/ontology.mjs new file mode 100644 index 00000000..4f813849 --- /dev/null +++ b/src/node_modules/dbpedia/ontology.mjs @@ -0,0 +1,22 @@ +import { promises as fs } from 'fs'; + +import { stringify } from '@svizzle/utils'; +import * as _ from 'lamb'; + +import { dbo } from 'dbpedia/util.mjs'; + +const FILE_ONTOLOGY_JSON = 'data/dbpedia/ontology.json'; + +export const loadOntology = async (depth, { squash=false, fullURI=true }={}) => { + const data = await fs.readFile(FILE_ONTOLOGY_JSON, { encoding: 'utf-8'}); + const changedURIs = fullURI + ? JSON.parse(data) + : JSON.parse(data.replaceAll(dbo, '')); + + const selectAtDepth = _.pickIf(value => _.getIn(value, 'depth') <= depth); + const ontology = squash + ? _.values(_.mapValues(selectAtDepth(changedURIs), _.getKey('class_'))) + : selectAtDepth(changedURIs); + + return ontology; +}; diff --git a/src/node_modules/dbpedia/requests.mjs b/src/node_modules/dbpedia/requests.mjs index 8a7b0824..26810b8e 100644 --- a/src/node_modules/dbpedia/requests.mjs +++ b/src/node_modules/dbpedia/requests.mjs @@ -1,7 +1,8 @@ import * as _ from 'lamb'; -import { getValue, isIterableLongerThan1 } from '@svizzle/utils'; -import { dbr, prefixes } from 'dbpedia/util.mjs'; +import { getValue, isIterableLongerThan1, stringify } from '@svizzle/utils'; +import { dbr, dbo, prefixes } from 'dbpedia/util.mjs'; +import { loadOntology } from 'dbpedia/ontology.mjs'; import { query } from 'sparql/query.mjs'; const sanitizeInput = input => { @@ -84,3 +85,46 @@ export const isDisambiguation = async input => { const disambiguations = _.mapValues(groups, isIterableLongerThan1); return disambiguations; }; + +export const getClasses = async ( + input, + { depth=Infinity, + squash=true, + fullURI=true } = {} +) => { + + const sanitizedURIs = sanitizeInput(input); + const queries = _.map(sanitizedURIs, URI => + `{ + BIND (${URI} as ?title) + OPTIONAL { ${URI} rdf:type ?type . } + }`, + ); + const sparql = buildQuery(queries); + const results = await makeRequest(sparql); + const groups = _.group(results, _.getKey('title')); + const types = _.mapValues(groups, group => _.map(group, _.getKey('type'))); + const classFilter = await loadOntology(depth); + const filteredTypes = _.mapValues( + types, + typeList => { + const filtered = _.filter(typeList, t => t in classFilter); + const squashed = squash + ? filtered + : _.map(filtered, key => _.getIn(classFilter, key)); + const URIs = fullURI + ? squashed + : JSON.parse(stringify(squashed).replaceAll(dbo, '')); + return URIs; + } + ); + console.log(fullURI); + return filteredTypes; +}; + +const main = async () => { + const result = await getClasses(['Earth', 'Moon'], { fullURI: true, squash: false }); + console.log(result); +}; + +await main(); diff --git a/src/node_modules/es/entities.mjs b/src/node_modules/es/entities.mjs new file mode 100644 index 00000000..07649d77 --- /dev/null +++ b/src/node_modules/es/entities.mjs @@ -0,0 +1,35 @@ +import * as _ from 'lamb'; + +import { arxliveCopy } from 'conf/config.mjs'; +import { dbr } from 'dbpedia/util.mjs'; +import { scroll, clearScroll } from 'es/search.mjs'; + +// titles are the Wiki pages with whitepace replaced with underscores, so +// World War 1 => World_War_1 +// We use this terminology to stay consistent with Wikimedia's API, where the +// this parameter is also named title. +// https://api.wikimedia.org/wiki/API_reference/Core/Pages/Get_page +export const getEntities = async( + index, + domain=arxliveCopy, + { asTitle=true } = {} +) => { + + const scroller = scroll(domain, index, { size: 10000, }); + const uriCounts = {}; + for await (let page of scroller) { + _.forEach(page.hits.hits, doc => { + _.forEach(doc._source.dbpedia_entities, ({ URI }) => { + const key = asTitle + ? URI.replace(dbr, '') + : URI; + uriCounts[key] = uriCounts[key] ? uriCounts[key] + 1 : 1; + }); + }); + } + + clearScroll(domain); + + const entities = _.keys(uriCounts); + return entities; +}; diff --git a/src/node_modules/util/array.mjs b/src/node_modules/util/array.mjs index 6f65d8e6..7ae73df8 100644 --- a/src/node_modules/util/array.mjs +++ b/src/node_modules/util/array.mjs @@ -11,11 +11,3 @@ const _batch = (arr, batchSize) => { }; export const batch = _.pipe([_batch, _.filterWith(isNotNil)]); -<<<<<<< HEAD -<<<<<<< HEAD -======= -export const range = n => Array.from(Array(n).keys()); - ->>>>>>> 71c302d (Get ontological classes from Sparql Endpoint!) -======= ->>>>>>> d7385f9 (Implement requested changes)