Skip to content

Commit

Permalink
Merge pull request #453 from impresso/develop
Browse files Browse the repository at this point in the history
Release v3.0.6
  • Loading branch information
theorm authored Oct 24, 2024
2 parents 02ee6fe + 79fbe96 commit 9c50712
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 29 deletions.
36 changes: 32 additions & 4 deletions src/schema/schemas/ImpressoNerEntity.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,30 @@
"org.adm",
"org.ent",
"org.ent.pressagency",
"org.ent.pressagency.AFP",
"org.ent.pressagency.ANSA",
"org.ent.pressagency.AP",
"org.ent.pressagency.APA",
"org.ent.pressagency.ATS-SDA",
"org.ent.pressagency.Belga",
"org.ent.pressagency.CTK",
"org.ent.pressagency.DDP-DAPD",
"org.ent.pressagency.DNB",
"org.ent.pressagency.DPA",
"org.ent.pressagency.Domei",
"org.ent.pressagency.Europapress",
"org.ent.pressagency.Extel",
"org.ent.pressagency.Havas",
"org.ent.pressagency.Kipa",
"org.ent.pressagency.Reuters",
"org.ent.pressagency.SPK-SMP",
"org.ent.pressagency.Stefani",
"org.ent.pressagency.TASS",
"org.ent.pressagency.UP-UPI",
"org.ent.pressagency.Wolff",
"org.ent.pressagency.Xinhua",
"org.ent.pressagency.ag",
"org.ent.pressagency.unk",
"pers",
"pers.coll",
"pers.ind",
Expand All @@ -44,7 +68,8 @@
"prod.media",
"time",
"time.date.abs",
"time.hour.abs"
"time.hour.abs",
"unk"
]
},
"surfaceForm": {
Expand Down Expand Up @@ -82,8 +107,7 @@
"type": "number",
"description": "Confidence score for the named entity linking"
}
},
"required": ["ner"]
}
},
"wikidata": {
"type": "object",
Expand All @@ -96,6 +120,10 @@
"wikipediaPageName": {
"type": "string",
"description": "Wikipedia page name of the entity"
},
"wikipediaPageUrl": {
"type": "string",
"description": "Wikipedia page URL of the entity"
}
},
"required": ["id"]
Expand All @@ -109,5 +137,5 @@
"description": "Name of the entity"
}
},
"required": ["id", "type", "surfaceForm", "offset", "confidence"]
"required": ["id", "type", "confidence"]
}
129 changes: 104 additions & 25 deletions src/services/impresso-ner/impresso-ner.class.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import type { Params } from '@feathersjs/feathers'
import axios, { AxiosResponse } from 'axios'
import { Agent, request } from 'undici'
import { logger } from '../../logger'

export interface RequestPayload {
text: string
Expand All @@ -13,6 +14,7 @@ interface DownstreamRequestBody {
// See
// https://github.com/impresso/impresso-annotation/blob/740a31e2c925e4a4d59be97710e390871754674d/frontend/impresso_annotation/templates/landing_page.html#L157
// https://github.com/impresso/newsagency-classification/blob/7031c3992edf0d4354d9a29dea769fe7320f455f/lib/bert_classification/HIPE-scorer/tagset.txt
// https://github.com/impresso/impresso-schemas/blob/31-revise-entity-json-schema/json/entities/entities.schema.json
type NerType =
| 'comp.demonym'
| 'comp.function'
Expand All @@ -36,6 +38,30 @@ type NerType =
| 'org.adm'
| 'org.ent'
| 'org.ent.pressagency'
| 'org.ent.pressagency.AFP'
| 'org.ent.pressagency.ANSA'
| 'org.ent.pressagency.AP'
| 'org.ent.pressagency.APA'
| 'org.ent.pressagency.ATS-SDA'
| 'org.ent.pressagency.Belga'
| 'org.ent.pressagency.CTK'
| 'org.ent.pressagency.DDP-DAPD'
| 'org.ent.pressagency.DNB'
| 'org.ent.pressagency.DPA'
| 'org.ent.pressagency.Domei'
| 'org.ent.pressagency.Europapress'
| 'org.ent.pressagency.Extel'
| 'org.ent.pressagency.Havas'
| 'org.ent.pressagency.Kipa'
| 'org.ent.pressagency.Reuters'
| 'org.ent.pressagency.SPK-SMP'
| 'org.ent.pressagency.Stefani'
| 'org.ent.pressagency.TASS'
| 'org.ent.pressagency.UP-UPI'
| 'org.ent.pressagency.Wolff'
| 'org.ent.pressagency.Xinhua'
| 'org.ent.pressagency.ag'
| 'org.ent.pressagency.unk'
| 'pers'
| 'pers.coll'
| 'pers.ind'
Expand All @@ -46,41 +72,62 @@ type NerType =
| 'time'
| 'time.date.abs'
| 'time.hour.abs'
| 'unk'
| 'UNK'

/**
* See https://github.com/impresso/impresso-schemas/blob/31-revise-entity-json-schema/json/entities/entities.schema.json
*/
interface DownstreamNes {
confidence_nel?: number // named entity linking confidence score
confidence_ner: number // named entity recognition confidence score
id: string
lOffset: number // left offset
// fields not in the schema
index?: number // index
id: string | string[]
nested: boolean // is nested
rOffset: number // right offset
surface: string // surface form (text)

// fields from the schema

// required:
lOffset: number | null // left offset
rOffset: number | null // right offset
surface: string | null // surface form (text)
type: NerType

// optional:
confidence_nel?: number // named entity linking confidence score
confidence_ner?: number // named entity recognition confidence score

wkd_id?: string // Wikidata ID
wkpedia_pagename?: string // Wikipedia page name
wkpedia_url?: string // Wikipedia URL

function?: string // function
name?: string // entity name

title?: string
}

/**
* Loosely based on https://github.com/impresso/impresso-schemas/blob/31-revise-entity-json-schema/json/entities/entities.schema.json
* Some extra fields come from https://github.com/impresso/impresso-annotation/blob/main/backend/model_handler.py
*/
interface DownstreamResponse {
sys_id: string // model id
text: string // input text
text?: string // input text
ts: string // ISO timestamp
nes: DownstreamNes[]
}

export interface ImpressoNerEntity {
id: string
type: NerType
surfaceForm: string
offset: { start: number; end: number }
surfaceForm?: string
offset?: { start: number; end: number }
isTypeNested: boolean
confidence: { ner: number; nel?: number }
confidence: { ner?: number; nel?: number }
wikidata?: {
id: string
wikipediaPageName?: string
wikipediaPageUrl?: string
}
function?: string
name?: string
Expand Down Expand Up @@ -111,37 +158,69 @@ export class ImpressoNerService {

const url = `${this.baseUrl}/${MethodToUrl[method]}/`

const response = await axios.post<DownstreamResponse, AxiosResponse<DownstreamResponse>, DownstreamRequestBody>(
url,
{ data: text }
)
if (response.status !== 200) {
console.error(`Failed to fetch downstream data. Error (${response.status}): `, response.data)
const response = await request(url, {
method: 'POST',
body: JSON.stringify({ data: text }),
headers: { 'Content-Type': 'application/json' },
dispatcher: new Agent({
connectTimeout: 1 * 60 * 1000, // 1 minute
headersTimeout: 5 * 60 * 1000, // 5 minutes
bodyTimeout: 5 * 60 * 1000, // 5 minutes
}),
})

if (response.statusCode !== 200) {
let bodyText = ''
try {
bodyText = await response.body.text()
} catch {
/* ignore */
}

logger.error(`Failed to fetch downstream data. Error (${response.statusCode}): `, bodyText)
throw new Error('Failed to fetch downstream data')
}
return convertDownstreamResponse(response.data)

try {
const responseBody = await response.body.json()
return convertDownstreamResponse(responseBody as DownstreamResponse, data)
} catch (error) {
logger.error('Failed to parse downstream response', error)
throw new Error('Failed to parse downstream response')
}
}
}

const convertDownstreamResponse = (response: DownstreamResponse): ImpressoNerResponse => ({
const convertDownstreamResponse = (response: DownstreamResponse, request: RequestPayload): ImpressoNerResponse => ({
modelId: response.sys_id,
text: response.text,
text: response.text != null ? response.text : request.text,
timestamp: response.ts,
entities: response.nes.map(convertDownstreamEntity),
})

const convertDownstreamEntity = (entity: DownstreamNes): ImpressoNerEntity => ({
id: entity.id,
type: entity.type,
surfaceForm: entity.surface,
offset: { start: entity.lOffset, end: entity.rOffset },
id: typeof entity.id === 'string' ? entity.id : entity.id.join(','),
type: sanitizeType(entity.type),
...(entity.surface != null ? { surfaceForm: entity.surface } : {}),
...(entity.lOffset != null && entity.rOffset != null
? { offset: { start: entity.lOffset, end: entity.rOffset } }
: {}),
isTypeNested: entity.nested,
confidence: { ner: entity.confidence_ner, nel: entity.confidence_nel },
...(entity.wkd_id != null && entity.wkd_id != 'NIL'
? {
wikidata: { id: entity.wkd_id, wikipediaPageName: entity.wkpedia_pagename },
wikidata: {
id: entity.wkd_id,
wikipediaPageName: entity.wkpedia_pagename,
wikipediaPageUrl: [null, undefined, 'N/A'].includes(entity.wkpedia_url) ? undefined : entity.wkpedia_url,
},
}
: {}),
...(entity.function != null ? { function: entity.function } : {}),
...(entity.name != null ? { name: entity.name } : {}),
})

const sanitizeType = (type: NerType): NerType => {
if (type === 'UNK') return 'unk'
return type
}

0 comments on commit 9c50712

Please sign in to comment.