From 79fbe96ce5af2a8666d4fab0b201ad19cdd6b7b8 Mon Sep 17 00:00:00 2001 From: Roman Kalyakin Date: Thu, 24 Oct 2024 11:59:35 +0200 Subject: [PATCH] [chore] NER schema update (#452) * reconciled schema with the downstream service and updated the code and the schema * more tweaks to align the schema * using undici as the NER service http client --- src/schema/schemas/ImpressoNerEntity.json | 36 ++++- .../impresso-ner/impresso-ner.class.ts | 129 ++++++++++++++---- 2 files changed, 136 insertions(+), 29 deletions(-) diff --git a/src/schema/schemas/ImpressoNerEntity.json b/src/schema/schemas/ImpressoNerEntity.json index 8dbe33a0..b0cae1e7 100644 --- a/src/schema/schemas/ImpressoNerEntity.json +++ b/src/schema/schemas/ImpressoNerEntity.json @@ -35,6 +35,30 @@ "org.adm", "org.ent", "org.ent.pressagency", + "org.ent.pressagency.AFP", + "org.ent.pressagency.ANSA", + "org.ent.pressagency.AP", + "org.ent.pressagency.APA", + "org.ent.pressagency.ATS-SDA", + "org.ent.pressagency.Belga", + "org.ent.pressagency.CTK", + "org.ent.pressagency.DDP-DAPD", + "org.ent.pressagency.DNB", + "org.ent.pressagency.DPA", + "org.ent.pressagency.Domei", + "org.ent.pressagency.Europapress", + "org.ent.pressagency.Extel", + "org.ent.pressagency.Havas", + "org.ent.pressagency.Kipa", + "org.ent.pressagency.Reuters", + "org.ent.pressagency.SPK-SMP", + "org.ent.pressagency.Stefani", + "org.ent.pressagency.TASS", + "org.ent.pressagency.UP-UPI", + "org.ent.pressagency.Wolff", + "org.ent.pressagency.Xinhua", + "org.ent.pressagency.ag", + "org.ent.pressagency.unk", "pers", "pers.coll", "pers.ind", @@ -44,7 +68,8 @@ "prod.media", "time", "time.date.abs", - "time.hour.abs" + "time.hour.abs", + "unk" ] }, "surfaceForm": { @@ -82,8 +107,7 @@ "type": "number", "description": "Confidence score for the named entity linking" } - }, - "required": ["ner"] + } }, "wikidata": { "type": "object", @@ -96,6 +120,10 @@ "wikipediaPageName": { "type": "string", "description": "Wikipedia page name of the entity" + }, + "wikipediaPageUrl": { + "type": "string", + "description": "Wikipedia page URL of the entity" } }, "required": ["id"] @@ -109,5 +137,5 @@ "description": "Name of the entity" } }, - "required": ["id", "type", "surfaceForm", "offset", "confidence"] + "required": ["id", "type", "confidence"] } diff --git a/src/services/impresso-ner/impresso-ner.class.ts b/src/services/impresso-ner/impresso-ner.class.ts index 262d07aa..b23997a7 100644 --- a/src/services/impresso-ner/impresso-ner.class.ts +++ b/src/services/impresso-ner/impresso-ner.class.ts @@ -1,5 +1,6 @@ import type { Params } from '@feathersjs/feathers' -import axios, { AxiosResponse } from 'axios' +import { Agent, request } from 'undici' +import { logger } from '../../logger' export interface RequestPayload { text: string @@ -13,6 +14,7 @@ interface DownstreamRequestBody { // See // https://github.com/impresso/impresso-annotation/blob/740a31e2c925e4a4d59be97710e390871754674d/frontend/impresso_annotation/templates/landing_page.html#L157 // https://github.com/impresso/newsagency-classification/blob/7031c3992edf0d4354d9a29dea769fe7320f455f/lib/bert_classification/HIPE-scorer/tagset.txt +// https://github.com/impresso/impresso-schemas/blob/31-revise-entity-json-schema/json/entities/entities.schema.json type NerType = | 'comp.demonym' | 'comp.function' @@ -36,6 +38,30 @@ type NerType = | 'org.adm' | 'org.ent' | 'org.ent.pressagency' + | 'org.ent.pressagency.AFP' + | 'org.ent.pressagency.ANSA' + | 'org.ent.pressagency.AP' + | 'org.ent.pressagency.APA' + | 'org.ent.pressagency.ATS-SDA' + | 'org.ent.pressagency.Belga' + | 'org.ent.pressagency.CTK' + | 'org.ent.pressagency.DDP-DAPD' + | 'org.ent.pressagency.DNB' + | 'org.ent.pressagency.DPA' + | 'org.ent.pressagency.Domei' + | 'org.ent.pressagency.Europapress' + | 'org.ent.pressagency.Extel' + | 'org.ent.pressagency.Havas' + | 'org.ent.pressagency.Kipa' + | 'org.ent.pressagency.Reuters' + | 'org.ent.pressagency.SPK-SMP' + | 'org.ent.pressagency.Stefani' + | 'org.ent.pressagency.TASS' + | 'org.ent.pressagency.UP-UPI' + | 'org.ent.pressagency.Wolff' + | 'org.ent.pressagency.Xinhua' + | 'org.ent.pressagency.ag' + | 'org.ent.pressagency.unk' | 'pers' | 'pers.coll' | 'pers.ind' @@ -46,27 +72,47 @@ type NerType = | 'time' | 'time.date.abs' | 'time.hour.abs' + | 'unk' + | 'UNK' +/** + * See https://github.com/impresso/impresso-schemas/blob/31-revise-entity-json-schema/json/entities/entities.schema.json + */ interface DownstreamNes { - confidence_nel?: number // named entity linking confidence score - confidence_ner: number // named entity recognition confidence score - id: string - lOffset: number // left offset + // fields not in the schema + index?: number // index + id: string | string[] nested: boolean // is nested - rOffset: number // right offset - surface: string // surface form (text) + + // fields from the schema + + // required: + lOffset: number | null // left offset + rOffset: number | null // right offset + surface: string | null // surface form (text) type: NerType + // optional: + confidence_nel?: number // named entity linking confidence score + confidence_ner?: number // named entity recognition confidence score + wkd_id?: string // Wikidata ID wkpedia_pagename?: string // Wikipedia page name + wkpedia_url?: string // Wikipedia URL function?: string // function name?: string // entity name + + title?: string } +/** + * Loosely based on https://github.com/impresso/impresso-schemas/blob/31-revise-entity-json-schema/json/entities/entities.schema.json + * Some extra fields come from https://github.com/impresso/impresso-annotation/blob/main/backend/model_handler.py + */ interface DownstreamResponse { sys_id: string // model id - text: string // input text + text?: string // input text ts: string // ISO timestamp nes: DownstreamNes[] } @@ -74,13 +120,14 @@ interface DownstreamResponse { export interface ImpressoNerEntity { id: string type: NerType - surfaceForm: string - offset: { start: number; end: number } + surfaceForm?: string + offset?: { start: number; end: number } isTypeNested: boolean - confidence: { ner: number; nel?: number } + confidence: { ner?: number; nel?: number } wikidata?: { id: string wikipediaPageName?: string + wikipediaPageUrl?: string } function?: string name?: string @@ -111,37 +158,69 @@ export class ImpressoNerService { const url = `${this.baseUrl}/${MethodToUrl[method]}/` - const response = await axios.post, DownstreamRequestBody>( - url, - { data: text } - ) - if (response.status !== 200) { - console.error(`Failed to fetch downstream data. Error (${response.status}): `, response.data) + const response = await request(url, { + method: 'POST', + body: JSON.stringify({ data: text }), + headers: { 'Content-Type': 'application/json' }, + dispatcher: new Agent({ + connectTimeout: 1 * 60 * 1000, // 1 minute + headersTimeout: 5 * 60 * 1000, // 5 minutes + bodyTimeout: 5 * 60 * 1000, // 5 minutes + }), + }) + + if (response.statusCode !== 200) { + let bodyText = '' + try { + bodyText = await response.body.text() + } catch { + /* ignore */ + } + + logger.error(`Failed to fetch downstream data. Error (${response.statusCode}): `, bodyText) throw new Error('Failed to fetch downstream data') } - return convertDownstreamResponse(response.data) + + try { + const responseBody = await response.body.json() + return convertDownstreamResponse(responseBody as DownstreamResponse, data) + } catch (error) { + logger.error('Failed to parse downstream response', error) + throw new Error('Failed to parse downstream response') + } } } -const convertDownstreamResponse = (response: DownstreamResponse): ImpressoNerResponse => ({ +const convertDownstreamResponse = (response: DownstreamResponse, request: RequestPayload): ImpressoNerResponse => ({ modelId: response.sys_id, - text: response.text, + text: response.text != null ? response.text : request.text, timestamp: response.ts, entities: response.nes.map(convertDownstreamEntity), }) const convertDownstreamEntity = (entity: DownstreamNes): ImpressoNerEntity => ({ - id: entity.id, - type: entity.type, - surfaceForm: entity.surface, - offset: { start: entity.lOffset, end: entity.rOffset }, + id: typeof entity.id === 'string' ? entity.id : entity.id.join(','), + type: sanitizeType(entity.type), + ...(entity.surface != null ? { surfaceForm: entity.surface } : {}), + ...(entity.lOffset != null && entity.rOffset != null + ? { offset: { start: entity.lOffset, end: entity.rOffset } } + : {}), isTypeNested: entity.nested, confidence: { ner: entity.confidence_ner, nel: entity.confidence_nel }, ...(entity.wkd_id != null && entity.wkd_id != 'NIL' ? { - wikidata: { id: entity.wkd_id, wikipediaPageName: entity.wkpedia_pagename }, + wikidata: { + id: entity.wkd_id, + wikipediaPageName: entity.wkpedia_pagename, + wikipediaPageUrl: [null, undefined, 'N/A'].includes(entity.wkpedia_url) ? undefined : entity.wkpedia_url, + }, } : {}), ...(entity.function != null ? { function: entity.function } : {}), ...(entity.name != null ? { name: entity.name } : {}), }) + +const sanitizeType = (type: NerType): NerType => { + if (type === 'UNK') return 'unk' + return type +}