-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Base implementation of the word embedding search service * use word match * updated code after testing * updated old class to convert words to new format
- Loading branch information
Showing
16 changed files
with
482 additions
and
79 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
{ | ||
"$schema": "http://json-schema.org/draft-07/schema#", | ||
"title": "WordMatch", | ||
"description": "Represents a word match result from word embeddings similarity search", | ||
"type": "object", | ||
"properties": { | ||
"id": { | ||
"type": "string", | ||
"description": "Unique identifier for the word" | ||
}, | ||
"languageCode": { | ||
"type": "string", | ||
"description": "The language code of the word" | ||
}, | ||
"word": { | ||
"type": "string", | ||
"description": "The word" | ||
} | ||
}, | ||
"required": ["id", "languageCode", "word"], | ||
"additionalProperties": false | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
{ | ||
"$schema": "http://json-schema.org/draft-07/schema#", | ||
"title": "WordMatch", | ||
"description": "Represents a word match result from word embeddings similarity search", | ||
"type": "object", | ||
"properties": { | ||
"id": { | ||
"type": "string", | ||
"description": "Unique identifier for the word" | ||
}, | ||
"languageCode": { | ||
"type": "string", | ||
"description": "The language code of the word" | ||
}, | ||
"word": { | ||
"type": "string", | ||
"description": "The word" | ||
} | ||
}, | ||
"required": ["id", "languageCode", "word"], | ||
"additionalProperties": false | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import { HookContext } from '@feathersjs/feathers' | ||
import { ImpressoApplication } from '../../types' | ||
import { v4 } from 'uuid' | ||
import { WordMatch } from '../../models/generated/schemas' | ||
|
||
const { queryWithCommonParams, validate } = require('../../hooks/params') | ||
|
||
export default { | ||
before: { | ||
all: [], | ||
find: [ | ||
validate( | ||
{ | ||
language_code: { | ||
choices: ['fr', 'de', 'lb'], | ||
}, | ||
term: { | ||
required: true, | ||
regex: /^[A-zÀ-ÿ'()\s]+$/, | ||
max_length: 500, | ||
transform: (d: string) => | ||
d | ||
.replace(/[^A-zÀ-ÿ]/g, ' ') | ||
.toLowerCase() | ||
.split(/\s+/) | ||
.sort((a: string, b: string) => a.length - b.length) | ||
.pop(), | ||
}, | ||
}, | ||
'GET' | ||
), | ||
queryWithCommonParams(), | ||
], | ||
get: [], | ||
create: [], | ||
update: [], | ||
patch: [], | ||
remove: [], | ||
}, | ||
|
||
after: { | ||
all: [], | ||
find: [ | ||
(context: HookContext<ImpressoApplication>) => { | ||
if (Array.isArray(context.result.data)) { | ||
context.result.data = context.result.data.map((word: string) => { | ||
return { | ||
word, | ||
id: v4(), | ||
languageCode: context.params.query.language ?? 'fr', | ||
} satisfies WordMatch | ||
}) | ||
} | ||
}, | ||
], | ||
get: [], | ||
create: [], | ||
update: [], | ||
patch: [], | ||
remove: [], | ||
}, | ||
|
||
error: { | ||
all: [], | ||
find: [], | ||
get: [], | ||
create: [], | ||
update: [], | ||
patch: [], | ||
remove: [], | ||
}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
import type { ClientService, Params } from '@feathersjs/feathers' | ||
import { SimpleSolrClient } from '../../internalServices/simpleSolr' | ||
import { PublicFindResponse as FindResponse } from '../../models/common' | ||
import { ImpressoApplication } from '../../types' | ||
import { escapeValue } from '../../util/solr/filterReducers' | ||
import { WordMatch } from '../../models/generated/schemasPublic' | ||
|
||
export type ValidLanguageCodes = 'de' | 'fr' | 'lb' | ||
|
||
type FindQuery = Pick<FindResponse<unknown>['pagination'], 'limit' | 'offset'> & { | ||
term: string | ||
/** filter baseline vectors search by language. */ | ||
language_code?: ValidLanguageCodes | ||
top_k?: number | ||
} | ||
|
||
const EmbeddingProperty = 'fastText_emb_v100' | ||
|
||
interface SolrEmbeddingsDoc { | ||
word_s: string | ||
[EmbeddingProperty]: number[] | ||
lg_s: string | ||
id: string | ||
} | ||
|
||
const asWordMatch = (doc: Omit<SolrEmbeddingsDoc, typeof EmbeddingProperty>): WordMatch => ({ | ||
id: doc.id, | ||
word: doc.word_s, | ||
languageCode: doc.lg_s, | ||
}) | ||
|
||
export const buildGetTermEmbeddingVectorSolrQuery = (term: string, language?: string): string => { | ||
const parts = [`word_s:(${escapeValue(term)})`, language ? `lg_s:${language}` : undefined] | ||
return parts.filter(p => p != null).join(' AND ') | ||
} | ||
|
||
export const buildFindBySimilarEmbeddingsSolrQuery = (vectors: number[][], topK: number): string => { | ||
return vectors.map(vector => `({!knn f=${EmbeddingProperty} topK=${topK}}${JSON.stringify(vector)})`).join(' OR ') | ||
} | ||
|
||
export const DefaultPageSize = 20 | ||
export const DefaultTopK = 20 | ||
|
||
export class EmbeddingsService | ||
implements Pick<ClientService<WordMatch, unknown, unknown, FindResponse<WordMatch>>, 'find'> | ||
{ | ||
private readonly app: ImpressoApplication | ||
|
||
constructor({ app }: { app: ImpressoApplication }) { | ||
this.app = app | ||
} | ||
|
||
private get solr(): SimpleSolrClient { | ||
return this.app.service('simpleSolrClient') | ||
} | ||
|
||
private async getTermEmbeddingVectors(term: string, language?: string): Promise<number[][]> { | ||
const result = await this.solr.select<Pick<SolrEmbeddingsDoc, typeof EmbeddingProperty>>( | ||
this.solr.namespaces.WordEmbeddings, | ||
{ | ||
body: { | ||
query: buildGetTermEmbeddingVectorSolrQuery(term, language), | ||
fields: EmbeddingProperty, | ||
limit: 1, | ||
offset: 0, | ||
}, | ||
} | ||
) | ||
return result?.response?.docs?.map(item => item[EmbeddingProperty]) ?? [] | ||
} | ||
|
||
private async getWordsMatchingVectors( | ||
vectors: number[][], | ||
topK: number, | ||
offset: number, | ||
limit: number | ||
): Promise<Omit<SolrEmbeddingsDoc, typeof EmbeddingProperty>[]> { | ||
if (vectors.length === 0) return [] | ||
const result = await this.solr.select<Omit<SolrEmbeddingsDoc, typeof EmbeddingProperty>>( | ||
this.solr.namespaces.WordEmbeddings, | ||
{ | ||
body: { | ||
query: buildFindBySimilarEmbeddingsSolrQuery(vectors, topK), | ||
fields: ['word_s', 'lg_s', 'id'].join(','), | ||
limit, | ||
offset, | ||
}, | ||
} | ||
) | ||
return result?.response?.docs ?? [] | ||
} | ||
|
||
async find(params?: Params<FindQuery>): Promise<FindResponse<WordMatch>> { | ||
if (!params?.query) { | ||
throw new Error('Query parameters are required') | ||
} | ||
|
||
const { | ||
term, | ||
language_code: languageCode, | ||
top_k: topK = DefaultTopK, | ||
limit = DefaultPageSize, | ||
offset = 0, | ||
} = params.query | ||
|
||
const vectors = await this.getTermEmbeddingVectors(term, languageCode) | ||
const matches = await this.getWordsMatchingVectors(vectors, topK, offset, limit) | ||
|
||
return { | ||
pagination: { | ||
limit, | ||
offset, | ||
total: matches.length, | ||
}, | ||
data: matches.map(asWordMatch), | ||
} | ||
} | ||
} |
Oops, something went wrong.