Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 2.x] Improve consistency of JSONPath parsing & execution across frontend/backend #482

Merged
merged 1 commit into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions common/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,10 @@ export enum PROCESSOR_CONTEXT {
SEARCH_REQUEST = 'search_request',
SEARCH_RESPONSE = 'search_response',
}
export enum TRANSFORM_CONTEXT {
INPUT = 'input',
OUTPUT = 'output',
}
export const START_FROM_SCRATCH_WORKFLOW_NAME = 'Start From Scratch';
export const DEFAULT_NEW_WORKFLOW_NAME = 'new_workflow';
export const DEFAULT_NEW_WORKFLOW_DESCRIPTION = 'My new workflow';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import {
IConfigField,
PROCESSOR_CONTEXT,
WorkflowConfig,
JSONPATH_ROOT_SELECTOR,
WorkflowFormValues,
ModelInterface,
IndexMappings,
Expand All @@ -45,6 +44,7 @@ import {
getDataSourceId,
parseModelInputs,
parseModelOutputs,
sanitizeJSONPath,
} from '../../../../utils';
import { ConfigFieldList } from '../config_field_list';
import { OverrideQueryModal } from './modals/override_query_modal';
Expand Down Expand Up @@ -186,7 +186,13 @@ export function MLProcessorInputs(props: MLProcessorInputsProps) {
setDocFields(
docObjKeys.map((key) => {
return {
label: key,
label:
// ingest inputs can handle dot notation, and hence don't need
// sanitizing to handle JSONPath edge cases. The other contexts
// only support JSONPath, and hence need some post-processing/sanitizing.
props.context === PROCESSOR_CONTEXT.INGEST
? key
: sanitizeJSONPath(key),
};
})
);
Expand All @@ -202,7 +208,13 @@ export function MLProcessorInputs(props: MLProcessorInputsProps) {
setQueryFields(
queryObjKeys.map((key) => {
return {
label: key,
label:
// ingest inputs can handle dot notation, and hence don't need
// sanitizing to handle JSONPath edge cases. The other contexts
// only support JSONPath, and hence need some post-processing/sanitizing.
props.context === PROCESSOR_CONTEXT.INGEST
? key
: sanitizeJSONPath(key),
};
})
);
Expand Down Expand Up @@ -391,7 +403,15 @@ export function MLProcessorInputs(props: MLProcessorInputsProps) {
? 'Specify a query field'
: 'Define a document field'
}
valueHelpText={`Specify a document field or define JSONPath to transform the document to map to a model input field.`}
valueHelpText={`Specify a ${
props.context === PROCESSOR_CONTEXT.SEARCH_REQUEST
? 'query'
: 'document'
} field or define JSONPath to transform the ${
props.context === PROCESSOR_CONTEXT.SEARCH_REQUEST
? 'query'
: 'document'
} to map to a model input field.`}
valueOptions={
props.context === PROCESSOR_CONTEXT.INGEST
? docFields
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import {
PROCESSOR_CONTEXT,
SearchHit,
SimulateIngestPipelineResponse,
TRANSFORM_CONTEXT,
WorkflowConfig,
WorkflowFormValues,
customStringify,
Expand Down Expand Up @@ -185,11 +186,15 @@ export function InputTransformModal(props: InputTransformModalProps) {
Array.isArray(sampleSourceInput)
? generateArrayTransform(
sampleSourceInput as [],
tempInputMap[selectedTransformOption]
tempInputMap[selectedTransformOption],
props.context,
TRANSFORM_CONTEXT.INPUT
)
: generateTransform(
sampleSourceInput,
tempInputMap[selectedTransformOption]
tempInputMap[selectedTransformOption],
props.context,
TRANSFORM_CONTEXT.INPUT
);

setTransformedInput(customStringify(output));
Expand Down Expand Up @@ -303,7 +308,15 @@ export function InputTransformModal(props: InputTransformModalProps) {
? 'Specify a query field'
: 'Define a document field'
}
valueHelpText={`Specify a document field or define JSONPath to transform the document to map to a model input field.`}
valueHelpText={`Specify a ${
props.context === PROCESSOR_CONTEXT.SEARCH_REQUEST
? 'query'
: 'document'
} field or define JSONPath to transform the ${
props.context === PROCESSOR_CONTEXT.SEARCH_REQUEST
? 'query'
: 'document'
} to map to a model input field.`}
valueOptions={props.valueOptions}
// If the map we are adding is the first one, populate the selected option to index 0
onMapAdd={(curArray) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import {
SearchHit,
SearchPipelineConfig,
SimulateIngestPipelineResponse,
TRANSFORM_CONTEXT,
WorkflowConfig,
WorkflowFormValues,
customStringify,
Expand Down Expand Up @@ -157,7 +158,9 @@ export function OutputTransformModal(props: OutputTransformModalProps) {
sampleSourceOutput = JSON.parse(sourceOutput);
const output = generateTransform(
sampleSourceOutput,
reverseKeysAndValues(tempOutputMap[selectedTransformOption])
reverseKeysAndValues(tempOutputMap[selectedTransformOption]),
props.context,
TRANSFORM_CONTEXT.OUTPUT
);
setTransformedOutput(customStringify(output));
} catch {}
Expand Down
5 changes: 3 additions & 2 deletions public/utils/config_to_template_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import {
SearchPipelineConfig,
} from '../../common';
import { processorConfigToFormik } from './config_to_form_utils';
import { sanitizeJSONPath } from './utils';

/*
**************** Config -> template utils **********************
Expand Down Expand Up @@ -451,11 +452,11 @@ function mergeMapIntoSingleObj(
curMap = reverse
? {
...curMap,
[mapEntry.value]: mapEntry.key,
[sanitizeJSONPath(mapEntry.value)]: sanitizeJSONPath(mapEntry.key),
}
: {
...curMap,
[mapEntry.key]: mapEntry.value,
[sanitizeJSONPath(mapEntry.key)]: sanitizeJSONPath(mapEntry.value),
};
});
return curMap;
Expand Down
114 changes: 98 additions & 16 deletions public/utils/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ import {
ModelInterface,
ModelOutput,
ModelOutputFormField,
PROCESSOR_CONTEXT,
SimulateIngestPipelineDoc,
SimulateIngestPipelineResponse,
TRANSFORM_CONTEXT,
WORKFLOW_RESOURCE_TYPE,
WORKFLOW_STEP_TYPE,
Workflow,
Expand Down Expand Up @@ -182,14 +184,21 @@ export function unwrapTransformedDocs(

// ML inference processors will use standard dot notation or JSONPath depending on the input.
// We follow the same logic here to generate consistent results.
export function generateTransform(input: {} | [], map: MapFormValue): {} {
export function generateTransform(
input: {} | [],
map: MapFormValue,
context: PROCESSOR_CONTEXT,
transformContext: TRANSFORM_CONTEXT
): {} {
let output = {};
map.forEach((mapEntry) => {
try {
const transformedResult = getTransformedResult(
mapEntry,
input,
mapEntry.value
mapEntry.value,
context,
transformContext
);
output = {
...output,
Expand All @@ -204,12 +213,23 @@ export function generateTransform(input: {} | [], map: MapFormValue): {} {
// a single field value in the transformed output.
// A specialty scenario for when configuring input on search response processors, one-to-one is false,
// and the input is an array.
export function generateArrayTransform(input: [], map: MapFormValue): {}[] {
export function generateArrayTransform(
input: [],
map: MapFormValue,
context: PROCESSOR_CONTEXT,
transformContext: TRANSFORM_CONTEXT
): {}[] {
let output = [] as {}[];
map.forEach((mapEntry) => {
try {
const transformedResult = input.map((inputEntry) =>
getTransformedResult(mapEntry, inputEntry, mapEntry.value)
getTransformedResult(
mapEntry,
inputEntry,
mapEntry.value,
context,
transformContext
)
);
output = {
...output,
Expand All @@ -223,19 +243,62 @@ export function generateArrayTransform(input: [], map: MapFormValue): {}[] {
function getTransformedResult(
mapEntry: MapEntry,
input: {},
path: string
path: string,
context: PROCESSOR_CONTEXT,
transformContext: TRANSFORM_CONTEXT
): any {
// Edge case: if the path is ".", it implies returning
// the entire value. This may happen if full_response_path=false
// and the input is the entire result with nothing else to parse out.
// get() does not cover this case, so we override manually.
return path === '.'
? input
: mapEntry.value.startsWith(JSONPATH_ROOT_SELECTOR)
? // JSONPath transform
jsonpath.query(input, path)
: // Standard dot notation
get(input, path);
// Regular dot notation can only be executed if 1/ the JSONPath selector is not explicitly defined,
// and 2/ it is in the context of ingest, and 3/ it is transforming the input (the source document).
// For all other scenarios, it can only be JSONPath, due to backend parsing limitations.
if (
!mapEntry.value.startsWith(JSONPATH_ROOT_SELECTOR) &&
context === PROCESSOR_CONTEXT.INGEST &&
transformContext === TRANSFORM_CONTEXT.INPUT
) {
// sub-edge case: if the path is ".", it implies returning
// the entire value. This may happen if full_response_path=false
// and the input is the entire result with nothing else to parse out.
// get() does not cover this case, so we override manually.
if (path === '.') {
return input;
} else {
return get(input, path);
}
} else {
// The backend sets a JSONPath setting ALWAYS_RETURN_LIST=false, which
// dynamically returns a list or single value, based on whether
// the path is definite or not. We try to mimic that with a
// custom fn isIndefiniteJsonPath(), since this setting, nor
// knowing if the path is definite or indefinite, is not exposed
// by any known jsonpath JS-based / NPM libraries.
// if found to be definite, we remove the outermost array, which
// will always be returned by default when running query().
const isIndefinite = isIndefiniteJsonPath(path);
const res = jsonpath.query(input, path);
if (isIndefinite) {
return res;
} else {
return res[0];
}
}
}

// Indefinite/definite path defns:
// https://github.com/json-path/JsonPath?tab=readme-ov-file#what-is-returned-when
// Note this may not cover every use case, as the true definition requires low-level
// branch navigation of the path nodes, which is not exposed by this npm library.
// Hence, we do our best to cover the majority of use cases and common patterns.
function isIndefiniteJsonPath(path: string): boolean {
// regex has 3 overall OR checks:
// 1. consecutive '.'s, indicating deep scan - \.{2}
// 2. ?(<anything>), indicating an expression - \?\(.*\)
// 3. multiple array indices - \[\d+,\d+\] | \[.*:.*\] | \[\*\]
// if any are met, then we call the path indefinite.
const indefiniteRegEx = new RegExp(
/\.{2}|\?\(.*\)|\[\d+,\d+\]|\[.*:.*\]|\[\*\]/,
'g'
);
return indefiniteRegEx.test(path);
}

// Derive the collection of model inputs from the model interface JSONSchema into a form-ready list
Expand Down Expand Up @@ -422,3 +485,22 @@ export const getErrorMessageForStepType = (
return '';
}
};

// Sanitize the nested keys in a given JSONPath definition.
// to ensure it works consistently on the frontend & backend. There are several discrepancies
// between the frontend and the backend packages, such that some
// scenarios will succeed on the frontend and fail on the backend,
// or vice versa.
export function sanitizeJSONPath(path: string): string {
return path.split('.').reduce((prevValue, curValue, idx) => {
// Case 1: accessing array via dot notation. Fails on the backend.
if (!isNaN(parseInt(curValue))) {
return prevValue + `[${curValue}]`;
// Case 2: accessing key with a dash via dot notation. Fails on the frontend.
} else if (curValue.includes('-')) {
return prevValue + `["${curValue}"]`;
} else {
return prevValue + '.' + curValue;
}
});
}
Loading