Skip to content

Commit

Permalink
Improve consistency of JSONPath parsing & execution across frontend/b…
Browse files Browse the repository at this point in the history
…ackend (#481)

Signed-off-by: Tyler Ohlsen <[email protected]>
  • Loading branch information
ohltyler authored Nov 15, 2024
1 parent 697241e commit 75ae094
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 26 deletions.
4 changes: 4 additions & 0 deletions common/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,10 @@ export enum PROCESSOR_CONTEXT {
SEARCH_REQUEST = 'search_request',
SEARCH_RESPONSE = 'search_response',
}
export enum TRANSFORM_CONTEXT {
INPUT = 'input',
OUTPUT = 'output',
}
export const START_FROM_SCRATCH_WORKFLOW_NAME = 'Start From Scratch';
export const DEFAULT_NEW_WORKFLOW_NAME = 'new_workflow';
export const DEFAULT_NEW_WORKFLOW_DESCRIPTION = 'My new workflow';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import {
IConfigField,
PROCESSOR_CONTEXT,
WorkflowConfig,
JSONPATH_ROOT_SELECTOR,
WorkflowFormValues,
ModelInterface,
IndexMappings,
Expand All @@ -45,6 +44,7 @@ import {
getDataSourceId,
parseModelInputs,
parseModelOutputs,
sanitizeJSONPath,
} from '../../../../utils';
import { ConfigFieldList } from '../config_field_list';
import { OverrideQueryModal } from './modals/override_query_modal';
Expand Down Expand Up @@ -186,7 +186,13 @@ export function MLProcessorInputs(props: MLProcessorInputsProps) {
setDocFields(
docObjKeys.map((key) => {
return {
label: key,
label:
// ingest inputs can handle dot notation, and hence don't need
// sanitizing to handle JSONPath edge cases. The other contexts
// only support JSONPath, and hence need some post-processing/sanitizing.
props.context === PROCESSOR_CONTEXT.INGEST
? key
: sanitizeJSONPath(key),
};
})
);
Expand All @@ -202,7 +208,13 @@ export function MLProcessorInputs(props: MLProcessorInputsProps) {
setQueryFields(
queryObjKeys.map((key) => {
return {
label: key,
label:
// ingest inputs can handle dot notation, and hence don't need
// sanitizing to handle JSONPath edge cases. The other contexts
// only support JSONPath, and hence need some post-processing/sanitizing.
props.context === PROCESSOR_CONTEXT.INGEST
? key
: sanitizeJSONPath(key),
};
})
);
Expand Down Expand Up @@ -391,7 +403,15 @@ export function MLProcessorInputs(props: MLProcessorInputsProps) {
? 'Specify a query field'
: 'Define a document field'
}
valueHelpText={`Specify a document field or define JSONPath to transform the document to map to a model input field.`}
valueHelpText={`Specify a ${
props.context === PROCESSOR_CONTEXT.SEARCH_REQUEST
? 'query'
: 'document'
} field or define JSONPath to transform the ${
props.context === PROCESSOR_CONTEXT.SEARCH_REQUEST
? 'query'
: 'document'
} to map to a model input field.`}
valueOptions={
props.context === PROCESSOR_CONTEXT.INGEST
? docFields
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import {
PROCESSOR_CONTEXT,
SearchHit,
SimulateIngestPipelineResponse,
TRANSFORM_CONTEXT,
WorkflowConfig,
WorkflowFormValues,
customStringify,
Expand Down Expand Up @@ -185,11 +186,15 @@ export function InputTransformModal(props: InputTransformModalProps) {
Array.isArray(sampleSourceInput)
? generateArrayTransform(
sampleSourceInput as [],
tempInputMap[selectedTransformOption]
tempInputMap[selectedTransformOption],
props.context,
TRANSFORM_CONTEXT.INPUT
)
: generateTransform(
sampleSourceInput,
tempInputMap[selectedTransformOption]
tempInputMap[selectedTransformOption],
props.context,
TRANSFORM_CONTEXT.INPUT
);

setTransformedInput(customStringify(output));
Expand Down Expand Up @@ -303,7 +308,15 @@ export function InputTransformModal(props: InputTransformModalProps) {
? 'Specify a query field'
: 'Define a document field'
}
valueHelpText={`Specify a document field or define JSONPath to transform the document to map to a model input field.`}
valueHelpText={`Specify a ${
props.context === PROCESSOR_CONTEXT.SEARCH_REQUEST
? 'query'
: 'document'
} field or define JSONPath to transform the ${
props.context === PROCESSOR_CONTEXT.SEARCH_REQUEST
? 'query'
: 'document'
} to map to a model input field.`}
valueOptions={props.valueOptions}
// If the map we are adding is the first one, populate the selected option to index 0
onMapAdd={(curArray) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import {
SearchHit,
SearchPipelineConfig,
SimulateIngestPipelineResponse,
TRANSFORM_CONTEXT,
WorkflowConfig,
WorkflowFormValues,
customStringify,
Expand Down Expand Up @@ -157,7 +158,9 @@ export function OutputTransformModal(props: OutputTransformModalProps) {
sampleSourceOutput = JSON.parse(sourceOutput);
const output = generateTransform(
sampleSourceOutput,
reverseKeysAndValues(tempOutputMap[selectedTransformOption])
reverseKeysAndValues(tempOutputMap[selectedTransformOption]),
props.context,
TRANSFORM_CONTEXT.OUTPUT
);
setTransformedOutput(customStringify(output));
} catch {}
Expand Down
5 changes: 3 additions & 2 deletions public/utils/config_to_template_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import {
SearchPipelineConfig,
} from '../../common';
import { processorConfigToFormik } from './config_to_form_utils';
import { sanitizeJSONPath } from './utils';

/*
**************** Config -> template utils **********************
Expand Down Expand Up @@ -451,11 +452,11 @@ function mergeMapIntoSingleObj(
curMap = reverse
? {
...curMap,
[mapEntry.value]: mapEntry.key,
[sanitizeJSONPath(mapEntry.value)]: sanitizeJSONPath(mapEntry.key),
}
: {
...curMap,
[mapEntry.key]: mapEntry.value,
[sanitizeJSONPath(mapEntry.key)]: sanitizeJSONPath(mapEntry.value),
};
});
return curMap;
Expand Down
114 changes: 98 additions & 16 deletions public/utils/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ import {
ModelInterface,
ModelOutput,
ModelOutputFormField,
PROCESSOR_CONTEXT,
SimulateIngestPipelineDoc,
SimulateIngestPipelineResponse,
TRANSFORM_CONTEXT,
WORKFLOW_RESOURCE_TYPE,
WORKFLOW_STEP_TYPE,
Workflow,
Expand Down Expand Up @@ -182,14 +184,21 @@ export function unwrapTransformedDocs(

// ML inference processors will use standard dot notation or JSONPath depending on the input.
// We follow the same logic here to generate consistent results.
export function generateTransform(input: {} | [], map: MapFormValue): {} {
export function generateTransform(
input: {} | [],
map: MapFormValue,
context: PROCESSOR_CONTEXT,
transformContext: TRANSFORM_CONTEXT
): {} {
let output = {};
map.forEach((mapEntry) => {
try {
const transformedResult = getTransformedResult(
mapEntry,
input,
mapEntry.value
mapEntry.value,
context,
transformContext
);
output = {
...output,
Expand All @@ -204,12 +213,23 @@ export function generateTransform(input: {} | [], map: MapFormValue): {} {
// a single field value in the transformed output.
// A specialty scenario for when configuring input on search response processors, one-to-one is false,
// and the input is an array.
export function generateArrayTransform(input: [], map: MapFormValue): {}[] {
export function generateArrayTransform(
input: [],
map: MapFormValue,
context: PROCESSOR_CONTEXT,
transformContext: TRANSFORM_CONTEXT
): {}[] {
let output = [] as {}[];
map.forEach((mapEntry) => {
try {
const transformedResult = input.map((inputEntry) =>
getTransformedResult(mapEntry, inputEntry, mapEntry.value)
getTransformedResult(
mapEntry,
inputEntry,
mapEntry.value,
context,
transformContext
)
);
output = {
...output,
Expand All @@ -223,19 +243,62 @@ export function generateArrayTransform(input: [], map: MapFormValue): {}[] {
function getTransformedResult(
mapEntry: MapEntry,
input: {},
path: string
path: string,
context: PROCESSOR_CONTEXT,
transformContext: TRANSFORM_CONTEXT
): any {
// Edge case: if the path is ".", it implies returning
// the entire value. This may happen if full_response_path=false
// and the input is the entire result with nothing else to parse out.
// get() does not cover this case, so we override manually.
return path === '.'
? input
: mapEntry.value.startsWith(JSONPATH_ROOT_SELECTOR)
? // JSONPath transform
jsonpath.query(input, path)
: // Standard dot notation
get(input, path);
// Regular dot notation can only be executed if 1/ the JSONPath selector is not explicitly defined,
// and 2/ it is in the context of ingest, and 3/ it is transforming the input (the source document).
// For all other scenarios, it can only be JSONPath, due to backend parsing limitations.
if (
!mapEntry.value.startsWith(JSONPATH_ROOT_SELECTOR) &&
context === PROCESSOR_CONTEXT.INGEST &&
transformContext === TRANSFORM_CONTEXT.INPUT
) {
// sub-edge case: if the path is ".", it implies returning
// the entire value. This may happen if full_response_path=false
// and the input is the entire result with nothing else to parse out.
// get() does not cover this case, so we override manually.
if (path === '.') {
return input;
} else {
return get(input, path);
}
} else {
// The backend sets a JSONPath setting ALWAYS_RETURN_LIST=false, which
// dynamically returns a list or single value, based on whether
// the path is definite or not. We try to mimic that with a
// custom fn isIndefiniteJsonPath(), since this setting, nor
// knowing if the path is definite or indefinite, is not exposed
// by any known jsonpath JS-based / NPM libraries.
// if found to be definite, we remove the outermost array, which
// will always be returned by default when running query().
const isIndefinite = isIndefiniteJsonPath(path);
const res = jsonpath.query(input, path);
if (isIndefinite) {
return res;
} else {
return res[0];
}
}
}

// Indefinite/definite path defns:
// https://github.com/json-path/JsonPath?tab=readme-ov-file#what-is-returned-when
// Note this may not cover every use case, as the true definition requires low-level
// branch navigation of the path nodes, which is not exposed by this npm library.
// Hence, we do our best to cover the majority of use cases and common patterns.
function isIndefiniteJsonPath(path: string): boolean {
// regex has 3 overall OR checks:
// 1. consecutive '.'s, indicating deep scan - \.{2}
// 2. ?(<anything>), indicating an expression - \?\(.*\)
// 3. multiple array indices - \[\d+,\d+\] | \[.*:.*\] | \[\*\]
// if any are met, then we call the path indefinite.
const indefiniteRegEx = new RegExp(
/\.{2}|\?\(.*\)|\[\d+,\d+\]|\[.*:.*\]|\[\*\]/,
'g'
);
return indefiniteRegEx.test(path);
}

// Derive the collection of model inputs from the model interface JSONSchema into a form-ready list
Expand Down Expand Up @@ -422,3 +485,22 @@ export const getErrorMessageForStepType = (
return '';
}
};

// Sanitize the nested keys in a given JSONPath definition.
// to ensure it works consistently on the frontend & backend. There are several discrepancies
// between the frontend and the backend packages, such that some
// scenarios will succeed on the frontend and fail on the backend,
// or vice versa.
export function sanitizeJSONPath(path: string): string {
return path.split('.').reduce((prevValue, curValue, idx) => {
// Case 1: accessing array via dot notation. Fails on the backend.
if (!isNaN(parseInt(curValue))) {
return prevValue + `[${curValue}]`;
// Case 2: accessing key with a dash via dot notation. Fails on the frontend.
} else if (curValue.includes('-')) {
return prevValue + `["${curValue}"]`;
} else {
return prevValue + '.' + curValue;
}
});
}

0 comments on commit 75ae094

Please sign in to comment.