diff --git a/dist/commonjs/atomic.d.ts b/dist/commonjs/atomic.d.ts index 4586faa..e88c7e6 100644 --- a/dist/commonjs/atomic.d.ts +++ b/dist/commonjs/atomic.d.ts @@ -13,6 +13,14 @@ export interface StringComparisonOptions { * Useful when only the differences in content are important, but not the order of the content * */ reorder?: boolean; + /** + * When `reorder` is used this determines how to split each string into the tokens that will be reordered. + * + * The value of this property is used in String.split() -- https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/split#separator + * + * @default " " + * */ + delimiter?: string | RegExp; } export interface StringSamenessResult { strategies: { diff --git a/dist/commonjs/index.d.ts b/dist/commonjs/index.d.ts index 480e7c1..7bf1740 100644 --- a/dist/commonjs/index.d.ts +++ b/dist/commonjs/index.d.ts @@ -2,7 +2,7 @@ import { ComparisonStrategyResult, StringComparisonOptions, StringSamenessResult import { strDefaultTransforms, transforms } from "./normalization/index.js"; declare const defaultStrategies: import("./atomic.js").ComparisonStrategy[]; declare const stringSameness: (valA: string, valB: string, options?: StringComparisonOptions) => StringSamenessResult; -export declare const reorderStr: (cleanA: string, cleanB: string, options?: StringComparisonOptions) => string; +export declare const reorderStr: (strA: string, strB: string, options?: StringComparisonOptions) => [string, string]; declare const createStringSameness: (defaults: StringComparisonOptions) => (valA: string, valB: string, options?: StringComparisonOptions) => StringSamenessResult; declare const strategies: { diceStrategy: import("./atomic.js").ComparisonStrategy; diff --git a/dist/commonjs/index.js b/dist/commonjs/index.js index 67ad753..3c0ba3d 100644 --- a/dist/commonjs/index.js +++ b/dist/commonjs/index.js @@ -17,16 +17,18 @@ const defaultStrategies = [ ]; exports.defaultStrategies = defaultStrategies; const stringSameness = (valA, valB, options) => { - const { transforms = index_js_2.strDefaultTransforms, strategies = defaultStrategies, reorder = false, } = options || {}; - const cleanA = transforms.reduce((acc, curr) => curr(acc), valA); + const { transforms = index_js_2.strDefaultTransforms, strategies = defaultStrategies, reorder = false, delimiter = ' ' } = options || {}; + let cleanA = transforms.reduce((acc, curr) => curr(acc), valA); let cleanB = transforms.reduce((acc, curr) => curr(acc), valB); - const shortest = cleanA.length > cleanB.length ? cleanB : cleanA; if (reorder) { // we want to ignore order of tokens as much as possible (user does not care about differences in word order, just absolute differences in characters overall) - // so we will reorder cleanB so its tokens match the order or tokens in cleanA as closely as possible + // so we will reorder the shorter of the two strings so its tokens match the order of tokens in the longer string as closely as possible // before we run strategies - cleanB = (0, exports.reorderStr)(cleanA, cleanB); + const [orderedX, orderedY] = (0, exports.reorderStr)(cleanA, cleanB); + cleanA = orderedX; + cleanB = orderedY; } + const shortest = cleanA.length > cleanB.length ? cleanB : cleanA; const stratResults = []; for (const strat of strategies) { if (strat.isValid !== undefined && !strat.isValid(cleanA, cleanB)) { @@ -60,32 +62,58 @@ const stringSameness = (valA, valB, options) => { }; }; exports.stringSameness = stringSameness; -const reorderStr = (cleanA, cleanB, options) => { - // to do the reordering we will use stringSameness with the provided strats to match against each token in cleanA and choose the closest token in cleanB - // and add the end concat any remaining tokens from cleanB to the reordered string - const aTokens = cleanA.split(' '); - const bTokens = cleanB.split(' '); - const orderedCandidateTokens = aTokens.reduce((acc, curr) => { +const reorderStr = (strA, strB, options) => { + const { transforms = index_js_2.strDefaultTransforms, strategies = defaultStrategies, delimiter = ' ' } = options || {}; + const cleanA = transforms.reduce((acc, curr) => curr(acc), strA); + const cleanB = transforms.reduce((acc, curr) => curr(acc), strB); + // split by "token" + const eTokens = cleanA.split(delimiter); + const cTokens = cleanB.split(delimiter); + let longerTokens, shorterTokens; + if (eTokens.length > cTokens.length) { + longerTokens = eTokens; + shorterTokens = cTokens; + } + else { + longerTokens = cTokens; + shorterTokens = eTokens; + } + // we will use longest string (token list) as the reducer and order the shorter list to match it + // so we don't have to deal with undefined positions in the shorter list + const orderedCandidateTokens = longerTokens.reduce((acc, curr) => { + // if we've run out of tokens in the shorter list just return + if (acc.remaining.length === 0) { + return acc; + } + // on each iteration of tokens in the long list + // we iterate through remaining tokens from the shorter list and find the token with the most sameness let highScore = 0; let highIndex = 0; let index = 0; for (const token of acc.remaining) { - const result = stringSameness(curr, token, { ...options, reorder: false }); - if (result.highScore > highScore) { - highScore = result.highScore; + const result = stringSameness(curr, token, { strategies }); + if (result.highScoreWeighted > highScore) { + highScore = result.highScoreWeighted; highIndex = index; } index++; } + // then remove the most same token from the remaining short list tokens const splicedRemaining = [...acc.remaining]; - if (highIndex <= splicedRemaining.length - 1) { - splicedRemaining.splice(highIndex, 1); - } - const ordered = highIndex <= acc.remaining.length - 1 ? acc.ordered.concat(acc.remaining[highIndex]) : acc.ordered; - return { ordered: ordered, remaining: splicedRemaining }; - }, { ordered: [], remaining: bTokens }); - const allOrderedCandidateTokens = orderedCandidateTokens.ordered.concat(orderedCandidateTokens.remaining); - return allOrderedCandidateTokens.join(' '); + splicedRemaining.splice(highIndex, 1); + return { + // finally add the most same token to the ordered short list + ordered: acc.ordered.concat(acc.remaining[highIndex]), + // and return the remaining short list tokens + remaining: splicedRemaining + }; + }, { + // "ordered" is the result of ordering tokens in the shorter list to match longer token order + ordered: [], + // remaining is the initial shorter list + remaining: shorterTokens + }); + return [longerTokens.join(' '), orderedCandidateTokens.ordered.join(' ')]; }; exports.reorderStr = reorderStr; const createStringSameness = (defaults) => { diff --git a/dist/commonjs/index.js.map b/dist/commonjs/index.js.map index e75075e..ccd1a95 100644 --- a/dist/commonjs/index.js.map +++ b/dist/commonjs/index.js.map @@ -1 +1 @@ -{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":";;;AAAA,4DAAoH;AAQpH,uDAA0E;AA2HtE,qGA3HI,+BAAoB,OA2HJ;AAFpB,2FAzH0B,qBAAU,OAyH1B;AAvHd,MAAM,oBAAoB,GAAG,CAAC,MAAc,EAAE,EAAE;IAC5C,oBAAoB;IACpB,4BAA4B;IAC5B,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC;AACzC,CAAC,CAAA;AAED,MAAM,iBAAiB,GAAG;IACtB,uBAAY;IACZ,wBAAa;IACb,yBAAc;CACjB,CAAA;AA2GG,8CAAiB;AAzGrB,MAAM,cAAc,GAAG,CAAC,IAAY,EAAE,IAAY,EAAE,OAAiC,EAAwB,EAAE;IAE3G,MAAM,EACF,UAAU,GAAG,+BAAoB,EACjC,UAAU,GAAG,iBAAiB,EAC9B,OAAO,GAAG,KAAK,GAClB,GAAG,OAAO,IAAI,EAAE,CAAC;IAElB,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IACjE,IAAI,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IAE/D,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;IAEjE,IAAI,OAAO,EAAE;QACT,8JAA8J;QAC9J,qGAAqG;QACrG,2BAA2B;QAC3B,MAAM,GAAG,IAAA,kBAAU,EAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KACvC;IAED,MAAM,YAAY,GAA0C,EAAE,CAAC;IAE/D,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE;QAC5B,IAAI,KAAK,CAAC,OAAO,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE;YAC/D,SAAS;SACZ;QACD,MAAM,GAAG,GAAG,KAAK,CAAC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAC3C,MAAM,MAAM,GAAG,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAC,KAAK,EAAE,GAAG,EAAC,CAAC,CAAC,CAAC,GAAG,CAAC;QAC5D,YAAY,CAAC,IAAI,CAAC;YACd,GAAG,MAAM;YACT,IAAI,EAAE,KAAK,CAAC,IAAI;SACnB,CAAC,CAAC;KACN;IAED,mCAAmC;IACnC,MAAM,WAAW,GAAG,oBAAoB,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;IAE1D,qBAAqB;IACrB,MAAM,SAAS,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC;IAChG,kCAAkC;IAClC,MAAM,iBAAiB,GAAG,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;IAChE,MAAM,QAAQ,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAgD,EAAE,IAAI,EAAE,EAAE;QAC5F,MAAM,EAAC,IAAI,EAAE,KAAK,EAAE,GAAG,IAAI,EAAC,GAAG,IAAI,CAAC;QACpC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;YACb,GAAG,IAAI;YACP,KAAK;SACR,CAAC;QACF,OAAO,GAAG,CAAC;IACf,CAAC,EAAE,EAAE,CAAC,CAAC;IACP,OAAO;QACH,UAAU,EAAE,QAAQ;QACpB,SAAS;QACT,iBAAiB;KACpB,CAAA;AACL,CAAC,CAAA;AAiDG,wCAAc;AA/CX,MAAM,UAAU,GAAG,CAAC,MAAc,EAAE,MAAc,EAAE,OAAiC,EAAU,EAAE;IACpG,wJAAwJ;IACxJ,kFAAkF;IAClF,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAClC,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAClC,MAAM,sBAAsB,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAA+C,EAAE,IAAI,EAAE,EAAE;QACpG,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,SAAS,GAAuB,CAAC,CAAC;QACtC,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,KAAK,MAAM,KAAK,IAAI,GAAG,CAAC,SAAS,EAAE;YAC/B,MAAM,MAAM,GAAG,cAAc,CAAC,IAAI,EAAE,KAAK,EAAE,EAAC,GAAG,OAAO,EAAE,OAAO,EAAE,KAAK,EAAC,CAAC,CAAC;YACzE,IAAI,MAAM,CAAC,SAAS,GAAG,SAAS,EAAE;gBAC9B,SAAS,GAAG,MAAM,CAAC,SAAS,CAAC;gBAC7B,SAAS,GAAG,KAAK,CAAC;aACrB;YACD,KAAK,EAAE,CAAC;SACX;QAED,MAAM,gBAAgB,GAAG,CAAC,GAAG,GAAG,CAAC,SAAS,CAAC,CAAC;QAC5C,IAAG,SAAS,IAAI,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE;YACzC,gBAAgB,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;SACzC;QACD,MAAM,OAAO,GAAG,SAAS,IAAI,GAAG,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC;QAEnH,OAAO,EAAC,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,gBAAgB,EAAC,CAAC;IAC3D,CAAC,EAAE,EAAC,OAAO,EAAE,EAAE,EAAE,SAAS,EAAE,OAAO,EAAC,CAAC,CAAC;IACtC,MAAM,yBAAyB,GAAG,sBAAsB,CAAC,OAAO,CAAC,MAAM,CAAC,sBAAsB,CAAC,SAAS,CAAC,CAAC;IAC1G,OAAO,yBAAyB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC/C,CAAC,CAAA;AA5BY,QAAA,UAAU,cA4BtB;AAED,MAAM,oBAAoB,GAAG,CAAC,QAAiC,EAAE,EAAE;IAC/D,OAAO,CAAC,IAAY,EAAE,IAAY,EAAE,UAAmC,EAAE,EAAE,EAAE,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,EAAE,EAAC,GAAG,QAAQ,EAAE,GAAG,OAAO,EAAC,CAAC,CAAC;AACxI,CAAC,CAAA;AAgBG,oDAAoB;AAdxB,MAAM,UAAU,GAAG;IACf,YAAY,EAAZ,uBAAY;IACZ,aAAa,EAAb,wBAAa;IACb,cAAc,EAAd,yBAAc;IACd,wBAAwB,EAAxB,mCAAwB;CAC3B,CAAC;AAWE,gCAAU;AATd,4BAA4B;AAC5B,MAAM,+BAA+B,GAAG,+BAAoB,CAAC;AAUzD,0EAA+B"} \ No newline at end of file +{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":";;;AAAA,4DAAoH;AAQpH,uDAA0E;AAuKtE,qGAvKI,+BAAoB,OAuKJ;AAFpB,2FArK0B,qBAAU,OAqK1B;AAnKd,MAAM,oBAAoB,GAAG,CAAC,MAAc,EAAE,EAAE;IAC5C,oBAAoB;IACpB,4BAA4B;IAC5B,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC;AACzC,CAAC,CAAA;AAED,MAAM,iBAAiB,GAAG;IACtB,uBAAY;IACZ,wBAAa;IACb,yBAAc;CACjB,CAAA;AAuJG,8CAAiB;AArJrB,MAAM,cAAc,GAAG,CAAC,IAAY,EAAE,IAAY,EAAE,OAAiC,EAAwB,EAAE;IAE3G,MAAM,EACF,UAAU,GAAG,+BAAoB,EACjC,UAAU,GAAG,iBAAiB,EAC9B,OAAO,GAAG,KAAK,EACf,SAAS,GAAG,GAAG,EAClB,GAAG,OAAO,IAAI,EAAE,CAAC;IAElB,IAAI,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IAC/D,IAAI,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IAE/D,IAAI,OAAO,EAAE;QACT,8JAA8J;QAC9J,wIAAwI;QACxI,2BAA2B;QAC3B,MAAM,CAAC,QAAQ,EAAE,QAAQ,CAAC,GAAG,IAAA,kBAAU,EAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACxD,MAAM,GAAG,QAAQ,CAAC;QAClB,MAAM,GAAG,QAAQ,CAAC;KACrB;IAED,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;IAEjE,MAAM,YAAY,GAA0C,EAAE,CAAC;IAE/D,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE;QAC5B,IAAI,KAAK,CAAC,OAAO,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE;YAC/D,SAAS;SACZ;QACD,MAAM,GAAG,GAAG,KAAK,CAAC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAC3C,MAAM,MAAM,GAAG,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAC,KAAK,EAAE,GAAG,EAAC,CAAC,CAAC,CAAC,GAAG,CAAC;QAC5D,YAAY,CAAC,IAAI,CAAC;YACd,GAAG,MAAM;YACT,IAAI,EAAE,KAAK,CAAC,IAAI;SACnB,CAAC,CAAC;KACN;IAED,mCAAmC;IACnC,MAAM,WAAW,GAAG,oBAAoB,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;IAE1D,qBAAqB;IACrB,MAAM,SAAS,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC;IAChG,kCAAkC;IAClC,MAAM,iBAAiB,GAAG,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;IAChE,MAAM,QAAQ,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAgD,EAAE,IAAI,EAAE,EAAE;QAC5F,MAAM,EAAC,IAAI,EAAE,KAAK,EAAE,GAAG,IAAI,EAAC,GAAG,IAAI,CAAC;QACpC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;YACb,GAAG,IAAI;YACP,KAAK;SACR,CAAC;QACF,OAAO,GAAG,CAAC;IACf,CAAC,EAAE,EAAE,CAAC,CAAC;IACP,OAAO;QACH,UAAU,EAAE,QAAQ;QACpB,SAAS;QACT,iBAAiB;KACpB,CAAA;AACL,CAAC,CAAA;AA0FG,wCAAc;AAxFX,MAAM,UAAU,GAAG,CAAC,IAAY,EAAE,IAAY,EAAE,OAAiC,EAAoB,EAAE;IAE1G,MAAM,EACF,UAAU,GAAG,+BAAoB,EACjC,UAAU,GAAG,iBAAiB,EAC9B,SAAS,GAAG,GAAG,EAClB,GAAG,OAAO,IAAI,EAAE,CAAC;IAElB,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IACjE,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IAEjE,mBAAmB;IACnB,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IACxC,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IAGxC,IAAI,YAAsB,EACtB,aAAuB,CAAC;IAE5B,IAAI,OAAO,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE;QACjC,YAAY,GAAG,OAAO,CAAC;QACvB,aAAa,GAAG,OAAO,CAAC;KAC3B;SAAM;QACH,YAAY,GAAG,OAAO,CAAC;QACvB,aAAa,GAAG,OAAO,CAAC;KAC3B;IAED,gGAAgG;IAChG,wEAAwE;IAExE,MAAM,sBAAsB,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAA+C,EAAE,IAAI,EAAE,EAAE;QACzG,6DAA6D;QAC7D,IAAI,GAAG,CAAC,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE;YAC5B,OAAO,GAAG,CAAC;SACd;QAED,+CAA+C;QAC/C,sGAAsG;QAEtG,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,KAAK,MAAM,KAAK,IAAI,GAAG,CAAC,SAAS,EAAE;YAC/B,MAAM,MAAM,GAAG,cAAc,CAAC,IAAI,EAAE,KAAK,EAAE,EAAC,UAAU,EAAC,CAAC,CAAC;YACzD,IAAI,MAAM,CAAC,iBAAiB,GAAG,SAAS,EAAE;gBACtC,SAAS,GAAG,MAAM,CAAC,iBAAiB,CAAC;gBACrC,SAAS,GAAG,KAAK,CAAC;aACrB;YACD,KAAK,EAAE,CAAC;SACX;QAED,uEAAuE;QACvE,MAAM,gBAAgB,GAAG,CAAC,GAAG,GAAG,CAAC,SAAS,CAAC,CAAC;QAC5C,gBAAgB,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;QAEtC,OAAO;YACH,4DAA4D;YAC5D,OAAO,EAAE,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC;YACrD,6CAA6C;YAC7C,SAAS,EAAE,gBAAgB;SAC9B,CAAC;IACN,CAAC,EAAE;QACC,6FAA6F;QAC7F,OAAO,EAAE,EAAE;QACX,wCAAwC;QACxC,SAAS,EAAE,aAAa;KAC3B,CAAC,CAAC;IAEH,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,sBAAsB,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AAC9E,CAAC,CAAA;AArEY,QAAA,UAAU,cAqEtB;AAED,MAAM,oBAAoB,GAAG,CAAC,QAAiC,EAAE,EAAE;IAC/D,OAAO,CAAC,IAAY,EAAE,IAAY,EAAE,UAAmC,EAAE,EAAE,EAAE,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,EAAE,EAAC,GAAG,QAAQ,EAAE,GAAG,OAAO,EAAC,CAAC,CAAC;AACxI,CAAC,CAAA;AAgBG,oDAAoB;AAdxB,MAAM,UAAU,GAAG;IACf,YAAY,EAAZ,uBAAY;IACZ,aAAa,EAAb,wBAAa;IACb,cAAc,EAAd,yBAAc;IACd,wBAAwB,EAAxB,mCAAwB;CAC3B,CAAC;AAWE,gCAAU;AATd,4BAA4B;AAC5B,MAAM,+BAA+B,GAAG,+BAAoB,CAAC;AAUzD,0EAA+B"} \ No newline at end of file diff --git a/dist/commonjs/normalization/index.d.ts b/dist/commonjs/normalization/index.d.ts index ae891c2..09d40c3 100644 --- a/dist/commonjs/normalization/index.d.ts +++ b/dist/commonjs/normalization/index.d.ts @@ -3,6 +3,7 @@ declare const lowercase: StringTransformFunc; declare const trim: StringTransformFunc; declare const replaceUnicode: StringTransformFunc; declare const removePunctuation: StringTransformFunc; +declare const removeNonAlphanumeric: StringTransformFunc; declare const removeWhitespace: StringTransformFunc; declare const replaceMultiWhitespace: StringTransformFunc; declare const transforms: { @@ -14,4 +15,4 @@ declare const transforms: { removePunctuation: StringTransformFunc; }; declare const strDefaultTransforms: StringTransformFunc[]; -export { lowercase, trim, replaceUnicode, removePunctuation, removeWhitespace, replaceMultiWhitespace, transforms, strDefaultTransforms }; +export { lowercase, trim, replaceUnicode, removePunctuation, removeWhitespace, removeNonAlphanumeric, replaceMultiWhitespace, transforms, strDefaultTransforms }; diff --git a/dist/commonjs/normalization/index.js b/dist/commonjs/normalization/index.js index cd77c4a..60d07d5 100644 --- a/dist/commonjs/normalization/index.js +++ b/dist/commonjs/normalization/index.js @@ -1,7 +1,8 @@ "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); -exports.strDefaultTransforms = exports.transforms = exports.replaceMultiWhitespace = exports.removeWhitespace = exports.removePunctuation = exports.replaceUnicode = exports.trim = exports.lowercase = void 0; -const PUNCTUATION_REGEX = new RegExp(/[^\w\s]|_/g); +exports.strDefaultTransforms = exports.transforms = exports.replaceMultiWhitespace = exports.removeNonAlphanumeric = exports.removeWhitespace = exports.removePunctuation = exports.replaceUnicode = exports.trim = exports.lowercase = void 0; +const PUNCTUATION_REGEX = new RegExp(/[`=(){}<>;',.~!@#$%^&*_+|:"?\-\\\[\]\/]/g); +const NON_ALPHANUMERIC_REGEX = new RegExp(/[^\w\s]|_/g); const WHITESPACE_REGEX = new RegExp(/\s/g); const MULTI_WHITESPACE_REGEX = new RegExp(/\s{2,}/g); const lowercase = (str) => str.toLocaleLowerCase(); @@ -12,6 +13,8 @@ const replaceUnicode = (str) => str.normalize('NFD').replace(/[\u0300-\u036f]/g, exports.replaceUnicode = replaceUnicode; const removePunctuation = (str) => str.replace(PUNCTUATION_REGEX, ''); exports.removePunctuation = removePunctuation; +const removeNonAlphanumeric = (str) => str.replace(NON_ALPHANUMERIC_REGEX, ''); +exports.removeNonAlphanumeric = removeNonAlphanumeric; const removeWhitespace = (str) => str.replace(WHITESPACE_REGEX, ''); exports.removeWhitespace = removeWhitespace; const replaceMultiWhitespace = (str) => str.replace(MULTI_WHITESPACE_REGEX, ' '); diff --git a/dist/commonjs/normalization/index.js.map b/dist/commonjs/normalization/index.js.map index f19e49f..fd2c87f 100644 --- a/dist/commonjs/normalization/index.js.map +++ b/dist/commonjs/normalization/index.js.map @@ -1 +1 @@ -{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/normalization/index.ts"],"names":[],"mappings":";;;AAEA,MAAM,iBAAiB,GAAG,IAAI,MAAM,CAAC,YAAY,CAAC,CAAC;AACnD,MAAM,gBAAgB,GAAG,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC;AAC3C,MAAM,sBAAsB,GAAG,IAAI,MAAM,CAAC,SAAS,CAAC,CAAC;AAErD,MAAM,SAAS,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;AAyB5E,8BAAS;AAxBb,MAAM,IAAI,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;AAyB1D,oBAAI;AAxBR,MAAM,cAAc,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;AAyB9G,wCAAc;AAxBlB,MAAM,iBAAiB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AAyB/F,8CAAiB;AAxBrB,MAAM,gBAAgB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;AAyB7F,4CAAgB;AAxBpB,MAAM,sBAAsB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,sBAAsB,EAAE,GAAG,CAAC,CAAC;AAyB1G,wDAAsB;AAvB1B,MAAM,UAAU,GAAG;IACf,SAAS;IACT,IAAI;IACJ,sBAAsB;IACtB,cAAc;IACd,gBAAgB;IAChB,iBAAiB;CACpB,CAAA;AAiBG,gCAAU;AAfd,MAAM,oBAAoB,GAA0B;IAChD,cAAc;IACd,iBAAiB;IACjB,IAAI;IACJ,sBAAsB;IACtB,SAAS;CACZ,CAAC;AAUE,oDAAoB"} \ No newline at end of file +{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/normalization/index.ts"],"names":[],"mappings":";;;AAEA,MAAM,iBAAiB,GAAG,IAAI,MAAM,CAAC,0CAA0C,CAAC,CAAC;AACjF,MAAM,sBAAsB,GAAG,IAAI,MAAM,CAAC,YAAY,CAAC,CAAC;AACxD,MAAM,gBAAgB,GAAG,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC;AAC3C,MAAM,sBAAsB,GAAG,IAAI,MAAM,CAAC,SAAS,CAAC,CAAC;AAErD,MAAM,SAAS,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;AA0B5E,8BAAS;AAzBb,MAAM,IAAI,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;AA0B1D,oBAAI;AAzBR,MAAM,cAAc,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;AA0B9G,wCAAc;AAzBlB,MAAM,iBAAiB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AA0B/F,8CAAiB;AAzBrB,MAAM,qBAAqB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,sBAAsB,EAAE,EAAE,CAAC,CAAC;AA2BxG,sDAAqB;AA1BzB,MAAM,gBAAgB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;AAyB7F,4CAAgB;AAxBpB,MAAM,sBAAsB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,sBAAsB,EAAE,GAAG,CAAC,CAAC;AA0B1G,wDAAsB;AAxB1B,MAAM,UAAU,GAAG;IACf,SAAS;IACT,IAAI;IACJ,sBAAsB;IACtB,cAAc;IACd,gBAAgB;IAChB,iBAAiB;CACpB,CAAA;AAkBG,gCAAU;AAhBd,MAAM,oBAAoB,GAA0B;IAChD,cAAc;IACd,iBAAiB;IACjB,IAAI;IACJ,sBAAsB;IACtB,SAAS;CACZ,CAAC;AAWE,oDAAoB"} \ No newline at end of file diff --git a/dist/esm/atomic.d.ts b/dist/esm/atomic.d.ts index 4586faa..e88c7e6 100644 --- a/dist/esm/atomic.d.ts +++ b/dist/esm/atomic.d.ts @@ -13,6 +13,14 @@ export interface StringComparisonOptions { * Useful when only the differences in content are important, but not the order of the content * */ reorder?: boolean; + /** + * When `reorder` is used this determines how to split each string into the tokens that will be reordered. + * + * The value of this property is used in String.split() -- https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/split#separator + * + * @default " " + * */ + delimiter?: string | RegExp; } export interface StringSamenessResult { strategies: { diff --git a/dist/esm/index.d.ts b/dist/esm/index.d.ts index 480e7c1..7bf1740 100644 --- a/dist/esm/index.d.ts +++ b/dist/esm/index.d.ts @@ -2,7 +2,7 @@ import { ComparisonStrategyResult, StringComparisonOptions, StringSamenessResult import { strDefaultTransforms, transforms } from "./normalization/index.js"; declare const defaultStrategies: import("./atomic.js").ComparisonStrategy[]; declare const stringSameness: (valA: string, valB: string, options?: StringComparisonOptions) => StringSamenessResult; -export declare const reorderStr: (cleanA: string, cleanB: string, options?: StringComparisonOptions) => string; +export declare const reorderStr: (strA: string, strB: string, options?: StringComparisonOptions) => [string, string]; declare const createStringSameness: (defaults: StringComparisonOptions) => (valA: string, valB: string, options?: StringComparisonOptions) => StringSamenessResult; declare const strategies: { diceStrategy: import("./atomic.js").ComparisonStrategy; diff --git a/dist/esm/index.js b/dist/esm/index.js index 65ee2e7..9a862b4 100644 --- a/dist/esm/index.js +++ b/dist/esm/index.js @@ -11,16 +11,18 @@ const defaultStrategies = [ cosineStrategy ]; const stringSameness = (valA, valB, options) => { - const { transforms = strDefaultTransforms, strategies = defaultStrategies, reorder = false, } = options || {}; - const cleanA = transforms.reduce((acc, curr) => curr(acc), valA); + const { transforms = strDefaultTransforms, strategies = defaultStrategies, reorder = false, delimiter = ' ' } = options || {}; + let cleanA = transforms.reduce((acc, curr) => curr(acc), valA); let cleanB = transforms.reduce((acc, curr) => curr(acc), valB); - const shortest = cleanA.length > cleanB.length ? cleanB : cleanA; if (reorder) { // we want to ignore order of tokens as much as possible (user does not care about differences in word order, just absolute differences in characters overall) - // so we will reorder cleanB so its tokens match the order or tokens in cleanA as closely as possible + // so we will reorder the shorter of the two strings so its tokens match the order of tokens in the longer string as closely as possible // before we run strategies - cleanB = reorderStr(cleanA, cleanB); + const [orderedX, orderedY] = reorderStr(cleanA, cleanB); + cleanA = orderedX; + cleanB = orderedY; } + const shortest = cleanA.length > cleanB.length ? cleanB : cleanA; const stratResults = []; for (const strat of strategies) { if (strat.isValid !== undefined && !strat.isValid(cleanA, cleanB)) { @@ -53,32 +55,58 @@ const stringSameness = (valA, valB, options) => { highScoreWeighted, }; }; -export const reorderStr = (cleanA, cleanB, options) => { - // to do the reordering we will use stringSameness with the provided strats to match against each token in cleanA and choose the closest token in cleanB - // and add the end concat any remaining tokens from cleanB to the reordered string - const aTokens = cleanA.split(' '); - const bTokens = cleanB.split(' '); - const orderedCandidateTokens = aTokens.reduce((acc, curr) => { +export const reorderStr = (strA, strB, options) => { + const { transforms = strDefaultTransforms, strategies = defaultStrategies, delimiter = ' ' } = options || {}; + const cleanA = transforms.reduce((acc, curr) => curr(acc), strA); + const cleanB = transforms.reduce((acc, curr) => curr(acc), strB); + // split by "token" + const eTokens = cleanA.split(delimiter); + const cTokens = cleanB.split(delimiter); + let longerTokens, shorterTokens; + if (eTokens.length > cTokens.length) { + longerTokens = eTokens; + shorterTokens = cTokens; + } + else { + longerTokens = cTokens; + shorterTokens = eTokens; + } + // we will use longest string (token list) as the reducer and order the shorter list to match it + // so we don't have to deal with undefined positions in the shorter list + const orderedCandidateTokens = longerTokens.reduce((acc, curr) => { + // if we've run out of tokens in the shorter list just return + if (acc.remaining.length === 0) { + return acc; + } + // on each iteration of tokens in the long list + // we iterate through remaining tokens from the shorter list and find the token with the most sameness let highScore = 0; let highIndex = 0; let index = 0; for (const token of acc.remaining) { - const result = stringSameness(curr, token, { ...options, reorder: false }); - if (result.highScore > highScore) { - highScore = result.highScore; + const result = stringSameness(curr, token, { strategies }); + if (result.highScoreWeighted > highScore) { + highScore = result.highScoreWeighted; highIndex = index; } index++; } + // then remove the most same token from the remaining short list tokens const splicedRemaining = [...acc.remaining]; - if (highIndex <= splicedRemaining.length - 1) { - splicedRemaining.splice(highIndex, 1); - } - const ordered = highIndex <= acc.remaining.length - 1 ? acc.ordered.concat(acc.remaining[highIndex]) : acc.ordered; - return { ordered: ordered, remaining: splicedRemaining }; - }, { ordered: [], remaining: bTokens }); - const allOrderedCandidateTokens = orderedCandidateTokens.ordered.concat(orderedCandidateTokens.remaining); - return allOrderedCandidateTokens.join(' '); + splicedRemaining.splice(highIndex, 1); + return { + // finally add the most same token to the ordered short list + ordered: acc.ordered.concat(acc.remaining[highIndex]), + // and return the remaining short list tokens + remaining: splicedRemaining + }; + }, { + // "ordered" is the result of ordering tokens in the shorter list to match longer token order + ordered: [], + // remaining is the initial shorter list + remaining: shorterTokens + }); + return [longerTokens.join(' '), orderedCandidateTokens.ordered.join(' ')]; }; const createStringSameness = (defaults) => { return (valA, valB, options = {}) => stringSameness(valA, valB, { ...defaults, ...options }); diff --git a/dist/esm/index.js.map b/dist/esm/index.js.map index eabec69..1c07647 100644 --- a/dist/esm/index.js.map +++ b/dist/esm/index.js.map @@ -1 +1 @@ -{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,cAAc,EAAE,aAAa,EAAE,YAAY,EAAE,wBAAwB,EAAC,MAAM,+BAA+B,CAAC;AAQpH,OAAO,EAAC,oBAAoB,EAAE,UAAU,EAAC,MAAM,0BAA0B,CAAC;AAE1E,MAAM,oBAAoB,GAAG,CAAC,MAAc,EAAE,EAAE;IAC5C,oBAAoB;IACpB,4BAA4B;IAC5B,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC;AACzC,CAAC,CAAA;AAED,MAAM,iBAAiB,GAAG;IACtB,YAAY;IACZ,aAAa;IACb,cAAc;CACjB,CAAA;AAED,MAAM,cAAc,GAAG,CAAC,IAAY,EAAE,IAAY,EAAE,OAAiC,EAAwB,EAAE;IAE3G,MAAM,EACF,UAAU,GAAG,oBAAoB,EACjC,UAAU,GAAG,iBAAiB,EAC9B,OAAO,GAAG,KAAK,GAClB,GAAG,OAAO,IAAI,EAAE,CAAC;IAElB,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IACjE,IAAI,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IAE/D,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;IAEjE,IAAI,OAAO,EAAE;QACT,8JAA8J;QAC9J,qGAAqG;QACrG,2BAA2B;QAC3B,MAAM,GAAG,UAAU,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KACvC;IAED,MAAM,YAAY,GAA0C,EAAE,CAAC;IAE/D,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE;QAC5B,IAAI,KAAK,CAAC,OAAO,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE;YAC/D,SAAS;SACZ;QACD,MAAM,GAAG,GAAG,KAAK,CAAC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAC3C,MAAM,MAAM,GAAG,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAC,KAAK,EAAE,GAAG,EAAC,CAAC,CAAC,CAAC,GAAG,CAAC;QAC5D,YAAY,CAAC,IAAI,CAAC;YACd,GAAG,MAAM;YACT,IAAI,EAAE,KAAK,CAAC,IAAI;SACnB,CAAC,CAAC;KACN;IAED,mCAAmC;IACnC,MAAM,WAAW,GAAG,oBAAoB,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;IAE1D,qBAAqB;IACrB,MAAM,SAAS,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC;IAChG,kCAAkC;IAClC,MAAM,iBAAiB,GAAG,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;IAChE,MAAM,QAAQ,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAgD,EAAE,IAAI,EAAE,EAAE;QAC5F,MAAM,EAAC,IAAI,EAAE,KAAK,EAAE,GAAG,IAAI,EAAC,GAAG,IAAI,CAAC;QACpC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;YACb,GAAG,IAAI;YACP,KAAK;SACR,CAAC;QACF,OAAO,GAAG,CAAC;IACf,CAAC,EAAE,EAAE,CAAC,CAAC;IACP,OAAO;QACH,UAAU,EAAE,QAAQ;QACpB,SAAS;QACT,iBAAiB;KACpB,CAAA;AACL,CAAC,CAAA;AAED,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,MAAc,EAAE,MAAc,EAAE,OAAiC,EAAU,EAAE;IACpG,wJAAwJ;IACxJ,kFAAkF;IAClF,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAClC,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAClC,MAAM,sBAAsB,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAA+C,EAAE,IAAI,EAAE,EAAE;QACpG,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,SAAS,GAAuB,CAAC,CAAC;QACtC,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,KAAK,MAAM,KAAK,IAAI,GAAG,CAAC,SAAS,EAAE;YAC/B,MAAM,MAAM,GAAG,cAAc,CAAC,IAAI,EAAE,KAAK,EAAE,EAAC,GAAG,OAAO,EAAE,OAAO,EAAE,KAAK,EAAC,CAAC,CAAC;YACzE,IAAI,MAAM,CAAC,SAAS,GAAG,SAAS,EAAE;gBAC9B,SAAS,GAAG,MAAM,CAAC,SAAS,CAAC;gBAC7B,SAAS,GAAG,KAAK,CAAC;aACrB;YACD,KAAK,EAAE,CAAC;SACX;QAED,MAAM,gBAAgB,GAAG,CAAC,GAAG,GAAG,CAAC,SAAS,CAAC,CAAC;QAC5C,IAAG,SAAS,IAAI,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE;YACzC,gBAAgB,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;SACzC;QACD,MAAM,OAAO,GAAG,SAAS,IAAI,GAAG,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC;QAEnH,OAAO,EAAC,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,gBAAgB,EAAC,CAAC;IAC3D,CAAC,EAAE,EAAC,OAAO,EAAE,EAAE,EAAE,SAAS,EAAE,OAAO,EAAC,CAAC,CAAC;IACtC,MAAM,yBAAyB,GAAG,sBAAsB,CAAC,OAAO,CAAC,MAAM,CAAC,sBAAsB,CAAC,SAAS,CAAC,CAAC;IAC1G,OAAO,yBAAyB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC/C,CAAC,CAAA;AAED,MAAM,oBAAoB,GAAG,CAAC,QAAiC,EAAE,EAAE;IAC/D,OAAO,CAAC,IAAY,EAAE,IAAY,EAAE,UAAmC,EAAE,EAAE,EAAE,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,EAAE,EAAC,GAAG,QAAQ,EAAE,GAAG,OAAO,EAAC,CAAC,CAAC;AACxI,CAAC,CAAA;AAED,MAAM,UAAU,GAAG;IACf,YAAY;IACZ,aAAa;IACb,cAAc;IACd,wBAAwB;CAC3B,CAAC;AAEF,4BAA4B;AAC5B,MAAM,+BAA+B,GAAG,oBAAoB,CAAC;AAE7D,OAAO,EAGH,cAAc,EACd,oBAAoB,EACpB,iBAAiB,EACjB,UAAU,EACV,UAAU,EACV,+BAA+B,EAC/B,oBAAoB,EAGvB,CAAA"} \ No newline at end of file +{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,cAAc,EAAE,aAAa,EAAE,YAAY,EAAE,wBAAwB,EAAC,MAAM,+BAA+B,CAAC;AAQpH,OAAO,EAAC,oBAAoB,EAAE,UAAU,EAAC,MAAM,0BAA0B,CAAC;AAE1E,MAAM,oBAAoB,GAAG,CAAC,MAAc,EAAE,EAAE;IAC5C,oBAAoB;IACpB,4BAA4B;IAC5B,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC;AACzC,CAAC,CAAA;AAED,MAAM,iBAAiB,GAAG;IACtB,YAAY;IACZ,aAAa;IACb,cAAc;CACjB,CAAA;AAED,MAAM,cAAc,GAAG,CAAC,IAAY,EAAE,IAAY,EAAE,OAAiC,EAAwB,EAAE;IAE3G,MAAM,EACF,UAAU,GAAG,oBAAoB,EACjC,UAAU,GAAG,iBAAiB,EAC9B,OAAO,GAAG,KAAK,EACf,SAAS,GAAG,GAAG,EAClB,GAAG,OAAO,IAAI,EAAE,CAAC;IAElB,IAAI,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IAC/D,IAAI,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IAE/D,IAAI,OAAO,EAAE;QACT,8JAA8J;QAC9J,wIAAwI;QACxI,2BAA2B;QAC3B,MAAM,CAAC,QAAQ,EAAE,QAAQ,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACxD,MAAM,GAAG,QAAQ,CAAC;QAClB,MAAM,GAAG,QAAQ,CAAC;KACrB;IAED,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;IAEjE,MAAM,YAAY,GAA0C,EAAE,CAAC;IAE/D,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE;QAC5B,IAAI,KAAK,CAAC,OAAO,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE;YAC/D,SAAS;SACZ;QACD,MAAM,GAAG,GAAG,KAAK,CAAC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAC3C,MAAM,MAAM,GAAG,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAC,KAAK,EAAE,GAAG,EAAC,CAAC,CAAC,CAAC,GAAG,CAAC;QAC5D,YAAY,CAAC,IAAI,CAAC;YACd,GAAG,MAAM;YACT,IAAI,EAAE,KAAK,CAAC,IAAI;SACnB,CAAC,CAAC;KACN;IAED,mCAAmC;IACnC,MAAM,WAAW,GAAG,oBAAoB,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;IAE1D,qBAAqB;IACrB,MAAM,SAAS,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC;IAChG,kCAAkC;IAClC,MAAM,iBAAiB,GAAG,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;IAChE,MAAM,QAAQ,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAgD,EAAE,IAAI,EAAE,EAAE;QAC5F,MAAM,EAAC,IAAI,EAAE,KAAK,EAAE,GAAG,IAAI,EAAC,GAAG,IAAI,CAAC;QACpC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;YACb,GAAG,IAAI;YACP,KAAK;SACR,CAAC;QACF,OAAO,GAAG,CAAC;IACf,CAAC,EAAE,EAAE,CAAC,CAAC;IACP,OAAO;QACH,UAAU,EAAE,QAAQ;QACpB,SAAS;QACT,iBAAiB;KACpB,CAAA;AACL,CAAC,CAAA;AAED,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,IAAY,EAAE,IAAY,EAAE,OAAiC,EAAoB,EAAE;IAE1G,MAAM,EACF,UAAU,GAAG,oBAAoB,EACjC,UAAU,GAAG,iBAAiB,EAC9B,SAAS,GAAG,GAAG,EAClB,GAAG,OAAO,IAAI,EAAE,CAAC;IAElB,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IACjE,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IAEjE,mBAAmB;IACnB,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IACxC,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IAGxC,IAAI,YAAsB,EACtB,aAAuB,CAAC;IAE5B,IAAI,OAAO,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE;QACjC,YAAY,GAAG,OAAO,CAAC;QACvB,aAAa,GAAG,OAAO,CAAC;KAC3B;SAAM;QACH,YAAY,GAAG,OAAO,CAAC;QACvB,aAAa,GAAG,OAAO,CAAC;KAC3B;IAED,gGAAgG;IAChG,wEAAwE;IAExE,MAAM,sBAAsB,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAA+C,EAAE,IAAI,EAAE,EAAE;QACzG,6DAA6D;QAC7D,IAAI,GAAG,CAAC,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE;YAC5B,OAAO,GAAG,CAAC;SACd;QAED,+CAA+C;QAC/C,sGAAsG;QAEtG,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,KAAK,MAAM,KAAK,IAAI,GAAG,CAAC,SAAS,EAAE;YAC/B,MAAM,MAAM,GAAG,cAAc,CAAC,IAAI,EAAE,KAAK,EAAE,EAAC,UAAU,EAAC,CAAC,CAAC;YACzD,IAAI,MAAM,CAAC,iBAAiB,GAAG,SAAS,EAAE;gBACtC,SAAS,GAAG,MAAM,CAAC,iBAAiB,CAAC;gBACrC,SAAS,GAAG,KAAK,CAAC;aACrB;YACD,KAAK,EAAE,CAAC;SACX;QAED,uEAAuE;QACvE,MAAM,gBAAgB,GAAG,CAAC,GAAG,GAAG,CAAC,SAAS,CAAC,CAAC;QAC5C,gBAAgB,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;QAEtC,OAAO;YACH,4DAA4D;YAC5D,OAAO,EAAE,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC;YACrD,6CAA6C;YAC7C,SAAS,EAAE,gBAAgB;SAC9B,CAAC;IACN,CAAC,EAAE;QACC,6FAA6F;QAC7F,OAAO,EAAE,EAAE;QACX,wCAAwC;QACxC,SAAS,EAAE,aAAa;KAC3B,CAAC,CAAC;IAEH,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,sBAAsB,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AAC9E,CAAC,CAAA;AAED,MAAM,oBAAoB,GAAG,CAAC,QAAiC,EAAE,EAAE;IAC/D,OAAO,CAAC,IAAY,EAAE,IAAY,EAAE,UAAmC,EAAE,EAAE,EAAE,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,EAAE,EAAC,GAAG,QAAQ,EAAE,GAAG,OAAO,EAAC,CAAC,CAAC;AACxI,CAAC,CAAA;AAED,MAAM,UAAU,GAAG;IACf,YAAY;IACZ,aAAa;IACb,cAAc;IACd,wBAAwB;CAC3B,CAAC;AAEF,4BAA4B;AAC5B,MAAM,+BAA+B,GAAG,oBAAoB,CAAC;AAE7D,OAAO,EAGH,cAAc,EACd,oBAAoB,EACpB,iBAAiB,EACjB,UAAU,EACV,UAAU,EACV,+BAA+B,EAC/B,oBAAoB,EAGvB,CAAA"} \ No newline at end of file diff --git a/dist/esm/normalization/index.d.ts b/dist/esm/normalization/index.d.ts index ae891c2..09d40c3 100644 --- a/dist/esm/normalization/index.d.ts +++ b/dist/esm/normalization/index.d.ts @@ -3,6 +3,7 @@ declare const lowercase: StringTransformFunc; declare const trim: StringTransformFunc; declare const replaceUnicode: StringTransformFunc; declare const removePunctuation: StringTransformFunc; +declare const removeNonAlphanumeric: StringTransformFunc; declare const removeWhitespace: StringTransformFunc; declare const replaceMultiWhitespace: StringTransformFunc; declare const transforms: { @@ -14,4 +15,4 @@ declare const transforms: { removePunctuation: StringTransformFunc; }; declare const strDefaultTransforms: StringTransformFunc[]; -export { lowercase, trim, replaceUnicode, removePunctuation, removeWhitespace, replaceMultiWhitespace, transforms, strDefaultTransforms }; +export { lowercase, trim, replaceUnicode, removePunctuation, removeWhitespace, removeNonAlphanumeric, replaceMultiWhitespace, transforms, strDefaultTransforms }; diff --git a/dist/esm/normalization/index.js b/dist/esm/normalization/index.js index 1d95387..bc7ae0b 100644 --- a/dist/esm/normalization/index.js +++ b/dist/esm/normalization/index.js @@ -1,10 +1,12 @@ -const PUNCTUATION_REGEX = new RegExp(/[^\w\s]|_/g); +const PUNCTUATION_REGEX = new RegExp(/[`=(){}<>;',.~!@#$%^&*_+|:"?\-\\\[\]\/]/g); +const NON_ALPHANUMERIC_REGEX = new RegExp(/[^\w\s]|_/g); const WHITESPACE_REGEX = new RegExp(/\s/g); const MULTI_WHITESPACE_REGEX = new RegExp(/\s{2,}/g); const lowercase = (str) => str.toLocaleLowerCase(); const trim = (str) => str.trim(); const replaceUnicode = (str) => str.normalize('NFD').replace(/[\u0300-\u036f]/g, ""); const removePunctuation = (str) => str.replace(PUNCTUATION_REGEX, ''); +const removeNonAlphanumeric = (str) => str.replace(NON_ALPHANUMERIC_REGEX, ''); const removeWhitespace = (str) => str.replace(WHITESPACE_REGEX, ''); const replaceMultiWhitespace = (str) => str.replace(MULTI_WHITESPACE_REGEX, ' '); const transforms = { @@ -22,5 +24,5 @@ const strDefaultTransforms = [ replaceMultiWhitespace, lowercase ]; -export { lowercase, trim, replaceUnicode, removePunctuation, removeWhitespace, replaceMultiWhitespace, transforms, strDefaultTransforms }; +export { lowercase, trim, replaceUnicode, removePunctuation, removeWhitespace, removeNonAlphanumeric, replaceMultiWhitespace, transforms, strDefaultTransforms }; //# sourceMappingURL=index.js.map \ No newline at end of file diff --git a/dist/esm/normalization/index.js.map b/dist/esm/normalization/index.js.map index 522b8b1..4817cb4 100644 --- a/dist/esm/normalization/index.js.map +++ b/dist/esm/normalization/index.js.map @@ -1 +1 @@ -{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/normalization/index.ts"],"names":[],"mappings":"AAEA,MAAM,iBAAiB,GAAG,IAAI,MAAM,CAAC,YAAY,CAAC,CAAC;AACnD,MAAM,gBAAgB,GAAG,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC;AAC3C,MAAM,sBAAsB,GAAG,IAAI,MAAM,CAAC,SAAS,CAAC,CAAC;AAErD,MAAM,SAAS,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;AAChF,MAAM,IAAI,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;AAC9D,MAAM,cAAc,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;AAClH,MAAM,iBAAiB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AACnG,MAAM,gBAAgB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;AACjG,MAAM,sBAAsB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,sBAAsB,EAAE,GAAG,CAAC,CAAC;AAE9G,MAAM,UAAU,GAAG;IACf,SAAS;IACT,IAAI;IACJ,sBAAsB;IACtB,cAAc;IACd,gBAAgB;IAChB,iBAAiB;CACpB,CAAA;AAED,MAAM,oBAAoB,GAA0B;IAChD,cAAc;IACd,iBAAiB;IACjB,IAAI;IACJ,sBAAsB;IACtB,SAAS;CACZ,CAAC;AAEF,OAAO,EACH,SAAS,EACT,IAAI,EACJ,cAAc,EACd,iBAAiB,EACjB,gBAAgB,EAChB,sBAAsB,EACtB,UAAU,EACV,oBAAoB,EACvB,CAAA"} \ No newline at end of file +{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/normalization/index.ts"],"names":[],"mappings":"AAEA,MAAM,iBAAiB,GAAG,IAAI,MAAM,CAAC,0CAA0C,CAAC,CAAC;AACjF,MAAM,sBAAsB,GAAG,IAAI,MAAM,CAAC,YAAY,CAAC,CAAC;AACxD,MAAM,gBAAgB,GAAG,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC;AAC3C,MAAM,sBAAsB,GAAG,IAAI,MAAM,CAAC,SAAS,CAAC,CAAC;AAErD,MAAM,SAAS,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;AAChF,MAAM,IAAI,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;AAC9D,MAAM,cAAc,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;AAClH,MAAM,iBAAiB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AACnG,MAAM,qBAAqB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,sBAAsB,EAAE,EAAE,CAAC,CAAC;AAC5G,MAAM,gBAAgB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;AACjG,MAAM,sBAAsB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,sBAAsB,EAAE,GAAG,CAAC,CAAC;AAE9G,MAAM,UAAU,GAAG;IACf,SAAS;IACT,IAAI;IACJ,sBAAsB;IACtB,cAAc;IACd,gBAAgB;IAChB,iBAAiB;CACpB,CAAA;AAED,MAAM,oBAAoB,GAA0B;IAChD,cAAc;IACd,iBAAiB;IACjB,IAAI;IACJ,sBAAsB;IACtB,SAAS;CACZ,CAAC;AAEF,OAAO,EACH,SAAS,EACT,IAAI,EACJ,cAAc,EACd,iBAAiB,EACjB,gBAAgB,EAChB,qBAAqB,EACrB,sBAAsB,EACtB,UAAU,EACV,oBAAoB,EACvB,CAAA"} \ No newline at end of file