diff --git a/lib/db/query-parsers/sql.js b/lib/db/query-parsers/sql.js index c728704dbc..6e766f77eb 100644 --- a/lib/db/query-parsers/sql.js +++ b/lib/db/query-parsers/sql.js @@ -5,28 +5,37 @@ 'use strict' -const logger = require('../../logger').child({ component: 'sql_query_parser' }) -const StatementMatcher = require('../statement-matcher') +const defaultLogger = require('../../logger').child({ component: 'sql_query_parser' }) const stringify = require('json-stringify-safe') -const OPERATIONS = [ - new StatementMatcher( - 'select', - /^[^\S]*?select\b[\s\S]+?\bfrom[\s\n\r\[\(]+([^\]\s\n\r,)(;]*)/gim - ), - new StatementMatcher('update', /^[^\S]*?update[^\S]+?([^\s\n\r,;]+)/gim), - new StatementMatcher( - 'insert', - /^[^\S]*?insert(?:[^\S]+ignore)?[^\S]+into[^\S]+([^\s\n\r(,;]+)/gim - ), - new StatementMatcher('delete', /^[^\S]*?delete[^\S]+?from[^\S]+([^\s\n\r,(;]+)/gim) -] -const COMMENT_PATTERN = /\/\\*.*?\\*\//g - -// This must be called synchronously after the initial db call for backtraces to -// work correctly - -module.exports = function parseSql(sql) { +/** + * In a query like `select * from (select * from foo)`, extract the subquery + * as the statement to retrieve the target identifier from. + * + * @type {RegExp} + */ +const selectSubquery = /from\s*?\((?select.*?)\)\s*?/i + +/** + * Matches queries with leading common table expressions and assigns the + * actual query to a match group named `query`. + * + * @type {RegExp} + */ +const cteMatcher = /^\s*?with[\w\W]*?\)\s*?(?(?:insert|update|delete|select)[\w\W]*)/i + +/** + * Parses a SQL statement into the parts we want to report as metadata in + * database transactions. + * + * @param {string} sql The statement to parse. + * @param {object} [deps] A set of optional dependencies. + * @param {object} [deps.logger] A logger instance. + * + * @returns {{query: string, collection: null|string, operation: string}} Parsed + * metadata. + */ +module.exports = function parseSql(sql, { logger = defaultLogger } = {}) { // Sometimes we get an object here from MySQL. We have been unable to // reproduce it, so we'll just log what that object is and return a statement // type of `other`. @@ -36,9 +45,9 @@ module.exports = function parseSql(sql) { if (typeof sql !== 'string') { if (logger.traceEnabled()) { try { - logger.trace('parseSQL got an a non-string sql that looks like: %s', stringify(sql)) + logger.trace('parseSQL got a non-string sql that looks like: %s', stringify(sql)) } catch (err) { - logger.debug(err, 'Unabler to stringify SQL') + logger.debug(err, 'Unable to stringify SQL') } } return { @@ -48,24 +57,287 @@ module.exports = function parseSql(sql) { } } - sql = sql.replace(COMMENT_PATTERN, '').trim() + sql = removeMultiLineComments(sql).trim() + sql = removeSingleLineComments(sql).trim() + let result = { + operation: 'other', + collection: null, + query: sql + } + + // We want to remove the CTE _after_ assigning the statement to the result's + // `query` property. Otherwise, the actual query will not be recorded in + // the trace. + sql = removeCte(sql) + + // After all of our normalizing of the overall query, if it doesn't actually + // look like an SQL statement, short-circuit the parsing routine. + if (looksLikeValidSql(sql) === false) { + return result + } + + const lines = sql.split('\n') + result = { ...result, ...parseLines(lines) } + result.query = sql.trim() + + return result +} - let parsedStatement +/** + * Iterates the lines of an SQL statement, reducing them to the relevant lines, + * and returns the metadata found within. + * + * We do not inline this in `parseSql` because doing so will violate a + * code complexity linting rule. + * + * @param {string[]} lines Set of SQL statement lines. + * + * @returns {{collection: null, operation: string}} SQL statement metadata. + */ +function parseLines(lines) { + let result = { + operation: 'other', + collection: null + } + + parser: for (let i = 0; i < lines.length; i += 1) { + const line = lines[i].toLowerCase().trim() + switch (true) { + case line.startsWith('select'): { + const statement = lines.slice(i).join(' ') + result.operation = 'select' + result = { ...result, ...parseStatement(statement, 'select') } + break parser + } + + case line.startsWith('update'): { + const statement = lines.slice(i).join(' ') + result.operation = 'update' + result = { ...result, ...parseStatement(statement, 'update') } + break parser + } + + case line.startsWith('insert'): { + const statement = lines.slice(i).join(' ') + result.operation = 'insert' + result = { ...result, ...parseStatement(statement, 'insert') } + break parser + } + + case line.startsWith('delete'): { + const statement = lines.slice(i).join(' ') + result.operation = 'delete' + result = { ...result, ...parseStatement(statement, 'delete') } + break parser + } + } + } + + return result +} + +/** + * Iterates through the provided string and removes all multi-line comments + * found therein. + * + * @param {string} input The string to parse. + * + * @returns {string} Cleaned up string. + */ +function removeMultiLineComments(input) { + const startPos = input.indexOf('/*') + if (startPos === -1) { + return input + } + + const endPos = input.indexOf('*/', startPos + 2) + const part1 = input.slice(0, startPos).trim() + const part2 = input.slice(endPos + 2).trim() + return removeMultiLineComments(`${part1} ${part2}`) +} + +/** + * Removes all single line, and trailing, comments from the input query. + * These are comments that start with `--` or `#`. + * + * @param {string} input The query that might contain comments. + * @returns {string} The query without any comments. + */ +function removeSingleLineComments(input) { + const resultLines = [] + const lines = input.split('\n') + for (let i = 0; i < lines.length; i += 1) { + let line = lines[i] + if (/^(--|#)/.test(line) === true) { + continue + } + let pos = line.indexOf(' --') + if (pos > -1) { + line = line.slice(0, pos) + resultLines.push(line) + continue + } + pos = line.indexOf(' #') + if (pos > -1) { + line = line.slice(0, pos) + resultLines.push(line) + continue + } + + resultLines.push(line) + } + return resultLines.join('\n') +} - for (let i = 0, l = OPERATIONS.length; i < l; i++) { - parsedStatement = OPERATIONS[i].getParsedStatement(sql) - if (parsedStatement) { +/** + * Removes any leading common table expression (CTE) from the query and returns + * the query that targets the CTE. The metadata we are interested in, is not + * contained in the CTE, but in the query targeting the CTE. + * + * @param {string} statement The SQL statement that might have a CTE. + * @returns {string} The SQL statement without a leading CTE. + */ +function removeCte(statement) { + const matches = cteMatcher.exec(statement) + if (matches === null) { + return statement + } + return matches.groups.query +} + +/** + * Tests the start of the statement to determine if it looks like a valid + * SQL statement. + * + * @param {string} sql SQL statement with any comments stripped. + * + * @returns {boolean} True if the statement looks good. Otherwise, false. + */ +function looksLikeValidSql(sql) { + return /^\s*?(?:with|select|insert|update|delete)/i.test(sql.toLowerCase()) +} + +/** + * Extracts the collection, database, and table information from an SQL + * statement. + * + * @param {string} statement The SQL statement to parse. + * @param {string} [kind] The type of SQL statement being parsed. This + * dictates how the algorithm will determine where the desired fields are. + * Valid values are: `insert`, `delete`, `select`, and `update`. + * + * @returns {{database: string, collection, table}} The found information. + */ +function parseStatement(statement, kind = 'insert') { + let splitter + switch (kind) { + case 'insert': { + splitter = /\s*?\binto\b\s*?/i + break + } + + case 'delete': { + splitter = /\s*?\bfrom\b\s*?/i + break + } + + case 'select': { + const subqueryMatch = selectSubquery.exec(statement) + if (subqueryMatch !== null) { + statement = subqueryMatch.groups.subquery + } + + if (/\bfrom\b/i.test(statement) === false) { + // Statement does not specify a table. We don't need further processing. + // E.g., we have a statement like `select 1 + 1 as added`. + return { collection: 'unknown', table: 'unknown' } + } + + splitter = /\s*?\bfrom\b\s*?/i + break + } + + case 'update': { + splitter = /\s*?\bupdate\b\s*?/i break } } - if (parsedStatement) { - return parsedStatement + const targetIdentifier = statement.split(splitter).pop().trim().split(/\s/).shift() + return parseTableIdentifier(targetIdentifier) +} + +function parseTableIdentifier(identifier) { + const leadingChars = /^[`'"]/ + const trailingChars = /[`'"]$/ + let collection + let database + let table + + const separatorPos = identifier.indexOf('.') + if (separatorPos > 0) { + const parts = identifier.split('.', 2) + database = parts[0] + table = parts[1] + } else { + table = identifier.replace(leadingChars, '').replace(trailingChars, '') + table = normalizeTableName(identifier) } - return { - operation: 'other', - collection: null, - query: sql + if (table !== undefined) { + table = table.replace(leadingChars, '').replace(trailingChars, '') + table = normalizeTableName(table) } + if (database !== undefined) { + database = database.replace(leadingChars, '').replace(trailingChars, '') + collection = `${database}.${table}` + } + if (collection === undefined) { + collection = table + } + + return { collection, database, table } +} + +/** + * Our cross-application tests have tests that do not match any known SQL + * engine's valid syntax for table names. But we need to support them, so this + * function will inspect table names and try to return the correct thing. + * + * @param {string} tableIdentifier Something that _should_ represent a table + * name. + * + * @returns {string} The normalized table name. + */ +function normalizeTableName(tableIdentifier) { + // Some of our tests add non-standard characters to table names and expects + // they will be stripped. + tableIdentifier = tableIdentifier.replace(/[;]/g, '') + + if (tableIdentifier[0] === '(') { + // We might have a subquery. If there is a single word between the + // parentheticals, we return it as the table name (even though this is not + // valid SQL). Otherwise, we return a special value. + + const parts = tableIdentifier.replace(/[()]/g, '').split(/\s/) + if (parts.length === 1) { + return parts[0] + } + } + + const parenPos = tableIdentifier.indexOf('(') + if (parenPos > 0) { + // We seem to accept `into foo(x,y)` as a valid table name, where we + // decide that "foo" is the actual table name. + return tableIdentifier.slice(0, parenPos) + } + + const commaPos = tableIdentifier.indexOf(',') + if (commaPos > -1) { + // For some reason, we accept `from foo,bar` and decide that "foo" is + // the actual table name. + return tableIdentifier.slice(0, commaPos) + } + + return tableIdentifier } diff --git a/lib/db/statement-matcher.js b/lib/db/statement-matcher.js deleted file mode 100644 index 1e79a38ef1..0000000000 --- a/lib/db/statement-matcher.js +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright 2020 New Relic Corporation. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - */ - -'use strict' - -// ( ` database` . ` table ` ) -const CLEANER = /^\(?(?:([`'"]?)(.*?)\1\.)?([`'"]?)(.*?)\3\)?$/ - -function StatementMatcher(operation, operationPattern) { - this.operation = operation - this.matcher = new RegExp('^\\s*' + operation, 'ig') - this.operationPattern = operationPattern -} - -StatementMatcher.prototype.getParsedStatement = function getParsedStatement(sql) { - this.operationPattern.lastIndex = 0 - this.matcher.lastIndex = 0 - CLEANER.lastIndex = 0 - - if (this.matcher.test(sql)) { - const queryMatch = this.operationPattern.exec(sql) - let collection = queryMatch ? queryMatch[1] : 'unknown' - let database = null - - // If the cleaner can match this collection, pull out the cleaned up names - // from there. The spec doesn't want the database names in the collection - // name, but for legacy reasons we keep it. - // TODO: Either update the spec (and CATs) to accept database name in the - // collection name or remove it here. - const cleanerMatch = CLEANER.exec(collection) - if (cleanerMatch && cleanerMatch[4]) { - collection = cleanerMatch[4] - if (cleanerMatch[2]) { - database = cleanerMatch[2] - collection = database + '.' + collection - } - } - - // TODO: Pass through the database here to the parsed statement. It could - // be used for datastore attributes. - return { - operation: this.operation, - database: database, - collection: collection, - query: sql - } - } - - return null -} - -module.exports = StatementMatcher diff --git a/test/unit/db/query-parsers/sql.test.js b/test/unit/db/query-parsers/sql.test.js index d9168b5549..2b7a3acf3e 100644 --- a/test/unit/db/query-parsers/sql.test.js +++ b/test/unit/db/query-parsers/sql.test.js @@ -7,6 +7,8 @@ const test = require('node:test') const assert = require('node:assert') + +const match = require('../../../lib/custom-assertions/match') const parseSql = require('../../../../lib/db/query-parsers/sql') const CATs = require('../../../lib/cross_agent_tests/sql_parsing') @@ -161,3 +163,207 @@ test('database query parser', async (t) => { } }) }) + +test('logs correctly if input is incorrect', () => { + let logs = [] + const logger = { + traceEnabled() { + return true + }, + trace(msg, data) { + logs.push([msg, data]) + }, + debug(error, msg) { + logs.push([error, msg]) + } + } + + let result = parseSql({ an: 'invalid object' }, { logger }) + assert.deepStrictEqual(result, { operation: 'other', collection: null, query: '' }) + assert.deepStrictEqual(logs, [ + ['parseSQL got a non-string sql that looks like: %s', `{"an":"invalid object"}`] + ]) + + logs = [] + logger.trace = () => { + throw Error('boom') + } + result = parseSql({ an: 'invalid object' }, { logger }) + assert.deepStrictEqual(result, { operation: 'other', collection: null, query: '' }) + assert.equal(logs[0][1], 'Unable to stringify SQL') +}) + +test('reports correct info if single line comments present', () => { + const expected = { operation: 'insert', collection: 'bar', table: 'bar' } + let statement = `-- insert into bar some stuff + insert into bar + (col1, col2) -- the columns + values('a', 'b') -- the values + ` + let found = parseSql(statement) + match(found, expected) + + statement = `# insert into bar some stuff + insert into bar + (col1, col2) # the columns + values('a', 'b') # the values + ` + found = parseSql(statement) + match(found, expected) + + statement = `--insert into bar some stuff + insert into bar + (col1, col2) --the columns + values('--hoorah', '#foobar') #the values + ` + found = parseSql(statement) + match(found, expected) +}) + +test('reports correct info if multi-line comments present', () => { + const expected = { operation: 'insert', collection: 'foo', table: 'foo' } + + let statement = `/*insert into bar some stuff*/ + insert into foo (col1) values('bar') + ` + let found = parseSql(statement) + match(found, expected) + + statement = `/**** + insert into bar some stuff + ****/ + insert into + foo (col1) + values('bar') + ` + found = parseSql(statement) + match(found, expected) + + statement = `insert /* insert into bar */ into foo` + found = parseSql(statement) + match(found, expected) + + statement = `/* insert into bar some stuff */ insert into foo (col1)` + found = parseSql(statement) + match(found, expected) + + statement = `insert into /* insert into bar some stuff */ foo (col1)` + found = parseSql(statement) + match(found, expected) + + statement = `insert /* comments! */ into /* insert into bar some stuff */ foo /* MOAR */ (col1)` + found = parseSql(statement) + match(found, expected) +}) + +test('handles quoted names', () => { + const expected = { operation: 'insert', collection: 'foo', table: 'foo' } + + let statement = 'insert into `foo` (col1)' + let found = parseSql(statement) + match(found, expected) + + statement = `insert into 'foo' (col1)` + found = parseSql(statement) + match(found, expected) + + statement = `insert into "foo" (col1)` + found = parseSql(statement) + match(found, expected) + + expected.collection = 'foo``foo' + expected.table = 'foo``foo' + statement = 'insert into `foo``foo`' + found = parseSql(statement) + match(found, expected) + + expected.collection = `foo''foo` + expected.table = `foo''foo` + statement = "insert into `foo''foo`" + found = parseSql(statement) + match(found, expected) + + expected.collection = `foo"foo` + expected.table = `foo"foo` + statement = 'insert into `foo"foo`' + found = parseSql(statement) + match(found, expected) +}) + +test('handles fully qualified names', () => { + const expected = { operation: 'insert', collection: 'myDb.foo', table: 'foo', database: 'myDb' } + + let statement = 'insert into `myDb`.`foo` (col1)' + let found = parseSql(statement) + match(found, expected) + + statement = `insert into 'myDb'.'foo' (col1)` + found = parseSql(statement) + match(found, expected) + + statement = `insert into "myDb"."foo" (col1)` + found = parseSql(statement) + match(found, expected) +}) + +test('handles leading CTE', () => { + let statement = `with cte1 as ( + select + linking_col + from + linking_table + ) + select + foo_col + from + foo_table a + join cte1 linking_col + where + a.bar_col = 'bar'` + let found = parseSql(statement) + match(found, { + operation: 'select', + collection: 'foo_table', + table: 'foo_table', + database: undefined + }) + + statement = `with cte1 as (select * from foo) update bar set bar.a = cte1.a` + found = parseSql(statement) + match(found, { + operation: 'update', + collection: 'bar', + table: 'bar', + database: undefined + }) +}) + +test('maps `SELECT ? + ? AS solution` to "unknown" collection', () => { + const statement = 'SELECT ? + ? AS solution' + const found = parseSql(statement) + match(found, { + operation: 'select', + collection: 'unknown', + table: 'unknown' + }) +}) + +test('handles odd characters attached to table names', () => { + const expected = { operation: 'select', collection: 'unit-test', table: 'unit-test' } + + let statement = 'select test from unit-test;' + let found = parseSql(statement) + match(found, expected) + + expected.collection = 'schema.unit-test' + statement = 'select test from schema.unit-test;' + found = parseSql(statement) + match(found, expected) +}) + +test('handles subqueries in place of table identifiers', () => { + const expected = { operation: 'select', collection: 'foo', table: 'foo' } + const statement = 'select * from (select foo from foo) where bar = "baz"' + const found = parseSql(statement) + match(found, expected) +})