From 4fa92bd983a789cae3c45bb8604f7e233f623c3a Mon Sep 17 00:00:00 2001 From: Yash Mittal Date: Sat, 9 Mar 2024 21:21:02 +0530 Subject: [PATCH] Tutorial 21 (#28) * Prisma: Initial Wiring * Chores: Remove redundant code * Chores: Readme for Prisma Adapter; GitHub CI Refactoring * Chores: Remove random file that got added * Demo: Fix server URL * Fix Readme.md * Fix main readme * Fix CLI Tests * Fix CI * Fix CI * Fix CI * Fix CI * Tutorial-21 * docs: add tutorial for step 21 * fix: generate command * fix: update workflows to not use node v14 --------- Co-authored-by: Chakshu Gautam --- .github/workflows/db-tests.yml | 3 +- .gitignore | 4 +- docs/tutorials/21.md | 173 ++++++++++++++++++++ package-lock.json | 150 +++++++++++++++-- package.json | 3 + src/csvStorage.js | 21 ++- src/queryExecuter.js | 29 +++- src/queryParser.js | 47 +++++- tests/appoximateLargeFile.test.js | 63 ++++++++ tests/queryExecuter.test.js | 56 +++++++ tests/queryParser.test.js | 257 +++++++++++++++++++++++++++--- util/generateLargeFile.js | 35 ++++ 12 files changed, 798 insertions(+), 43 deletions(-) create mode 100644 docs/tutorials/21.md create mode 100644 tests/appoximateLargeFile.test.js create mode 100644 util/generateLargeFile.js diff --git a/.github/workflows/db-tests.yml b/.github/workflows/db-tests.yml index c0cd8a3..2540e7c 100644 --- a/.github/workflows/db-tests.yml +++ b/.github/workflows/db-tests.yml @@ -13,7 +13,7 @@ jobs: strategy: matrix: - node-version: [14.x, 16.x, 18.x] + node-version: [16.x, 18.x] steps: - uses: actions/checkout@v3 @@ -22,4 +22,5 @@ jobs: with: node-version: ${{ matrix.node-version }} - run: npm i + - run: npm run generate - run: npm test diff --git a/.gitignore b/.gitignore index 656790b..93ee4f1 100644 --- a/.gitignore +++ b/.gitignore @@ -130,4 +130,6 @@ dist .pnp.* -**/.DS_Store \ No newline at end of file +**/.DS_Store + +student_large.csv \ No newline at end of file diff --git a/docs/tutorials/21.md b/docs/tutorials/21.md new file mode 100644 index 0000000..d662cb6 --- /dev/null +++ b/docs/tutorials/21.md @@ -0,0 +1,173 @@ +## Step 21: Add Approximate Counting using `HyperLogLog` + +### 21.1 First things first, create a utility to generate large files + +Since the effect of `HyperLogLog` is best seen on large files, we need to create a utility function which generates large files, let's say with `10_000_000` data points. To do this create a file named `generateLargeFile.js` in a `utils` folder and add the following logic to it. + +```js +const fs = require('fs'); +const { faker, da } = require('@faker-js/faker'); +const { parse } = require('json2csv'); + +async function generateLargeCSV(filename) { + let data = []; + for (let i = 1; i <= 10_000_000; i++) { + const record = { + id: i, + name: faker.person.firstName(), + age: faker.number.int({ min: 18, max: 100 }), + }; + data.push(record); + + let rows; + if (i % 500_000 === 0) { + console.log(`Generated ${i} records`); + if (!fs.existsSync(filename)) { + rows = parse(data, { header: true }); + } else { + // Rows without headers. + rows = parse(data, { header: false }); + } + fs.appendFileSync(filename, rows); + data = []; + } + + } + // Append file function can create new file too. + + // Always add new line if file already exists. + fs.appendFileSync(filename, "\r\n"); +} + +generateLargeCSV('student_large.csv') +``` + +### 21.2 Implement CSV reader for `HyperLogLog` + +Since `HyperLogLog` is a data structure which keeps data stored in a hashed format, we implement a separete CSV reader for it. Create a function named `readCSVforHLL` in your `csvStorage.js`. +Sample logic for it can be found here: +```js +function readCSVForHLL(filePath, bitSampleSize = 12, digestSize = 128) { + const results = []; + var h = hll({ bitSampleSize: bitSampleSize, digestSize: digestSize }); + + return new Promise((resolve, reject) => { + fs.createReadStream(filePath) + .pipe(csv()) + .on('data', (data) => h.insert(JSON.stringify(data))) + .on('end', () => { + resolve(h); + }) + .on('error', (error) => { + reject(error); + }); + }); +} +``` + +### 21.3 Update the `queryParser` implementation to identify `COUNT` and `APPROXIMATE_COUNT` as valid tokens + +Since our SQL queries will now be accepting the `COUNT` and `APPROXIMATE_COUNT` tokens as valid tokens, we need to update the logic of our parser to identify and process them accordingly. Update your `queryParser.js` with the required logic (regex) to identify that. + + +### 21.4 Update the `executeSELECTQuery` function to add support for `COUNT` and `APPROXIMATE_COUNT` + +Update the existing logic in the `executeSELECTQuery` function to identify and process the `COUNT` and `APPROXIMATE_COUNT` commands in your SQL query. +Some snippets that might be helpful are: +```js +// getting approx counts +if (isApproximateCount && fields.length === 1 && fields[0] === 'COUNT(*)' && whereClauses.length === 0) { + let hll = await readCSVForHLL(`${table}.csv`); + return [{ 'APPROXIMATE_COUNT(*)': hll.estimate() }]; +} + + // Distinct inside count - example "SELECT COUNT (DISTINCT student.name) FROM student" + if (isCountDistinct) { + + if (isApproximateCount) { + var h = hll({ bitSampleSize: 12, digestSize: 128 }); + orderedResults.forEach(row => h.insert(distinctFields.map(field => row[field]).join('|'))); + return [{ [`APPROXIMATE_${fields[0]}`]: h.estimate() }]; + } + else { + let distinctResults = [...new Map(orderedResults.map(item => [distinctFields.map(field => item[field]).join('|'), item])).values()]; + return [{ [fields[0]]: distinctResults.length }]; + } + } +``` + + +### 21.5 Write a test case for approximate count + +Since we are following `TDD` in this tutorial, we are going to be writing a test case to test our implementation now. +Create a file named `approximateLargeFile.test.js` in your `tests` folder and add the following test cases: + +```js +const fs = require('fs'); +const { executeSELECTQuery } = require('../src/queryExecuter'); +const jestConsole = console; + +beforeEach(() => { + global.console = require('console'); +}); + +afterEach(() => { + global.console = jestConsole; +}); + +test('Large File Count(*) - Approximate and Exact', async () => { + // Test Exact Count + + const startMemoryUsageExact = process.memoryUsage().heapUsed; + const startTimeExact = performance.now(); + + const queryExact = "SELECT COUNT(*) FROM student_large"; + const resultExact = await executeSELECTQuery(queryExact); + const exactResult = resultExact[0]['COUNT(*)']; + + const endTimeExact = performance.now(); + const endMemoryUsageExact = process.memoryUsage().heapUsed; + + console.log(`Execution Time for Exact Count: ${(endTimeExact - startTimeExact).toFixed(2)} ms`); + console.log(`Start Memory for Exact Count: ${startMemoryUsageExact / 1024 / 1024} MB`); + console.log(`End Memory for Exact Count: ${endMemoryUsageExact / 1024 / 1024} MB`); + console.log(`Memory Used for Exact Count: ${(endMemoryUsageExact - startMemoryUsageExact) / 1024 / 1024} MB`); + + const startMemoryUsage = process.memoryUsage().heapUsed; + const startTime = performance.now(); + + const query = "SELECT APPROXIMATE_COUNT(*) FROM student_large"; + const result = await executeSELECTQuery(query); + + // Expect the approximate count to be within 5% of the actual count + expect(result[0]['APPROXIMATE_COUNT(*)']).toBeGreaterThan(exactResult - 0.05 * exactResult); + expect(result[0]['APPROXIMATE_COUNT(*)']).toBeLessThan(exactResult + 0.05 * exactResult); + + const endTime = performance.now(); + const endMemoryUsage = process.memoryUsage().heapUsed; + + console.log(`Execution Time for Approximate Count: ${(endTime - startTime).toFixed(2)} ms`); + console.log(`Start Memory: ${startMemoryUsage / 1024 / 1024} MB`); + console.log(`End Memory: ${endMemoryUsage / 1024 / 1024} MB`); + console.log(`Memory Used for Approximate Count: ${(endMemoryUsage - startMemoryUsage) / 1024 / 1024} MB`); + +}, 120000); + +test('Execute SQL Query with COUNT with DISTINCT on a column', async () => { + const queryExact = "SELECT COUNT(DISTINCT (name, age)) FROM student_large"; + const resultExact = await executeSELECTQuery(queryExact); + console.log({ resultExact }); + const exactResult = resultExact[0]['COUNT(DISTINCT (name, age))']; + + const query = "SELECT APPROXIMATE_COUNT(DISTINCT (name, age)) FROM student_large"; + const result = await executeSELECTQuery(query); + + // Expect the approximate count to be within 2% of the actual count + expect(result[0]['APPROXIMATE_COUNT(DISTINCT (name, age))']).toBeGreaterThan(exactResult - 0.05 * exactResult); + expect(result[0]['APPROXIMATE_COUNT(DISTINCT (name, age))']).toBeLessThan(exactResult + 0.05 * exactResult); +}, 120000); +``` + +### 21.6 Update the tests for other files to test for the updates you made in other parts of the implementation + +Since we have made changes to the other parts of the implementation such as the `csvStorage.js`, `queryParser.js` and `queryExecutor.js` we need to update the tests for those files to test for the functionality. \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 3afaec3..9b5dd73 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,6 +10,7 @@ "license": "ISC", "dependencies": { "csv-parser": "^3.0.0", + "hll": "^2.0.0", "json2csv": "^6.0.0-alpha.2", "xterm": "^5.3.0" }, @@ -17,6 +18,7 @@ "stylusdb-cli": "node ./src/cli.js" }, "devDependencies": { + "@faker-js/faker": "^8.4.1", "jest": "^29.7.0" } }, @@ -655,6 +657,22 @@ "integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==", "dev": true }, + "node_modules/@faker-js/faker": { + "version": "8.4.1", + "resolved": "https://registry.npmjs.org/@faker-js/faker/-/faker-8.4.1.tgz", + "integrity": "sha512-XQ3cU+Q8Uqmrbf2e0cIC/QN43sTBSC8KF12u29Mb47tWrt2hAgBXSgpZMj4Ao8Uk0iJcU99QsOCaIL8934obCg==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/fakerjs" + } + ], + "engines": { + "node": "^14.17.0 || ^16.13.0 || >=18.0.0", + "npm": ">=6.14.13" + } + }, "node_modules/@istanbuljs/load-nyc-config": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@istanbuljs/load-nyc-config/-/load-nyc-config-1.1.0.tgz", @@ -1200,6 +1218,11 @@ "sprintf-js": "~1.0.2" } }, + "node_modules/async": { + "version": "3.2.5", + "resolved": "https://registry.npmjs.org/async/-/async-3.2.5.tgz", + "integrity": "sha512-baNZyqaaLhyLVKm/DlvdW051MSgO6b8eVfIezl9E5PqWxFgzLm/wQntEW4zOytVburDEr0JlALEpdOFwvErLsg==" + }, "node_modules/babel-jest": { "version": "29.7.0", "resolved": "https://registry.npmjs.org/babel-jest/-/babel-jest-29.7.0.tgz", @@ -1310,14 +1333,12 @@ "node_modules/balanced-match": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", - "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", - "dev": true + "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==" }, "node_modules/brace-expansion": { "version": "1.1.11", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", - "dev": true, "dependencies": { "balanced-match": "^1.0.0", "concat-map": "0.0.1" @@ -1525,8 +1546,7 @@ "node_modules/concat-map": { "version": "0.0.1", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", - "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", - "dev": true + "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==" }, "node_modules/convert-source-map": { "version": "2.0.0", @@ -1796,8 +1816,7 @@ "node_modules/fs.realpath": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", - "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", - "dev": true + "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==" }, "node_modules/fsevents": { "version": "2.3.3", @@ -1865,7 +1884,6 @@ "version": "7.2.3", "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz", "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==", - "dev": true, "dependencies": { "fs.realpath": "^1.0.0", "inflight": "^1.0.4", @@ -1917,6 +1935,17 @@ "node": ">= 0.4" } }, + "node_modules/hll": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/hll/-/hll-2.0.0.tgz", + "integrity": "sha512-PV92xkmczdzgSDIf8ii7zInuvw40X+XFovrtSOaVx7TfuKnb4EWvgjLHTcCVm5FGXMDUNe2v/4g0vyMtQESJdg==", + "dependencies": { + "murmurhash3": "^0.5.0" + }, + "engines": { + "node": ">=10 <15" + } + }, "node_modules/html-escaper": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz", @@ -1964,7 +1993,6 @@ "version": "1.0.6", "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==", - "dev": true, "dependencies": { "once": "^1.3.0", "wrappy": "1" @@ -1973,8 +2001,7 @@ "node_modules/inherits": { "version": "2.0.4", "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", - "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", - "dev": true + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" }, "node_modules/is-arrayish": { "version": "0.2.1", @@ -2726,6 +2753,17 @@ "url": "https://github.com/chalk/supports-color?sponsor=1" } }, + "node_modules/js-beautify-node": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/js-beautify-node/-/js-beautify-node-1.0.0.tgz", + "integrity": "sha512-P5t/LUeK2PUIEzhJ3ehjHiMGQjstXgCBpX4HJkG0Igq6XwnSI0c4HUGsJep34pZK5BEpkCyIzOCcthV/ucSitg==", + "engines": [ + "node >= 0.3.0" + ], + "bin": { + "jsbeautify": "beautify-node.js" + } + }, "node_modules/js-tokens": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", @@ -2931,7 +2969,6 @@ "version": "3.1.2", "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", - "dev": true, "dependencies": { "brace-expansion": "^1.1.7" }, @@ -2947,12 +2984,41 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/mkdirp": { + "version": "0.5.6", + "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.6.tgz", + "integrity": "sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==", + "dependencies": { + "minimist": "^1.2.6" + }, + "bin": { + "mkdirp": "bin/cmd.js" + } + }, "node_modules/ms": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==", "dev": true }, + "node_modules/murmurhash3": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/murmurhash3/-/murmurhash3-0.5.0.tgz", + "integrity": "sha512-bgvgIBctpwslE9kIurdGH9knDDyYCZca5upcy9MLzsc16afyBdwkfhOJmxN607exbyesZgTQEtAcqs3euEsFlA==", + "hasInstallScript": true, + "dependencies": { + "nan": "^2.14.2", + "shipitjs": "^0.3.2" + }, + "engines": { + "node": ">=0.12.0" + } + }, + "node_modules/nan": { + "version": "2.19.0", + "resolved": "https://registry.npmjs.org/nan/-/nan-2.19.0.tgz", + "integrity": "sha512-nO1xXxfh/RWNxfd/XPfbIfFk5vgLsAxUR9y5O0cHMJu/AW9U95JLXqthYHjEp+8gQ5p96K9jUp8nbVOxCdRbtw==" + }, "node_modules/natural-compare": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", @@ -2996,7 +3062,6 @@ "version": "1.4.0", "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", - "dev": true, "dependencies": { "wrappy": "1" } @@ -3098,7 +3163,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==", - "dev": true, "engines": { "node": ">=0.10.0" } @@ -3274,6 +3338,17 @@ "node": ">=10" } }, + "node_modules/rimraf": { + "version": "2.6.3", + "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.3.tgz", + "integrity": "sha512-mwqeW5XsA2qAejG46gYdENaxXjx9onRNCfn7L0duuP4hCuTIi/QO7PDK07KJfp1d+izWPrzEJDcSqBa0OZQriA==", + "dependencies": { + "glob": "^7.1.3" + }, + "bin": { + "rimraf": "bin.js" + } + }, "node_modules/semver": { "version": "6.3.1", "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", @@ -3304,6 +3379,33 @@ "node": ">=8" } }, + "node_modules/shipitjs": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/shipitjs/-/shipitjs-0.3.2.tgz", + "integrity": "sha512-uLtgZP2PBzxwoCDfEhARWpuQewMwRLGhkrD8O/rqXHdyYjaV8LY0F4aMi5Lr3l4XWm5YiHINsCMhzD0scu7GzQ==", + "dependencies": { + "async": "", + "commander": ">=0.6.1", + "js-beautify-node": "", + "semver": "^5.5.1", + "temp": "", + "underscore": "" + }, + "bin": { + "shipitjs": "bin/shipitjs" + }, + "engines": { + "node": ">=0.4.x" + } + }, + "node_modules/shipitjs/node_modules/semver": { + "version": "5.7.2", + "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz", + "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==", + "bin": { + "semver": "bin/semver" + } + }, "node_modules/signal-exit": { "version": "3.0.7", "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz", @@ -3455,6 +3557,18 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/temp": { + "version": "0.9.4", + "resolved": "https://registry.npmjs.org/temp/-/temp-0.9.4.tgz", + "integrity": "sha512-yYrrsWnrXMcdsnu/7YMYAofM1ktpL5By7vZhf15CrXijWWrEYZks5AXBudalfSWJLlnen/QUJUB5aoB0kqZUGA==", + "dependencies": { + "mkdirp": "^0.5.1", + "rimraf": "~2.6.2" + }, + "engines": { + "node": ">=6.0.0" + } + }, "node_modules/test-exclude": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/test-exclude/-/test-exclude-6.0.0.tgz", @@ -3517,6 +3631,11 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/underscore": { + "version": "1.13.6", + "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.6.tgz", + "integrity": "sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==" + }, "node_modules/undici-types": { "version": "5.26.5", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", @@ -3611,8 +3730,7 @@ "node_modules/wrappy": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", - "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", - "dev": true + "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==" }, "node_modules/write-file-atomic": { "version": "4.0.2", diff --git a/package.json b/package.json index 4eb4c53..ba8cdcc 100644 --- a/package.json +++ b/package.json @@ -7,6 +7,7 @@ "doc": "docs" }, "scripts": { + "generate": "node ./util/generateLargeFile.js", "test": "jest", "test-verbose": "jest --verbose", "server": "node ./src/server.js" @@ -18,10 +19,12 @@ "author": "Chakshu Gautam", "license": "ISC", "devDependencies": { + "@faker-js/faker": "^8.4.1", "jest": "^29.7.0" }, "dependencies": { "csv-parser": "^3.0.0", + "hll": "^2.0.0", "json2csv": "^6.0.0-alpha.2", "xterm": "^5.3.0" } diff --git a/src/csvStorage.js b/src/csvStorage.js index 4729599..c4c33b9 100644 --- a/src/csvStorage.js +++ b/src/csvStorage.js @@ -1,9 +1,11 @@ const fs = require('fs'); const csv = require('csv-parser'); const { parse } = require('json2csv'); +const hll = require('hll'); function readCSV(filePath) { const results = []; + var h = hll(); return new Promise((resolve, reject) => { fs.createReadStream(filePath) @@ -18,9 +20,26 @@ function readCSV(filePath) { }); } +function readCSVForHLL(filePath, bitSampleSize = 12, digestSize = 128) { + const results = []; + var h = hll({ bitSampleSize: bitSampleSize, digestSize: digestSize }); + + return new Promise((resolve, reject) => { + fs.createReadStream(filePath) + .pipe(csv()) + .on('data', (data) => h.insert(JSON.stringify(data))) + .on('end', () => { + resolve(h); + }) + .on('error', (error) => { + reject(error); + }); + }); +} + async function writeCSV(filename, data) { const csv = parse(data); fs.writeFileSync(filename, csv); } -module.exports = { readCSV, writeCSV }; \ No newline at end of file +module.exports = { readCSV, readCSVForHLL, writeCSV }; \ No newline at end of file diff --git a/src/queryExecuter.js b/src/queryExecuter.js index ce210a3..863cd1a 100644 --- a/src/queryExecuter.js +++ b/src/queryExecuter.js @@ -1,5 +1,7 @@ const { parseSelectQuery, parseInsertQuery, parseDeleteQuery } = require('./queryParser.js'); -const { readCSV, writeCSV } = require('./csvStorage.js'); +const { readCSV, readCSVForHLL, writeCSV } = require('./csvStorage.js'); +const hll = require('hll'); + function performInnerJoin(data, joinData, joinCondition, fields, table) { return data.flatMap(mainRow => { @@ -203,8 +205,14 @@ function applyGroupBy(data, groupByFields, aggregateFunctions) { async function executeSELECTQuery(query) { try { + const { fields, table, whereClauses, joinType, joinTable, joinCondition, groupByFields, hasAggregateWithoutGroupBy, isApproximateCount, orderByFields, limit, isDistinct, distinctFields, isCountDistinct } = parseSelectQuery(query); + + + if (isApproximateCount && fields.length === 1 && fields[0] === 'COUNT(*)' && whereClauses.length === 0) { + let hll = await readCSVForHLL(`${table}.csv`); + return [{ 'APPROXIMATE_COUNT(*)': hll.estimate() }]; + } - const { fields, table, whereClauses, joinType, joinTable, joinCondition, groupByFields, hasAggregateWithoutGroupBy, orderByFields, limit, isDistinct } = parseSelectQuery(query); let data = await readCSV(`${table}.csv`); // Perform INNER JOIN if specified @@ -229,6 +237,7 @@ async function executeSELECTQuery(query) { ? data.filter(row => whereClauses.every(clause => evaluateCondition(row, clause))) : data; + let groupResults = filteredData; if (hasAggregateWithoutGroupBy) { // Special handling for queries like 'SELECT COUNT(*) FROM table' @@ -293,6 +302,20 @@ async function executeSELECTQuery(query) { }); } + // Distinct inside count - example "SELECT COUNT (DISTINCT student.name) FROM student" + if (isCountDistinct) { + + if (isApproximateCount) { + var h = hll({ bitSampleSize: 12, digestSize: 128 }); + orderedResults.forEach(row => h.insert(distinctFields.map(field => row[field]).join('|'))); + return [{ [`APPROXIMATE_${fields[0]}`]: h.estimate() }]; + } + else { + let distinctResults = [...new Map(orderedResults.map(item => [distinctFields.map(field => item[field]).join('|'), item])).values()]; + return [{ [fields[0]]: distinctResults.length }]; + } + } + // Select the specified fields let finalResults = orderedResults.map(row => { const selectedRow = {}; @@ -303,6 +326,8 @@ async function executeSELECTQuery(query) { return selectedRow; }); + // console.log("CP-2", orderedResults) + // Remove duplicates if specified let distinctResults = finalResults; if (isDistinct) { diff --git a/src/queryParser.js b/src/queryParser.js index 4230a15..c81aada 100644 --- a/src/queryParser.js +++ b/src/queryParser.js @@ -8,7 +8,25 @@ function parseSelectQuery(query) { query = query.trim(); // Initialize distinct flag - let isDistinct = false; + let isDistinct = false; // Global DISTINCT, not within COUNT + let isCountDistinct = false; // New flag for DISTINCT within COUNT + let distinctFields = []; // Array to hold fields after DISTINCT within COUNT or APPROXIMATE_COUNT + + + // Detect APPROXIMATE_COUNT + let isApproximateCount = false; + const approximateCountRegex = /APPROXIMATE_COUNT\((DISTINCT\s)?(.+?)\)/i; + const approximateCountMatch = query.match(approximateCountRegex); + if (approximateCountMatch) { + isApproximateCount = true; + // If DISTINCT is used within APPROXIMATE_COUNT, capture the fields + if (approximateCountMatch[1]) { + isCountDistinct = true; + // distinctFields.push(approximateCountMatch[2].trim()); + } + // Simplify further processing by normalizing to COUNT (adjust as necessary for your logic) + query = query.replace(approximateCountRegex, `COUNT(${approximateCountMatch[1] || ''}${approximateCountMatch[2]})`); + } // Check for DISTINCT keyword and update the query if (query.toUpperCase().includes('SELECT DISTINCT')) { @@ -59,13 +77,24 @@ function parseSelectQuery(query) { // Extract JOIN information const { joinType, joinTable, joinCondition } = parseJoinClause(queryWithoutWhere); + const countDistinctRegex = /COUNT\((DISTINCT\s\((.*?)\))\)/gi; + let countDistinctMatch; + while ((countDistinctMatch = countDistinctRegex.exec(query)) !== null) { + isCountDistinct = true; + if (isApproximateCount) { + distinctFields.push(...countDistinctMatch[2].trim().split(',').map(field => field.trim())); + } else { + distinctFields.push(...countDistinctMatch[2].trim().split(',').map(field => field.trim())); + } + } + // Parse SELECT part const selectRegex = /^SELECT\s(.+?)\sFROM\s(.+)/i; const selectMatch = selectPart.match(selectRegex); if (!selectMatch) { throw new Error('Invalid SELECT format'); } - const [, fields, table] = selectMatch; + let [, fields, table] = selectMatch; // Parse WHERE part if it exists let whereClauses = []; @@ -76,8 +105,17 @@ function parseSelectQuery(query) { // Check for aggregate functions without GROUP BY const hasAggregateWithoutGroupBy = checkAggregateWithoutGroupBy(query, groupByFields); + // Temporarily replace commas within parentheses to avoid incorrect splitting + const tempPlaceholder = '__TEMP_COMMA__'; // Ensure this placeholder doesn't appear in your actual queries + fields = fields.replace(/\(([^)]+)\)/g, (match) => match.replace(/,/g, tempPlaceholder)); + + // Now split fields and restore any temporary placeholders + const parsedFields = fields.split(',').map(field => + field.trim().replace(new RegExp(tempPlaceholder, 'g'), ',')); + + return { - fields: fields.split(',').map(field => field.trim()), + fields: parsedFields, table: table.trim(), whereClauses, joinType, @@ -86,7 +124,10 @@ function parseSelectQuery(query) { groupByFields, orderByFields, hasAggregateWithoutGroupBy, + isApproximateCount, + isCountDistinct, limit, + distinctFields, isDistinct }; } catch (error) { diff --git a/tests/appoximateLargeFile.test.js b/tests/appoximateLargeFile.test.js new file mode 100644 index 0000000..cb419da --- /dev/null +++ b/tests/appoximateLargeFile.test.js @@ -0,0 +1,63 @@ +const fs = require('fs'); +const { executeSELECTQuery } = require('../src/queryExecuter'); +const jestConsole = console; + +beforeEach(() => { + global.console = require('console'); +}); + +afterEach(() => { + global.console = jestConsole; +}); + +test('Large File Count(*) - Approximate and Exact', async () => { + // Test Exact Count + + const startMemoryUsageExact = process.memoryUsage().heapUsed; + const startTimeExact = performance.now(); + + const queryExact = "SELECT COUNT(*) FROM student_large"; + const resultExact = await executeSELECTQuery(queryExact); + const exactResult = resultExact[0]['COUNT(*)']; + + const endTimeExact = performance.now(); + const endMemoryUsageExact = process.memoryUsage().heapUsed; + + console.log(`Execution Time for Exact Count: ${(endTimeExact - startTimeExact).toFixed(2)} ms`); + console.log(`Start Memory for Exact Count: ${startMemoryUsageExact / 1024 / 1024} MB`); + console.log(`End Memory for Exact Count: ${endMemoryUsageExact / 1024 / 1024} MB`); + console.log(`Memory Used for Exact Count: ${(endMemoryUsageExact - startMemoryUsageExact) / 1024 / 1024} MB`); + + const startMemoryUsage = process.memoryUsage().heapUsed; + const startTime = performance.now(); + + const query = "SELECT APPROXIMATE_COUNT(*) FROM student_large"; + const result = await executeSELECTQuery(query); + + // Expect the approximate count to be within 5% of the actual count + expect(result[0]['APPROXIMATE_COUNT(*)']).toBeGreaterThan(exactResult - 0.05 * exactResult); + expect(result[0]['APPROXIMATE_COUNT(*)']).toBeLessThan(exactResult + 0.05 * exactResult); + + const endTime = performance.now(); + const endMemoryUsage = process.memoryUsage().heapUsed; + + console.log(`Execution Time for Approximate Count: ${(endTime - startTime).toFixed(2)} ms`); + console.log(`Start Memory: ${startMemoryUsage / 1024 / 1024} MB`); + console.log(`End Memory: ${endMemoryUsage / 1024 / 1024} MB`); + console.log(`Memory Used for Approximate Count: ${(endMemoryUsage - startMemoryUsage) / 1024 / 1024} MB`); + +}, 120000); + +test('Execute SQL Query with COUNT with DISTINCT on a column', async () => { + const queryExact = "SELECT COUNT(DISTINCT (name, age)) FROM student_large"; + const resultExact = await executeSELECTQuery(queryExact); + console.log({ resultExact }); + const exactResult = resultExact[0]['COUNT(DISTINCT (name, age))']; + + const query = "SELECT APPROXIMATE_COUNT(DISTINCT (name, age)) FROM student_large"; + const result = await executeSELECTQuery(query); + + // Expect the approximate count to be within 2% of the actual count + expect(result[0]['APPROXIMATE_COUNT(DISTINCT (name, age))']).toBeGreaterThan(exactResult - 0.05 * exactResult); + expect(result[0]['APPROXIMATE_COUNT(DISTINCT (name, age))']).toBeLessThan(exactResult + 0.05 * exactResult); +}, 120000); \ No newline at end of file diff --git a/tests/queryExecuter.test.js b/tests/queryExecuter.test.js index 3dc121c..891f062 100644 --- a/tests/queryExecuter.test.js +++ b/tests/queryExecuter.test.js @@ -418,3 +418,59 @@ test('LIKE with ORDER BY and LIMIT', async () => { // Expecting the first two names alphabetically that contain 'a' expect(result).toEqual([{ name: 'Alice' }, { name: 'Jane' }]); }); + + +test('Execute SQL Query with APPROXIMATE_COUNT Function', async () => { + const query = "SELECT APPROXIMATE_COUNT(id) FROM student"; + const result = await executeSELECTQuery(query); + // Assuming APPROXIMATE_COUNT behaves like COUNT for testing + // Expecting the count of all student records + expect(result).toEqual([{ 'COUNT(id)': 5 }]); // Assuming there are 5 records in student.csv +}); + +test('Execute SQL Query with APPROXIMATE_COUNT and GROUP BY Clauses', async () => { + const query = "SELECT APPROXIMATE_COUNT(id), course FROM enrollment GROUP BY course"; + const result = await executeSELECTQuery(query); + // Assuming APPROXIMATE_COUNT behaves like COUNT for testing + // Expecting the count of student records grouped by course + expect(result).toEqual([ + { 'COUNT(id)': 2, course: 'Mathematics' }, // Assuming 2 students are enrolled in Mathematics + { 'COUNT(id)': 2, course: 'Physics' }, // Assuming 1 student is enrolled in Physics + { 'COUNT(id)': 1, course: 'Chemistry' }, // Assuming 1 student is enrolled in Chemistry + { 'COUNT(id)': 1, course: 'Biology' } // Assuming 1 student is enrolled in Biology + ]); +}); + +test('Execute SQL Query with APPROXIMATE_COUNT, WHERE, and ORDER BY Clauses', async () => { + const query = "SELECT APPROXIMATE_COUNT(id) FROM student WHERE age > '20' ORDER BY age DESC"; + const result = await executeSELECTQuery(query); + // Assuming APPROXIMATE_COUNT behaves like COUNT for testing + // Expecting the count of students older than 20, ordered by age in descending order + // Note: The ORDER BY clause does not affect the outcome for a single aggregated result + expect(result).toEqual([{ 'COUNT(id)': 5 }]); // Assuming there are 4 students older than 20 +}); + + +test('Execute SQL Query with APPROXIMATE_COUNT only', async () => { + const query = "SELECT APPROXIMATE_COUNT(*) FROM student"; + const result = await executeSELECTQuery(query); + expect(result).toEqual([{ 'APPROXIMATE_COUNT(*)': 5 }]); +}); + +test('Execute SQL Query with APPROXIMATE_COUNT with DISTINCT on a column', async () => { + const query = "SELECT APPROXIMATE_COUNT(DISTINCT (name)) FROM student"; + const result = await executeSELECTQuery(query); + expect(result).toEqual([{ 'APPROXIMATE_COUNT(DISTINCT (name))': 4 }]); +}); + +test('Execute SQL Query with COUNT with DISTINCT on a column', async () => { + const query = "SELECT COUNT(DISTINCT (name)) FROM student"; + const result = await executeSELECTQuery(query); + expect(result).toEqual([{ 'COUNT(DISTINCT (name))': 4 }]); +}); + +test('Execute SQL Query with COUNT with DISTINCT on a column', async () => { + const query = "SELECT COUNT(DISTINCT (name, age)) FROM student"; + const result = await executeSELECTQuery(query); + expect(result).toEqual([{ 'COUNT(DISTINCT (name, age))': 5 }]); +}); \ No newline at end of file diff --git a/tests/queryParser.test.js b/tests/queryParser.test.js index 664463b..39e12e5 100644 --- a/tests/queryParser.test.js +++ b/tests/queryParser.test.js @@ -16,6 +16,9 @@ test('Parse SQL Query', () => { orderByFields: null, limit: null, isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -38,6 +41,9 @@ test('Parse SQL Query with WHERE Clause', () => { orderByFields: null, limit: null, isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -64,6 +70,9 @@ test('Parse SQL Query with Multiple WHERE Clauses', () => { orderByFields: null, limit: null, isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -82,6 +91,9 @@ test('Parse SQL Query with INNER JOIN', async () => { orderByFields: null, limit: null, isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }) }); @@ -100,6 +112,9 @@ test('Parse SQL Query with INNER JOIN and WHERE Clause', async () => { orderByFields: null, limit: null, isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }) }); @@ -160,6 +175,9 @@ test('Parse LEFT Join Query Completely', () => { orderByFields: null, limit: null, isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }) }) @@ -178,6 +196,9 @@ test('Parse LEFT Join Query Completely', () => { orderByFields: null, limit: null, isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }) }) @@ -196,6 +217,9 @@ test('Parse SQL Query with LEFT JOIN with a WHERE clause filtering the main tabl orderByFields: null, limit: null, isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -214,6 +238,9 @@ test('Parse SQL Query with LEFT JOIN with a WHERE clause filtering the join tabl orderByFields: null, limit: null, isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -232,6 +259,9 @@ test('Parse SQL Query with RIGHT JOIN with a WHERE clause filtering the main tab orderByFields: null, limit: null, isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -250,6 +280,9 @@ test('Parse SQL Query with RIGHT JOIN with a WHERE clause filtering the join tab orderByFields: null, limit: null, isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -268,7 +301,10 @@ test('Parse COUNT Aggregate Query', () => { joinType: null, orderByFields: null, limit: null, - isDistinct: false + isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -287,7 +323,10 @@ test('Parse SUM Aggregate Query', () => { joinType: null, orderByFields: null, limit: null, - isDistinct: false + isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -305,7 +344,10 @@ test('Parse AVG Aggregate Query', () => { joinType: null, orderByFields: null, limit: null, - isDistinct: false + isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -323,7 +365,10 @@ test('Parse MIN Aggregate Query', () => { joinType: null, orderByFields: null, limit: null, - isDistinct: false + isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -341,7 +386,10 @@ test('Parse MAX Aggregate Query', () => { joinType: null, orderByFields: null, limit: null, - isDistinct: false + isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -359,7 +407,10 @@ test('Parse basic GROUP BY query', () => { hasAggregateWithoutGroupBy: false, orderByFields: null, limit: null, - isDistinct: false + isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -377,7 +428,10 @@ test('Parse GROUP BY query with WHERE clause', () => { hasAggregateWithoutGroupBy: false, orderByFields: null, limit: null, - isDistinct: false + isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -395,7 +449,10 @@ test('Parse GROUP BY query with multiple fields', () => { hasAggregateWithoutGroupBy: false, orderByFields: null, limit: null, - isDistinct: false + isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -416,7 +473,10 @@ test('Parse GROUP BY query with JOIN and WHERE clauses', () => { hasAggregateWithoutGroupBy: false, orderByFields: null, limit: null, - isDistinct: false + isDistinct: false, + isApproximateCount: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -483,6 +543,7 @@ test('Parse SQL Query with Basic DISTINCT', () => { fields: ['age'], table: 'student', isDistinct: true, + isApproximateCount: false, whereClauses: [], groupByFields: null, joinType: null, @@ -490,7 +551,9 @@ test('Parse SQL Query with Basic DISTINCT', () => { joinCondition: null, orderByFields: null, limit: null, - hasAggregateWithoutGroupBy: false + hasAggregateWithoutGroupBy: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -501,6 +564,7 @@ test('Parse SQL Query with DISTINCT and Multiple Columns', () => { fields: ['student_id', 'course'], table: 'enrollment', isDistinct: true, + isApproximateCount: false, whereClauses: [], groupByFields: null, joinType: null, @@ -508,7 +572,9 @@ test('Parse SQL Query with DISTINCT and Multiple Columns', () => { joinCondition: null, orderByFields: null, limit: null, - hasAggregateWithoutGroupBy: false + hasAggregateWithoutGroupBy: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -519,6 +585,7 @@ test('Parse SQL Query with DISTINCT and WHERE Clause', () => { fields: ['course'], table: 'enrollment', isDistinct: true, + isApproximateCount: false, whereClauses: [{ field: 'student_id', operator: '=', value: '"1"' }], groupByFields: null, joinType: null, @@ -526,7 +593,9 @@ test('Parse SQL Query with DISTINCT and WHERE Clause', () => { joinCondition: null, orderByFields: null, limit: null, - hasAggregateWithoutGroupBy: false + hasAggregateWithoutGroupBy: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -537,6 +606,7 @@ test('Parse SQL Query with DISTINCT and JOIN Operations', () => { fields: ['student.name'], table: 'student', isDistinct: true, + isApproximateCount: false, whereClauses: [], groupByFields: null, joinType: 'INNER', @@ -547,7 +617,9 @@ test('Parse SQL Query with DISTINCT and JOIN Operations', () => { }, orderByFields: null, limit: null, - hasAggregateWithoutGroupBy: false + hasAggregateWithoutGroupBy: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -558,6 +630,7 @@ test('Parse SQL Query with DISTINCT, ORDER BY, and LIMIT', () => { fields: ['age'], table: 'student', isDistinct: true, + isApproximateCount: false, whereClauses: [], groupByFields: null, joinType: null, @@ -565,7 +638,9 @@ test('Parse SQL Query with DISTINCT, ORDER BY, and LIMIT', () => { joinCondition: null, orderByFields: [{ fieldName: 'age', order: 'DESC' }], limit: 2, - hasAggregateWithoutGroupBy: false + hasAggregateWithoutGroupBy: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -576,6 +651,7 @@ test('Parse SQL Query with DISTINCT on All Columns', () => { fields: ['*'], table: 'student', isDistinct: true, + isApproximateCount: false, whereClauses: [], groupByFields: null, joinType: null, @@ -583,7 +659,9 @@ test('Parse SQL Query with DISTINCT on All Columns', () => { joinCondition: null, orderByFields: null, limit: null, - hasAggregateWithoutGroupBy: false + hasAggregateWithoutGroupBy: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -595,13 +673,16 @@ test('Parse SQL Query with LIKE Clause', () => { table: 'student', whereClauses: [{ field: 'name', operator: 'LIKE', value: '%Jane%' }], isDistinct: false, + isApproximateCount: false, groupByFields: null, joinType: null, joinTable: null, joinCondition: null, orderByFields: null, limit: null, - hasAggregateWithoutGroupBy: false + hasAggregateWithoutGroupBy: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -613,13 +694,16 @@ test('Parse SQL Query with LIKE Clause and Wildcards', () => { table: 'student', whereClauses: [{ field: 'name', operator: 'LIKE', value: 'J%' }], isDistinct: false, + isApproximateCount: false, groupByFields: null, joinType: null, joinTable: null, joinCondition: null, orderByFields: null, limit: null, - hasAggregateWithoutGroupBy: false + hasAggregateWithoutGroupBy: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -634,13 +718,16 @@ test('Parse SQL Query with Multiple LIKE Clauses', () => { { field: 'age', operator: 'LIKE', value: '2%' } ], isDistinct: false, + isApproximateCount: false, groupByFields: null, joinType: null, joinTable: null, joinCondition: null, orderByFields: null, limit: null, - hasAggregateWithoutGroupBy: false + hasAggregateWithoutGroupBy: false, + isCountDistinct: false, + distinctFields: [] }); }); @@ -653,6 +740,95 @@ test('Parse SQL Query with LIKE and ORDER BY Clauses', () => { whereClauses: [{ field: 'name', operator: 'LIKE', value: '%e%' }], orderByFields: [{ fieldName: 'age', order: 'DESC' }], isDistinct: false, + isApproximateCount: false, + groupByFields: null, + joinType: null, + joinTable: null, + joinCondition: null, + limit: null, + hasAggregateWithoutGroupBy: false, + isCountDistinct: false, + distinctFields: [] + }); +}); + +test('Parse SQL Query with APPROXIMATE_COUNT Function', () => { + const query = "SELECT APPROXIMATE_COUNT(id) FROM student"; + const parsed = parseSelectQuery(query); + expect(parsed).toEqual({ + fields: ['COUNT(id)'], // Assuming APPROXIMATE_COUNT is replaced with COUNT for simplicity + table: 'student', + whereClauses: [], + isDistinct: false, + isApproximateCount: true, // This flag should be true when APPROXIMATE_COUNT is used + groupByFields: null, + joinType: null, + joinTable: null, + joinCondition: null, + orderByFields: null, + limit: null, + hasAggregateWithoutGroupBy: true, + isCountDistinct: false, + distinctFields: [] + }); +}); + +test('Parse SQL Query with APPROXIMATE_COUNT and GROUP BY Clauses', () => { + const query = "SELECT APPROXIMATE_COUNT(id), course FROM enrollment GROUP BY course"; + const parsed = parseSelectQuery(query); + expect(parsed).toEqual({ + fields: ['COUNT(id)', 'course'], // Assuming APPROXIMATE_COUNT is replaced with COUNT for simplicity + table: 'enrollment', + whereClauses: [], + isDistinct: false, + isApproximateCount: true, // This flag should be true when APPROXIMATE_COUNT is used + groupByFields: ['course'], + joinType: null, + joinTable: null, + joinCondition: null, + orderByFields: null, + limit: null, + hasAggregateWithoutGroupBy: false, + isCountDistinct: false, + distinctFields: [] + }); +}); + +test('Parse SQL Query with APPROXIMATE_COUNT, WHERE, and ORDER BY Clauses', () => { + const query = "SELECT APPROXIMATE_COUNT(id) FROM student WHERE age > 20 ORDER BY age DESC"; + const parsed = parseSelectQuery(query); + expect(parsed).toEqual({ + fields: ['COUNT(id)'], + table: 'student', + whereClauses: [ + { field: 'age', operator: '>', value: '20' } + ], + orderByFields: [{ fieldName: 'age', order: 'DESC' }], + isDistinct: false, + isApproximateCount: true, + groupByFields: null, + joinType: null, + joinTable: null, + joinCondition: null, + limit: null, + hasAggregateWithoutGroupBy: true, + isCountDistinct: false, + distinctFields: [] + }); +}); + +test('Parse SQL Query with APPROXIMATE_COUNT with DISTINCT on a column', () => { + const query = "SELECT APPROXIMATE_COUNT(DISTINCT (name)) FROM student"; + const parsed = parseSelectQuery(query); + expect(parsed).toEqual({ + fields: ['COUNT(DISTINCT (name))'], + table: 'student', + whereClauses: [], + orderByFields: null, + isDistinct: false, + distinctFields: ['name'], + isCountDistinct: true, + isApproximateCount: true, groupByFields: null, joinType: null, joinTable: null, @@ -660,4 +836,47 @@ test('Parse SQL Query with LIKE and ORDER BY Clauses', () => { limit: null, hasAggregateWithoutGroupBy: false }); -}); \ No newline at end of file +}); + +test('Parse SQL Query with COUNT with DISTINCT on a column', () => { + const query = "SELECT COUNT(DISTINCT (name)) FROM student"; + const parsed = parseSelectQuery(query); + expect(parsed).toEqual({ + fields: ['COUNT(DISTINCT (name))'], + table: 'student', + whereClauses: [], + orderByFields: null, + isDistinct: false, + distinctFields: ['name'], + isCountDistinct: true, + isApproximateCount: false, + groupByFields: null, + joinType: null, + joinTable: null, + joinCondition: null, + limit: null, + hasAggregateWithoutGroupBy: false + }); +}); + +test('Parse SQL Query with COUNT with DISTINCT on multiple column', () => { + const query = "SELECT COUNT(DISTINCT (name, age)) FROM student"; + const parsed = parseSelectQuery(query); + expect(parsed).toEqual({ + fields: ['COUNT(DISTINCT (name, age))'], + table: 'student', + whereClauses: [], + orderByFields: null, + isDistinct: false, + distinctFields: ['name', 'age'], + isCountDistinct: true, + isApproximateCount: false, + groupByFields: null, + joinType: null, + joinTable: null, + joinCondition: null, + limit: null, + hasAggregateWithoutGroupBy: false + }); +}); + diff --git a/util/generateLargeFile.js b/util/generateLargeFile.js new file mode 100644 index 0000000..d70c913 --- /dev/null +++ b/util/generateLargeFile.js @@ -0,0 +1,35 @@ +const fs = require('fs'); +const { faker, da } = require('@faker-js/faker'); +const { parse } = require('json2csv'); + +async function generateLargeCSV(filename) { + let data = []; + for (let i = 1; i <= 10_000_000; i++) { + const record = { + id: i, + name: faker.person.firstName(), + age: faker.number.int({ min: 18, max: 100 }), + }; + data.push(record); + + let rows; + if (i % 500_000 === 0) { + console.log(`Generated ${i} records`); + if (!fs.existsSync(filename)) { + rows = parse(data, { header: true }); + } else { + // Rows without headers. + rows = parse(data, { header: false }); + } + fs.appendFileSync(filename, rows); + data = []; + } + + } + // Append file function can create new file too. + + // Always add new line if file already exists. + fs.appendFileSync(filename, "\r\n"); +} + +generateLargeCSV('student_large.csv') \ No newline at end of file