diff --git a/package-lock.json b/package-lock.json index 791740e..05cbcbb 100644 --- a/package-lock.json +++ b/package-lock.json @@ -33,6 +33,7 @@ "@trpc/client": "^10.4.2", "@trpc/react-query": "^10.4.2", "@trpc/server": "^10.4.2", + "async": "^3.2.5", "chalk": "^5.3.0", "crisp-sdk-web": "^1.0.13", "csv-parse": "^5.5.2", @@ -118,6 +119,7 @@ "@testing-library/react": "^13.4.0", "@testing-library/user-event": "^14.4.3", "@trivago/prettier-plugin-sort-imports": "^4.3.0", + "@types/async": "^3.2.24", "@types/bcrypt": "^5.0.0", "@types/chalk": "^2.2.0", "@types/content-disposition": "^0.5.5", @@ -13710,6 +13712,12 @@ "resolved": "https://registry.npmjs.org/@types/aria-query/-/aria-query-5.0.4.tgz", "integrity": "sha512-rfT93uj5s0PRL7EzccGMs3brplhcrghnDoV26NqKhCAS1hVo+WdNsPvE/yb6ilfr5hi2MEk6d5EWJTKdxg8jVw==" }, + "node_modules/@types/async": { + "version": "3.2.24", + "resolved": "https://registry.npmjs.org/@types/async/-/async-3.2.24.tgz", + "integrity": "sha512-8iHVLHsCCOBKjCF2KwFe0p9Z3rfM9mL+sSP8btyR5vTjJRAqpBYD28/ZLgXPf0pjG1VxOvtCV/BgXkQbpSe8Hw==", + "dev": true + }, "node_modules/@types/babel__core": { "version": "7.20.5", "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz", @@ -15665,8 +15673,7 @@ "node_modules/async": { "version": "3.2.5", "resolved": "https://registry.npmjs.org/async/-/async-3.2.5.tgz", - "integrity": "sha512-baNZyqaaLhyLVKm/DlvdW051MSgO6b8eVfIezl9E5PqWxFgzLm/wQntEW4zOytVburDEr0JlALEpdOFwvErLsg==", - "dev": true + "integrity": "sha512-baNZyqaaLhyLVKm/DlvdW051MSgO6b8eVfIezl9E5PqWxFgzLm/wQntEW4zOytVburDEr0JlALEpdOFwvErLsg==" }, "node_modules/async-limiter": { "version": "1.0.1", @@ -50394,6 +50401,12 @@ "resolved": "https://registry.npmjs.org/@types/aria-query/-/aria-query-5.0.4.tgz", "integrity": "sha512-rfT93uj5s0PRL7EzccGMs3brplhcrghnDoV26NqKhCAS1hVo+WdNsPvE/yb6ilfr5hi2MEk6d5EWJTKdxg8jVw==" }, + "@types/async": { + "version": "3.2.24", + "resolved": "https://registry.npmjs.org/@types/async/-/async-3.2.24.tgz", + "integrity": "sha512-8iHVLHsCCOBKjCF2KwFe0p9Z3rfM9mL+sSP8btyR5vTjJRAqpBYD28/ZLgXPf0pjG1VxOvtCV/BgXkQbpSe8Hw==", + "dev": true + }, "@types/babel__core": { "version": "7.20.5", "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz", @@ -52086,8 +52099,7 @@ "async": { "version": "3.2.5", "resolved": "https://registry.npmjs.org/async/-/async-3.2.5.tgz", - "integrity": "sha512-baNZyqaaLhyLVKm/DlvdW051MSgO6b8eVfIezl9E5PqWxFgzLm/wQntEW4zOytVburDEr0JlALEpdOFwvErLsg==", - "dev": true + "integrity": "sha512-baNZyqaaLhyLVKm/DlvdW051MSgO6b8eVfIezl9E5PqWxFgzLm/wQntEW4zOytVburDEr0JlALEpdOFwvErLsg==" }, "async-limiter": { "version": "1.0.1", diff --git a/package.json b/package.json index f413191..541aca8 100644 --- a/package.json +++ b/package.json @@ -102,6 +102,7 @@ "@trpc/client": "^10.4.2", "@trpc/react-query": "^10.4.2", "@trpc/server": "^10.4.2", + "async": "^3.2.5", "chalk": "^5.3.0", "crisp-sdk-web": "^1.0.13", "csv-parse": "^5.5.2", @@ -187,6 +188,7 @@ "@testing-library/react": "^13.4.0", "@testing-library/user-event": "^14.4.3", "@trivago/prettier-plugin-sort-imports": "^4.3.0", + "@types/async": "^3.2.24", "@types/bcrypt": "^5.0.0", "@types/chalk": "^2.2.0", "@types/content-disposition": "^0.5.5", diff --git a/src/features/domain.ts b/src/features/domain.ts index a3658db..bc40a24 100644 --- a/src/features/domain.ts +++ b/src/features/domain.ts @@ -1,5 +1,6 @@ import { Prisma } from '@prisma/client'; import { PrismaClientUnknownRequestError } from '@prisma/client/runtime/library'; +import { eachOfLimit } from 'async'; import { parse } from 'csv-parse'; import { minutesToMilliseconds } from 'date-fns/minutesToMilliseconds'; import fsSync from 'fs'; @@ -528,7 +529,10 @@ export async function updateWebsiteDataOnDomains() { executablePath: process.env.PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH, }); try { - for (const [rawDomainIndex, rawDomain] of Object.entries(rawDomains)) { + // Since the underlying content fetching is based on waiting a timeout on the website "to be sure" single page applications (SPA) + // have rendered their content, it takes some time in the iteration are consecutives. Due to that we made the loop batching a few for each iteration + // Note: previously in average it was 6 seconds per website (since to 2 pages renderings with timeout), we tried to keep it short (others long-running jobs are bout ~50ms per page loaded) + await eachOfLimit(rawDomains, 15, async function (rawDomain, rawDomainIndex) { watchGracefulExitInLoop(); console.log( @@ -546,7 +550,7 @@ export async function updateWebsiteDataOnDomains() { handleReachabilityError(error); // Skip this one to perform other domains - continue; + return; } else { throw error; } @@ -622,7 +626,7 @@ export async function updateWebsiteDataOnDomains() { handleReachabilityError(error); // Skip this one to perform other domains - continue; + return; } else { throw error; } @@ -636,7 +640,7 @@ export async function updateWebsiteDataOnDomains() { } } catch (error) { // The `href` may not be a valid URL, just skip this link - continue; + return; } } } @@ -764,7 +768,7 @@ export async function updateWebsiteDataOnDomains() { }, }); - continue; + return; } else if (error instanceof PrismaClientUnknownRequestError) { handlePrismaErrorDueToContent(error); } else { @@ -774,7 +778,7 @@ export async function updateWebsiteDataOnDomains() { // Do not flood network (tiny delay since it's unlikely a lot consecutive domains would be managed by the same provider) await sleep(50); - } + }); } finally { await browser.close(); }