Skip to content

Commit

Permalink
chore: parallelize loop iterations when fetching website data since i…
Browse files Browse the repository at this point in the history
…t requires a lot of time for each
  • Loading branch information
sneko committed Feb 26, 2024
1 parent f97d2a4 commit 4667a8d
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 10 deletions.
20 changes: 16 additions & 4 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@
"@trpc/client": "^10.4.2",
"@trpc/react-query": "^10.4.2",
"@trpc/server": "^10.4.2",
"async": "^3.2.5",
"chalk": "^5.3.0",
"crisp-sdk-web": "^1.0.13",
"csv-parse": "^5.5.2",
Expand Down Expand Up @@ -187,6 +188,7 @@
"@testing-library/react": "^13.4.0",
"@testing-library/user-event": "^14.4.3",
"@trivago/prettier-plugin-sort-imports": "^4.3.0",
"@types/async": "^3.2.24",
"@types/bcrypt": "^5.0.0",
"@types/chalk": "^2.2.0",
"@types/content-disposition": "^0.5.5",
Expand Down
16 changes: 10 additions & 6 deletions src/features/domain.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { Prisma } from '@prisma/client';
import { PrismaClientUnknownRequestError } from '@prisma/client/runtime/library';
import { eachOfLimit } from 'async';
import { parse } from 'csv-parse';
import { minutesToMilliseconds } from 'date-fns/minutesToMilliseconds';
import fsSync from 'fs';
Expand Down Expand Up @@ -528,7 +529,10 @@ export async function updateWebsiteDataOnDomains() {
executablePath: process.env.PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH,
});
try {
for (const [rawDomainIndex, rawDomain] of Object.entries(rawDomains)) {
// Since the underlying content fetching is based on waiting a timeout on the website "to be sure" single page applications (SPA)
// have rendered their content, it takes some time in the iteration are consecutives. Due to that we made the loop batching a few for each iteration
// Note: previously in average it was 6 seconds per website (since to 2 pages renderings with timeout), we tried to keep it short (others long-running jobs are bout ~50ms per page loaded)
await eachOfLimit(rawDomains, 15, async function (rawDomain, rawDomainIndex) {
watchGracefulExitInLoop();

console.log(
Expand All @@ -546,7 +550,7 @@ export async function updateWebsiteDataOnDomains() {
handleReachabilityError(error);

// Skip this one to perform other domains
continue;
return;
} else {
throw error;
}
Expand Down Expand Up @@ -622,7 +626,7 @@ export async function updateWebsiteDataOnDomains() {
handleReachabilityError(error);

// Skip this one to perform other domains
continue;
return;
} else {
throw error;
}
Expand All @@ -636,7 +640,7 @@ export async function updateWebsiteDataOnDomains() {
}
} catch (error) {
// The `href` may not be a valid URL, just skip this link
continue;
return;
}
}
}
Expand Down Expand Up @@ -764,7 +768,7 @@ export async function updateWebsiteDataOnDomains() {
},
});

continue;
return;
} else if (error instanceof PrismaClientUnknownRequestError) {
handlePrismaErrorDueToContent(error);
} else {
Expand All @@ -774,7 +778,7 @@ export async function updateWebsiteDataOnDomains() {

// Do not flood network (tiny delay since it's unlikely a lot consecutive domains would be managed by the same provider)
await sleep(50);
}
});
} finally {
await browser.close();
}
Expand Down

0 comments on commit 4667a8d

Please sign in to comment.