From c75bab226e3c3a93be2956352644b063d0d2beed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Rame=CC=81?= Date: Thu, 22 Feb 2024 18:36:16 +0100 Subject: [PATCH] chore: change the logic to not break jobs when a domain has a network issue (temporary or permanent) --- src/features/domain.ts | 54 ++++++++++++++++++++++++++++++++++++++---- src/utils/request.ts | 13 ++++++++-- 2 files changed, 61 insertions(+), 6 deletions(-) diff --git a/src/features/domain.ts b/src/features/domain.ts index 020f7a4..16e0a5f 100644 --- a/src/features/domain.ts +++ b/src/features/domain.ts @@ -13,7 +13,7 @@ import { PeerCertificate, TLSSocket } from 'tls'; import z from 'zod'; import { downloadFile } from '@etabli/src/common'; -import { getWebsiteData, guessWebsiteNameFromPageTitles } from '@etabli/src/features/website'; +import { getWebsiteData, getWebsiteDataResponse, guessWebsiteNameFromPageTitles } from '@etabli/src/features/website'; import { LitePeerCertificateSchema } from '@etabli/src/models/entities/certificate'; import { BusinessDomainError, unexpectedDomainRedirectionError } from '@etabli/src/models/entities/errors'; import { LiteRawDomainSchema, LiteRawDomainSchemaType } from '@etabli/src/models/entities/raw-domain'; @@ -313,7 +313,20 @@ export async function updateRobotsTxtOnDomains() { try { const rootUrl = new URL(`https://${rawDomain.name}`); const robotsUrl = `${rootUrl.toString()}robots.txt`; - const result = await fetch(robotsUrl); + + let result: Response; + try { + result = await fetch(robotsUrl); + } catch (error) { + if (error instanceof Error) { + handleReachabilityError(error); + + // Skip this one to perform other domains + continue; + } else { + throw error; + } + } // We want to prevent redirection on another domain to keep integrity but we let pathname redirection pass, so looking at domain only const resultingUrl = new URL(result.url); @@ -424,6 +437,7 @@ export async function updateWildcardCertificateOnDomains() { `try to process SSL certificate for domain ${rawDomain.name} (${rawDomain.id}) ${formatArrayProgress(rawDomainIndex, rawDomains.length)}` ); + let toSkip: boolean = false; const certificate = await new Promise((resolve, reject) => { const request = https.request( { @@ -445,6 +459,8 @@ export async function updateWildcardCertificateOnDomains() { if (error instanceof Error) { handleReachabilityError(error); + // Skip this one to perform other domains + toSkip = true; resolve(null); } else { throw error; @@ -457,6 +473,10 @@ export async function updateWildcardCertificateOnDomains() { request.end(); }); + if (toSkip) { + continue; + } + // The content has no defined structure, we just kept the library format to debug if needed main processed values as it comes let certificateContent: object | null = null; if (certificate) { @@ -497,7 +517,20 @@ export async function updateWebsiteDataOnDomains() { try { const url = new URL(`https://${rawDomain.name}`); - const websiteData = await getWebsiteData(url.toString()); + + let websiteData: getWebsiteDataResponse; + try { + websiteData = await getWebsiteData(url.toString()); + } catch (error) { + if (error instanceof Error) { + handleReachabilityError(error); + + // Skip this one to perform other domains + continue; + } else { + throw error; + } + } if (websiteData.status >= 200 && websiteData.status < 300) { if (containsHtml(websiteData.html)) { @@ -560,7 +593,20 @@ export async function updateWebsiteDataOnDomains() { // Wait a bit to not flood this website (tiny delay in this loop because it's just the second request to this domain in this iteration) await sleep(50); - const anotherPageData = await getWebsiteData(anotherPageUrl); + let anotherPageData: getWebsiteDataResponse; + try { + anotherPageData = await getWebsiteData(url.toString()); + } catch (error) { + if (error instanceof Error) { + handleReachabilityError(error); + + // Skip this one to perform other domains + continue; + } else { + throw error; + } + } + anotherPageTitle = anotherPageData.title; break; diff --git a/src/utils/request.ts b/src/utils/request.ts index fa114db..9080bf8 100644 --- a/src/utils/request.ts +++ b/src/utils/request.ts @@ -1,4 +1,8 @@ -// When it's an error related to the remote server or the connection we skip it since we have no control over it +import * as Sentry from '@sentry/nextjs'; + +// This should be used close to network calls because it silents errors +// And in our case of long-running jobs we want the loop to continue despite network errors because it will be fetch again next time +// TODO: in the future we could register the error into the database for specific domains so we can tell to the list source to look at removing them if appropriate export function handleReachabilityError(error: Error) { if ( ![ @@ -15,8 +19,13 @@ export function handleReachabilityError(error: Error) { // Hostname/IP does not match certificate's altnames (which makes the certificate invalid) 'ERR_TLS_CERT_ALTNAME_INVALID', 'DEPTH_ZERO_SELF_SIGNED_CERT', + 'UNABLE_TO_VERIFY_LEAF_SIGNATURE', + 'CERT_HAS_EXPIRED', ].includes((error.cause as any)?.code || '') ) { - throw error; + // Since we do not throw error, we log them for record but we also notify Sentry so we can make the list above updated with appropriate codes + console.error(error); + + Sentry.captureException(error); } }