-
Which package is this bug report for? If unsure which one to select, leave blank@crawlee/puppeteer (PuppeteerCrawler) Issue description
Code sampleconst { PuppeteerCrawler, Configuration } = require('crawlee')
const crawleeConfig = Configuration.getGlobalConfig()
crawleeConfig.set('persistStorage', false)
function getPageViewPort (page) {
if (!page || page.isClosed()) return null
const { width, height } = page.viewport()
return `${width}x${height}`
}
async function getPageUserAgent (page) {
if (!page || page.isClosed()) return null
return await page.evaluate(() => navigator.userAgent)
}
const crawler = new PuppeteerCrawler({
requestHandlerTimeoutSecs: 120,
retryOnBlocked: true,
keepAlive: true,
sameDomainDelaySecs: 5,
launchContext: {
useChrome: true,
useIncognitoPages: true,
launchOptions: {
headless: true,
ignoreHTTPSErrors: true
}
},
autoscaledPoolOptions: {
minConcurrency: 1,
desiredConcurrency: 1,
maxConcurrency: 1
},
requestHandler: async ({ request, page, response }) => {
const pageUserAgent = await getPageUserAgent(page)
const pageViewPort = getPageViewPort(page)
console.log(pageUserAgent, pageViewPort)
console.info(`Successfully crawled ${response.url()} with code ${response.status()}`)
},
errorHandler: ({ request, response }, err) => {
console.error('Error handler: ', err)
},
failedRequestHandler: async ({ request, page }, err) => {
try {
console.error('Failed request handler: ', err)
} catch (error) {
console.error('Error in failed request handler: ', error)
}
},
preNavigationHooks: [
async (crawlingContext, gotoOptions) => {
gotoOptions.waitUntil = 'domcontentloaded'
}
]
})
const url = 'https://www.quizerry.com/category/computer-science/'
crawler.run([
{
url,
userData: {
url
}
}
]) Package version3.5.4 Node.js version18.17.0 Operating systemMac OS Apify platform
I have tested this on the
|
Beta Was this translation helpful? Give feedback.
Replies: 3 comments
-
Works fine if |
Beta Was this translation helpful? Give feedback.
-
Sounds like a clash between |
Beta Was this translation helpful? Give feedback.
-
Hello @teammakdi, it's actually because of the In case you want to save some bandwidth and time and not wait to load image, CSS, and font files, you can use the preNavigationHooks: [
async ({ blockRequests }) => await blockRequests(),
], You can read more about this helper in our documentation. Please let us know if this helped. Thank you! |
Beta Was this translation helpful? Give feedback.
Hello @teammakdi, it's actually because of the
gotoOptions.waitUntil = 'domcontentloaded'
line. By default, our crawlers wait until all the resources on the page are loaded (load
event), so these exact errors don't happen. You crawler works just fine if you remove this line.In case you want to save some bandwidth and time and not wait to load image, CSS, and font files, you can use the
blockRequests
context helper in thepreNavigation
hook like this:You can read more about this helper in our documentation.
Please let us know if this helped. Thank you!