-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhandler.js
83 lines (70 loc) · 1.94 KB
/
handler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
'use strict';
require('dotenv').config();
const launchChrome = require('@serverless-chrome/lambda');
const CDP = require('chrome-remote-interface');
const puppeteer = require('puppeteer');
const AWS = require('aws-sdk');
const cheerio = require('cheerio');
module.exports.crawling = async (event, context) => {
let slsChrome = null;
let browser = null;
let page = null;
try {
// launch serverless-chrome
slsChrome = await launchChrome();
// puppeteer connects to serverless-chrome via Web Socket
browser = await puppeteer.connect({
ignoreHTTPSErrors: true,
browserWSEndpoint: (await CDP.Version()).webSocketDebuggerUrl
});
page = await browser.newPage();
page.setDefaultNavigationTimeout(30000);
await page.setJavaScriptEnabled(false);
await page.goto(
process.env.SCRAPE_URL, {
waitUntil: 'networkidle2'
}
);
// get the entire document HTML
const html = await page.evaluate(() => {
return document.getElementsByTagName('html')[0].innerHTML
});
AWS.config.update({
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
secretAccessKey: process.env.AWS_SEECRET_ACCESS_KEY,
region: process.env.AWS_REGION
});
const s3 = new AWS.S3();
const response = await s3.putObject({
Bucket: process.env.S3_BUCKET,
Key: `${Date.now()}.html`,
ContentType: 'text/html',
Body: html
}).promise();
const urls = [];
const $ = cheerio.load(html);
$('section').find('.entrylist-contents-title').each((i, elem) => {
urls.push($(elem).find('a').attr('href'));
});
console.log({
result: 'OK',
urls: urls,
s3response: response
});
} catch (err) {
console.error(err);
console.log({
result: 'NG'
});
} finally {
if (page) {
await page.close();
}
if (browser) {
await browser.disconnect();
}
if (slsChrome) {
await slsChrome.kill();
}
}
};