-
Notifications
You must be signed in to change notification settings - Fork 16
/
scrape.js
292 lines (265 loc) · 9.04 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
//Imports
const puppeteer = require("puppeteer");
const jsonfile = require("jsonfile");
const fs = require("fs");
const rxjs = require("rxjs");
const { mergeMap, toArray, filter } = require("rxjs/operators");
require("dotenv").config();
/**
* Automated login to LinkedIn
* @param {string} username User email
* @param {string} password User password
*/
const linkedinLogin = async (username, password, page) => {
console.log(`Logging in with email: ${process.env.EMAIL}`);
await page.type("#session_key", username);
await page.type("#session_password", password);
await page.click(".sign-in-form__submit-button");
// Wait for page load
return new Promise((resolve) => {
page.on("framenavigated", async () => {
if (page.url().startsWith("https://www.linkedin.com/feed")) {
// Save the session cookies
const cookiesObject = await page.cookies();
// Store cookies in cookie.json to persist the session
jsonfile.writeFile(
"./cookie.json",
cookiesObject,
{ spaces: 2 },
(err) => {
if (err) console.log("Error while writing file: ", err);
else console.log("Session saved successfully!");
}
);
return resolve();
}
});
});
};
/**
* Automating scroll inside a page
* @param {Promise} page Promise of Browser page
*/
const autoScroll = async (page) => {
await page.evaluate(async () => {
await new Promise((resolve) => {
let totalHeight = 0;
const distance = 100;
const timer = setInterval(() => {
const scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight) {
clearInterval(timer);
resolve();
}
}, 100);
});
});
};
/**
* Fetch all profile links
* @param {Promise} page Promise of Browser page
* @param {Number} pagesToVisit Specifies the number of page to scrape (defaults to 2)
*/
const fetchProfileLinks = async (page, pagesToVisit = 2) => {
let profileLinks = [];
for (let pageNumber = 0; pageNumber < pagesToVisit; pageNumber++) {
await autoScroll(page);
//Fetch all profile links from the page
profileLinks.push(
...(await page.evaluate(() => {
//Multiple selectors for different displays of LinkedIn(see issue #20)
const profileListSelectors = [
".search-result__info .search-result__result-link",
".reusable-search__entity-results-list .entity-result__title-text a",
];
let profileListNodes = null;
for (
let profileListSelectorIndex = 0;
profileListSelectorIndex < profileListSelectors.length;
profileListSelectorIndex++
) {
//Break the loop where profile selector matches
if (
document.querySelectorAll(
profileListSelectors[profileListSelectorIndex]
).length > 0
) {
profileListNodes = document.querySelectorAll(
profileListSelectors[profileListSelectorIndex]
);
break;
}
}
if (profileListNodes) {
//Store and return profile links from nodes
let profiles = [];
profileListNodes.forEach((profile) => {
if (profile.href) {
// Remove query params from URL
profiles.push(profile.href.split("?")[0]);
}
});
return profiles;
}
}))
);
if (pageNumber < pagesToVisit - 1) {
//Click on next button on the bottom of the profiles page
await page.click(
".artdeco-pagination__button.artdeco-pagination__button--next"
);
await page.waitForNavigation();
}
}
return profileLinks;
};
/**
* Filter and return active employees (any activity withing 1 week)
* from all employees by visiting their activity page
* @param {Promise} page Promise of Browser page
* @param {Array.<String>} profileLinks A list of scraped profile links
* @param {Array.<String>} waitUntilOptions Puppeteer options
* @param {Number} numOfParallelTabs Number of profiles to visit in parallel tabs
*/
const fetchEachProfileActivityInParallel = async (
page,
profileLinks,
waitUntilOptions,
numOfParallelTabs = 5
) => {
return rxjs.from(profileLinks).pipe(
mergeMap(async (profileLink) => {
//Visit activity page
await page.goto(profileLink + "/detail/recent-activity", {
waitUntil: waitUntilOptions,
});
//Find time of last activities of a user(likes, comments, posts)
const individualActivities = await page.evaluate(() => {
let timeOfActivity = [];
const timeSelector =
"div.feed-shared-actor__meta.relative >" +
" span.feed-shared-actor__sub-description.t-12.t-black--light.t-normal" +
" > span > span.visually-hidden";
if (document.querySelectorAll(timeSelector)) {
document.querySelectorAll(timeSelector).forEach((item) => {
if (item.innerHTML) {
//Log all user activity within a week
if (
item.innerHTML.match(/[0-9] (minutes?|hours?|days?|week) ago/)
)
timeOfActivity.push(item.innerHTML);
}
});
}
return timeOfActivity;
});
//Return links to active employees
if (individualActivities.length) {
return profileLink;
} else {
return null;
}
}, numOfParallelTabs),
filter((profileLink) => !!profileLink),
toArray()
);
};
/**
* Save profile links to a JSON file
* @param {Array.<String>} activeEmployees List of active employees
*/
const saveProfiles = (activeEmployees) => {
const time = Date.now();
const fileName = `./output/${process.env.COMPANY}${time}.json`; // generate the a unique fileName for each run of the script
//Save all active employee profiles to a file
if (!fs.existsSync("./output")) {
// check for existing output directory, create it if necessary
fs.mkdirSync("./output");
}
const output = { activeProfiles: activeEmployees };
fs.appendFile(fileName, JSON.stringify(output, null, "\t"), (err) => {
if (err) throw err;
});
};
/**
* Scrape LinkedIn to find active users for a given company
* @param {{email: string, password: string, company: string}} data An object with login credentials and the company's LinkedIn handle
*/
const scrapeLinkedIn = async (data) => {
//Launch a chromium automated session
const browser = await puppeteer.launch({
headless: false,
dumpio: true,
args: ["--no-sandbox"],
});
const waitUntilOptions = ["domcontentloaded", "networkidle2"];
try {
//Open a new tab
const page = await browser.newPage();
//Page configurations
await page.setViewport({ width: 1200, height: 1200 });
page.setDefaultNavigationTimeout(0);
//Check if cookies are stored in cookie.json and use that data to skip login
const previousSession = fs.existsSync("./cookie.json");
if (previousSession) {
//Load the cookies
const cookiesArr = require(`.${"/cookie.json"}`);
if (cookiesArr.length !== 0) {
//Set each browser cookie
for (let cookie of cookiesArr) {
await page.setCookie(cookie);
}
console.log("Previous session loaded successfully!");
}
} else {
//Visit LinkedIn
await page.goto(`https://www.linkedin.com/`);
//Login to your account
await linkedinLogin(data.username, data.password, page);
}
try {
//Visit the company's page and find the list of employees
await page.goto(`https://www.linkedin.com/company/${data.company}`, {
waitUntil: waitUntilOptions,
});
//Visit all employees from the company's page
await page.click(
"a.ember-view.org-top-card-secondary-content__see-all-link"
);
} catch (e) {
console.error(
"Oops! An error occured while trying to find the company's page." +
"\n" +
"The reason for this error can be either the browser was closed while execution or you entered invalid data in env file." +
"\n" +
"Please check the LinkedIn handle of the company you're trying to find and your credentials and try again."
);
await browser.close();
}
await page.waitForNavigation();
//Fetch all profile links
const profileLinks = await fetchProfileLinks(page);
//Visit activity page and filter the list of active employees
const activeEmployeesObservable = await fetchEachProfileActivityInParallel(
page,
profileLinks,
waitUntilOptions
);
const activeEmployees = await rxjs.lastValueFrom(activeEmployeesObservable);
console.log("Active users : ", activeEmployees);
//Save profiles to a file
saveProfiles(activeEmployees);
await browser.close();
} catch (err) {
console.error("Oops! An error occured.");
console.error(err);
await browser.close();
}
};
scrapeLinkedIn({
username: process.env.EMAIL,
password: process.env.PASSWORD,
company: process.env.COMPANY,
});