Skip to content

Commit

Permalink
Merge pull request #546 from apifytech/fix/cheerio-crawler-session
Browse files Browse the repository at this point in the history
fixed passing session id to proxyUrl.
  • Loading branch information
mnmkng authored Jan 8, 2020
2 parents 83b3318 + 73c903b commit 35d769a
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 13 deletions.
8 changes: 5 additions & 3 deletions docs/api/CheerioCrawler.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ await crawler.run();
request: Request,
contentType: Object, // Parsed Content-Type header: { type, encoding }
response: Object // An instance of Node's http.IncomingMessage object,
autoscaledPool: AutoscaledPool
autoscaledPool: AutoscaledPool,
session: Session
}</code></pre><p> Type of <code>body</code> depends on web page <code>Content-Type</code> header.</p>
<ul>
<li><p>String for <code>text/html</code>, <code>application/xhtml+xml</code>, <code>application/xml</code> mime types</p>
Expand Down Expand Up @@ -169,8 +170,9 @@ The exceptions are logged to the request using the
This function is suitable for setting dynamic properties such as cookies to the <a href="request"><code>Request</code></a>.</p>
<p> The function receives the following object as an argument:</p>
<pre><code>{
request: Request
}</code></pre><p> where the <a href="request"><code>Request</code></a> instance corresponds to the initialized request.</p>
request: Request,
session: Session
}</code></pre><p> where the <a href="request"><code>Request</code></a> instance corresponds to the initialized request and the <a href="session"><code>Session</code></a> instance corresponds to used session.</p>
<p> The function should modify the properties of the passed <a href="request"><code>Request</code></a> instance
in place because there are already earlier references to it. Making a copy and returning it from
this function is therefore not supported, because it would create inconsistencies where
Expand Down
28 changes: 18 additions & 10 deletions src/crawlers/cheerio_crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import BasicCrawler from './basic_crawler';
import { addTimeoutToPromise, parseContentTypeFromResponse } from '../utils';
import { getApifyProxyUrl } from '../actor';
import { BASIC_CRAWLER_TIMEOUT_MULTIPLIER } from '../constants';
import { requestAsBrowser } from '../utils_request';
import * as utilsRequest from '../utils_request';
import { TimeoutError } from '../errors';

/**
Expand Down Expand Up @@ -133,7 +133,8 @@ const DEFAULT_OPTIONS = {
* request: Request,
* contentType: Object, // Parsed Content-Type header: { type, encoding }
* response: Object // An instance of Node's http.IncomingMessage object,
* autoscaledPool: AutoscaledPool
* autoscaledPool: AutoscaledPool,
* session: Session
* }
* ```
* Type of `body` depends on web page `Content-Type` header.
Expand Down Expand Up @@ -191,10 +192,11 @@ const DEFAULT_OPTIONS = {
* The function receives the following object as an argument:
* ```
* {
* request: Request
* request: Request,
* session: Session
* }
* ```
* where the {@link Request} instance corresponds to the initialized request.
* where the {@link Request} instance corresponds to the initialized request and the {@link Session} instance corresponds to used session.
*
* The function should modify the properties of the passed {@link Request} instance
* in place because there are already earlier references to it. Making a copy and returning it from
Expand Down Expand Up @@ -327,6 +329,10 @@ class CheerioCrawler {
throw new Error('Cannot use "options.persistCookiesPerSession" without "options.useSessionPool"');
}

if (apifyProxySession && useSessionPool) {
throw new Error('Cannot use "options.apifyProxySession" with "options.useSessionPool"');
}

this.supportedMimeTypes = new Set(DEFAULT_MIME_TYPES);
if (additionalMimeTypes.length) this._extendSupportedMimeTypes(additionalMimeTypes);

Expand Down Expand Up @@ -450,11 +456,11 @@ class CheerioCrawler {
headers.Cookie = session.getCookieString(request.url);
}

const opts = this._getRequestOptions(request);
const opts = this._getRequestOptions(request, session);
let responseStream;

try {
responseStream = await requestAsBrowser(opts);
responseStream = await utilsRequest.requestAsBrowser(opts);
} catch (e) {
if (e instanceof TimeoutError) {
this._handleRequestTimeout(session);
Expand Down Expand Up @@ -494,15 +500,16 @@ class CheerioCrawler {
/**
* Combines the provided `requestOptions` with mandatory (non-overridable) values.
* @param {Request} request
* @param {Session?} session
* @ignore
*/
_getRequestOptions(request) {
_getRequestOptions(request, session) {
const mandatoryRequestOptions = {
url: request.url,
method: request.method,
headers: Object.assign({}, request.headers),
ignoreSslErrors: this.ignoreSslErrors,
proxyUrl: this._getProxyUrl(),
proxyUrl: this._getProxyUrl(session),
stream: true,
useCaseSensitiveHeaders: true,
abortFunction: (res) => {
Expand Down Expand Up @@ -533,13 +540,14 @@ class CheerioCrawler {
/**
* Enables the use of a proxy by returning a proxy URL
* based on configured options or null if no proxy is used.
* @param {Session?} session
* @ignore
*/
_getProxyUrl() {
_getProxyUrl(session = {}) {
if (this.useApifyProxy) {
return getApifyProxyUrl({
groups: this.apifyProxyGroups,
session: this.apifyProxySession,
session: session.id || this.apifyProxySession,
groupsParamName: 'options.apifyProxyGroups',
sessionParamName: 'options.apifyProxySession',
});
Expand Down
35 changes: 35 additions & 0 deletions test/crawlers/cheerio_crawler.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ import log from 'apify-shared/log';
import { ENV_VARS } from 'apify-shared/consts';
import express from 'express';
import bodyParser from 'body-parser';
import sinon from 'sinon';
import Apify from '../../build';
import { sleep } from '../../build/utils';
import { Session } from '../../build/session_pool/session';
import { STATUS_CODES_BLOCKED } from '../../build/constants';
import LocalStorageDirEmulator from '../local_storage_dir_emulator';
import * as utilsRequest from '../../build/utils_request';

// Add common props to mocked request responses.
const responseMock = {
Expand Down Expand Up @@ -960,5 +962,38 @@ describe('CheerioCrawler', () => {
});
await cheerioCrawler.run();
});

test('should use sessionId in proxyUrl when the session pool is enabled', async () => {
const sourcesNew = [
{ url: 'http://example.com/?q=1' },
];
process.env[ENV_VARS.PROXY_PASSWORD] = 'abc123';

const requestListNew = new Apify.RequestList({ sources: sourcesNew });
let sessionUsed;
let requestUsed;
const handlePageFunction = async ({ session }) => {
sessionUsed = session;
};
const oldCall = utilsRequest.requestAsBrowser;
const fakeCall = async (opt) => {
requestUsed = opt;
return oldCall(opt);
};
const stub = sinon.stub(utilsRequest, 'requestAsBrowser').callsFake(fakeCall);
const cheerioCrawler = new Apify.CheerioCrawler({
requestList: requestListNew,
maxConcurrency: 1,
handlePageFunction,
useSessionPool: true,
useApifyProxy: true,
});

await requestListNew.initialize();
await cheerioCrawler.run();

expect(requestUsed.proxyUrl.includes(sessionUsed.id)).toBeTruthy();
stub.restore();
});
});
});

0 comments on commit 35d769a

Please sign in to comment.