Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exclude list for URLs and domains #7

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,29 @@ Option | Description | Default
`--ignore-query` | Ignore a query string | `false`
`--ignore-external` | Ignore all external links | `false`
`--ignore-nofollow` | Ignore `rel=nofollow` links | `false`
`--ignore-list` | A JSON file that contains a list of URLs and domains to be ignored | `none`
`--include-images` | Check `<img>` elements | `false`
`--slack-webhook` | Slack incoming webhook url | `none`
`--timeout` | Time to wait for response | `5000`
`--silent` | Run without printing progress line | `false`
`--help` | Output help text |

The `--ignore-list` option must contain a path to a `JSON` file that includes
URLs and domains to be ignored. The `JSON` object must have two properties:
`urls` and `domains`. They both have an array of strings as a value, for example:

```
{
"urls":[
"https://www.example.com/path/",
"https://foo.com/2"
],
"domains":[
"bar.com",
"www.mydomain.org"
]
}
```

### Examples

Expand All @@ -49,6 +66,7 @@ octopus www.deptagency.com
octopus www.awg-mode.de --ignore-external
octopus www.hardeck.de --ignore-query=isEnergyEfficiencyChartOpen --ignore-query=followSearch
octopus www.golfino.com --silent --slack-webhook=https://hooks.slack.com/services/XXX/XXX/XXX
octopus example.com --ignore-list=exclude.json
```


Expand Down
1 change: 1 addition & 0 deletions config/help.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"--ignore-query Ignore a custom search query (Default: false)",
"--ignore-external Ignore external links (Default: false)",
"--ignore-nofollow Ignore rel=nofollow links (Default: false)",
"--ignore-list Path to JSON file that contains a list list of URLs and domains to be ignored (Default: none)",
"--slack-webhook Slack incoming webhook url (Default: none)",
"--timeout Time to wait for the server response (Default: 5000)",
"--silent Run without printing progress line (Default: false)",
Expand Down
2 changes: 2 additions & 0 deletions config/mri.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
"silent": false,
"timeout": 5000,
"ignore-query": [],
"ignore-list": "",
"ignore-external": false,
"ignore-nofollow": false,
"slack-webhook": ""
},
"string": [
"timeout",
"ignore-query",
"ignore-list",
"slack-webhook"
],
"boolean": [
Expand Down
39 changes: 37 additions & 2 deletions lib/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ const prependHttp = require('prepend-http');
const cheerioLoad = require('cheerio')['load'];
const differenceBy = require('lodash.differenceby');
const windowWidth = require('term-size')()['columns'];
const fs = require('fs');

/**
* App defaults
Expand All @@ -27,12 +28,15 @@ let baseHost;
let crawledLinks = [];
let inboundLinks = [];
let brokenLinks = [];
let ignoreUrls = [];
let ignoreDomains = [];

/**
* CLI colors
*/
const COLOR_GRAY = '\x1b[90m';
const COLOR_GREEN = '\x1b[32m';
const COLOR_RED = '\x1b[31m';
const FORMAT_END = '\x1b[0m';

/**
Expand Down Expand Up @@ -67,6 +71,30 @@ const maxLength = windowWidth - 20;
require('draftlog').into(console);
console.stream = console.draft(EOL);

/**
* Read ignored URLs and domains from a JSON file
*/
const readIgnoreFile = () => {
const filePath = config['ignore-list'];
// eslint-disable-next-line
if (fs.existsSync(filePath)) {
try {
// eslint-disable-next-line
let rawdata = fs.readFileSync(filePath);
let json = JSON.parse(rawdata);
ignoreUrls = json.urls ? json.urls : [];
ignoreDomains = json.domains ? json.domains : [];
} catch ( error ) {
console.log(
justify('❌', null, 1),
COLOR_RED,
`ERROR: Invalid JSON object in "${filePath}" configuration file`,
FORMAT_END
);
process.exit(1);
}
}
};

/**
* Magic function for the brokenLinks object
Expand Down Expand Up @@ -261,11 +289,12 @@ const crawl = ( crawlUrl, referenceUrl = '' ) => {

// Parse url
const parsedLink = new URL(pageLink);

if (
( ! config['ignore-external'] || ( config['ignore-external'] && parsedLink.host === baseHost ) ) &&
( ! parsedLink.searchParams || ( parsedLink.searchParams && ! config['ignore-query'].filter(query => parsedLink.searchParams.get(query)).length ) ) &&
( ! inboundLinks.filter(item => item.requestUrl === pageLink).length )
( ! inboundLinks.filter(item => item.requestUrl === pageLink).length ) &&
( ! ignoreUrls.filter(item => item === pageLink).length ) &&
( ! ignoreDomains.filter(item => item === parsedLink.host).length )
) {
inboundLinks.push( {
'referenceUrl': requestUrl,
Expand Down Expand Up @@ -325,13 +354,19 @@ module.exports = (argv) => {
'ignore-external': Boolean(argv['ignore-external']),
'include-images': Boolean(argv['include-images']),
'slack-webhook': String(argv['slack-webhook']),
'ignore-list': String(argv['ignore-list']),
};

// Skip nofollow links
if ( argv['ignore-nofollow'] ) {
ignoreProtocols.push('[rel~="nofollow"]');
}

// Read ignore list
if ( argv['ignore-list'] ) {
readIgnoreFile();
}

// Base data
baseUrl = prependHttp(argv._[0], {https: true});
baseHost = new URL(baseUrl).host;
Expand Down