Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SEO tweaks #491

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
6 changes: 6 additions & 0 deletions extension.json
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@
"ArticleViewHeader": {
"handler": "MirahezeMagicHooks"
},
"BeforePageDisplay": {
"handler": "MirahezeMagicHooks"
},
"BlockIpComplete": {
"handler": "MirahezeMagicHooks"
},
Expand Down Expand Up @@ -103,6 +106,9 @@
"GlobalUserPageWikis": {
"handler": "MirahezeMagicHooks"
},
"HtmlPageLinkRendererEnd": {
"handler": "MirahezeMagicHooks"
},
"ImportDumpJobAfterImport": {
"handler": "MirahezeMagicHooks"
},
Expand Down
120 changes: 120 additions & 0 deletions includes/Hooks.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
use MediaWiki\Extension\AbuseFilter\Hooks\AbuseFilterShouldFilterActionHook;
use MediaWiki\Extension\AbuseFilter\Variables\VariableHolder;
use MediaWiki\Extension\CentralAuth\User\CentralAuthUser;
use MediaWiki\Hook\BeforePageDisplayHook;
use MediaWiki\Hook\BlockIpCompleteHook;
colleirose marked this conversation as resolved.
Show resolved Hide resolved
use MediaWiki\Hook\ContributionsToolLinksHook;
use MediaWiki\Hook\GetLocalURL__InternalHook;
Expand All @@ -23,6 +24,7 @@
use MediaWiki\Hook\SkinAddFooterLinksHook;
use MediaWiki\Html\Html;
use MediaWiki\Http\HttpRequestFactory;
use MediaWiki\Linker\Hook\HtmlPageLinkRendererEndHook;
use MediaWiki\Linker\Linker;
use MediaWiki\MainConfigNames;
use MediaWiki\MediaWikiServices;
Expand Down Expand Up @@ -58,6 +60,7 @@
class Hooks implements
AbuseFilterShouldFilterActionHook,
ArticleViewHeaderHook,
BeforePageDisplayHook,
BlockIpCompleteHook,
ContributionsToolLinksHook,
CreateWikiDeletionHook,
Expand All @@ -67,6 +70,7 @@ class Hooks implements
CreateWikiTablesHook,
CreateWikiWritePersistentModelHook,
GetLocalURL__InternalHook,
HtmlPageLinkRendererEndHook,
ImportDumpJobAfterImportHook,
ImportDumpJobGetFileHook,
MessageCacheFetchOverridesHook,
Expand Down Expand Up @@ -148,6 +152,122 @@ public static function factory(
);
}

/**
* Add nofollow to redlinks to prevent search engines from following these links
* to save crawl budget because redlinks are noindex 404 pages
*
* Crawl budget is not a worry for most sites, but on Miraheze,
* wikis can have thousands of pages, making it a valid concern.
*
* @see https://github.com/marohh/mediawikiRemoveRedlinks/blob/master/includes/RemoveRedlinks.php
* @see https://ahrefs.com/blog/crawl-budget/
*
* @param LinkRenderer $linkRenderer The LinkRenderer object
* @param LinkTarget $target The target of the link
* @param boolean $isKnown Whether the page exists or not
* @param HtmlArmor|string $text The contents of the <a> tag
* @param string[] &$attribs Link attributes
* @param string &$ret The value to return if the hook returns false
*
* @return bool
*/
colleirose marked this conversation as resolved.
Show resolved Hide resolved
public function onHtmlPageLinkRendererEnd(
$linkRenderer,
$target,
$isKnown,
&$text,
&$attribs,
&$ret
) {
if ( $isKnown || $target->isExternal() ) {
return true;
}

$attribs['rel'] = 'nofollow';

return true;
}
colleirose marked this conversation as resolved.
Show resolved Hide resolved

/**
* Add noindex to some pages for SEO purposes. Indexing these pages is bad for SEO because
* it wastes crawl budget, so we'll reduce the indexing of such pages.
*
* @see https://ahrefs.com/blog/content-pruning/
* @see https://ahrefs.com/blog/crawl-budget/
* @see https://gitlab.com/hydrawiki/extensions/seo/-/blob/master/SEOHooks.php?ref_type=heads
colleirose marked this conversation as resolved.
Show resolved Hide resolved
*
* @param OutputPage $out The OutputPage object
* @param Skin $skin The Skin object that will be used to generate the page
*/
public function onBeforePageDisplay( $out, $skin ): void {

// If something has absolutely no value to someone searching on Google, then inclue it in $noIndexNamespaces
// For example, a wiki's CSS styles, a user's user page, the talk page of an article, etc. do not need to appear in Google,
// because they are not something you'd actually look up to find any kind of useful information.
//
// I am tempted to add templates and modules here, but I believe some wikis, like devwiki, may want their templates to be indexed.
// I also am not including every possible talk page here, because some wikis might have troubleshooting instructions or other interesting things there.
//
// In general, if you can think of any wiki where it might be useful to index a namespace, it's best not to add it here.
//
// TODO: Change ManageWiki so that some namespaces default to noindex, then remove this.

$noIndexNamespaces = [
NS_SPECIAL,
NS_CATEGORY_TALK,
NS_MEDIAWIKI,
NS_MEDIAWIKI_TALK,
NS_USER,
NS_USER_TALK,
NS_TALK,
NS_FILE_TALK,
NS_PROJECT_TALK,
NS_TEMPLATE_TALK
];

if ( in_array( $out->getTitle()->getNamespace(), $noIndexNamespaces ) ) {
$out->setRobotPolicy( 'noindex,nofollow' );
return;
}

$noIndexURLParamKeys = [
'action',
'curid',
'diff',
'from',
'group',
'mobileaction',
'oldid',
'printable',
'profile',
'redirect',
'redlink',
'stableid',
'veaction',
];

$noIndexURLParamKeyValuePairs = [
'feed' => [ 'rss' ],
'limit' => [ '500' ],
'title' => [
'Category:Noindexed_pages',
'Category:Hidden_categories',
],
];

foreach ( $out->getRequest()->getValues() as $key => $value ) {
if ( in_array( $key, $noIndexURLParamKeys ) ) {
$out->setRobotPolicy( 'noindex,nofollow' );
return;
}

if ( in_array( $value, $noIndexURLParamKeyValuePairs[$key] ?? [] ) ) {
$out->setRobotPolicy( 'noindex,nofollow' );
return;
}
}
}
colleirose marked this conversation as resolved.
Show resolved Hide resolved

/**
* Avoid filtering automatic account creation
*
Expand Down