Skip to content

Commit

Permalink
Manage already rewritten URL in dynamic JS URL rewriting
Browse files Browse the repository at this point in the history
This change is needed to properly handle cases where the link is
relative, already rewritten and containing the "original hostname" in
the path
  • Loading branch information
benoit74 committed May 2, 2024
1 parent f2e4263 commit fe62acb
Show file tree
Hide file tree
Showing 6 changed files with 342 additions and 0 deletions.
87 changes: 87 additions & 0 deletions javascript/src/wombatSetup.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,51 @@ export function applyFuzzyRules(path) {
return path;
}

export function hasAlreadyBeenRewritten(
original_absolute_url,
orig_url,
uri,
url,
) {
// Detect (with a heuristic) that the path is most probably already rewritten and
// must be kept as-is. We just need to detect relative links (all statically rewritten
// links are relative) and contains a path including the hostname (which cannot be
// joined with the orig_url since if it includes the hostname, it means it is in
// another hostname than orig_url and will hence go one level too high in the path
// hierarchy, hence working only on ZIM paths / relative links).
// The heurisitic is:
// - the link must be relative and start by going at least one level up
// - the first non relative part of the path (i.e. not . or ..) looks like a hostname
// (i.e. it contains a dot)
// - the relative link, when merged with orig_url, is going exactly one "path level"
// too high in the hierarchy
if (typeof uri.scheme == 'undefined' && url.startsWith('../')) {
const urlParts = url.split('/');
const original_absolute_url1 = URI.resolve(
orig_url,
urlParts.slice(1).join('/'),
);
const original_absolute_url2 = URI.resolve(
orig_url,
urlParts.slice(2).join('/'),
);
// detect that relative link is going exactly one "path level" too high
if (
original_absolute_url1 == original_absolute_url &&
original_absolute_url2 != original_absolute_url
) {
const firstNonRelativePart = urlParts.find((urlPart) => urlPart !== '..');
// detect that first non relative part of the path looks like a hostname
if (firstNonRelativePart.indexOf('.') > -1) {
// if all 3 conditions are true, then we assume it has already been rewritten
return true;
}
}
}
// otherwise we don't know and assume it can be safely rewritten
return false;
}

export function urlRewriteFunction(
current_url, // The current (real) url we are on, e.g. http://library.kiwix.org/content/myzim_yyyy-mm/www.example.com/index.html
orig_host, // The host of the original url, e.g. www.example.com
Expand Down Expand Up @@ -61,6 +106,48 @@ export function urlRewriteFunction(
// present ; current URL does not allow to do it easily
const original_absolute_url = URI.resolve(orig_url, url);

// Detect if url has probably already been rewritten and return as-is in such a case
if (hasAlreadyBeenRewritten(original_absolute_url, orig_url, uri, url)) {
return url;
}

// Detect (with a heuristic) that the path is most probably already rewritten and
// must be kept as-is. We just need to detect relative links (all statically rewritten
// links are relative) and contains a path including the hostname (which cannot be
// joined with the orig_url since if it includes the hostname, it means it is in
// another hostname than orig_url and will hence go one level too high in the path
// hierarchy, hence working only on ZIM paths / relative links).
// The heurisitic is:
// - the link must be relative and start by going at least one level up
// - the first non relative part of the path (i.e. not . or ..) looks like a hostname
// (i.e. it contains a dot)
// - the relative link, when merged with orig_url, is going exactly one "path level"
// too high in the hierarchy
if (typeof uri.scheme == 'undefined' && url.startsWith('../')) {
const urlParts = url.split('/');
const original_absolute_url1 = URI.resolve(
orig_url,
urlParts.slice(1).join('/'),
);
const original_absolute_url2 = URI.resolve(
orig_url,
urlParts.slice(2).join('/'),
);
// detect that relative link is going exactly one "path level" too high
if (
original_absolute_url1 == original_absolute_url &&
original_absolute_url2 != original_absolute_url
) {
const firstNonRelativePart = urlParts.find((urlPart) => urlPart !== '..');
// detect that first non relative part of the path looks like a hostname
if (firstNonRelativePart.indexOf('.') > -1) {
// if all 3 conditions are true, then we do not rewrite the link at all,
// otherwise we continue with normal rewritting
return url;
}
}
}

// We now have to transform this absolute URI into a normalized ZIM path entry
const absolute_url_parts = URI.parse(original_absolute_url);

Expand Down
245 changes: 245 additions & 0 deletions javascript/test/wombatUrlRewriting.js
Original file line number Diff line number Diff line change
Expand Up @@ -882,3 +882,248 @@ test('alreadyRewritenUrlUTF8CharsNotEncoded', (t) => {
'http://library.kiwix.org/content/myzim_yyyy-mm/www.example.com/javascript/cont%F0%9F%8E%81nt.txt',
);
});

test('simpleContentDomainNameInPath1', (t) => {
t.is(
urlRewriteFunction(
t.context.currentUrl,
t.context.originalHost,
t.context.originalScheme,
t.context.originalUrl,
t.context.prefix,
'https://www.example.com/www.example.com/content.txt',
undefined,
undefined,
undefined,
),
'http://library.kiwix.org/content/myzim_yyyy-mm/www.example.com/www.example.com/content.txt',
);
});

test('simpleContentDomainNameInPath2', (t) => {
t.is(
urlRewriteFunction(
t.context.currentUrl,
t.context.originalHost,
t.context.originalScheme,
t.context.originalUrl,
t.context.prefix,
'https://www.example.com/javascript/www.example.com',
undefined,
undefined,
undefined,
),
'http://library.kiwix.org/content/myzim_yyyy-mm/www.example.com/javascript/www.example.com',
);
});

// URL has already been statically rewritten and originally had a query parameter
test('relAlreadyEncoded', (t) => {
t.is(
urlRewriteFunction(
t.context.currentUrl,
t.context.originalHost,
t.context.originalScheme,
t.context.originalUrl,
t.context.prefix,
'../javascript/content.txt%3Fquery%3Dvalue',
undefined,
undefined,
undefined,
),
'http://library.kiwix.org/content/myzim_yyyy-mm/www.example.com/javascript/content.txt%3Fquery%3Dvalue',
);
});

// this is an edge case where the URL has already been statically rewritten and is located
// on a different domain name => we do not touch it at all
test('relAnotherHostAlreadyRewritten', (t) => {
t.is(
urlRewriteFunction(
t.context.currentUrl,
t.context.originalHost,
t.context.originalScheme,
t.context.originalUrl,
t.context.prefix,
'../../anotherhost.com/javascript/content.txt',
undefined,
undefined,
undefined,
),
'../../anotherhost.com/javascript/content.txt',
);
});

// this is an edge case where the URL has already been statically rewritten and is located
// on a different domain name => we do not touch it at all
test('relAnotherHostAlreadyRewrittenRootPath', (t) => {
const documentPath = '/';
const originalUrl =
t.context.originalScheme + '://' + t.context.originalHost + documentPath;
const currentUrl = t.context.prefix + t.context.originalHost + documentPath;
t.is(
urlRewriteFunction(
currentUrl,
t.context.originalHost,
t.context.originalScheme,
originalUrl,
t.context.prefix,
'../anotherhost.com/javascript/content.txt',
undefined,
undefined,
undefined,
),
'../anotherhost.com/javascript/content.txt',
);
});

// this is an edge case where the URL has already been statically rewritten and is located
// on a different domain name => we do not touch it at all
test('relAnotherHostAlreadyRewrittenEmptyPath', (t) => {
const documentPath = '';
const originalUrl =
t.context.originalScheme + '://' + t.context.originalHost + documentPath;
const currentUrl = t.context.prefix + t.context.originalHost + documentPath;
t.is(
urlRewriteFunction(
currentUrl,
t.context.originalHost,
t.context.originalScheme,
originalUrl,
t.context.prefix,
'../anotherhost.com/javascript/content.txt',
undefined,
undefined,
undefined,
),
'../anotherhost.com/javascript/content.txt',
);
});

// this is an edge case where the URL has already been statically rewritten and is located
// on a different fuzzified domain name => we do not touch it at all
test('relAnotherFuzzifiedHostAlreadyRewritten', (t) => {
t.is(
urlRewriteFunction(
t.context.currentUrl,
t.context.originalHost,
t.context.originalScheme,
t.context.originalUrl,
t.context.prefix,
'../../youtube.fuzzy.replayweb.page/get_video_info%3Fvideo_id%3D123ah',
undefined,
undefined,
undefined,
),
'../../youtube.fuzzy.replayweb.page/get_video_info%3Fvideo_id%3D123ah',
);
});

// this is an edge case where the URL might looks like it has already been statically
// rewritten since it is going too up exactly by one level but it does not looks like a
// hostname at all => we rewrite it again
test('relTooUpNotLookingLikeAHostname', (t) => {
t.is(
urlRewriteFunction(
t.context.currentUrl,
t.context.originalHost,
t.context.originalScheme,
t.context.originalUrl,
t.context.prefix,
'../../javascript/content.txt', // this is too many .. ; at this stage it means host home folder
undefined,
undefined,
undefined,
),
'http://library.kiwix.org/content/myzim_yyyy-mm/www.example.com/javascript/content.txt',
);
});

// this is an edge case where the URL might looks like it has already been statically
// rewritten since it is going too up exactly by one level but it does not looks like a
// hostname at all => we rewrite it again
test('relTooUpNotLookingLikeAHostnameRootPath', (t) => {
const documentPath = '/';
const originalUrl =
t.context.originalScheme + '://' + t.context.originalHost + documentPath;
const currentUrl = t.context.prefix + t.context.originalHost + documentPath;
t.is(
urlRewriteFunction(
currentUrl,
t.context.originalHost,
t.context.originalScheme,
originalUrl,
t.context.prefix,
'../javascript/content.txt',
undefined,
undefined,
undefined,
),
'http://library.kiwix.org/content/myzim_yyyy-mm/www.example.com/javascript/content.txt',
);
});

// this is an edge case where the URL might looks like it has already been statically
// rewritten since it is going too up exactly by one level but it does not looks like a
// hostname at all => we rewrite it again
test('relTooUpNotLookingLikeAHostnameEmptyPath', (t) => {
const documentPath = '';
const originalUrl =
t.context.originalScheme + '://' + t.context.originalHost + documentPath;
const currentUrl = t.context.prefix + t.context.originalHost + documentPath;
t.is(
urlRewriteFunction(
currentUrl,
t.context.originalHost,
t.context.originalScheme,
originalUrl,
t.context.prefix,
'../javascript/content.txt',
undefined,
undefined,
undefined,
),
'http://library.kiwix.org/content/myzim_yyyy-mm/www.example.com/javascript/content.txt',
);
});

// this is an edge case where the URL might looks like it has already been statically
// rewritten and is located on a different domain name but it is going to way up in the
// hierarchy so it is most probably not really rewritten yet => we rewrite it again
test('relNotReallyAnotherHostAlreadyRewrittenTooUp', (t) => {
t.is(
urlRewriteFunction(
t.context.currentUrl,
t.context.originalHost,
t.context.originalScheme,
t.context.originalUrl,
t.context.prefix,
'../../../anotherhost.com/javascript/content.txt',
undefined,
undefined,
undefined,
),
'http://library.kiwix.org/content/myzim_yyyy-mm/www.example.com/anotherhost.com/javascript/content.txt',
);
});

// this is an edge case where the URL might looks like it has already been statically
// rewritten and is located on a different domain name but it is going no enough way up
// in the hierarchy so it is most probably not really rewritten yet => we rewrite it
// again
test('relNotReallyAnotherHostAlreadyRewrittenNotUpEnough', (t) => {
t.is(
urlRewriteFunction(
t.context.currentUrl,
t.context.originalHost,
t.context.originalScheme,
t.context.originalUrl,
t.context.prefix,
'../anotherhost.com/javascript/content.txt',
undefined,
undefined,
undefined,
),
'http://library.kiwix.org/content/myzim_yyyy-mm/www.example.com/anotherhost.com/javascript/content.txt',
);
});
5 changes: 5 additions & 0 deletions test-website/content/javascript.html
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,10 @@ <h2>Various Javascript</h2>
09d. Simple module Javascript with importmap required but nasty URL used in JS: <span class="status" id="span09d">This is not working</span>
</p>

<p>
10. Rewritting image src dynamically based on already statically rewritten link: <img id="img10" src="https://not_standard_netloc_punny_encoded/javascript/not_working.png"></img>
</p>

<script>
fetch("https://standard_netloc/javascript/content.txt")
.then(function(response) {
Expand Down Expand Up @@ -425,6 +429,7 @@ <h2>Various Javascript</h2>
<script type="module" src="./javascript/script09b.js"></script>
<script type="module" src="./javascript/script09c.js"></script>
<script type="module" src="./javascript/script09d.js"></script>
<script src="./javascript/script10.js"></script>

</body>

Expand Down
Binary file added test-website/content/javascript/not_working.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 5 additions & 0 deletions test-website/content/javascript/script10.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
const img10 = document.getElementById('img10');
const origSrc = img10.getAttribute('src')
const newSrc = origSrc.replace('not_working', 'working')
console.debug('Replacing ' + origSrc + ' with ' + newSrc)
img10.src = newSrc;
Binary file added test-website/content/javascript/working.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit fe62acb

Please sign in to comment.