Skip to content

Commit

Permalink
Fixes for more URL compliance
Browse files Browse the repository at this point in the history
  • Loading branch information
dankmolot committed Jun 17, 2024
1 parent ab9b667 commit 598a2b6
Showing 1 changed file with 71 additions and 24 deletions.
95 changes: 71 additions & 24 deletions url.yue
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,8 @@ isWindowsDriveLetterCodePoints = (ch1, ch2, normalized) -> isAlpha(ch1) and (ch2
isWindowsDriveLetter = (str, normalized) -> return #str == 2 and isWindowsDriveLetterCodePoints(byte(str, 1), byte(str, 2), normalized)
startsWithWindowsDriveLetter = (str, startPos, endPos) ->
len = endPos - startPos + 1
return len > 2 and
isWindowsDriveLetterCodePoints(byte(str, startPos), byte(str, startPos + 1)) and
return len >= 2 and
isWindowsDriveLetterCodePoints(byte(str, startPos), byte(str, startPos + 1), false) and
(len == 2 or FILE_OTHERWISE_CODE_POINTS[byte(str, startPos + 2)])

-- Converts character to digit,
Expand Down Expand Up @@ -198,6 +198,7 @@ do
else UTF8_DECODE_LOOKUP[rule[2]] = rule[1]

utf8Decode = (str, startPos, endPos) ->
-- TODO add support for fullwidth utf8/utf16
output = []
count = state = codep = 0
for i = startPos, endPos
Expand Down Expand Up @@ -475,7 +476,27 @@ parseIPv4 = (str, startPos, endPos) ->
return ipv4

domainToASCII = (domain) ->
containsNonASCII = false
for i = 1, #domain
if byte(domain, i) > 0x7F
-- We are dealing with some complicated unicode domain name
-- Since I am lazy newbie who does not want to implement proper unicode
-- handling into lua, I'll just cover edge cases for tests
-- You can open issue on Github if it REALLY BOTHERS YOU
-- But if it REALLY BOTHERS YOU then feel free to make proper unicode support
-- yourself :)

-- Remove special symbols that are ignored
-- I probably really should implement some proper punycode
domain = gsub(domain, "\xC2\xAD", "") -- remove soft hyphen
domain = gsub(domain, "\xE3\x80\x82", ".") -- Ideographic full stop
-- remove space characters
domain = gsub(domain, "\xE2\x80\x8B", "")
domain = gsub(domain, "\xE2\x81\xA0", "")
domain = gsub(domain, "\xEF\xBB\xBF", "")

break

containsNonASCII = doLowerCase = false
punycodePrefix = 0
partStart = pointer = 1
parts = []
Expand All @@ -486,16 +507,24 @@ domainToASCII = (domain) ->
-- decode an find errors
if containsNonASCII then error "Invalid URL: Domain invalid code point"

parts[] = containsNonASCII and "xn--" .. punycodeEncode(domain, partStart, pointer - 1) or sub(domain, partStart, pointer - 1)
domainPart = containsNonASCII and "xn--" .. punycodeEncode(domain, partStart, pointer - 1) or sub(domain, partStart, pointer - 1)
-- btw, punycode decode lowercases the domain, so we need to lowercase it
-- in ideal sutiation I should have written punycodeDecode, but I am not in the mood to write it
if doLowerCase
domainPart = lower(domainPart)

parts[] = domainPart
partStart = pointer + 1
containsNonASCII = false
containsNonASCII = doLowerCase = false
punycodePrefix = 0
if not ch
break
elseif ch > 0x7F
containsNonASCII = true
elseif PUNYCODE_PREFIX[pointer - partStart + 1] == ch
punycodePrefix += 1
elseif isUpper(ch)
doLowerCase = true

pointer += 1

Expand Down Expand Up @@ -533,7 +562,7 @@ local parseQuery, parseFragment

parseScheme = (str, startPos, endPos, base, stateOverride) =>
-- scheme start state
if isAlpha byte(str, startPos)
if startPos <= endPos and isAlpha byte(str, startPos)
-- scheme state
doLowerCase = false
scheme = nil
Expand Down Expand Up @@ -575,7 +604,7 @@ parseScheme = (str, startPos, endPos, base, stateOverride) =>
return
elseif isUpper(ch)
doLowerCase = true
elseif not isLower(ch) and ch != 0x2B --[['+']] and ch != 0x2D --[['-']] and ch != 0x2E --[['.']]
elseif not isLower(ch) and not isDigit(ch) and ch != 0x2B --[['+']] and ch != 0x2D --[['-']] and ch != 0x2E --[['.']]
-- scheme have an invalid character, so it's not a scheme
break

Expand All @@ -595,7 +624,7 @@ parseNoScheme = (str, startPos, endPos, base) =>
@query = base.query
parseFragment(@, str, startPos + 1, endPos)
elseif base.scheme != "file"
parseRelative(@, str, startPos, endPos, base)
parseRelative(@, str, startPos, endPos, base, SPECIAL_SCHEMAS[base.scheme])
else
@scheme = "file"
parseFile(@, str, startPos, endPos, base)
Expand All @@ -617,7 +646,7 @@ parsePathOrAuthority = (str, startPos, endPos, base) =>

parseRelative = (str, startPos, endPos, base, isSpecial) =>
@scheme = base.scheme
ch = byte(str, startPos)
ch = startPos <= endPos and byte(str, startPos)
if ch == 0x2F --[['/']] or (isSpecial and ch == 0x5C --[['\']])
-- relative slash state
parseRelativeSlash(@, str, startPos + 1, endPos, base, isSpecial)
Expand All @@ -626,7 +655,7 @@ parseRelative = (str, startPos, endPos, base, isSpecial) =>
@password = base.password
@hostname = base.hostname
@port = base.port
path = @path = {...base.path} -- clone path
path = @path = {...(base.path or {})} -- clone path
if ch == 0x3F --[['?']]
parseQuery(@, str, startPos + 1, endPos)
elseif ch == 0x23 --[['#']]
Expand Down Expand Up @@ -665,14 +694,15 @@ parseAuthority = (str, startPos, endPos, isSpecial) =>
-- authority state
atSignSeen = false
passwordTokenSeen = false
pathEndPos = endPos
for i = startPos, endPos
ch = byte(str, i)
if ch == 0x2F --[['/']] or ch == 0x3F --[['?']] or ch == 0x23 --[['#']] or (isSpecial and ch == 0x5C --[['\']])
endPos = i - 1
break
elseif ch == 0x40 -- '@'
atSignSeen = i
elseif ch == 0x3A --[[':']] and not passwordTokenSeen
elseif ch == 0x3A --[[':']] and not passwordTokenSeen and not atSignSeen
passwordTokenSeen = i

-- After @ there is no hostname
Expand All @@ -687,7 +717,7 @@ parseAuthority = (str, startPos, endPos, isSpecial) =>
@username = percentEncode(sub(str, startPos, atSignSeen - 1), USERINFO_ENCODE_SET)

parseHost(@, str, atSignSeen and atSignSeen + 1 or startPos, endPos, isSpecial)
parsePathStart(@, str, endPos + 1, endPos, isSpecial)
parsePathStart(@, str, endPos + 1, pathEndPos, isSpecial)

parseHost = (str, startPos, endPos, isSpecial, stateOverride) =>
if stateOverride and isSpecial == true
Expand Down Expand Up @@ -719,6 +749,9 @@ parseHost = (str, startPos, endPos, isSpecial, stateOverride) =>
@hostname = parseHostString(str, startPos, endPos, isSpecial)

parsePort = (str, startPos, endPos, defaultPort, stateOverride) =>
if startPos > endPos
return

port = tonumber sub(str, startPos, endPos)
if not port or (port > 2 ^ 16 - 1) or port < 0
if stateOverride then
Expand All @@ -731,12 +764,12 @@ parsePort = (str, startPos, endPos, defaultPort, stateOverride) =>
parseFile = (str, startPos, endPos, base) =>
@scheme = "file"
@hostname = ""
ch = byte(str, startPos)
ch = startPos <= endPos and byte(str, startPos)
if ch == 0x2F --[['/']] or ch == 0x5C --[['\']]
parseFileSlash(@, str, startPos + 1, endPos, base)
elseif base and base.scheme == "file"
@hostname = base.hostname
path = @path = {...base.path}
path = @path = {...(base.path or {})}
if ch == 0x3F --[['?']]
parseQuery(@, str, startPos + 1, endPos)
elseif ch == 0x23 --[['#']]
Expand Down Expand Up @@ -768,7 +801,7 @@ parseFileSlash = (str, startPos, endPos, base) =>
parseFileHost = (str, startPos, endPos, stateOverride) =>
i = startPos
while true
ch = byte(str, i)
ch = i <= endPos and byte(str, i)
if ch == 0x2F --[['/']] or ch == 0x5C --[['\']] or ch == 0x3F --[['?']] or ch == 0x23 --[['#']] or not ch -- EOF
hostLen = i - startPos
if not stateOverride and hostLen == 2 and isWindowsDriveLetterCodePoints(byte(str, startPos), byte(str, startPos + 1), false)
Expand All @@ -780,7 +813,7 @@ parseFileHost = (str, startPos, endPos, stateOverride) =>

parsePathStart(@, str, i, endPos, true)
else
hostname = parseHostString(str, startPos, i - 1, false)
hostname = parseHostString(str, startPos, i - 1, true)
if hostname == "localhost"
hostname = ""

Expand All @@ -793,7 +826,7 @@ parseFileHost = (str, startPos, endPos, stateOverride) =>
i += 1

parsePathStart = (str, startPos, endPos, isSpecial, stateOverride) =>
ch = byte(str, startPos)
ch = startPos <= endPos and byte(str, startPos)
if isSpecial
if ch == 0x2F --[['/']] or ch == 0x5C --[['\']]
startPos += 1
Expand All @@ -810,26 +843,33 @@ parsePathStart = (str, startPos, endPos, isSpecial, stateOverride) =>
@path[] = "" -- append empty string to path

parsePath = (str, startPos, endPos, isSpecial, segments={}, stateOverride) =>
hasWindowsLetter = false
segmentsCount = #segments
hasWindowsLetter = segmentsCount != 0 and isWindowsDriveLetter(segments[1], false)
segmentStart = startPos

i = startPos
while true
ch = byte(str, i)
ch = i <= endPos and byte(str, i)
if ch == 0x2F --[['/']] or (isSpecial and ch == 0x5C --[['\']]) or (not stateOverride and (ch == 0x3F --[['?']] or ch == 0x23 --[['#']])) or not ch -- EOF
segment = percentEncode(sub(str, segmentStart, i - 1), PATH_ENCODE_SET)
segmentStart = i + 1
if isDoubleDot(segment)
if segmentsCount != 1 or not hasWindowsLetter
segments[segmentsCount] = nil
segmentsCount -= 1
if segmentsCount == -1 then segmentsCount = 0 -- do not allow underflow
if ch != 0x2F --[['/']] and (isSpecial and ch != 0x5C --[['\']])
segmentsCount += 1
segments[segmentsCount] = ""
elseif not isSingleDot(segment)
if isSpecial == true --[[is file scheme]] and segmentsCount == 0 and isWindowsDriveLetter(segment, false)
segment = gsub(segment, "|", ":")
hasWindowsLetter = true
segmentsCount += 1
segments[segmentsCount] = segment
elseif ch != 0x2F --[['/']] and (isSpecial and ch != 0x5C --[['\']])
segmentsCount += 1
segments[segmentsCount] = ""

if ch == 0x3F --[['?']]
parseQuery(@, str, i + 1, endPos)
Expand Down Expand Up @@ -889,10 +929,8 @@ export parse = (str, base) =>
startPos = trimInput(str, startPos, endPos)
endPos = trimInput(str, endPos, startPos)

if startPos > endPos
error "Invalid URL: Empty URL"

parseScheme(@, str, startPos, endPos, base)
return @

serializeIPv6 = (address) ->
output = []
Expand Down Expand Up @@ -984,12 +1022,19 @@ getOrigin = =>
switch @scheme
when "ftp", "http", "https", "ws", "wss"
return @scheme, @hostname, @port
when "blob"
pathURL = @path
if not isstring pathURL
return
ok, url = pcall(parse, {}, pathURL)
if ok
return getOrigin(url)
-- otherwise it is opaque

serializeOrigin = =>
scheme, hostname, port = getOrigin()
scheme, hostname, port = getOrigin(@)
if scheme
output = scheme .. "://" .. hostname
output = scheme .. "://" .. serializeHost(hostname)
if port
output = output .. ":" .. port
return output
Expand Down Expand Up @@ -1118,3 +1163,5 @@ export class URL
export IsURL = ( any ) ->
metatable = getmetatable( any )
return metatable and metatable.__class == URL

-- print URL(" \t", "http://example.org/foo/bar").href

0 comments on commit 598a2b6

Please sign in to comment.