url-sanitizer, thoughts on lookups?
Uzlopak opened this issue · 3 comments
Uzlopak commented
@ivan-tymoshenko
What do you think about such a construction?
Also the most expensive operation seems to be te charCodeAt()-call. So maybe if we loaded highCharCode already and skip it if we should not decode that character?
'use strict'
const highCharCodeDefault = new Array(65536)
for (let i = 0; i < 65536; ++i) {
highCharCodeDefault[i] = null
}
const decodeLookup = new Array(65536)
for (let i = 0; i < 65536; ++i) {
decodeLookup[i] = highCharCodeDefault
}
decodeLookup[50] = highCharCodeDefault.slice()
decodeLookup[50][53] = '%'
decodeLookup[50][51] = '#'
decodeLookup[50][52] = '$'
decodeLookup[50][54] = '&'
decodeLookup[50][66] = '+'
decodeLookup[50][98] = '+'
decodeLookup[50][67] = ','
decodeLookup[50][99] = ','
decodeLookup[50][70] = '/'
decodeLookup[50][102] = '/'
decodeLookup[51] = highCharCodeDefault.slice()
decodeLookup[51][65] = ':'
decodeLookup[51][97] = ':'
decodeLookup[51][66] = ';'
decodeLookup[51][98] = ';'
decodeLookup[51][68] = '='
decodeLookup[51][100] = '='
decodeLookup[51][70] = '?'
decodeLookup[51][102] = '?'
decodeLookup[52] = highCharCodeDefault.slice()
decodeLookup[52][48] = '@'
const shouldHighCharCodeDefault = new Array(65536)
for (let i = 0; i < 65536; ++i) {
shouldHighCharCodeDefault[i] = false
}
const shouldDecodeLookup = new Array(65536)
for (let i = 0; i < 65536; ++i) {
shouldDecodeLookup[i] = shouldHighCharCodeDefault
}
shouldDecodeLookup[50] = shouldHighCharCodeDefault.slice()
shouldDecodeLookup[50][53] = true // '%'
shouldDecodeLookup[50][51] = true // '#'
shouldDecodeLookup[50][52] = true // '$'
shouldDecodeLookup[50][54] = true // '&'
shouldDecodeLookup[50][66] = true // '+'
shouldDecodeLookup[50][98] = true // '+'
shouldDecodeLookup[50][67] = true // ','
shouldDecodeLookup[50][99] = true // ','
shouldDecodeLookup[50][70] = true // '/'
shouldDecodeLookup[50][102] = true // '/'
shouldDecodeLookup[51] = shouldHighCharCodeDefault.slice()
shouldDecodeLookup[51][65] = true // ':'
shouldDecodeLookup[51][97] = true // ':'
shouldDecodeLookup[51][66] = true // ';'
shouldDecodeLookup[51][98] = true // ';'
shouldDecodeLookup[51][68] = true // '='
shouldDecodeLookup[51][100] = true // '='
shouldDecodeLookup[51][70] = true // '?'
shouldDecodeLookup[51][102] = true // '?'
shouldDecodeLookup[52] = shouldHighCharCodeDefault.slice()
shouldDecodeLookup[52][48] = true // '@'
const shouldDecodeHighCharCodeLookup = new Array(65536)
for (let i = 0; i < 65536; ++i) {
shouldDecodeHighCharCodeLookup[i] = i === 50 || i === 51 || i === 52
}
const rfcNonConform = new Array(65536)
for (let i = 0; i < 65536; ++i) {
rfcNonConform[i] = i === 63 || i === 59 || i === 35
}
function safeDecodeURI (path) {
let shouldDecode = false
let shouldDecodeParam = false
let querystring = ''
for (let i = 1; i < path.length; i++) {
const charCode = path.charCodeAt(i)
if (charCode === 37) {
const highCharCode = path.charCodeAt(i + 1)
const lowCharCode = (shouldDecodeHighCharCodeLookup[highCharCode] && path.charCodeAt(i + 2)) || 0
if (shouldDecodeLookup[highCharCode][lowCharCode]) {
shouldDecodeParam = true
// %25 - encoded % char. We need to encode one more time to prevent double decoding
if (highCharCode === 50 && lowCharCode === 53) {
shouldDecode = true
path = path.slice(0, i + 1) + '25' + path.slice(i + 1)
i += 2
}
i += 2
} else {
shouldDecode = true
}
// Some systems do not follow RFC and separate the path and query
// string with a `;` character (code 59), e.g. `/foo;jsessionid=123456`.
// Thus, we need to split on `;` as well as `?` and `#`.
} else if (rfcNonConform[charCode]) {
querystring = path.slice(i + 1)
path = path.slice(0, i)
break
}
}
const decodedPath = shouldDecode ? decodeURI(path) : path
return { path: decodedPath, querystring, shouldDecodeParam }
}
function safeDecodeURIComponent (uriComponent) {
const startIndex = uriComponent.indexOf('%')
if (startIndex === -1) return uriComponent
let decoded = ''
let lastIndex = startIndex
const len = uriComponent.length
for (let i = startIndex; i < len; i++) {
if (uriComponent.charCodeAt(i) === 37) {
const highCharCode = uriComponent.charCodeAt(i + 1)
const lowCharCode = (shouldDecodeHighCharCodeLookup[highCharCode] && uriComponent.charCodeAt(i + 2)) || 0
const decodedChar = decodeLookup[highCharCode][lowCharCode]
decoded += uriComponent.slice(lastIndex, i) + decodedChar
lastIndex = i + 3
}
}
return uriComponent.slice(0, startIndex) + decoded + uriComponent.slice(lastIndex)
}
module.exports = { safeDecodeURI, safeDecodeURIComponent }
ivan-tymoshenko commented
It looks a bit expensive. And you still call the charCodeAt
for each char in the path, right?
for (let i = 1; i < path.length; i++) {
const charCode = path.charCodeAt(i) // <-- still have the call
if (charCode === 37) {
const highCharCode = path.charCodeAt(i + 1) // <-- this call is not really expansive, bc it's not a part of the hot path. It's called only when you have an encoded char in the path
Uzlopak commented
Maybe we can avoid the lowCharCode when highCharCode is not 50,51 or 52?
ivan-tymoshenko commented
I would say it doesn't worth it. It's a one additional charCodeAt
call when you meet an encoded char and it's not from the list. It's definitely not a hot path. Theoratically you can reduce this call, but it might kill the code readability.