url-sanitizer, thoughts on lookups?

Question

url-sanitizer, thoughts on lookups?

Uzlopak opened this issue 2 years ago · 3 comments

@ivan-tymoshenko
What do you think about such a construction?

Also the most expensive operation seems to be te charCodeAt()-call. So maybe if we loaded highCharCode already and skip it if we should not decode that character?

'use strict'

const highCharCodeDefault = new Array(65536)
for (let i = 0; i < 65536; ++i) {
  highCharCodeDefault[i] = null
}

const decodeLookup = new Array(65536)
for (let i = 0; i < 65536; ++i) {
  decodeLookup[i] = highCharCodeDefault
}

decodeLookup[50] = highCharCodeDefault.slice()
decodeLookup[50][53] = '%'
decodeLookup[50][51] = '#'
decodeLookup[50][52] = '$'
decodeLookup[50][54] = '&'
decodeLookup[50][66] = '+'
decodeLookup[50][98] = '+'
decodeLookup[50][67] = ','
decodeLookup[50][99] = ','
decodeLookup[50][70] = '/'
decodeLookup[50][102] = '/'

decodeLookup[51] = highCharCodeDefault.slice()
decodeLookup[51][65] = ':'
decodeLookup[51][97] = ':'
decodeLookup[51][66] = ';'
decodeLookup[51][98] = ';'
decodeLookup[51][68] = '='
decodeLookup[51][100] = '='
decodeLookup[51][70] = '?'
decodeLookup[51][102] = '?'

decodeLookup[52] = highCharCodeDefault.slice()
decodeLookup[52][48] = '@'

const shouldHighCharCodeDefault = new Array(65536)
for (let i = 0; i < 65536; ++i) {
  shouldHighCharCodeDefault[i] = false
}

const shouldDecodeLookup = new Array(65536)
for (let i = 0; i < 65536; ++i) {
  shouldDecodeLookup[i] = shouldHighCharCodeDefault
}

shouldDecodeLookup[50] = shouldHighCharCodeDefault.slice()
shouldDecodeLookup[50][53] = true // '%'
shouldDecodeLookup[50][51] = true // '#'
shouldDecodeLookup[50][52] = true // '$'
shouldDecodeLookup[50][54] = true // '&'
shouldDecodeLookup[50][66] = true // '+'
shouldDecodeLookup[50][98] = true // '+'
shouldDecodeLookup[50][67] = true // ','
shouldDecodeLookup[50][99] = true // ','
shouldDecodeLookup[50][70] = true // '/'
shouldDecodeLookup[50][102] = true // '/'

shouldDecodeLookup[51] = shouldHighCharCodeDefault.slice()
shouldDecodeLookup[51][65] = true // ':'
shouldDecodeLookup[51][97] = true // ':'
shouldDecodeLookup[51][66] = true // ';'
shouldDecodeLookup[51][98] = true // ';'
shouldDecodeLookup[51][68] = true // '='
shouldDecodeLookup[51][100] = true // '='
shouldDecodeLookup[51][70] = true // '?'
shouldDecodeLookup[51][102] = true // '?'

shouldDecodeLookup[52] = shouldHighCharCodeDefault.slice()
shouldDecodeLookup[52][48] = true // '@'

const shouldDecodeHighCharCodeLookup = new Array(65536)
for (let i = 0; i < 65536; ++i) {
  shouldDecodeHighCharCodeLookup[i] = i === 50 || i === 51 || i === 52
}

const rfcNonConform = new Array(65536)
for (let i = 0; i < 65536; ++i) {
  rfcNonConform[i] = i === 63 || i === 59 || i === 35
}

function safeDecodeURI (path) {
  let shouldDecode = false
  let shouldDecodeParam = false

  let querystring = ''

  for (let i = 1; i < path.length; i++) {
    const charCode = path.charCodeAt(i)

    if (charCode === 37) {
      const highCharCode = path.charCodeAt(i + 1)

      const lowCharCode = (shouldDecodeHighCharCodeLookup[highCharCode] && path.charCodeAt(i + 2)) || 0

      if (shouldDecodeLookup[highCharCode][lowCharCode]) {
        shouldDecodeParam = true
        // %25 - encoded % char. We need to encode one more time to prevent double decoding
        if (highCharCode === 50 && lowCharCode === 53) {
          shouldDecode = true
          path = path.slice(0, i + 1) + '25' + path.slice(i + 1)
          i += 2
        }
        i += 2
      } else {
        shouldDecode = true
      }
      // Some systems do not follow RFC and separate the path and query
      // string with a `;` character (code 59), e.g. `/foo;jsessionid=123456`.
      // Thus, we need to split on `;` as well as `?` and `#`.
    } else if (rfcNonConform[charCode]) {
      querystring = path.slice(i + 1)
      path = path.slice(0, i)
      break
    }
  }
  const decodedPath = shouldDecode ? decodeURI(path) : path
  return { path: decodedPath, querystring, shouldDecodeParam }
}

function safeDecodeURIComponent (uriComponent) {
  const startIndex = uriComponent.indexOf('%')
  if (startIndex === -1) return uriComponent

  let decoded = ''
  let lastIndex = startIndex
  const len = uriComponent.length

  for (let i = startIndex; i < len; i++) {
    if (uriComponent.charCodeAt(i) === 37) {
      const highCharCode = uriComponent.charCodeAt(i + 1)

      const lowCharCode = (shouldDecodeHighCharCodeLookup[highCharCode] && uriComponent.charCodeAt(i + 2)) || 0

      const decodedChar = decodeLookup[highCharCode][lowCharCode]
      decoded += uriComponent.slice(lastIndex, i) + decodedChar

      lastIndex = i + 3
    }
  }
  return uriComponent.slice(0, startIndex) + decoded + uriComponent.slice(lastIndex)
}

module.exports = { safeDecodeURI, safeDecodeURIComponent }

Answer 1 · 2023-02-08T14:30:14.000Z

It looks a bit expensive. And you still call the charCodeAt for each char in the path, right?

  for (let i = 1; i < path.length; i++) {
    const charCode = path.charCodeAt(i) // <-- still have the call 

    if (charCode === 37) {
      const highCharCode = path.charCodeAt(i + 1) // <-- this call is not really expansive, bc it's not a part of the hot path. It's called only when you have an encoded char in the path

Answer 2 · 2023-02-08T23:09:16.000Z

Maybe we can avoid the lowCharCode when highCharCode is not 50,51 or 52?

Answer 3 · 2023-02-09T07:56:11.000Z

I would say it doesn't worth it. It's a one additional charCodeAt call when you meet an encoded char and it's not from the list. It's definitely not a hot path. Theoratically you can reduce this call, but it might kill the code readability.