paws-r/paws

Consider using stringi as dependency

DyfanJones opened this issue · 1 comments

json_convert_string <- function(string) {
  replace <- list(
    c("\\", "\\\\"),
    c('"', '\\"'),
    c("\b", "\\b"),
    c("\f", "\\f"),
    c("\r", "\\r"),
    c("\t", "\\t"),
    c("\n", "\\n")
  )
  for (elem in replace) {
    string <- gsub(elem[1], elem[2], string, fixed = TRUE)
  }
  string <- json_escape_unicode(string)
  string <- sprintf('"%s"', string)
  return(string)
}

json_escape_unicode <- function(string) {
  from <- intToUtf8(1:31, multiple = T)
  to <- paste0("\\u00", format(as.hexmode(1:31), width = 2))
  for (i in 1:31) {
    string <- gsub(from[i], to[i], string, fixed = TRUE)
  }
  return(string)
}

json_convert_string_stringi <- function(string) {
  from <- c("\\", '"', "\b", "\f", "\r", "\t", "\n")
  to <- c("\\\\", '\\"', "\\b", "\\f", "\\r", "\\t", "\\n")
  string <- stringi::stri_replace_all_fixed(string, from, to, vectorize_all =F)
  string <- json_escape_unicode_stringi(string)
  string <- sprintf('"%s"', string)
  return(string)
}

json_escape_unicode_stringi <- function(string) {
  from <- intToUtf8(1:31, multiple = T)
  to <- paste0("\\u00", format(as.hexmode(1:31), width = 2))
  string <- stringi::stri_replace_all_fixed(string, from, to, vectorize_all = F)
  return(string)
}

unicode <- c(intToUtf8(1:31, multiple = T), c("\\", '"', "\b", "\f", "\r", "\t", "\n"))

x <- c(1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6)
bms <- lapply(x, \(i) {
  string <- paste0(sample(c(letters, LETTERS, unicode), i, replace = T), collapse = "")
  bm <- bench::mark(
    paws = paws.common:::json_escape_unicode(string),
    new_mthd = json_escape_unicode(string),
    stringi = json_escape_unicode_stringi(string)
  )
  bm$id <- i
  return(bm)
})

do.call(rbind, bms) |> ggplot2::autoplot() + 
  ggplot2::facet_wrap("id", labeller = ggplot2::label_both, scales = "free_x")

Created on 2024-02-26 with reprex v2.1.0

image

stringi has some significant performance enhancements. Alternatively move some of the looping gsub to cpp, however cpp isn't my forte so will need help regarding that option :)

Found Rcpp solutions developed from: https://stackoverflow.com/questions/25609174/fast-escaping-deparsing-of-character-vectors-in-r

remotes::install_github("dyfanjones/paws/paws.common", ref="json_escape")
json_escape_unicode <- function(string) {
  from <- intToUtf8(1:31, multiple = TRUE)
  to <- paste0("\\u00", format(as.hexmode(1:31), width = 2))
  for (i in 1:31) {
    string <- gsub(from[i], to[i], string, fixed = TRUE)
  }
  return(string)
}

json_convert_string <- function(string) {
  replace <- list(
    c("\\", "\\\\"),
    c('"', '\\"'),
    c("\b", "\\b"),
    c("\f", "\\f"),
    c("\r", "\\r"),
    c("\t", "\\t"),
    c("\n", "\\n")
  )
  for (elem in replace) {
    string <- gsub(elem[1], elem[2], string, fixed = TRUE)
  }
  string <- json_escape_unicode(string)
  string <- sprintf('"%s"', string)
  return(string)
}

json_convert_string_stringi <- function(string) {
  from <- c("\\", '"', "\b", "\f", "\r", "\t", "\n")
  to <- c("\\\\", '\\"', "\\b", "\\f", "\\r", "\\t", "\\n")
  string <- stringi::stri_replace_all_fixed(string, from, to, vectorize_all =F)
  string <- json_escape_unicode_stringi(string)
  string <- sprintf('"%s"', string)
  return(string)
}

json_escape_unicode_stringi <- function(string) {
  from <- intToUtf8(1:31, multiple = T)
  to <- paste0("\\u00", format(as.hexmode(1:31), width = 2))
  string <- stringi::stri_replace_all_fixed(string, from, to, vectorize_all = F)
  return(string)
}

unicode <- c(intToUtf8(1:31, multiple = T), "\\", '"', "\b", "\f", "\r", "\t", "\n")
x <- c(1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6)
bms <- lapply(x, \(i) {
  string <- paste0(sample(c(letters, LETTERS, unicode), i, replace = T), collapse = "")
  bm <- bench::mark(
    old_paws = json_convert_string(string),
    stringi = json_convert_string_stringi(string),
    new_paws = paws.common:::json_convert_string(string)
  )
  bm$id <- i
  return(bm)
})

Created on 2024-02-27 with reprex v2.1.0

image

From this we have significant performance improvement.