Consider using stringi as dependency
DyfanJones opened this issue · 1 comments
DyfanJones commented
json_convert_string <- function(string) {
replace <- list(
c("\\", "\\\\"),
c('"', '\\"'),
c("\b", "\\b"),
c("\f", "\\f"),
c("\r", "\\r"),
c("\t", "\\t"),
c("\n", "\\n")
)
for (elem in replace) {
string <- gsub(elem[1], elem[2], string, fixed = TRUE)
}
string <- json_escape_unicode(string)
string <- sprintf('"%s"', string)
return(string)
}
json_escape_unicode <- function(string) {
from <- intToUtf8(1:31, multiple = T)
to <- paste0("\\u00", format(as.hexmode(1:31), width = 2))
for (i in 1:31) {
string <- gsub(from[i], to[i], string, fixed = TRUE)
}
return(string)
}
json_convert_string_stringi <- function(string) {
from <- c("\\", '"', "\b", "\f", "\r", "\t", "\n")
to <- c("\\\\", '\\"', "\\b", "\\f", "\\r", "\\t", "\\n")
string <- stringi::stri_replace_all_fixed(string, from, to, vectorize_all =F)
string <- json_escape_unicode_stringi(string)
string <- sprintf('"%s"', string)
return(string)
}
json_escape_unicode_stringi <- function(string) {
from <- intToUtf8(1:31, multiple = T)
to <- paste0("\\u00", format(as.hexmode(1:31), width = 2))
string <- stringi::stri_replace_all_fixed(string, from, to, vectorize_all = F)
return(string)
}
unicode <- c(intToUtf8(1:31, multiple = T), c("\\", '"', "\b", "\f", "\r", "\t", "\n"))
x <- c(1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6)
bms <- lapply(x, \(i) {
string <- paste0(sample(c(letters, LETTERS, unicode), i, replace = T), collapse = "")
bm <- bench::mark(
paws = paws.common:::json_escape_unicode(string),
new_mthd = json_escape_unicode(string),
stringi = json_escape_unicode_stringi(string)
)
bm$id <- i
return(bm)
})
do.call(rbind, bms) |> ggplot2::autoplot() +
ggplot2::facet_wrap("id", labeller = ggplot2::label_both, scales = "free_x")
Created on 2024-02-26 with reprex v2.1.0
stringi
has some significant performance enhancements. Alternatively move some of the looping gsub
to cpp, however cpp isn't my forte so will need help regarding that option :)
DyfanJones commented
Found Rcpp solutions developed from: https://stackoverflow.com/questions/25609174/fast-escaping-deparsing-of-character-vectors-in-r
remotes::install_github("dyfanjones/paws/paws.common", ref="json_escape")
json_escape_unicode <- function(string) {
from <- intToUtf8(1:31, multiple = TRUE)
to <- paste0("\\u00", format(as.hexmode(1:31), width = 2))
for (i in 1:31) {
string <- gsub(from[i], to[i], string, fixed = TRUE)
}
return(string)
}
json_convert_string <- function(string) {
replace <- list(
c("\\", "\\\\"),
c('"', '\\"'),
c("\b", "\\b"),
c("\f", "\\f"),
c("\r", "\\r"),
c("\t", "\\t"),
c("\n", "\\n")
)
for (elem in replace) {
string <- gsub(elem[1], elem[2], string, fixed = TRUE)
}
string <- json_escape_unicode(string)
string <- sprintf('"%s"', string)
return(string)
}
json_convert_string_stringi <- function(string) {
from <- c("\\", '"', "\b", "\f", "\r", "\t", "\n")
to <- c("\\\\", '\\"', "\\b", "\\f", "\\r", "\\t", "\\n")
string <- stringi::stri_replace_all_fixed(string, from, to, vectorize_all =F)
string <- json_escape_unicode_stringi(string)
string <- sprintf('"%s"', string)
return(string)
}
json_escape_unicode_stringi <- function(string) {
from <- intToUtf8(1:31, multiple = T)
to <- paste0("\\u00", format(as.hexmode(1:31), width = 2))
string <- stringi::stri_replace_all_fixed(string, from, to, vectorize_all = F)
return(string)
}
unicode <- c(intToUtf8(1:31, multiple = T), "\\", '"', "\b", "\f", "\r", "\t", "\n")
x <- c(1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6)
bms <- lapply(x, \(i) {
string <- paste0(sample(c(letters, LETTERS, unicode), i, replace = T), collapse = "")
bm <- bench::mark(
old_paws = json_convert_string(string),
stringi = json_convert_string_stringi(string),
new_paws = paws.common:::json_convert_string(string)
)
bm$id <- i
return(bm)
})
Created on 2024-02-27 with reprex v2.1.0
From this we have significant performance improvement.