bitten by this, make sure this also works with ngram = 1, although one should not call it with terms which only have 1 word
jwijffels opened this issue · 0 comments
jwijffels commented
It should be this
txt_recode_ngram <- function (x, compound, ngram, sep = " ") {
ngram <- as.integer(ngram)
if (length(ngram) != 1) {
stopifnot(length(ngram) == length(compound))
keywords <- data.frame(keyword = compound, ngram = ngram,
stringsAsFactors = FALSE)
ngrams <- unique(keywords$ngram)
ngrams <- sort(ngrams, decreasing = TRUE)
for (i in ngrams) {
x <- txt_recode_ngram(x, compound = keywords$keyword[keywords$ngram == i], ngram = i, sep = sep)
}
}
else {
keywords <- as.character(compound)
if (length(keywords) == 0) {
return(x)
}
y <- udpipe::txt_nextgram(x, n = ngram, sep = sep)
idx <- which(y %in% keywords)
x[idx] <- y[idx]
size <- length(x)
if(ngram > 1){
for (i in 1:(ngram - 1)) {
loc <- idx + i
loc <- loc[loc <= size]
if(length(loc) > 0){
x[loc] <- NA_character_
}
}
}
}
x
}