crash of cwb_makeall() / cwb_huffcode() / cwb_compress_rdx() if registry path not expanded
Closed this issue · 0 comments
ablaette commented
library(tm)
library(tokenizers)
library(RcppCWB)
library(magrittr)
vrt_dir <- file.path(tempdir(), "reuters")
if (!dir.exists(vrt_dir)) dir.create(vrt_dir)
registry_dir <- path.expand("~/Lab/github/duplicates/inst/extdata/cwb/registry")
data_dir <- "~/Lab/github/duplicates/inst/extdata/cwb/indexed_corpora/reuters2"
reuters_fname <- system.file(package = "tm", "texts", "reuters-21578.xml")
reuters <- system.file("texts", "crude", package = "tm") %>%
DirSource(reuters_fname, mode = "binary") %>%
VCorpus(readerControl = list(reader = readReut21578XMLasPlain))
reuters_tok <- reuters %>%
lapply(`[[`, "content") %>%
as.character() %>%
gsub("[<>]", "", .) %>%
tokenize_words(lowercase = FALSE, stopwords = FALSE, strip_punct = FALSE)
docnodes <- lapply(
1L:length(reuters),
function(i)
sprintf(
'<doc id="%s">\n%s\n</doc>',
names(reuters[i]),
paste(reuters_tok[[i]], collapse = "\n")
)
)
reuters_xml <- sprintf("<xml>\n%s\n</xml>", paste(docnodes, collapse = "\n"))
cat(reuters_xml, file = file.path(vrt_dir, "reuters.vrt"))
file.remove(list.files(data_dir, full.names = TRUE))
cwb_encode(
corpus = "REUTERS2",
registry = registry_dir,
vrt_dir = vrt_dir,
data_dir = path.expand(data_dir),
encoding = "utf8",
p_attributes = "word",
s_attributes = list(doc = "id", xml = character()),
verbose = TRUE
)
cwb_makeall(corpus = "REUTERS2", p_attribute = "word", registry = registry_dir)
cwb_huffcode(corpus = "REUTERS2", p_attribute = "word", registry = registry_dir)
cwb_compress_rdx(corpus = "REUTERS2", p_attribute = "word", registry = registry_dir)