PolMine/polmineR.misc

Check that Duplicates method works

Closed this issue · 1 comments

Sample code ...

library(polmineR.misc)
library(polmineR)
library(pbapply)
library(slam)

D <- Duplicates$new(charRegex = "[a-zA-ZäöüÄÖÜ]", # characters to keep
                    pAttribute = "word", # attribute to use
                    sAttribute = "date", # date attribute
                    datePrep = NULL, # function to get dates to YYYY-MM-DD
                    sample = 1000L, # sample size for parttion bundle on which the char count is based.
                    # this sample of articles is used to determine the top 10
                    # least frequent characters in the data which
                    # should be used to calculate the similiarity
                    n = 1L, # number of day window for duplicates
                    threshold = 0.6) # min threshold. The reference works with 0.6, the default is 0.9

set.seed(132)
sample_days <- sample(s_attributes(coi, "date"), 100) # only use a small part of the corpus 

article_bundle <- corpus(coi) |>
  subset(date %in% sample_days) |> 
  split(s_attribute = "article_id")
  
# results in 3500 elements

D$detectDuplicates(x = article_bundle, mc = 3L)

A modified/adapted version of this example has been used as an example and as a unit test now.