ml_word_visualisations

Structure of repository

Folder lda consists of the files to run lda experiments

  1. utils.R which includes functions for testing lda topics

  2. main.R which includes methods for creating and testing lda models from different libraries

Folder data is there to store your data.

Folder results stores the results of your experiment

Example

LDA

0. Preparation

0.1 Set (hyper)parameters

Data

data_dir <- "./data/depression_anxiety_cleaned.csv"
data_col <- "dep_all_words"
id_col <- "unique_id"
group_col <- "minidep_diagnose" # now necessary, but only used for t-test

Document Term Matrix

ngram_window <- c(1,3)
stopwords <- stopwords::stopwords("en", source = "snowball")
removalword <- "" # just possible with one word
occ_rate <- 0
removal_num_most <- 1
removal_num_least <- 0
removal_mode <- "absolute" # "relative"
split <- 1

LDA

model_type <- "textmineR" # or "mallet"
num_topics <- 20
num_top_words <- 10
num_train_iterations <- 2000
num_pred_iterations <- 200
pred_mode <- "function" # or "custom" for mallet

Analysis

cor_var <- "PHQtot" # grouping variable for t-test, to be predicted variable for other
control_vars <- c("PHQtot") # vector of variables to control analysis with if test_method is linear_regression
test_method <- "textTrain_regression" # linear_regression, logistic_regression, t-test

Miscellaneous

seed <- 1234
0.2 Create directory to save all computations

All objects created within the pipeline are created in the directory below. These include

  • Document Term Matrix

  • model

  • predictions

  • analysis results

save_dir <- paste0("./results/",
            model_type,"_",
            data_col, "_",
            num_topics, 
            "_most_",removal_num_most, 
            "_least_", removal_num_least, 
            "_occ_", occ_rate, 
            "_pred_", mode)
0.3 Imports
library(textmineR)
library(tidyverse)
library(dplyr)
library(textmineR)
library(mallet)
library(rJava)
library(tokenizers)
library(text2vec)
library(quanteda)
source("./lda/main.R")

1. Compute Document Term Matrix

dtms <- get_dtm(data_dir = data_dir,
                id_col = id_col,
                data_col = data_col,
                group_var = group_col,
                ngram_window = ngram_window,
                stopwords = stopwords,
                removalword = removalword,
                occ_rate = occ_rate,
                removal_mode = removal_mode,
                removal_rate_most = removal_num_most,
                removal_rate_least = removal_num_least,
                split=split,
                seed=seed,
                save_dir=save_dir)

2. Create LDA Model

model <- get_lda_model(model_type=model_type,
                        dtm=dtms$train_dtm,
                        num_topics=num_topics,
                        num_top_words=num_top_words,
                        num_iterations = num_train_iterations,
                        seed=seed,
                        save_dir=save_dir)

3. Create Predictions

preds <- get_lda_preds(model = model,
                        num_iterations=num_pred_iterations,
                        data = dtms$train_data,
                        dtm = dtms$train_dtm,
                        group_var = c(cor_var),
                        seed=seed,
                        mode=mode,
                        save_dir = save_dir)

4. Analysis

4.1 textTrain_regression
test <- get_lda_test(model=model,
                    preds=preds,
                    group_var = cor_var,
                    control_vars = control_vars,
                    test_method = "textTrain_regression",
                    seed=seed,
                    save_dir=save_dir)
4.2 Linear Regression
test <- get_lda_test(model=model,
                    preds=preds,
                    group_var = cor_var,
                    control_vars = control_vars,
                    test_method = "linear_regression",
                    seed=seed,
                    save_dir=save_dir)