Statistical Analysis
https://gettinggeneticsdone.blogspot.com/2011/01/rstats-function-for-extracting-f-test-p.html
Function to extract the overall ANOVA p-value out of a linear model object
lmp_f <- function (modelobject) {
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
f <- summary(modelobject)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
attributes(p) <- NULL
return(p)
}
if(!require(tidyverse)) install.packages("tidyverse")
library(tidyverse)
dat <- data.frame(x = factor("A"), y = 1)
mutate(dat, x = fct_recode(x, B = "A"))
colSums(is.na(dataset)) #결측값포함된 열 확인
data_filled <- data.frame( sapply(dataset,
function(x) ifelse(is.na(x), mean(x, na.rm=TRUE),
x)) )
https://www.rdocumentation.org/packages/DMwR/versions/0.4.1/topics/knnImputation https://thebook.io/006723/ch09/02/02/
if(!require(DMwR)) install.packages("DMwR")
library(DMwR)
data(algae)
cleanAlgae <- knnImputation(algae)
summary(cleanAlgae)
if(!require(dplyr)) install.packages("dplr")
if(!require(psych)) install.packages("psych")
library(dplyr)
library(psych)
a1 <- c(1, 2, 3, 4)
b1 <- c(1, 2, 3, 4)
c1 <- c(1, 2, 3, 4)
df <- data.frame(a1, b1, c1)
df$var1 <- rowSums(select(df, a1:c1))/ncol(select(df, a1:c1))
s1 <- "sol_001.x"
s2 <- "sol_009.x"
sel <- select(df, s1:s2)
wb <- rowSums(sel) / ncol(sel)
alpha(sel)
변수가 여러개 있는 데이터를 사전분석할 때 sex등과 같은 변수가 미치는 영향을 사전 검토할 때 유용
names(df)
options(digits = 3)
DV <- df[1:27] # 종속변수 선택
IV <- df$sex #독립변수
IVcol <- "sex"
v1 <- levels(IV)[1] #독립변수의 속성1
v2 <- levels(IV)[2] #독립변수의 속성2
t.df <- data.frame()
for (i in 1:ncol(DV)) {
s.t <- t.test(DV[, i] ~ IV)
coef.df <- data.frame( v1 = DV[which(df[,IVcol]== v1), i] %>% mean,
v2 = DV[which(df[,IVcol]== v2), i] %>% mean,
t = s.t$statistic,
p = s.t$p.value )
colnames(coef.df) <- c(v1, v2, "t", "p")
rownames(coef.df) <- names(dfN)[i]
t.df <- rbind(t.df, coef.df)
} ; t.df[order(t.df$p), ]
t.df
names(df)
DV <- df[1:27] # 종속변수 선택
IV <- df$status #독립변수
IVcol <- "status"
v1 <- levels(IV)[1] #독립변수의 속성1
v2 <- levels(IV)[2] #독립변수의 속성2
t.df <- data.frame()
for (i in 1:ncol(DV)) {
s.t <- wilcox.test(DV[, i] ~ IV)
coef.df <- data.frame( v1m = DV[which(df[,IVcol]== v1), i] %>% mean,
v1d = DV[which(df[,IVcol]== v1), i] %>% median,
v1s = DV[which(df[,IVcol]== v1), i] %>% sd,
v2m = DV[which(df[,IVcol]== v2), i] %>% mean,
v2d = DV[which(df[,IVcol]== v2), i] %>% median,
v2s = DV[which(df[,IVcol]== v2), i] %>% sd,
w = s.t$statistic,
p = s.t$p.value )
colnames(coef.df) <- c(paste0(v1, "me"), paste0(v1, "md"), paste0(v1, "sd"),
paste0(v2, "me"), paste0(v2, "md"), paste0(v2, "sd"),
"w", "p")
rownames(coef.df) <- names(dfN)[i]
t.df <- rbind(t.df, coef.df)
} ; t.df[order(t.df$p), ]
t.df
attach(df)
DV <- wbN
IV1 <- year
IV2 <- boeeC
detach(df)
# regression
fit1 <- lm(DV ~ IV1) %>% summary
fit2 <- lm(DV ~ IV1 + IV2) %>% summary
# object2string
o2s <- function(x) {
n <- deparse(substitute(x)); print(n)
}
# combining results data
fit.df <- do.call( "rbind",
list(
fit1$coef[, 3:4] %>% round(., 3) %>% as.data.frame() %>% mutate(var = rownames(.), fit = o2s(fit1)),
fit2$coef[, 3:4] %>% round(., 3) %>% as.data.frame() %>% mutate(var = rownames(.), fit = o2s(fit2))
)
)
fit.df <- fit.df[c(4, 3, 1, 2)]
fit.df
fit.df[order(fit.df$var),]
https://statisticsglobe.com/r-multiple-regressions-in-for-loop
mod_summaries <- list() # Create empty list
for(i in 2:ncol(data)) { # Head of for-loop
predictors_i <- colnames(data)[2:i] # Create vector of predictor names
mod_summaries[[i - 1]] <- summary( # Store regression model summary in list
lm(y ~ ., data[ , c("y", predictors_i)]))
}
## regresssion table
coef.df <- (fit %>% summary)$coef %>% round(.,3)
confint.df <- fit %>% confint %>% as.data.frame() %>% round(.,3)
beta.df <- (fit.s %>% summary)$coef[, 1] %>% as.data.frame() %>% round(.,3)
t.df <- cbind(coef.df, confint.df)
t.df <- cbind(t.df, beta.df)
colnames(t.df) <- c("B", "SE", "t", "p", "95%CI LL", "95%CI UL", "b")
t.df
t.df <- dplyr::select(t.df, "B", "b", "SE", "t", "p", "95%CI LL", "95%CI UL")
## table note
n <- nrow(df)
# F통계
numdf <- (fit %>% summary)$fstatistic[2]
dendf <- (fit %>% summary)$fstatistic[3]
value <- (fit %>% summary)$fstatistic[1] %>% round(.,2)
# R제곱
r2 <- (fit %>% summary)$r.squared %>% round(.,2)
r2adj <- (fit %>% summary)$adj.r.squared %>% round(.,2)
fstat <- paste0("F(", numdf, ",", dendf, ") = ", value)
r.v <- paste0("R2 = ", r2, " , 수정 R2 = ", r2adj)
nn <- paste0("N = ", n)
cf <- "SE = 표준오차; CI = 신뢰구간; LL = 하한계; UL = 상한계."
##
n.v <- c(fstat, r.v, nn, cf)
write.csv(t.df, "table_regression.csv")
write.csv(n.v, "table_regression_note.csv", row.names = F)
v_df %>% mutate(자아존상하 = case_when(자아존중감 < 4 ~ 2, TRUE ~ 6)) -> df
X <- c("대면빈도", "비대면빈도", "관계이용", "정보이용", "오락이용", "소셜이용", "뉴스이용")
M <- "관계_고립"
Y <- "정신건강"
W <- "성+나이+소득+학력"
I <- "자아존상하"
out2_l <- list()
out6_l <- list()
test_l <- list()
for(i in seq_along(X)) {
M_fmla <- paste0(M, "~", X[[i]], "*", I, "+", W)
Y_fmla <- paste0(Y, "~", M, "+", X[[i]], "*", I, "+", M, "*", I, "+", W)
med_fit <- lm(M_fmla, data = df)
out_fit <- lm(Y_fmla, data = df)
med2_out <- mediation::mediate(
med_fit, out_fit, treat = X[[i]], mediator = M, covariates = list(자아존상하 = 2), sims = 100)
med6_out <- mediation::mediate(
med_fit, out_fit, treat = X[[i]], mediator = M, covariates = list(자아존상하 = 6), sims = 100)
med_init <- mediation::mediate(med_fit, out_fit, treat = X[[i]], mediator = M, sims = 2)
test_l[[i]] <- test.modmed(
med_init, covariates.1 = list(자아존상하 = 2), covariates.2 = list(자아존상하 = 6))
plot(med2_out, main = paste(X[[i]], "-", M, "-", Y,": ", I, " = 2", sep = " "))
plot(med6_out, main = paste(X[[i]], "-", M, "-", Y,": ", I, " = 6", sep = " "))
out2_l[[i]] <- summary(med2_out)
out6_l[[i]] <- summary(med6_out)
}
saveRDS(test_l, "iso_mh_outi.rds")
saveRDS(out2_l, "iso_mh_out2.rds")
saveRDS(out6_l, "iso_mh_out6.rds")
varName <- names(iris)[1]
as.name(varName)
object2string <- function(x) {
n <- deparse(substitute(x))
print(n)
}
object2string(a)
# Or use all.vars() all.names()
# Return a character vector containing all the names which occur in an expression or call
modelformula <- mpg ~ cyl + disp + hp + drat + qsec
all.vars(modelformula)
[1] "mpg" "cyl" "disp" "hp" "drat" "qsec"
modelformula <- A ~ B + C
mycars <- lapply(mtcars[, all.vars(modelformula)], scale)
https://www.r-bloggers.com/2013/04/ggplot2-graphics-in-a-loop/ aes_string() 문자형으로 변수
plotHistFunc <- function(x, na.rm = TRUE, ...) {
nm <- names(x)
for (i in seq_along(nm)) {
plots <-ggplot(x,aes_string(x = nm[i])) + geom_histogram(alpha = .5,fill = "dodgerblue")
ggsave(plots,filename=paste("myplot",nm[i],".png",sep=""))
}
}
plotHistFunc(df) ## execute function
https://aosmith.rbind.io/2018/08/20/automating-exploratory-plots/
library(ggplot2) # v. 3.3.3
library(purrr) # v. 0.3.4
set.seed(16)
dat = data.frame(elev = round( runif(20, 100, 500), 1),
resp = round( runif(20, 0, 10), 1),
grad = round( runif(20, 0, 1), 2),
slp = round( runif(20, 0, 35),1),
lat = runif(20, 44.5, 45),
long = runif(20, 122.5, 123.1),
nt = rpois(20, lambda = 25) )
head(dat)
response = names(dat)[1:3]
expl = names(dat)[4:7]
response = set_names(response)
response
expl = set_names(expl)
expl
scatter_fun = function(x, y) {
ggplot(dat, aes(x = .data[[x]], y = .data[[y]]) ) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "grey74") +
theme_bw()
}
scatter_fun = function(x, y) {
ggplot(dat, aes_string(x = x, y = y ) ) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "grey74") +
theme_bw()
}
scatter_fun(x = "lat", y = "elev")
elev_plots = map(expl, ~scatter_fun(.x, "elev") )
elev_plots
all_plots = map(response,
~map(expl, scatter_fun, y = .x) )
all_plots2 = map(response, function(resp) {
map(expl, function(expl) {
scatter_fun(x = expl, y = resp)
})
})
all_plots$grad[1:2]
all_plots$grad$long
all_plots[[3]][[3]]
resp_expl = tidyr::expand_grid(response, expl)
resp_expl
allplots2 = pmap(resp_expl, ~scatter_fun(x = .y, y = .x) )
allplots2_names = pmap(resp_expl, ~paste0(.x, "_", .y, ".png"))
allplots2_names[1:2]
pdf("all_scatterplots.pdf")
all_plots
dev.off()
iwalk(all_plots, ~{
pdf(paste0(.y, "_scatterplots.pdf") )
print(.x)
dev.off()
})
plotnames = imap(all_plots, ~paste0(.y, "_", names(.x), ".png")) %>%
flatten()
plotnames
walk2(plotnames, flatten(all_plots), ~ggsave(filename = .x, plot = .y,
height = 7, width = 7))
cowplot::plot_grid(plotlist = all_plots[[1]])
response_plots = map(all_plots, ~cowplot::plot_grid(plotlist = .x))
response_plots
# look into line by line
rep(1, nrow(df))
t(apply(df, 2, median))
rep(1, nrow(df)) %*% t(apply(df, 2, median)) # 행렬 곱하기
head(rep(1, nrow(df)) %*% t(apply(df, 2, median)), 3)
# function for median centering
center_median <- function(x) {
ones = rep(1, nrow(x))
x_median = ones %*% t(apply(x, 2, median))
x - x_median
}
table() %>% as.data.frame.matrix() -> df
bind_cols(
tibble(변수명 = rownames(df)),
df) -> df
# 행방향 합산
df %>% summarise(across(where(is.numeric), sum)) %>%
add_column(변수명 = "Total", .before = "열이름")
# 열방향 합산
mutate(
Total = rowSums(across(where(is.numeric) ))
)