title author date output
Multivariate_analysis
Chunmei Gao
November 29, 2015
html_document
library(corrplot)
twins <- read.csv("~/Documents/6215-Applied Multivariance Model/project/nmtwins.csv")
#View(twins)
str(twins)
dim(twins)
#seperate the data into two groups
identical <- twins[twins$zygosity==1,]
fraternal <- twins[twins$zygosity==2,]

# describe the data:

hist(twins$moed)
hist(twins$faed)
hist(twins$faminc)
hist(twins$english)

# checking the fraternal twins sex
for(i in 1:839)
twins[which(twins$zygosity==2),2]

# data cleaning
# missing value:
sum(is.na(twins$moed))
sum(is.na(twins$faed))
sum(is.na(twins$faminc))

# test multivariate normal distribution: QQ plot(for sigle variable)  Chi-square (for multi variables)

source("http://www.stat.wmich.edu/wang/561/codes/R/chisqplot.R")
chisqplot(twins[,7:11]) 
# a few points above the line indicate large distances or outlying observations

qqnorm(twins$english)
qqline(twins$english, col="red")

qqnorm(twins$math)
qqline(twins$math, col="red")

qqnorm(twins$socsci)
qqline(twins$socsci, col="red")

qqnorm(twins$natsci)
qqline(twins$natsci, col="red")

qqnorm(twins$vocab)
qqline(twins$vocab, col="red")
# from the qq plot for each score variables, we didint see a significant diviation from normal distribution.
## ??why the multi-normal has some divations? correlation? outlier?

# PCA
#library(corrplot)
corrplot(cor(twins[,7:11]),method="number")
fit <- princomp(twins[,7:11], cor=TRUE)
summary(fit)
loadings(fit)
fit$scores[1:10,]
plot(fit,type="lines")
plot(cars)
##-----------------
##    Haoqi     ##
##-------------------
# I need a confidence interval
twins3=twins[,-c(1,3,4,5,6)]
head(twins3)
df1=twins3[c(twins3$sex==1),]
head(df1)
df2=twins3[c(twins3$sex==2),]
z1=df1[,2:6]
z2=df2[,2:6]
#using t-test to see whether there exists significant (ch5 co)
#difference of mean between the sexes.
x1bar=colMeans(z1)
x2bar=colMeans(z2)
n1=nrow(z1)
n2=nrow(z2)
p=5

S1=var(z1)
S2=var(z2)
Spool=((n1-1)*S1+(n2-1)*S2)/(n1+n2-2)
T2=t(x1bar-x2bar)%*%solve((S1/n1+S2/n2))%*%(x1bar-x2bar)
T2
#[1,] 442.6469
c2 = (n1+n2-2)*p*qf(df1=p,df2=n1+n2-p-1,0.96)/(n1+n2-p-1)
c2
#[1] 11.7024
pval = 1-pf(T2*(n1+n2-p-1)/((n1+n2-2)*p), p, n1+n2-p-1)
pval
#[1,]    0

#Since T2>c2, the conclusion could be drawn that there exists significant
#difference between different sexes.

#using ANOVA 
twins[1:10,]
twins2[1:10,]
head(twins2)
x=twins2[,7:11]
y=twins2[,12:16]
colMeans(twins2)
d=(x+y)/2
d=as.matrix(d)
sex=as.factor(twins2$sex)
zygosity=as.factor(twins2$zygosity)

fit=manova(d~sex*zygosity)
summary(fit)

#Df   Pillai approx F num Df den Df Pr(>F)    
#sex            1 0.272730   62.326      5    831 <2e-16 ***
#  zygosity       1 0.009962    1.672      5    831 0.1387    
#sex:zygosity   1 0.004803    0.802      5    831 0.5483    
#Residuals    835                                           
#---
#  Signif. codes:  0 °Æ***°Ø 0.001 °Æ**°Ø 0.01 °Æ*°Ø 0.05 °Æ.°Ø 0.1 °Æ °Ø 1





twins2=read.csv("C:/Users/lyf/Desktop/twins-in-rows2.csv")
twins2=twins2[,-1]
dat.dist <- dist(twins2,method = "euclidean")
dat.hclust3 <- hclust(dat.dist, method="complete")
plot(dat.hclust3)
##-----------------
##    kelly      ##
##-------------------
twins=read.csv("nmtwins.csv")
head(twins)
twins[1:50,]
rep(c(1,2),times=3)
nrow(twins)
1678/2
index=rep(c(1,2),times=839)
df=data.frame(twins,index)
head(df)
df1=df[df$index==1,]
head(df1)
table(df1$pairnum)
df2=df[df$index==2,]
head(df2)
dfnew=data.frame(df1,df2)
head(dfnew)
dfnew=dfnew[,-c(12,13,15,24)]#…æµÙindex,pairnum,zygosity
head(dfnew)
library(plyr)
dfnew=rename(dfnew,c("sex.1"="sex2","moed.1"="moed2","faed.1"="faed2", "faminc.1"="faminc2","english.1" = "english2", "math.1" = "math2", "socsci.1" = "socsci2","natsci.1"="natsci2", "vocab.1"="vocab2"))
head(dfnew)
write.csv(dfnew,file="twins-in-row.csv",row.names=F)
dfnew=read.csv("twins-in-row.csv")

dfnew1=dfnew[,-c(12,13,14,15)]
head(dfnew1)
write.csv(dfnew1,"twins-in-rows2.csv",row.names=F)
table(dfnew$sex,dfnew$sex2)
table(dfnew$moed,dfnew$moed2)
table(dfnew$faed,dfnew$faed2)
table(dfnew$faminc,dfnew$faminc2)
----------------------------------------------
  ############################
  ##PCA for the test scores##
  ###########################
twin=read.csv("nmtwins.csv")
twin[1:20,]
twin2=read.csv("twins-in-rows2.csv")
twin2[1:50,]
#PCA
library(corrplot)
corrplot(cor(twin[,7:11]),method="number") #plot the correlation matrix
fit <- princomp(twin[,7:11], cor=TRUE)
summary(fit)
##Importance of components:
##                         Comp.1    Comp.2     Comp.3     Comp.4     Comp.5
##Standard deviation     1.8748378 0.7362136 0.62628756 0.57828468 0.46510586
##Proportion of Variance 0.7030034 0.1084021 0.07844722 0.06688264 0.04326469
##Cumulative Proportion  0.7030034 0.8114055 0.88985267 0.95673531 1.00000000
loadings(fit)
##Loadings:
##         Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
##english -0.434 -0.401  0.780  0.143 -0.150
##math    -0.426  0.647  0.211 -0.596       
##socsci  -0.470 -0.191 -0.481        -0.709
##natsci  -0.445  0.433 -0.129  0.753  0.177
##vocab   -0.459 -0.444 -0.315 -0.226  0.665

##Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
##SS loadings       1.0    1.0    1.0    1.0    1.0
##Proportion Var    0.2    0.2    0.2    0.2    0.2
##Cumulative Var    0.2    0.4    0.6    0.8    1.0
fit$scores[1:10,]
plot(fit,type="lines")#indicate 1 component
#reduction data
-0.434*twin$english-0.426*twin$math
PC1=-0.434*twin$english-0.426*twin$math-0.470*twin$socsci-0.445*twin$natsci-0.459*twin$vocab
PC2= -0.401*twin$english+ 0.647*twin$math+0.433*twin$natsci--0.444*twin$vocab
mean(PC2)
-------------------------------------------------------------------
####################################################################################
##the difference in absolute difference between identical twins and fraternal twins##
####################################################################################
twin[1:10,]
twin2[1:10,]
x=twin2[,7:11]
nrow(x)
y=twin2[,12:16]
nrow(y)
z=abs(x-y)#the absolute difference of scores between twins
z
nrow(z)
colMeans(z)
df=data.frame(twin2,z)#dataset we will use
df[1:10,]
z1=df[c(df$zygosity==1),17:21]#data who are identical twins
head(z1)
nrow(z1)##509 ∂‘Õ¨¬—À´∞˚Õ
z2=df[c(df$zygosity==2),17:21]#data who are not identical twins
nrow(z2) ##330 ∂‘“Ϭ—À´∞˚Õ
head(z2)

nrow(z1)
nrow(z2)
##two absokute difference comparison test
x1bar=colMeans(z1)
x1bar
x2bar=colMeans(z2)
x2bar
n1=nrow(z1)
n1
n2=nrow(z2)
n2
p=5
#assuming that the variance is equal within two groups
S1=var(z1)
S2=var(z2)
Spool=((n1-1)*S1+(n2-1)*S2)/(n1+n2-2)
T2=t(x1bar-x2bar)%*%solve((1/n1+1/n2)*Spool)%*%(x1bar-x2bar)
T2
##[,1]
##[1,] 118.1061
T2a=(n1+n2-2)*p*qf(df1=p,df2=n1+n2-p-1,0.96)/(n1+n2-p-1)
T2a 
##[1] 11.76112
#T2>T2a we can reject H0 that two population mean vectors are equivalent
#assuming that the variance is unequal within two groups
T22=t(x1bar-x2bar)%*%solve(S1/n1+S2/n2)%*%(x1bar-x2bar)
T22 
##[,1]
##[1,] 98.26564
pval = 1-pf(T22*(n1+n2-p-1)/((n1+n2-2)*p), p, n1+n2-p-1)
pval
t.test(z1[1],z2[1])
t.test(z1[2],z2[2])
t.test(z1[3],z2[3])
t.test(z1[4],z2[4])
t.test(z1[5],z2[5])
t.test(z1,z2)
nrow(z)
--------------------------------------------------------------------
twin[1:10,]
twin2[1:10,]
head(twin2)
x=twin2[,7:11]
y=twin2[,12:16]
colMeans(twin2)
d=(x+y)/2 #the mean scores between twins
colMeans(d)
df=data.frame(twin2,d)#dataset we will use
df[1:10,]
d1=df[c(df$zygosity==1),17:21]#data who are identical twins
head(d1)
d2=df[c(df$zygosity==2),17:21]#data who are not identical twins
head(d2)
nrow(d1)
nrow(d2)
##two means comparison test
x1bar=colMeans(d1)
x1bar
x2bar=colMeans(d2)
x2bar
n1=nrow(d1)
n2=nrow(d2)
p=5
#assuming that the variance is equal within two groups
S1=var(d1)
S2=var(d2)
Spool=((n1-1)*S1+(n2-1)*S2)/(n1+n2-2)
T2=t(x1bar-x2bar)%*%solve((1/n1+1/n2)*Spool)%*%(x1bar-x2bar)
T2
##    [,1]
##[1,] 8.503705
T2a=(n1+n2-2)*p*qf(df1=p,df2=n1+n2-p-1,0.96)/(n1+n2-p-1)
T2a 
##[1] 11.76112
#T2<T2a we cannot reject H0 that two population mean vectors are equivalent
#assuming that the variance is unequal within two groups
T22=t(x1bar-x2bar)%*%solve(S1/n1+S2/n2)%*%(x1bar-x2bar)
T22 
##    [,1]
##[1,] 8.783256
t.test(d1,d2)
t.test(d1[1],d2[1])
t.test(d1[2],d2[2])
t.test(d1[3],d2[3])
t.test(d1[4],d2[4])
t.test(d1[5],d2[5])
##∂‘Õ¨¬—“Ϭ—À´∞˚Õµƒ5∏ˆ∑÷ ˝◊ˆmanova
nrow(d)
head(twin2)
nrow(twin2)
Y1=d
Y1=as.matrix(Y1)
Y1
x=as.factor(twin2$zygosity)
fit=manova(Y1 ~ x)
summary.aov(fit)             # univariate ANOVA tables
summary(fit, test = "Wilks") # ANOVA table of Wilks' lambda
summary(fit)                # same F statistics as single-df terms  
##Df   Pillai approx F num Df den Df Pr(>F)
##x           1 0.010058   1.6926      5    833 0.1338
##Residuals 837    
-------------------------------------------------------------------------
  #########################################
 ## social class * zygosity interaction##
  ########################################
Y2=as.matrix(z)
zygosity=as.factor(twin2$zygosity)
motheredu=as.factor(twin2$moed)
fatheredu=as.factor(twin2$faed)
faminc=as.factor(twin2$faminc)

fit1=manova(Y1~zygosity*motheredu*fatheredu*faminc)#mean value as response
summary(fit1)
##                                      Df  Pillaiapprox F num Df den Df    Pr(>F)    
##zygosity                              1 0.01043   1.1192      5    531  0.349015    
##motheredu                             5 0.15139   3.3409     25   2675 4.445e-08 ***
##atheredu                             5 0.15530   3.4300     25   2675 2.006e-08 ***
##faminc                                6 0.08280   1.5014     30   2675  0.039428 *  
##zygosity:motheredu                    5 0.03386   0.7295     25   2675  0.831474    
##zygosity:fatheredu                    5 0.03237   0.6971     25   2675  0.864970    
##motheredu:fatheredu                  21 0.20434   1.0855    105   2675  0.263135    
##zygosity:faminc                       6 0.06308   1.1394     30   2675  0.275207    
##motheredu:faminc                     26 0.24992   1.0826    130   2675  0.251162    
##fatheredu:faminc                     27 0.28271   1.1875    135   2675  0.073397 .  
##zygosity:motheredu:fatheredu         19 0.19637   1.1511     95   2675  0.154011    
##zygosity:motheredu:faminc            21 0.25501   1.3692    105   2675  0.008477 ** 
##zygosity:fatheredu:faminc            22 0.19996   1.0130    110   2675  0.445685    
##motheredu:fatheredu:faminc           40 0.34123   0.9796    200   2675  0.566690    
##zygosity:motheredu:fatheredu:faminc  23 0.23482   1.1463    115   2675  0.141050    
##Residuals                           535                                             
##---
##  Signif. codes:  0 °Æ***°Ø 0.001 °Æ**°Ø 0.01 °Æ*°Ø 0.05 °Æ.°Ø 0.1 °Æ °Ø 1
fit2=manova(Y2~zygosity*motheredu*fatheredu*faminc)#difference as response
summary(fit2)
##                                      Df  Pillai approx F num Df den Df  Pr(>F)    
##zygosity                              1 0.19791  26.2039      5    531 < 2e-16 ***
##motheredu                             5 0.05249   1.1352     25   2675 0.29196    
##fatheredu                             5 0.04349   0.9388     25   2675 0.55030    
##faminc                                6 0.07822   1.4171     30   2675 0.06612 .  
##zygosity:motheredu                    5 0.06081   1.3173     25   2675 0.13432    
##zygosity:fatheredu                    5 0.04741   1.0244     25   2675 0.42937    
##motheredu:fatheredu                  21 0.14685   0.7709    105   2675 0.95848    
##zygosity:faminc                       6 0.08280   1.5015     30   2675 0.03942 *  
##motheredu:faminc                     26 0.25557   1.1084    130   2675 0.19454    
##fatheredu:faminc                     27 0.27819   1.1674    135   2675 0.09558 .  
##zygosity:motheredu:fatheredu         19 0.20339   1.1940     95   2675 0.10036    
##zygosity:motheredu:faminc            21 0.23366   1.2489    105   2675 0.04668 *  
##zygosity:fatheredu:faminc            22 0.26074   1.3379    110   2675 0.01207 *  
##motheredu:fatheredu:faminc           40 0.42221   1.2336    200   2675 0.01717 *  
##zygosity:motheredu:fatheredu:faminc  23 0.26721   1.3133    115   2675 0.01575 *  
##  Residuals                           535                                           
##---
##  Signif. codes:  0 °Æ***°Ø 0.001 °Æ**°Ø 0.01 °Æ*°Ø 0.05 °Æ.°Ø 0.1 °Æ °Ø 1
-----------------------------------------------------------------------
  
  
  ##use the reduction data
  ##absolute difference between different types of twins
df=data.frame(twin,PC1)
head(df)
index=rep(c(1,2),times=839)
df=data.frame(df,index)
head(df)
pc11=df[df$index==1,12]
pc12=df[df$index==2,12]
t.test(pc11,pc12,var.equal=F)
##  Welch Two Sample t-test

##data:  pc11 and pc12
##t = 1.7539, df = 1675.902, p-value = 0.07964
##alternative hypothesis: true difference in means is not equal to 0
##95 percent confidence interval:
##  -0.099589  1.783050
##sample estimates:
##  mean of x mean of y 
##-45.31259 -46.15433 

var.test(pc11,pc12)
##F test to compare two variances

##data:  pc11 and pc12
##F = 1.0154, num df = 838, denom df = 838, p-value = 0.8246
##alternative hypothesis: true ratio of variances is not equal to 1
##95 percent confidence interval:
##  0.8867573 1.1627916
##sample estimates:
##  ratio of variances 
##1.015438 

---------------------------------------------------------------------------------
twin[1:20,]
twin2[1:20,]
y=as.matrix(twin[,7:11])
x=as.matrix(twin[,2:6])
lm=lm(y~sex+ zygosity+moed+faed+ faminc,data=twin)
summary(lm)

x=as.matrix(twin[,7:11])
library(outliers)
chisq.out.test(x)
chisq.out.test(x,opposite=TRUE)

#================================================================================
#                              Hongtao Li
#================================================================================

setwd("/Users/Lihongtao/Desktop/6215 Multivariate Statistical Analysis/Project")
data1 = read.csv("nmtwins.csv")  # read csv file 
score = read.csv("scores.csv")
family=data1[,2:6]
attach(data1)
summary(data1)
n=1678
mean=apply(score,2,mean)
#================================================================================

# Detecting Outliers
# step1. One dimenstional dot diagram
dotchart(data1$english,labels=row.names(english),cex=.7,main="Dot Polt", xlab="")
stripchart(score)
#--------------------------------------------------------------------------------

# step2. Sctplot matrix 
plot(score,main="Simple Scatterplot Matrix") # Matrix
#--------------------------------------------------------------------------------

# step3. z
z.eng=rep(NA,1678)
for (i in 1:n)
  {z.eng[i]=(english[i]-mean(english))/sqrt(sd(english))}
z.eng

z.math=rep(NA,1678)
for (i in 1:n)
{z.math[i]=(math[i]-mean(math))/sqrt(sd(math))}
z.math

z.soc=rep(NA,1678)
for (i in 1:n)
{z.soc[i]=(socsci[i]-mean(socsci))/sqrt(sd(socsci))}
z.soc

z.nat=rep(NA,1678)
for (i in 1:n)
{z.nat[i]=(natsci[i]-mean(natsci))/sqrt(sd(natsci))}
z.nat

z.vocab=rep(NA,1678)
for (i in 1:n)
{z.vocab[i]=(vocab[i]-mean(vocab))/sqrt(sd(vocab))}
z.vocab
#--------------------------------------------------------------------------------

# step4. squared generalized distances
S=cov(score)
S.inv=solve(S)
ssd=matrix(1:n,nrow=1)
for(i in 1:n)
{ssd[i]=as.matrix(score[i,]-mean)%*%as.matrix(S.inv)%*%t(score[i,]-mean)}
ssd
ssdsort=sort(ssd)
# Chi-square plots
qchi=c(1:n)
for(i in 1:n)
{qchi[i]=qchisq(p=((i-1/2)/n),df=5)}
plot(ssdsort~qchi)
#================================================================================

#CI
source("http://www.stat.wmich.edu/wang/561/codes/R/ci.R")
confidence(n=1678,xbar=mean,S=S, conf.region=T,alpha=.05)


score=as.matrix(score)
family=as.matrix(family)
fit=lm(score~family)
summary(fit)

library(outliers)
plot(fit,which=4)

chisq.out.test(score,variance=var(score), opposite=FALSE)