diabetes-analysis

Analysis of diabetes dataset using R.

Questions: dataset-analysis-questions.pdf

Dataset desciption : dataset-description.pdf

Dataset : diabetes-dataset.csv

Answers :

R Markdown : diabetes-analysis.Rmd
HTML : diabetes-analysis.html
Markdown : This file
PDF : diabetes-analysis.pdf

Q. Find total record for class 0 and class 1 from the outcome column.

positive = subset(diabetes,diabetes$Outcome==1)
negative = subset(diabetes,diabetes$Outcome==0)
View(positive)
View(negative)

Q. Find Distinct values for different attributes (Columns) and print total count for each.

library("plyr")
(ldply(diabetes, function(x) data.frame(table(x))))

Q. What is the Gini coefficient of the attributes(Columns).

gini(diabetes$Pregnancies)
gini(diabetes$Glucose)
gini(diabetes$BloodPressure)
gini(diabetes$SkinThickness)
gini(diabetes$Insulin)
gini(diabetes$BMI)
gini(diabetes$DiabetesPedigreeFunction)
gini(diabetes$Age)
gini(diabetes$Outcome)

Q. Find DiabetesPedigreeFunction which is < .200 if skin thickness and Insulin is 0 .

subset(diabetes, diabetes$SkinThickness==0 & diabetes$Insulin==0 & diabetes$DiabetesPedigreeFunction<0.200)

Q. Check what will be the blood pressure level if pregnancies count is 10 and print the average BP level.

BP_for_preg_10 = subset(diabetes,Pregnancies==10,select= BloodPressure)
print(mean(BP_for_preg_10$BloodPressure))
View(BP_for_preg_10)

Q. Print first 3 Highest and Lowest count of pregnancies and print the age

#Lowest
x = diabetes$Pregnancies
index = which(x == sort(unique(x) )[1:3] )
View(diabetes[c(index),c(1,8)])

#Highest
x = diabetes$Pregnancies
index = which(x == sort(unique(x),decreasing = TRUE )[1:3] )
View(diabetes[c(index),c(1,8)])

Q. Check if someone having BMI level o then BP should be > 90

BMI_0_BP_90 = subset(diabetes,BMI==0&BloodPressure>90)
View(BMI_0_BP_90)
BMI_0_BP_NOT90 = subset(diabetes,BMI==0&BloodPressure<=90)
View(BMI_0_BP_NOT90)

Q. Print all the rows which contains only one 0 in at least one column.

View(subset(diabetes, apply(diabetes,1, function(x) length(which(x==0))==1 )))

Q. Print all the rows which contains 0 in more than one columns.

morethan_1_0 = subset(diabetes,apply(diabetes,1,function(x) length(which(x==0)))>1)
View(morethan_1_0)

Q. Print all the rows if Glucose level is < 100.

View(subset(diabetes, diabetes[,2] < 100))

Q. Print all the age which is < then 35 and BP is > 60.

agelessthan_35_BPmorethan_60 = subset(diabetes,Age<35&BloodPressure>60,select=c(Age))
View(agelessthan_35_BPmorethan_60)

Q. Print decision tree of pregnancies on the basis of Age,BP and Insulin.

output = ctree(Pregnancies ~ Age+BloodPressure+Insulin,
               diabetes)
plot(output)

Q. Print all the values in BP columns where values in equal digit (Ex. 66 ,99).

doubledigit = subset(diabetes,grepl("(0|1|2|3|4|5|6|7|8|9)\\1",diabetes$BloodPressure))
View(doubledigit)

Q. Find ANOVA method for the database.

# taking 'Insulin' as response, 'Glucose' as predictor 
# and 'Outcome' as categorical variable
result1 = aov(Insulin~Glucose*Outcome,data = diabetes)
result2 = aov(Insulin~Glucose+Outcome,data = diabetes)
print(anova(result1,result2))

Q. Print all the Glucose value where insulin value is equal to 0.

# taking 'Insulin' as response, 'Glucose' as predictor 
# and 'Outcome' as categorical variable
glucose_insulin0 = subset(diabetes,Insulin==0,select=Glucose)
View(glucose_insulin0)

Q. Find Binomial distribution of database

#binomial distribution for 'Insulin'
insulin.level = diabetes$Insulin
binom.value = dbinom(insulin.level, length(insulin.level), 0.5)
plot(insulin.level, binom.value)

Q. Check if database if normally distributed

#checking if 'Blood Pressure' is normally distributed using a qqplot
bp.level = diabetes$BloodPressure
qqnorm(bp.level)
qqline(bp.level, col = 2)

Q. Find prediction of pregnancies on the basis of DiabetesPedigreeFunction.

model = lm(Pregnancies~DiabetesPedigreeFunction,data=diabetes)
intercpt=coef(model)[1]
XDPF = coef(model)[2]
avg= mean(diabetes$DiabetesPedigreeFunction)
newpredict = intercpt + XDPF*avg #for average DPF
print(newpredict)

Q. Print scatter plot for the database.

plot(diabetes[c(1:10),])

Q. Print time series analysis basis on Age (30-80) as per glucose value.

age_30_80 = subset(diabetes,Age>29&Age<81)
time_series_age = ts(age_30_80$Age,start=1,end=371,frequency = 1)
time_series_glucose = ts(age_30_80$Glucose,start=1,end=371,frequency = 1)
age_glucose_matrix = matrix(c(time_series_age,time_series_glucose),nrow=371)
time_series_matrix = ts(age_glucose_matrix,start=1,end=371,frequency = 1)
plot(time_series_matrix)

Q. Find logical distribution basis on outcomes.

model = lm(Outcome~Pregnancies+Glucose+BloodPressure+SkinThickness+Insulin+BMI+DiabetesPedigreeFunction+Age,data=diabetes)
x = c(coef(model)[2],coef(model)[3],coef(model)[4],coef(model)[5],coef(model)[6],coef(model)[7],coef(model)[8],coef(model)[9])
max_3 = sort(x,decreasing = T)
max_3 = max_3[1:3]
model = lm(Outcome~BMI+DiabetesPedigreeFunction+Age,data=diabetes) #check for the values in max using above given characterstics of model
tree = ctree(Outcome~BMI+DiabetesPedigreeFunction+Age,diabetes)
plot(tree)

rohit3463/diabetes-analysis

diabetes-analysis

Q. Find total record for class 0 and class 1 from the outcome column.

Q. Find Distinct values for different attributes (Columns) and print total count for each.

Q. What is the Gini coefficient of the attributes(Columns).

Q. Find DiabetesPedigreeFunction which is < .200 if skin thickness and Insulin is 0 .

Q. Check what will be the blood pressure level if pregnancies count is 10 and print the average BP level.

Q. Print first 3 Highest and Lowest count of pregnancies and print the age

Q. Check if someone having BMI level o then BP should be > 90

Q. Print all the rows which contains only one 0 in at least one column.

Q. Print all the rows which contains 0 in more than one columns.

Q. Print all the rows if Glucose level is < 100.

Q. Print all the age which is < then 35 and BP is > 60.

Q. Print decision tree of pregnancies on the basis of Age,BP and Insulin.

Q. Print all the values in BP columns where values in equal digit (Ex. 66 ,99).

Q. Find ANOVA method for the database.

Q. Print all the Glucose value where insulin value is equal to 0.

Q. Find Binomial distribution of database

Q. Check if database if normally distributed

Q. Find prediction of pregnancies on the basis of DiabetesPedigreeFunction.

Q. Print scatter plot for the database.

Q. Print time series analysis basis on Age (30-80) as per glucose value.

Q. Find logical distribution basis on outcomes.