title | author | date | output |
---|---|---|---|
Cleaning Data Project |
Rene Broekhoven |
Thursday, July 23, 2015 |
html_document |
The datafile for this project can be obtained at :
https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip
This is a ZIP-file which you should unzip with your own tools (Windows or Mac) in a directory of your choosing. For example : c:/desktop/project After unzipping, you will get a folder "getdata-projectfiles-UCI HAR Dataset". In this folder you will find "UCI HAR Dataset" : this directory will be your starting point. This script can be executed from the original directory (in this example : c:/desktop/project)
## Unzipping the dataset will get you the directory : getdata-projectfiles-UCI HAR Dataset
## The starting workingdirectory will be set to the subdirectory herein : "........./"UCI HAR Dataset/"
## Cleaning everything and setting the working directory ##
rm(list=ls())
startwd <- getwd()
## !! Change the dots(....) to your directory where you unzipped the UCI HAR dataset.
## After the unzipping, there should be the folder : "getdata-projectfiles-UCI HAR Dataset"
## This example is my own directory I used :
## EXAMPLE : my_unzip_dir <- "c:/users/rene/desktop/cleaningdata/project/"
my_unzip_dir <- "...."
# setting the working directory
project_working_dir <- "getdata-projectfiles-UCI HAR Dataset/UCI HAR Dataset"
setwd(paste(my_unzip_dir,project_working_dir, sep=""))
#using library(plyr) to make things easy
library(dplyr)
## QUESTION 1 ##
# Making dataframe's and merging
test_df <- read.table("./test/X_test.txt")
train_df <- read.table("./train/X_train.txt")
my_df <- rbind(train_df, test_df)
## QUESTION 2 ##
# Getting only the mean en std variables
# First getting the names from the file features.txt
features <- read.table("features.txt")
features <- as.character(features[,2])
colnames(my_df) <- features
# to remove the duplicates (although I could nog find them by hand...)
my_df <- my_df[ , !duplicated(colnames(my_df))]
# selecting the right columns with a simle regular expression
# I also deleted the variables with meanFreq, because they seem to be derived variables asnd therefore untidy
my_df <- select(my_df, matches("mean|std")) # as requested in the project
my_df <- select(my_df,-matches("meanFreq")) # my interpretation ! : these are derived variables (= 'untidy')
# adding the activity and subject columns from train and test (the columns now are called V1 and V2)
test_activity <- read.table("./test/y_test.txt")
test_subject <- read.table("./test/subject_test.txt")
train_activity <- read.table("./train/y_train.txt")
train_subject <- read.table("./train/subject_train.txt")
# binding train and test with activity as Factor
tot_activity <- rbind(test_activity, train_activity)
tot_activity[,1] <- as.factor(tot_activity[,1])
tot_subject <- rbind(test_subject, train_subject)
# naming the new column names from V1 and V2 into "activity" and "subject"
my_df <- cbind(my_df, tot_activity,tot_subject)
colnames(my_df)[length(names(my_df))-1] <-"activity"
colnames(my_df)[length(names(my_df))] <-"subject"
## QUESTION 3 ##
# Making the activity in descriptive labels from the activity-labels.txt
labels <- read.table("activity_labels.txt")
labels <- labels[,2]
levels(my_df$activity) <- labels
## QUESTION 4 ##
# Appropriately labels the data set with descriptive variable names from the features.txt
# activity and subject already labeled
# this was a lot of work : I wonder if this was meant in the assignment ?
# I had little clue as to what the variables really meant
temp <- names(my_df)
temp <- tolower(temp)
temp <- gsub("^t", "time", temp)
temp <- gsub("^f", "frequency", temp)
temp <- gsub("acc", "acceleration", temp)
temp <- gsub("gyro", "gyroscope", temp)
temp <- gsub("mag", "magnitude", temp)
temp <- gsub("\\(\\)", "", temp)
temp <- gsub("\\(tbody", "timebody", temp)
temp <- gsub("\\(", "", temp)
temp <- gsub("\\(", "", temp)
names(my_df) <- temp
# creating the table format for use in dplyr
my_tb <- tbl_df(my_df)
## QUESTION 5 ##
# using the summarise_each function from dplyr and using "View" for a seperate window
my_tidy_data <- my_tb %>% arrange(activity, subject) %>% group_by(activity, subject) %>% summarise_each(funs(mean))
View(my_tidy_data)
# writing the file as .txt file to my_tidy_data.txt
write.table(my_tidy_data, file="my_tidy_data.txt", row.names = FALSE)
# get back to the origional directory from where we started
setwd(startwd)