reproducible-research-assignment-1: An HTML repository from gabrielverta

title	author	date	output
Reproducible Research - Course Project 1	Gabriel Verta	March 6, 2016	html_document

To parse dates we're going to use lubridate, dplyr to parse data, and to plot some graphs ggplot2


library(dplyr)
library(lubridate)
library(ggplot2)

Reading the data, and parsing dates. Grouping activities per day

activities <- read.csv("activity.csv")
activities$date <- ymd(activities$date)

activities_per_day <- group_by(activities, date)
activities_per_interval <- group_by(activities, interval)

Histogram of the total number of steps taken each day:

items <- tapply(activities$steps, activities$date, FUN=sum)
barplot(items, las=2, cex.names = 0.5)

Mean and median number of steps taken each day. Time series plot of the average number of steps taken.

mean_and_median <- summarise(activities_per_day, mean=mean(steps, na.rm = TRUE), n=median(steps, na.rm = TRUE))
plot(mean_and_median$mean, type="l", col="red", ylab = "Steps", xlab = "Day", las=2, main="Average number of steps taken")
lines(mean_and_median$median, type="l", col="blue")
legend("topright", legend=c("mean", "median"), pch=1, col=c("red", "blue"))

The 5-minute interval that, on average, contains the maximum number of steps

average_of_steps_by_interval <- summarise(activities_per_interval, average=mean(steps, na.rm = TRUE))
ranking_of_intervals <- arrange(average_of_steps_by_interval, desc(average))
print(ranking_of_intervals$interval[1])

Code to describe and show a strategy for imputing missing data

Total number of missing values in the dataset

nas <- is.na(activities$steps)
total_empty <- sum(nas)
percentage_empty <- mean(nas)
# Total empty results
print(total_empty)

# Percentage
print(paste(sprintf("%.2f", percentage_empty * 100), "%", sep=""))

Using average of that 5-minute to populate NAs

not_empty <- mutate(activities)
na_intervals <- not_empty$interval[nas]
average_na_intervals <- sapply(na_intervals, function(item) {average_of_steps_by_interval$average[average_of_steps_by_interval$interval==item]})
not_empty$steps[nas] = average_na_intervals
#average_of_steps_by_interval$average[average_of_steps_by_interval$interval==0]

Histogram of the total number of steps taken each day after missing values are imputed

not_empty_items <- tapply(not_empty$steps, not_empty$date, FUN=sum)

par(mfrow=c(2,1))
barplot(items, las=2, cex.names = 0.5, main="Histogram with missing values (NAs)")
barplot(not_empty_items, las=2, cex.names = 0.5, main="Histogram without missing values (NAs)")

As we can see, missing values can change the final result of an analysis.

Panel plot comparing the average number of steps taken per 5-minute interval across weekdays and weekends

activities_with_days <- mutate(activities, weekday=weekdays(date))
activities_by_day_interval <- group_by(activities_with_days, weekday, interval)

results_per_day <- summarise(activities_by_day_interval, steps=mean(steps, na.rm=TRUE))
qplot(interval, steps, data=results_per_day, facets= weekday ~ .) + geom_line()

gabrielverta/reproducible-research-assignment-1