forked from rdpeng/RepData_PeerAssessment1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPA1_template.Rmd
137 lines (93 loc) · 3.75 KB
/
PA1_template.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
---
title: "Reproducible Research: Peer Assessment 1"
author: "Adam Ficke"
date: "2/18/2021"
output:
html_document:
keep_md: yes
---
## Loading and preprocessing the data
```{r preprocessing}
library(tidyverse)
df1 <- read_csv("activity.zip") %>%
drop_na()
```
## What is mean total number of steps taken per day?
```{r mean_steps, echo=TRUE}
df1 <- df1 %>%
group_by(date) %>%
summarize(tot_steps = sum(steps))
df1 %>%
ggplot(aes(tot_steps)) +
geom_histogram(binwidth = 600) +
ggtitle("Mean Daily Number of Steps") +
theme(plot.title = element_text(hjust=0.5))
mean_steps <- round(df1 %>%
summarise(mean_steps = mean(tot_steps)))
median_steps <- round(df1 %>%
summarise(median_steps = median(tot_steps)))
print(paste("The mean number of steps is",mean_steps, "and the median number of steps is",median_steps))
```
## What is the average daily activity pattern?
```{r avg_daily_pattern}
df2 <- read_csv("activity.zip")
df_int <- df2 %>%
group_by(interval) %>%
summarize(avg_steps = mean(steps,na.rm = TRUE))
plot_1 <- df_int %>%
ggplot(aes(x = interval, y = avg_steps)) +
geom_line() +
ggtitle("Average Number of Steps per 5 Minute Intervals") +
theme(plot.title = element_text(hjust=0.5))
plot_1
interval_max<-df_int[which.max(df_int$avg_steps),][1]
steps_max<-round(df_int[which.max(df_int$avg_steps),][2])
print(paste(interval_max, "is the interval with the highest number of steps,", "which has an average of",steps_max,"steps."))
```
## Imputing missing values
```{r impute, echo=TRUE}
#Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with \color{red}{\verb|NA|}NAs)
sum(is.na(df2$steps))
df_impute <- df2 %>%
group_by(interval) %>%
mutate(steps = ifelse(is.na(steps), mean(steps, na.rm = TRUE), steps))
# Make a histogram of the total number of steps taken each day and Calculate and report the mean and median total number of steps taken per day.
#Do these values differ from the estimates from the first part of the assignment?
#What is the impact of imputing missing data on the estimates of the total daily number of steps?
plot_hist_impute <- df_impute %>%
group_by(date) %>%
summarize(tot_steps = sum(steps)) %>%
ggplot(aes(tot_steps)) +
geom_histogram(binwidth = 500) +
ggtitle("Average Daily Number of Steps After Imputation") +
theme(plot.title = element_text(hjust=0.5))
plot_hist_impute
#mean/median
df_impute_dt <- df_impute %>%
group_by(date) %>%
summarise(tot_steps = sum(steps))
mean_steps_i <- round(df_impute_dt %>%
summarise(mean_steps = mean(tot_steps)))
median_steps_i <- round(df_impute_dt %>%
summarise(median_steps = median(tot_steps)))
print(paste("The mean number of steps after imputing is",mean_steps_i, "and the median number of steps after inputing is",median_steps_i))
```
## Are there differences in activity patterns between weekdays and weekends?
```{r weekend, echo=TRUE}
df_impute <- df_impute %>%
mutate(weekend = fct_collapse(weekdays(date),
Weekend = c("Saturday", "Sunday"),
Weekday = c("Monday","Tuesday","Wednesday","Thursday","Friday"))
)
#Plot weekend vs. weekday
df_impute_plot <- df_impute %>%
group_by(interval,weekend) %>%
summarize(avg_steps = mean(steps,na.rm = TRUE))
plot_3 <- df_impute_plot %>%
ggplot(aes(x = interval, y = avg_steps)) +
geom_line() +
facet_grid(~ weekend) +
ggtitle("Average Number of Steps per 5 Minute Intervals") +
theme(plot.title = element_text(hjust=0.5))
plot_3
```