Boot strapping and confidence intervals Based on chapter 8 of ModernDive. Code for quiz 12.
moderndive::pennies_sample
# A tibble: 50 x 2
ID year
<int> <dbl>
1 1 2002
2 2 1986
3 3 2017
4 4 1988
5 5 2008
6 6 1983
7 7 2008
8 8 1996
9 9 2004
10 10 2000
# ... with 40 more rows
ggplot(pennies_sample, aes(x = year)) +
geom_histogram(binwidth = 10, color = "white")
pennies_resample <- tibble(
year = c(1976, 1962, 1976, 1983, 2017, 2015, 2015, 1962, 2016, 1976,
2006, 1997, 1988, 2015, 2015, 1988, 2016, 1978, 1979, 1997,
1974, 2013, 1978, 2015, 2008, 1982, 1986, 1979, 1981, 2004,
2000, 1995, 1999, 2006, 1979, 2015, 1979, 1998, 1981, 2015,
2000, 1999, 1988, 2017, 1992, 1997, 1990, 1988, 2006, 2000)
)
ggplot(pennies_resample, aes(x = year)) +
geom_histogram(binwidth = 10, color = "white") +
labs(title = "Resample of 50 pennies")
ggplot(pennies_sample, aes(x = year)) +
geom_histogram(binwidth = 10, color = "white") +
labs(title = "Original sample of 50 pennies")
moderndive::pennies_resamples
# A tibble: 1,750 x 3
# Groups: name [35]
replicate name year
<int> <chr> <dbl>
1 1 Arianna 1988
2 1 Arianna 2002
3 1 Arianna 2015
4 1 Arianna 1998
5 1 Arianna 1979
6 1 Arianna 1971
7 1 Arianna 1971
8 1 Arianna 2015
9 1 Arianna 1988
10 1 Arianna 1979
# ... with 1,740 more rows
ggplot(resampled_means, aes(x = mean_year)) +
geom_histogram(binwidth = 1, color = "white", boundary = 1990) +
labs(x = "Sampled mean year")
virtual_shovel <- bowl %>%
rep_sample_n(size = 50)
virtual_resample <- pennies_sample %>%
rep_sample_n(size = 50, replace = TRUE)
# A tibble: 1 x 2
replicate resample_mean
<int> <dbl>
1 1 1997.
virtual_resamples <- pennies_sample %>%
rep_sample_n(size = 50, replace = TRUE, reps = 35)
ggplot(virtual_resampled_means, aes(x = mean_year)) +
geom_histogram(binwidth = 1, color = "white", boundary = 1990) +
labs(x = "Resample mean year")
virtual_resamples <- pennies_sample %>%
rep_sample_n(size = 50, replace = TRUE, reps = 1000)
ggplot(virtual_resampled_means, aes(x = mean_year)) +
geom_histogram(binwidth = 1, color = "white", boundary = 1990) +
labs(x = "sample mean")
# A tibble: 1 x 1
mean_of_means
<dbl>
1 1995.
pennies_sample %>%
rep_sample_n(size = 50, replace = TRUE, reps = 1000)
# A tibble: 50,000 x 3
# Groups: replicate [1,000]
replicate ID year
<int> <int> <dbl>
1 1 38 1999
2 1 47 1982
3 1 10 2000
4 1 12 1995
5 1 12 1995
6 1 50 2017
7 1 24 2017
8 1 3 2017
9 1 26 1979
10 1 15 1974
# ... with 49,990 more rows
pennies_sample %>%
rep_sample_n(size = 50, replace = TRUE, reps = 1000) %>%
group_by(replicate)
# A tibble: 50,000 x 3
# Groups: replicate [1,000]
replicate ID year
<int> <int> <dbl>
1 1 12 1995
2 1 23 1998
3 1 6 1983
4 1 49 2006
5 1 33 1979
6 1 10 2000
7 1 27 1993
8 1 34 1985
9 1 35 1985
10 1 43 2018
# ... with 49,990 more rows
pennies_sample %>%
rep_sample_n(size = 50, replace = TRUE, reps = 1000) %>%
group_by(replicate) %>%
summarize(mean_year = mean(year))
# A tibble: 1,000 x 2
replicate mean_year
<int> <dbl>
1 1 1996.
2 2 1989.
3 3 1996.
4 4 1995.
5 5 1998.
6 6 1997.
7 7 1994.
8 8 1995.
9 9 1995.
10 10 1996.
# ... with 990 more rows
Response: year (numeric)
# A tibble: 1 x 1
stat
<dbl>
1 1995.
Response: year (numeric)
# A tibble: 50 x 1
year
<dbl>
1 2002
2 1986
3 2017
4 1988
5 2008
6 1983
7 2008
8 1996
9 2004
10 2000
# ... with 40 more rows
Response: year (numeric)
# A tibble: 50 x 1
year
<dbl>
1 2002
2 1986
3 2017
4 1988
5 2008
6 1983
7 2008
8 1996
9 2004
10 2000
# ... with 40 more rows
Response: year (numeric)
# A tibble: 50,000 x 2
# Groups: replicate [1,000]
replicate year
<int> <dbl>
1 1 2004
2 1 1971
3 1 1996
4 1 1985
5 1 2006
6 1 1996
7 1 2015
8 1 1962
9 1 2017
10 1 1985
# ... with 49,990 more rows
visualize(bootstrap_distribution)
percentile_ci <- bootstrap_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
visualize(bootstrap_distribution) +
shade_confidence_interval(endpoints = percentile_ci)
standard_error_ci <- bootstrap_distribution %>%
get_confidence_interval(type = "se", point_estimate = x_bar)
visualize(bootstrap_distribution) +
shade_confidence_interval(endpoints = standard_error_ci)
moderndive::bowl_sample_1
# A tibble: 50 x 1
color
<chr>
1 white
2 white
3 red
4 red
5 white
6 white
7 red
8 white
9 white
10 white
# ... with 40 more rows
Response: color (factor)
# A tibble: 50 x 1
color
<fct>
1 white
2 white
3 red
4 red
5 white
6 white
7 red
8 white
9 white
10 white
# ... with 40 more rows
bowl_sample_1 %>%
specify(response = color, success = "red") %>%
generate(reps = 1000, type = "bootstrap")
Response: color (factor)
# A tibble: 50,000 x 2
# Groups: replicate [1,000]
replicate color
<int> <fct>
1 1 white
2 1 white
3 1 red
4 1 red
5 1 white
6 1 white
7 1 white
8 1 red
9 1 white
10 1 red
# ... with 49,990 more rows
percentile_ci_1 <- sample_1_bootstrap %>%
get_confidence_interval(level = 0.95, type = "percentile")
sample_1_bootstrap %>%
visualize(bins = 15) +
shade_confidence_interval(endpoints = percentile_ci_1) +
geom_vline(xintercept = 0.42, linetype = "dashed")
bowl_sample_2 <- bowl %>% rep_sample_n(size = 50)
percentile_ci_2 <- sample_2_bootstrap %>%
get_confidence_interval(level = 0.95, type = "percentile")
moderndive::mythbusters_yawn
# A tibble: 50 x 3
subj group yawn
<int> <chr> <chr>
1 1 seed yes
2 2 control yes
3 3 seed no
4 4 seed yes
5 5 seed no
6 6 control no
7 7 seed yes
8 8 control no
9 9 control no
10 10 seed no
# ... with 40 more rows
# A tibble: 4 x 3
# Groups: group [2]
group yawn count
<chr> <chr> <int>
1 control no 12
2 control yes 4
3 seed no 24
4 seed yes 10
Response: yawn (factor)
Explanatory: group (factor)
# A tibble: 50 x 2
yawn group
<fct> <fct>
1 yes seed
2 yes control
3 no seed
4 yes seed
5 no seed
6 no control
7 yes seed
8 no control
9 no control
10 no seed
# ... with 40 more rows
first_six_rows <- head(mythbusters_yawn)
# A tibble: 6 x 3
subj group yawn
<int> <chr> <chr>
1 3 seed no
2 2 control yes
3 2 control yes
4 5 seed no
5 2 control yes
6 1 seed yes
mythbusters_yawn %>%
specify(formula = yawn ~ group, success = "yes") %>%
generate(reps = 1000, type = "bootstrap")
Response: yawn (factor)
Explanatory: group (factor)
# A tibble: 50,000 x 3
# Groups: replicate [1,000]
replicate yawn group
<int> <fct> <fct>
1 1 yes seed
2 1 no control
3 1 no seed
4 1 no seed
5 1 no seed
6 1 no seed
7 1 yes seed
8 1 yes seed
9 1 no seed
10 1 no control
# ... with 49,990 more rows
mythbusters_yawn %>%
specify(formula = yawn ~ group, success = "yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in props")
Response: yawn (factor)
Explanatory: group (factor)
# A tibble: 1,000 x 2
replicate stat
<int> <dbl>
1 1 -0.189
2 2 -0.373
3 3 0.0517
4 4 0.0333
5 5 -0.0921
6 6 0.0676
7 7 0.0167
8 8 -0.0963
9 9 -0.0164
10 10 0.190
# ... with 990 more rows
visualize(bootstrap_distribution_yawning) +
geom_vline(xintercept = 0)
bootstrap_distribution_yawning %>%
get_confidence_interval(type = "percentile", level = 0.95)
# A tibble: 1 x 2
lower_ci upper_ci
<dbl> <dbl>
1 -0.221 0.295
myth_ci_se <- bootstrap_distribution_yawning %>%
get_confidence_interval(type = "se", point_estimate = obs_diff_in_props)
myth_ci_se
# A tibble: 1 x 2
lower_ci upper_ci
<dbl> <dbl>
1 -0.215 0.304
virtual_samples <- bowl %>%
rep_sample_n(size = 50, reps = 1000)
ggplot(sampling_distribution, aes(x = prop_red)) +
geom_histogram(binwidth = 0.05, boundary = 0.4, color = "white") +
labs(x = "Proportion of 50 balls that were red",
title = "Sampling distribution")