Top-down design (and the bootstrap)

What does it mean?

General approach to problem solving.

For example, if you’re designing a house, you might think:

In programming:

big_function = function(arguments) {
    piece_1 = get_piece_1(some_arguments)
    piece_2 = get_piece_2(other_arguments)
    output = put_pieces_together(piece_1, piece_2)
}

Then fill out the functions get_piece_1, get_piece_2, put_pieces_together.

Advantages:

A very short introduction to the bootstrap

Let \(x_1, \dots, x_n\) be a sample of size \(n\) from a population, and let \(f\) be an estimator of some population parameter \(\theta\) of interest, so that \(\hat \theta = f(x_1, \ldots, x_n)\) is our point estimate of the population parameter.

We can’t do this because we can’t draw new samples from the population, but…

Idea behind the bootstrap is to pretend that the observed data \(x_1, \ldots, x_n\) is the population.

The distribution of the \(\hat \theta^*\)’s can be used to approximate the sampling distribution of \(\hat \theta\).

In particular, if we have \(\hat \theta^*_1, \ldots, \hat \theta^*_{B}\), where \(B\) is the number of “bootstrap replicates”, then

We’ll try to compute bootstrap confidence intervals today.

Example: Bootstrap confidence intervals

First try at a function to compute bootstrap confidence intervals.

bootstrap_mean_ci = function(data, alpha, B) {
    n = length(data)
    boot_means = replicate(B, {
        boot_indices = sample(x = 1:n, size = n, replace = TRUE)
        boot_data = data[boot_indices]
        boot_mean = mean(boot_data)
    })
    q_lo = alpha / 2
    q_hi = 1 - (alpha / 2)
    boot_ci = quantile(boot_means, probs = c(q_lo, q_hi))
    return(boot_ci)
}

Top-down design way of building the function:

The confidence interval we want comes from the quantiles of the resampled estimates, so we:

  1. Get the resampled estimates of the mean.

  2. Compute the quantiles of the resampled estimates of the mean.

  3. Return the quantiles.

bootstrap_mean_ci = function(data, alpha, B) {
    boot_means = get_boot_means(data, B)
    boot_ci = get_ci(boot_means, alpha)
    return(boot_ci)
}

Notice that now would be a good time to write tests for the get_ci and get_boot_means functions.

In the bootstrap_mean_ci function, we assume that

We might also want to check that

Then we write the two sub-functions:

get_boot_means = function(data, B) {
    n = length(data)
    boot_means = replicate(B, {
        boot_indices = sample(x = 1:n, size = n, replace = TRUE)
        boot_data = data[boot_indices]
        boot_mean = mean(boot_data)
    })
    return(boot_means)
}
get_ci = function(x, alpha){
    q_lo = alpha / 2
    q_hi = 1 - (alpha / 2)
    ci = quantile(x, probs = c(q_lo, q_hi))
    return(ci)
}

Go through the informal tests:

## informal tests
length(get_boot_means(data = rnorm(10), B = 20))
## [1] 20
get_ci(0:100, .05)
##  2.5% 97.5% 
##   2.5  97.5

Then check the whole function:

bootstrap_mean_ci(data = rnorm(10), alpha = .05, B = 100)
##       2.5%      97.5% 
## -0.6770114  0.6726687

Example 2

bootstrap_mean_ci = function(data, alpha, B) {
    n = length(data)
    boot_means = replicate(B, {
        boot_indices = sample(x = 1:n, size = n, replace = TRUE)
        boot_data = data[boot_indices]
        boot_mean = mean(boot_data)
    })
    q_lo = alpha / 2
    q_hi = 1 - (alpha / 2)
    boot_ci = quantile(boot_means, probs = c(q_lo, q_hi))
    return(boot_ci)
}
bootstrap_lm_ci = function(data, formula, alpha, B) {
    n = nrow(data)
    boot_sampled_coefs = replicate(B, {
        boot_indices = sample(x = 1:n, size = n, replace = TRUE)
        boot_data = data[boot_indices,]
        boot_lm = lm(formula = formula, data = boot_data)
        boot_coefs = coef(boot_lm)
    })
    q_lo = alpha / 2
    q_hi = 1 - (alpha / 2)
    boot_ci = plyr::aaply(boot_sampled_coefs, 1, function(x) quantile(x, probs = c(q_lo, q_hi)))
    return(boot_ci)
}

Notice:

Within each function, we have two steps:

bootstrap_ci = function(data, estimator, alpha, B) {
    boot_estimates = get_boot_estimates(data, estimator, B)
    boot_ci = get_ci(boot_estimates, alpha)
    return(boot_ci)
}

We’re assuming:

This would be a good time to write tests for the functions we’ve defined.

Tests for return types:

Test for actual values:

Next step: fill out the functions

get_ci = function(estimates, alpha) {
    q_lo = alpha / 2
    q_hi = 1 - (alpha / 2)
    if(!is.null(dim(estimates))) {
        ## if we have multi-dimensional estimates
        cis = plyr::aaply(estimates, 1, function(x) quantile(x, probs = c(q_lo, q_hi)))
    } else {
        ## if we have one-dimensional estimates
        cis = quantile(estimates, probs = c(q_lo, q_hi))
    }
    return(cis)
}
get_boot_estimates = function(data, estimator, B) {
    boot_estimates = replicate(B, expr = {
        resampled_data = get_bootstrap_sample(data)
        boot_estimate = estimator(resampled_data)
        return(boot_estimate)
    })
    return(boot_estimates)
}

We made another problem for ourselves! Still need get_bootstrap_sample.

Fill out get_bootstrap_sample

get_bootstrap_sample = function(data) {
    if(!is.null(dim(data))) {
        ## in this case, data is rectangular, and we want to sample rows
        n = nrow(data)
        boot_idx = sample(1:n, size = n, replace = TRUE)
        bootstrap_sample = data[boot_idx,]
    } else {
        ## in this case, data is a vector and we want to sample elements of the vector
        n = length(data)
        boot_idx = sample(1:n, size = n, replace = TRUE)
        bootstrap_sample = data[boot_idx]
    }
    return(bootstrap_sample)
}

Could make an argument for breaking into two functions:

get_bootstrap_sample = function(data) {
    if(!is.null(dim(data))) {
        boot_sample = bootstrap_sample_rows(data)
    } else {
        boot_sample =  bootstrap_sample_elements(data)
    }
    return(boot_sample)
}

bootstrap_sample_rows = function(data) {
    n = nrow(data)
    boot_idx = sample(1:n, size = n, replace = TRUE)
    bootstrap_sample = data[boot_idx,]
    return(bootstrap_sample)
}

bootstrap_sample_elements = function(data) {
    n = length(data)
    boot_idx = sample(1:n, size = n, replace = TRUE)
    bootstrap_sample = data[boot_idx]
    return(bootstrap_sample)
}

Does it work?

bootstrap_ci(data = rnorm(10), estimator = mean, alpha = .05, B = 1000)
##       2.5%      97.5% 
## -0.9940405  0.1403266
bootstrap_ci(data = rnorm(100, sd = 2),
             estimator = sd,
             alpha = .05, B = 1000)
##     2.5%    97.5% 
## 1.663007 2.178305

The more complicated example: bootstrap confidence intervals for coefficients from a linear model:

data(iris)
iris_coef_estimator = function(d) {
    iris_lm = lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = d)
    iris_coef = coef(iris_lm)
    return(iris_coef)
}
bootstrap_ci(data = iris,
             estimator = iris_coef_estimator,
             alpha = .05,
             B = 1000)
##               
## X1                  2.5%     97.5%
##   (Intercept)  1.7679421 2.7262926
##   Sepal.Width  0.4582632 0.7256203
##   Petal.Length 0.4389810 0.5048571
## compare with
iris_lm = lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)
confint(iris_lm)
##                  2.5 %    97.5 %
## (Intercept)  1.7590943 2.7391860
## Sepal.Width  0.4585161 0.7325334
## Petal.Length 0.4380915 0.5057486