Split/apply/combine part 2

Reading: Hadley Wickham, "The Split/Apply/Combine Strategy for Data Analysis".

Agenda for today:

Last time

Split/apply/combine with base R, lots of different functions for different tasks

plyr is going to clean this up for us. The plyr functions have the goals as base R functions, but with

Before we get into that, let's go back to functions...

Functions as objects

Remember that functions in R are objects that we can create, manipulate, and pass to other functions.

simple_function = function(x) x^2
simple_function
## function(x) x^2
formals(simple_function)
## $x
environment(simple_function)
## <environment: R_GlobalEnv>

There is a difference between a function and what the function evaluates to:

simple_function
## function(x) x^2
class(simple_function)
## [1] "function"
simple_function(2)
## [1] 4
class(simple_function(2))
## [1] "numeric"

When we're doing split/apply/combine, we want to pass in a function, not what the function evaluates to.

x = sample(0:1, size = 20, replace = TRUE)
type = rep(letters[1:2], each = 10)
x
##  [1] 0 1 0 0 0 1 1 0 0 1 0 0 0 1 1 0 0 0 1 1
type
##  [1] "a" "a" "a" "a" "a" "a" "a" "a" "a" "a" "b" "b" "b" "b" "b" "b" "b"
## [18] "b" "b" "b"
tapply(x, type, mean) ## mean is a function
##   a   b 
## 0.4 0.4
tapply(x, type, mean(x)) ## mean(x) is a number, not a function
## Error in match.fun(FUN): 'mean(x)' is not a function, character or symbol
## the line above is the same as the following:
mean_x = mean(x)
mean_x
## [1] 0.4
tapply(x, type, mean_x)
## Error in get(as.character(FUN), mode = "function", envir = envir): object 'mean_x' of mode 'function' was not found

Anonymous functions

If you have a function you only want to use once, you don't have to assign it to anything.

When you use a function this way it is called an anonymous function because it doesn't have a name in your code.

These are often used in the context of the apply family.

data(diamonds)
## here the function for computing the coefficients is an anonymous function
diamond_coefs = by(diamonds, diamonds$color, FUN = function(data_subset) {
    diamond_lm = lm(log(price) ~ carat, data = data_subset)
    diamond_coefficients = coef(diamond_lm)
    return(diamond_coefficients)
})
diamond_coefs
## diamonds$color: D
## (Intercept)       carat 
##    6.048811    2.383864 
## -------------------------------------------------------- 
## diamonds$color: E
## (Intercept)       carat 
##    6.034513    2.348335 
## -------------------------------------------------------- 
## diamonds$color: F
## (Intercept)       carat 
##    6.088442    2.272790 
## -------------------------------------------------------- 
## diamonds$color: G
## (Intercept)       carat 
##    6.109554    2.178489 
## -------------------------------------------------------- 
## diamonds$color: H
## (Intercept)       carat 
##    6.180284    1.906300 
## -------------------------------------------------------- 
## diamonds$color: I
## (Intercept)       carat 
##    6.175315    1.799199 
## -------------------------------------------------------- 
## diamonds$color: J
## (Intercept)       carat 
##    6.254074    1.627947

Equivalent to:

get_diamond_coefs = function(data_subset) {
    diamond_lm = lm(log(price) ~ carat, data = data_subset)
    diamond_coefficients = coef(diamond_lm)
    return(diamond_coefficients)
}
diamond_coefs = by(diamonds, diamonds$color, get_diamond_coefs)
diamond_coefs
## diamonds$color: D
## (Intercept)       carat 
##    6.048811    2.383864 
## -------------------------------------------------------- 
## diamonds$color: E
## (Intercept)       carat 
##    6.034513    2.348335 
## -------------------------------------------------------- 
## diamonds$color: F
## (Intercept)       carat 
##    6.088442    2.272790 
## -------------------------------------------------------- 
## diamonds$color: G
## (Intercept)       carat 
##    6.109554    2.178489 
## -------------------------------------------------------- 
## diamonds$color: H
## (Intercept)       carat 
##    6.180284    1.906300 
## -------------------------------------------------------- 
## diamonds$color: I
## (Intercept)       carat 
##    6.175315    1.799199 
## -------------------------------------------------------- 
## diamonds$color: J
## (Intercept)       carat 
##    6.254074    1.627947

plyr naming convention

All plyr functions named **ply

The possible vaues for the positions are

Syntax will have you specify:

Array input: a*ply

syntax: a*ply(data, margin, fun)

Example:

library(plyr)
X = matrix(1:6, nrow = 2, ncol = 3)
X
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
aaply(X, 1, sum)
##  1  2 
##  9 12
adply(X, 1, sum)
##   X1 V1
## 1  1  9
## 2  2 12
alply(X, 1, sum)
## $`1`
## [1] 9
## 
## $`2`
## [1] 12
## 
## attr(,"split_type")
## [1] "array"
## attr(,"split_labels")
##   X1
## 1  1
## 2  2

A couple of extras:

Example on a 3-dimensional array:

data(HairEyeColor)
HairEyeColor
## , , Sex = Male
## 
##        Eye
## Hair    Brown Blue Hazel Green
##   Black    32   11    10     3
##   Brown    53   50    25    15
##   Red      10   10     7     7
##   Blond     3   30     5     8
## 
## , , Sex = Female
## 
##        Eye
## Hair    Brown Blue Hazel Green
##   Black    36    9     5     2
##   Brown    66   34    29    14
##   Red      16    7     7     7
##   Blond     4   64     5     8
dim(HairEyeColor)
## [1] 4 4 2
dimnames(HairEyeColor)
## $Hair
## [1] "Black" "Brown" "Red"   "Blond"
## 
## $Eye
## [1] "Brown" "Blue"  "Hazel" "Green"
## 
## $Sex
## [1] "Male"   "Female"
adply(HairEyeColor, c(1,2), sum)
##     Hair   Eye  V1
## 1  Black Brown  68
## 2  Brown Brown 119
## 3    Red Brown  26
## 4  Blond Brown   7
## 5  Black  Blue  20
## 6  Brown  Blue  84
## 7    Red  Blue  17
## 8  Blond  Blue  94
## 9  Black Hazel  15
## 10 Brown Hazel  54
## 11   Red Hazel  14
## 12 Blond Hazel  10
## 13 Black Green   5
## 14 Brown Green  29
## 15   Red Green  14
## 16 Blond Green  16

Play around with replacing the margin vector with others, and convince yourself of why you get the output you do.

Data frame input: d*ply

syntax: d*ply(data, variables, fun)

Let's look a bit more at HairEyeColor:

hair_and_gender_counts = adply(HairEyeColor, c(1, 3), sum)
hair_and_gender_counts
##    Hair    Sex  V1
## 1 Black   Male  56
## 2 Brown   Male 143
## 3   Red   Male  34
## 4 Blond   Male  46
## 5 Black Female  52
## 6 Brown Female 143
## 7   Red Female  37
## 8 Blond Female  81

Suppose we want to know what fraction of people with each hair color are men, and it seems to us that split/apply/combine will be good for this task.

First ask:

First define a function that will take a subset of the data and find return to us the fraction of men:

get_fraction_male = function(data_subset) {
    male_subset = subset(data_subset, Sex == "Male")
    frac_male = sum(male_subset$V1) / sum(data_subset$V1)
}

Then split hair_and_gender_counts on the Hair variable and apply get_fraction_male to each data subset:

ddply(hair_and_gender_counts, .(Hair), get_fraction_male)
##    Hair        V1
## 1 Black 0.5185185
## 2 Brown 0.5000000
## 3   Red 0.4788732
## 4 Blond 0.3622047

List input: l*ply

syntax: l*ply(data, fun)

Example:

a_list = list(a = 1, b = "state", c = TRUE)
a_list
## $a
## [1] 1
## 
## $b
## [1] "state"
## 
## $c
## [1] TRUE
laply(a_list, typeof)
## [1] "double"    "character" "logical"

l*ply will also work on vectors, e.g.:

vec = 1:10
laply(vec, log)
##  [1] 0.0000000 0.6931472 1.0986123 1.3862944 1.6094379 1.7917595 1.9459101
##  [8] 2.0794415 2.1972246 2.3025851

Output types

We said the options for output are arrays, data frames, and lists.

We've seen examples above, but let's look more systematically.

Data frame output: *dply

data(diamonds)
diamond_coefs = ddply(diamonds, .(color), function(data_subset) {
    return(coef(lm(log(price) ~ carat, data = data_subset)))
})
diamond_coefs
##   color (Intercept)    carat
## 1     D    6.048811 2.383864
## 2     E    6.034513 2.348335
## 3     F    6.088442 2.272790
## 4     G    6.109554 2.178489
## 5     H    6.180284 1.906300
## 6     I    6.175315 1.799199
## 7     J    6.254074 1.627947

Check on your own what happens when you replace .(color) with .(color, clarity). How is the output different than what we got using by last time? Is it better?

Array output: *aply

Here we split along one dimension and have one-dimensional output. Similar to what we've seen before with apply in base R.

X = matrix(1:6, nrow = 3, ncol = 2)
X
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
row_sums_x = aaply(X, 1, sum)
dim(row_sums_x)
## NULL
length(row_sums_x)
## [1] 3
row_sums_x
## 1 2 3 
## 5 7 9

Here we split along two dimensions and have one-dimensional output:

X_array = array(data = 1:12, dim = c(3, 2, 2))
dim(X_array)
## [1] 3 2 2
X_array
## , , 1
## 
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
## 
## , , 2
## 
##      [,1] [,2]
## [1,]    7   10
## [2,]    8   11
## [3,]    9   12
third_dim_sums_x = aaply(X_array, 1:2, sum)
dim(third_dim_sums_x)
## [1] 3 2
third_dim_sums_x
##    X2
## X1   1  2
##   1  8 14
##   2 10 16
##   3 12 18

Here we split along one dimension and have two-dimensional output:

nonsense_function = function(x) {
    out = c(x[1] * 2, x[2] * -1)
    return(out)
}
X
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
nonsense_applied_to_rows = aaply(X, 1, nonsense_function)
dim(nonsense_applied_to_rows)
## [1] 3 2
nonsense_applied_to_rows
##    
## X1  1  2
##   1 2 -4
##   2 4 -5
##   3 6 -6

I don't expect you to memorize this, just to know that we can use these functions to get consistently-shaped output and be able to look it up and figure out what it should be.

List output: *lply

diamond_lms = dlply(diamonds, .(color), function(data_subset) {
    lm(log(price) ~ carat, data = data_subset)
})

class(diamond_lms)
## [1] "list"
class(diamond_lms[[1]])
## [1] "lm"
names(diamond_lms)
## [1] "D" "E" "F" "G" "H" "I" "J"

Some special cases

Summing up