Introduction

This is a simple demonstration of how to convert existing plyr code to use the dplyr package. This requires a dplyr version greater than 0.2, which implements the do() function which is used in place of dlply() and ldply() from the plyr package. For each example the plyr implementation is on the left, the dplyr implementation is on the right. Some care has been taken to make the outputs functionally equivalent.

Dplyr is still in the early stages, so I would expect some of the verboseness in using do() to decrease over time. In particular the pattern to convert a dlply call of an anonymous function is very verbose and non-obvious.

Ex. Plyr: dlply(data, 'group', function(x) something(x$value)) Dplyr: data %>% group_by(group) %>% do(res=(function(x) something(x$value) )(.))

Baby Names-Explore Example

Download Dataset

plyr

Download Source

dplyr

Download Source

library(plyr)

bnames <- read.csv("../data/bnames.csv", stringsAsFactors = FALSE)
head(bnames)

##   year    name percent sex
## 1 1880    John 0.08154 boy
## 2 1880 William 0.08051 boy
## 3 1880   James 0.05006 boy
## 4 1880 Charles 0.04517 boy
## 5 1880  George 0.04329 boy
## 6 1880   Frank 0.02738 boy

# Whole dataset transformations ---------------------------------------------
letter <- function(x, n = 1) {
  if (n < 0) {
    nc <- nchar(x)
    n <- nc + n + 1
  }
  tolower(substr(x, n, n))
}
vowels <- function(x) {
  nchar(gsub("[^aeiou]", "", x))
}

bnames <- transform(bnames,
  first = letter(name, 1),
  last = letter(name, -1),
  length = nchar(name),
  vowels = vowels(name)
)

# Whole dataset summaries ----------------------------------------------------

summarise(bnames,
  max_perc = max(percent),
  min_perc = min(percent))

##   max_perc min_perc
## 1  0.08154  2.6e-05

# Group-wise transformations  ------------------------------------------------

# Want to calculate rank of each name in each year (per sex).  This is easy if
# we have a single sex for a single year:
one <- subset(bnames, sex == "boy" & year == 2008)
one$rank <- rank(-one$percent, ties.method = "first")
# or
one <- transform(one, rank = rank(-percent, ties.method = "first"))
head(one)

##        year      name  percent sex first last length vowels rank
## 128001 2008     Jacob 0.010355 boy     j    b      5      2    1
## 128002 2008   Michael 0.009437 boy     m    l      7      3    2
## 128003 2008     Ethan 0.009301 boy     e    n      5      1    3
## 128004 2008    Joshua 0.008799 boy     j    a      6      3    4
## 128005 2008    Daniel 0.008702 boy     d    l      6      3    5
## 128006 2008 Alexander 0.008566 boy     a    r      9      3    6

# Conceptually if we want to perform this same task for every sex in every 
# year, we need to split up the data, apply the transformation to every piece
# and then join the pieces back together

# This is what ddply does
bnames <- ddply(bnames, c("sex", "year"), transform, 
  rank = rank(-percent, ties.method = "first"))

# Group-wise summaries -------------------------------------------------------

# Group-wise summaries are much more interesting!

head(ddply(bnames, c("name"), summarise, tot = sum(percent)), n = 10)

##        name      tot
## 1     Aaden 0.000442
## 2   Aaliyah 0.019748
## 3     Aarav 0.000101
## 4     Aaron 0.293097
## 5        Ab 0.000218
## 6   Abagail 0.001326
## 7       Abb 0.000137
## 8     Abbey 0.007409
## 9     Abbie 0.022896
## 10 Abbigail 0.003392

head(ddply(bnames, c("length"), summarise, tot = sum(percent)), n = 10)

##    length     tot
## 1       2  0.2315
## 2       3  7.2744
## 3       4 36.8475
## 4       5 57.7588
## 5       6 60.3609
## 6       7 44.3370
## 7       8 14.8416
## 8       9  7.4245
## 9      10  0.6562
## 10     11  1.0414

head(ddply(bnames, c("year", "sex"), summarise, tot = sum(percent)), n = 10)

##    year  sex    tot
## 1  1880  boy 0.9307
## 2  1880 girl 0.9345
## 3  1881  boy 0.9304
## 4  1881 girl 0.9327
## 5  1882  boy 0.9275
## 6  1882 girl 0.9310
## 7  1883  boy 0.9288
## 8  1883 girl 0.9333
## 9  1884  boy 0.9273
## 10 1884 girl 0.9314

fl <- ddply(bnames, c("year", "sex", "first"), summarise, tot = sum(percent))
library(ggplot2)
qplot(year, tot, data = fl, geom = "line", colour = sex, facets = ~ first)

library(dplyr)

bnames <- read.csv("../data/bnames.csv", stringsAsFactors = FALSE)
head(bnames)

##   year    name percent sex
## 1 1880    John 0.08154 boy
## 2 1880 William 0.08051 boy
## 3 1880   James 0.05006 boy
## 4 1880 Charles 0.04517 boy
## 5 1880  George 0.04329 boy
## 6 1880   Frank 0.02738 boy

# Whole dataset transformations ---------------------------------------------
letter <- function(x, n = 1) {
  if (n < 0) {
    nc <- nchar(x)
    n <- nc + n + 1
  }
  tolower(substr(x, n, n))
}
vowels <- function(x) {
  nchar(gsub("[^aeiou]", "", x))
}

bnames <- mutate(bnames,
  first = letter(name, 1),
  last = letter(name, -1),
  length = nchar(name),
  vowels = vowels(name)
)

# Whole dataset summaries ----------------------------------------------------

summarise(bnames,
  max_perc = max(percent),
  min_perc = min(percent))

##   max_perc min_perc
## 1  0.08154  2.6e-05

# Group-wise transformations  ------------------------------------------------

# Want to calculate rank of each name in each year (per sex).  This is easy if
# we have a single sex for a single year:
one <- filter(bnames, sex == "boy", year == 2008)
one$rank <- rank(-one$percent, ties.method = "first")
# or
one <- mutate(one, rank = rank(-percent, ties.method = "first"))
head(one)

##   year      name  percent sex first last length vowels rank
## 1 2008     Jacob 0.010355 boy     j    b      5      2    1
## 2 2008   Michael 0.009437 boy     m    l      7      3    2
## 3 2008     Ethan 0.009301 boy     e    n      5      1    3
## 4 2008    Joshua 0.008799 boy     j    a      6      3    4
## 5 2008    Daniel 0.008702 boy     d    l      6      3    5
## 6 2008 Alexander 0.008566 boy     a    r      9      3    6

# Conceptually if we want to perform this same task for every sex in every 
# year, we need to split up the data, apply the transformation to every piece
# and then join the pieces back together

# This is what group_by and mutate do
bnames <- bnames %>% group_by(sex, year) %>% mutate(rank = rank(-percent, ties.method = "first"))

# Group-wise summaries -------------------------------------------------------

# Group-wise summaries are much more interesting!

bnames %>% group_by(name) %>% summarise(tot = sum(percent))

## Source: local data frame [6,782 x 2]
## 
##        name      tot
## 1     Aaden 0.000442
## 2   Aaliyah 0.019748
## 3     Aarav 0.000101
## 4     Aaron 0.293097
## 5        Ab 0.000218
## 6   Abagail 0.001326
## 7       Abb 0.000137
## 8     Abbey 0.007409
## 9     Abbie 0.022896
## 10 Abbigail 0.003392
## ..      ...      ...

bnames %>% group_by(length) %>% summarise(tot = sum(percent))

## Source: local data frame [10 x 2]
## 
##    length     tot
## 1       2  0.2315
## 2       3  7.2744
## 3       4 36.8475
## 4       5 57.7588
## 5       6 60.3609
## 6       7 44.3370
## 7       8 14.8416
## 8       9  7.4245
## 9      10  0.6562
## 10     11  1.0414

bnames %>% group_by(year, sex) %>% summarise(tot = sum(percent))

## Source: local data frame [258 x 3]
## Groups: year
## 
##    year  sex    tot
## 1  1880  boy 0.9307
## 2  1880 girl 0.9345
## 3  1881  boy 0.9304
## 4  1881 girl 0.9327
## 5  1882  boy 0.9275
## 6  1882 girl 0.9310
## 7  1883  boy 0.9288
## 8  1883 girl 0.9333
## 9  1884  boy 0.9273
## 10 1884 girl 0.9314
## ..  ...  ...    ...

fl <- bnames %>% group_by(year, sex, first) %>% summarise(tot = sum(percent))
library(ggplot2)
qplot(year, tot, data = fl, geom = "line", colour = sex, facets = ~ first)

Baby Names-Cluster Example

Download Dataset

plyr

Download Source

dplyr

Download Source

library(plyr)
library(reshape)
library(ggplot2)

bnames <- read.csv("../data/bnames.csv", stringsAsFactors = FALSE)

# Focus on the last 60 years
recent <- subset(bnames, year >= 1950)

# Still a lot of names, so pull out those names which are both moderately
# popular (> 1 / 1000) and in top 1000 for at least 30 years.  In a real
# analysis you'd probably want to analyse more data.
per_name <- ddply(recent, c("sex", "name"), summarise, 
  years = length(name), percent_avg = mean(percent))
long <- subset(per_name, years >= 30 & percent_avg > 0.001)
bnames_long <- merge(recent, long[c("sex", "name")], by = c("sex", "name"))

# To cluster, we need to reshape the data so that each year forms a column.
# We'll have to do this a few times, so we'll create a function to do this
# specifically for this data.  To learn more about how this works, see the 
# documentation for the reshape package.
widen <- function(variable) {
  as.matrix(cast(bnames_long, sex + name ~ year, fill = 0,
    value = variable))
}

long$cluster1 <- kmeans(widen("percent"), 20)$cluster
bnames_cl <- merge(bnames_long, long, by = c("sex", "name"))
ggplot(bnames_cl, aes(year, percent)) +
  geom_line(aes(group = interaction(sex, name))) +
  facet_wrap(~ cluster1)

#head(dlply(long, "cluster1", function(df) as.character(df$name)))

# Hmmmm.  Maybe be clustering too much based on absolute size, and not
# on relative shape.  Lets rescale percent to 0,1

library(mgcv)
smooth <- function(var, date) {
  predict(gam(var ~ s(date)))
}
scale01 <- function(x) (x - min(x)) / diff(range(x))

bnames_long <- ddply(bnames_long, c("sex", "name"), transform,
  percent_std = scale01(percent),
  percent_smo = scale01(smooth(percent, year))
)

long$cluster2 <- kmeans(widen("percent_std"), 20)$cluster
long$cluster3 <- kmeans(widen("percent_smo"), 20)$cluster

bnames_cl <- merge(bnames_long, long, by = c("sex", "name"))
qplot(year, percent_std, data = bnames_cl, group = interaction(name, sex), geom = "line") + 
  facet_wrap(~ cluster2)

qplot(year, percent_smo, data = bnames_cl, group = interaction(name, sex), geom = "line", colour = sex) + 
  facet_wrap(~ cluster2)

#dlply(long, "cluster2", function(df) as.character(df$name))

tab <- table(long[c("cluster2", "cluster3")])
library(e1071)
match <- matchClasses(tab)

## Cases in matched pairs: 67 %

print.table(tab[, match], zero.print = ".")

##         cluster3
## cluster2 18 15  9 19 13  8 18  6  1  2  1 12  5 10 19 10 16 17  7  9
##       1   5  .  .  .  .  .  5  .  .  .  .  .  .  .  .  .  .  .  .  .
##       2   .  9  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
##       3   .  .  9  .  .  .  .  1  .  .  .  .  .  .  .  .  8  1  .  9
##       4   .  .  . 11  .  .  .  .  .  .  .  .  .  . 11  .  .  .  .  .
##       5   .  2  .  . 16  .  .  .  .  .  .  .  .  .  .  .  5  3  .  .
##       6   .  .  .  .  . 27  .  .  .  .  .  .  .  .  .  .  .  .  .  .
##       7  22  .  .  .  .  2 22  .  .  .  .  .  .  .  .  .  .  .  .  .
##       8   .  .  .  .  .  .  . 16  .  4  .  .  .  .  .  .  .  .  .  .
##       9   .  .  .  .  .  8  .  . 11  . 11  .  1  .  .  .  .  .  .  .
##       10  .  .  .  .  .  .  .  .  . 18  .  .  .  .  .  .  .  .  3  .
##       11  .  .  .  .  .  .  .  .  8  .  8  .  .  .  .  .  .  .  .  .
##       12  .  .  .  3  .  .  .  .  .  .  . 11  .  .  3  .  .  .  .  .
##       13  .  .  .  .  .  .  .  .  .  .  .  .  9  .  .  .  .  .  .  .
##       14  .  .  .  .  .  .  .  .  2  .  2  .  7 11  . 11  .  .  .  .
##       15  .  .  . 11  .  .  .  .  .  .  .  .  .  6 11  6  .  .  .  .
##       16  .  .  .  1  .  .  .  .  .  .  .  .  . 15  1 15  .  .  .  .
##       17  .  1  .  .  .  .  .  .  .  .  .  7  .  .  .  . 11  .  .  .
##       18  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 13  .  .
##       19  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 30  .
##       20  .  .  5  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  5

library(dplyr)
library(reshape2)
library(ggplot2)

bnames <- read.csv("../data/bnames.csv", stringsAsFactors = FALSE)

# Focus on the last 60 years
recent <- filter(bnames, year >= 1950)

# Still a lot of names, so pull out those names which are both moderately
# popular (> 1 / 1000) and in top 1000 for at least 30 years.  In a real
# analysis you'd probably want to analyse more data.

per_name <- recent %>% group_by(sex, name) %>% summarise(years = length(name), percent_avg = mean(percent))
long <- filter(per_name, years >= 30 & percent_avg > 0.001)
bnames_long <- inner_join(recent, long[c("sex", "name")], by = c("sex", "name"))

# To cluster, we need to reshape the data so that each year forms a column.
# We'll have to do this a few times, so we'll create a function to do this
# specifically for this data.  To learn more about how this works, see the 
# documentation for the reshape package.
widen <- function(variable) {
  select(dcast(bnames_long, sex + name ~ year, fill = 0,
    value.var = variable), -sex, -name)
}

long$cluster1 <- kmeans(widen('percent'), 20)$cluster

bnames_cl <- inner_join(bnames_long, long, by = c("sex", "name"))
ggplot(bnames_cl, aes(year, percent)) +
  geom_line(aes(group = interaction(sex, name))) +
  facet_wrap(~ cluster1)

#long %>% group_by(cluster1) %>% do(names=.$name)

# Hmmmm.  Maybe be clustering too much based on absolute size, and not
# on relative shape.  Lets rescale percent to 0,1

library(mgcv)
smooth <- function(var, date) {
  predict(gam(var ~ s(date)))
}
scale01 <- function(x) (x - min(x)) / diff(range(x))

bnames_long <- bnames_long %>% group_by(sex, name) %>%
  mutate(percent_std = scale01(percent),
         percent_smo = scale01(smooth(percent, year)))

long$cluster2 <- kmeans(widen("percent_std"), 20)$cluster
long$cluster3 <- kmeans(widen("percent_smo"), 20)$cluster

bnames_cl <- inner_join(bnames_long, long, by = c("sex", "name"))
qplot(year, percent_std, data = bnames_cl, group = interaction(name, sex), geom = "line") + 
  facet_wrap(~ cluster2)

qplot(year, percent_smo, data = bnames_cl, group = interaction(name, sex), geom = "line", colour = sex) + 
  facet_wrap(~ cluster2)

#long %>% group_by(cluster2) %>% do(names=.$name)

tab <- table(long[c("cluster2", "cluster3")])

library(e1071)
match <- matchClasses(tab)

## Cases in matched pairs: 66.75 %

print.table(tab[, match], zero.print = ".")

##         cluster3
## cluster2 16 18 20  1  4 10 14  2  2  5 15 11  7 14  2 13 12  9  8  6
##       1  15  .  .  .  .  .  2  .  .  1  .  .  .  2  .  .  .  2  .  .
##       2   . 21  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
##       3   .  . 35  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
##       4   .  .  .  9  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
##       5   .  .  .  2  7  .  .  .  .  .  .  .  .  .  .  .  .  .  .  7
##       6   .  .  .  .  .  8  .  .  .  .  3  .  .  .  .  .  .  .  .  .
##       7   .  .  .  .  .  .  9  .  .  8  .  .  .  9  .  .  .  .  .  .
##       8   .  7  .  .  .  1  .  8  8  .  .  .  .  .  8  .  .  .  .  .
##       9   .  .  .  .  .  .  . 11 11  .  .  .  .  . 11  .  .  .  .  .
##       10  .  .  .  .  .  .  .  .  .  4  .  .  1  .  .  .  2  .  .  .
##       11  .  1  .  .  .  .  .  4  4  .  8  .  .  .  4  .  .  .  .  .
##       12  .  .  .  .  6  .  .  .  .  .  . 20  .  .  .  .  .  .  .  .
##       13  .  .  .  .  .  .  .  .  .  .  .  . 24  .  .  .  .  . 14  .
##       14  .  .  .  .  .  . 15  .  .  .  .  .  . 15  .  6  1  .  .  .
##       15  .  .  .  .  .  1  .  9  9  .  2  .  .  .  9  .  .  .  .  .
##       16  .  .  .  .  .  .  .  .  .  .  .  2  .  .  . 14  .  .  .  .
##       17  .  .  .  .  .  .  .  .  .  .  .  1  .  .  .  . 13  .  .  .
##       18  .  .  .  .  .  .  .  .  .  6  .  .  4  .  .  .  . 26  .  .
##       19  5  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  5  .
##       20  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  6

Baseball Example

plyr

Download Source

dplyr

Download Source

library(plyr)
library(ggplot2)

# First need to create new variables that capture the number of years
# a player has played, and for each year, how far through their career
# they are.

# This is straightforward using the techniques we just learned
b2 <- ddply(baseball, "id", transform,
  cyear = year - min(year) + 1)  
b2 <- ddply(b2, "id", transform,
  career = (cyear - 1) / max(cyear))  

head(b2)

##          id year stint team lg   g  ab   r   h X2b X3b hr rbi sb cs bb so
## 1 aaronha01 1954     1  ML1 NL 122 468  58 131  27   6 13  69  2  2 28 39
## 2 aaronha01 1955     1  ML1 NL 153 602 105 189  37   9 27 106  3  1 49 61
## 3 aaronha01 1956     1  ML1 NL 153 609 106 200  34  14 26  92  2  4 37 54
## 4 aaronha01 1957     1  ML1 NL 151 615 118 198  27   6 44 132  1  1 57 58
## 5 aaronha01 1958     1  ML1 NL 153 601 109 196  34   4 30  95  4  1 59 49
## 6 aaronha01 1959     1  ML1 NL 154 629 116 223  46   7 39 123  8  0 51 54
##   ibb hbp sh sf gidp cyear  career
## 1  NA   3  6  4   13     1 0.00000
## 2   5   3  7  4   20     2 0.04348
## 3   6   2  5  7   21     3 0.08696
## 4  15   0  0  3   13     4 0.13043
## 5  16   1  0  3   21     5 0.17391
## 6  17   4  0  9   19     6 0.21739

# Now what sort of model should we use?
bruth <- subset(b2, id == "ruthba01")
qplot(career, g, data = bruth, geom = "line")

qplot(career, g, data = b2, geom="boxplot", group = round_any(career, 0.05))

# Could we model that as two straight lines?
bruth$p <- (bruth$career - 0.5) * 100
mod <- lm(g ~ p + p:I(p > 0), data = bruth)
bruth$ghat <- predict(mod)
qplot(career, g, data = bruth, geom = "line") + 
  geom_line(aes(y = ghat), colour = "red")

# It doesn't look great, but it's a start
# Let's fit that model to every player
b2$p <- (b2$career - 0.5) * 100

models <- dlply(b2, "id", lm, formula = g ~ p + p:I(p > 0))
# Or a bit more explicitly
models <- dlply(b2, "id", function(df) lm(g ~ p + p:I(p > 0), data = df))

length(models)

## [1] 1228

onem <- models[[1]]
onem

## 
## Call:
## lm(formula = g ~ p + p:I(p > 0), data = df)
## 
## Coefficients:
##    (Intercept)               p  p:I(p > 0)TRUE  
##        162.600           0.413          -1.601

summary(onem)

## 
## Call:
## lm(formula = g ~ p + p:I(p > 0), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.356  -2.935   0.478   4.339  23.478 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     162.600      4.538   35.83  < 2e-16 ***
## p                 0.413      0.167    2.47    0.023 *  
## p:I(p > 0)TRUE   -1.601      0.316   -5.07  5.8e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.9 on 20 degrees of freedom
## Multiple R-squared:  0.688,  Adjusted R-squared:  0.656 
## F-statistic:   22 on 2 and 20 DF,  p-value: 8.84e-06

coefs <- coef(onem)
names(coefs) <- c("mid", "inc", "dec")
rsq <- summary(onem)$r.squared

get_coefs <- function(model) {
  coefs <- coef(model)
  names(coefs) <- c("mid", "inc", "dec")
  c(coefs, rsq = summary(model)$r.squared)
}

coefs <- ldply(models, get_coefs)

qplot(rsq, data = coefs, geom = "histogram", binwidth = 0.02)

qplot(mid, data = coefs, geom = "histogram", binwidth = 5)

qplot(inc, data = coefs, geom = "histogram", binwidth = 0.2)

qplot(dec, data = coefs, geom = "histogram", binwidth = 0.2)

qplot(rsq, inc, data=coefs)

qplot(dec, inc, data=coefs, colour=rsq)

library(dplyr)
library(ggplot2)

data(package='plyr', 'baseball')

# First need to create new variables that capture the number of years
# a player has played, and for each year, how far through their career
# they are.

# This is straightforward using the techniques we just learned
b2 <- tbl_df(baseball) %>%
  group_by(id) %>%
  mutate(cyear = year - min(year) + 1,
         career = (cyear - 1) / max(cyear))
b2

## Source: local data frame [21,699 x 24]
## Groups: id
## 
##           id year stint team lg  g  ab  r  h X2b X3b hr rbi sb cs bb so
## 1  ansonca01 1871     1  RC1    25 120 29 39  11   3  0  16  6  2  2  1
## 2  forceda01 1871     1  WS3    32 162 45 45   9   4  0  29  8  0  4  0
## 3  mathebo01 1871     1  FW1    19  89 15 24   3   1  0  10  2  1  2  0
## 4  startjo01 1871     1  NY2    33 161 35 58   5   1  1  34  4  2  3  0
## 5  suttoez01 1871     1  CL1    29 128 35 45   3   7  3  23  3  1  1  0
## 6  whitede01 1871     1  CL1    29 146 40 47   6   5  1  21  2  2  4  1
## 7   yorkto01 1871     1  TRO    29 145 36 37   5   7  2  23  2  2  9  1
## 8  ansonca01 1872     1  PH1    46 217 60 90  10   7  0  50  6  6 16  3
## 9  burdoja01 1872     1  BR2    37 174 26 46   3   0  0  15  0  1  1  1
## 10 forceda01 1872     1  TRO    25 130 40 53  11   0  0  16  2  2  1  0
## ..       ...  ...   ...  ... .. .. ... .. .. ... ... .. ... .. .. .. ..
## Variables not shown: ibb (int), hbp (int), sh (int), sf (int), gidp (int),
##   cyear (dbl), career (dbl)

# Now what sort of model should we use?
bruth <- subset(b2, id == "ruthba01")
qplot(career, g, data = bruth, geom = "line")

qplot(career, g, data = b2, geom="boxplot", group = plyr::round_any(career, 0.05))

# Could we model that as two straight lines?
bruth$p <- (bruth$career - 0.5) * 100
mod <- lm(g ~ p + p:I(p > 0), data = bruth)
bruth$ghat <- predict(mod)
qplot(career, g, data = bruth, geom = "line") + 
  geom_line(aes(y = ghat), colour = "red")

# It doesn't look great, but it's a start
# Let's fit that model to every player
b2$p <- (b2$career - 0.5) * 100

models <- b2 %>% do(model=lm(data=., formula = g ~ p + p:I(p > 0)))
# Or a bit more explicitly
models <- b2 %>% do(model=(function(df) lm(data=df, g ~ p + p:I(p > 0)))(.))

nrow(models)

## [1] 1228

onem <- models[[1, 'model']]
onem

## 
## Call:
## lm(formula = g ~ p + p:I(p > 0), data = df)
## 
## Coefficients:
##    (Intercept)               p  p:I(p > 0)TRUE  
##        162.600           0.413          -1.601

summary(onem)

## 
## Call:
## lm(formula = g ~ p + p:I(p > 0), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.356  -2.935   0.478   4.339  23.478 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     162.600      4.538   35.83  < 2e-16 ***
## p                 0.413      0.167    2.47    0.023 *  
## p:I(p > 0)TRUE   -1.601      0.316   -5.07  5.8e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.9 on 20 degrees of freedom
## Multiple R-squared:  0.688,  Adjusted R-squared:  0.656 
## F-statistic:   22 on 2 and 20 DF,  p-value: 8.84e-06

coefs <- as.list(coef(onem))
names(coefs) <- c("mid", "inc", "dec")
rsq <- summary(onem)$r.squared

get_coefs <- function(model) {
  coefs <- as.list(coef(model))
  names(coefs) <- c("mid", "inc", "dec")
  data.frame(coefs, rsq = summary(model)$r.squared)
}

coefs <- models %>% do(get_coefs(.$model))

qplot(rsq, data = coefs, geom = "histogram", binwidth = 0.02)

qplot(mid, data = coefs, geom = "histogram", binwidth = 5)

qplot(inc, data = coefs, geom = "histogram", binwidth = 0.2)

qplot(dec, data = coefs, geom = "histogram", binwidth = 0.2)

qplot(rsq, inc, data=coefs)

qplot(dec, inc, data=coefs, colour=rsq)

Texas Cities Example

Download Dataset

plyr

Download Source

dplyr

Download Source

library(ggplot2)
library(plyr)
options(na.action = "na.exclude")

# Helper functions ----------------------------------------------------------
deseas <- function(var, month) {
  resid(lm(var ~ factor(month))) + mean(var, na.rm = TRUE)
}

# Explore multiple cities ----------------------------------------------------
tx <- read.csv("../data/tx-house-sales.csv")

# We know from our exploration of Houston data that many of the series
# have strong seasonal components.  It's a good idea to check that's true for 
# all cities.  We'll start with sales.
qplot(date, sales, data = tx, geom = "line", group = city)

# Hmmmm.  Problem!  There's a large variation in the number of sales between
# cities.  The seasonal pattern does look pretty constant though.

# First possible solution, just remove the seasonal effect as we did for a 
# single city, but applied to multiple cities (using model as a tool)  
tx <- ddply(tx, "city", transform, 
  sales_ds = deseas(sales, month))

qplot(date, sales_ds, data = tx, geom = "line", group = city)

# It works, but we don't gain much insight into what's going on.
# Let's fit the models, and actually look at them this time
models <- dlply(tx, "city", function(df) 
  lm(sales ~ factor(month), data = df))

models[[1]]

## 
## Call:
## lm(formula = sales ~ factor(month), data = df)
## 
## Coefficients:
##     (Intercept)   factor(month)2   factor(month)3   factor(month)4  
##            91.8             27.0             53.9             57.6  
##  factor(month)5   factor(month)6   factor(month)7   factor(month)8  
##            74.4             84.8             80.2             84.1  
##  factor(month)9  factor(month)10  factor(month)11  factor(month)12  
##            45.3             42.4             27.5             29.4

coef(models[[1]])

##     (Intercept)  factor(month)2  factor(month)3  factor(month)4 
##           91.80           27.00           53.90           57.60 
##  factor(month)5  factor(month)6  factor(month)7  factor(month)8 
##           74.42           84.76           80.20           84.09 
##  factor(month)9 factor(month)10 factor(month)11 factor(month)12 
##           45.31           42.42           27.53           29.42

# To extract the coefficients, we want to go from a list to data frame
# Notice how plyr remembers the city names that we originally broke the
# models up by.
(ldply(models, coef))[1:5, 1:3]

##        city (Intercept) factor(month)2
## 1   Abilene        91.8           27.0
## 2  Amarillo       161.9           46.8
## 3 Arlington       302.2           62.8
## 4    Austin      1258.8          216.1
## 5  Bay Area       313.5           91.7

# Two problems with the model:
#   * Coefficients aren't comparable, because of varying sizes
#     Solution: log-transform to convert to ratios
#   * Coefficients not in useful form for plotting
#     Solution: create data frame ourselves
qplot(date, log10(sales), data = tx, geom = "line", group = city)

models2 <- dlply(tx, "city", function(df) 
  lm(log10(sales) ~ factor(month), data = df))
coef2 <- ldply(models2, function(mod) {
  data.frame(
    month = 1:12, 
    effect = c(0, coef(mod)[-1]), 
    intercept = coef(mod)[1])
})

# Pretty consistent pattern, although there are few outliers
qplot(month, effect, data = coef2, group = city, geom = "line")

# More interpretable if we back-transform - can now interpret as ratios
qplot(month, 10 ^ effect, data = coef2, group = city, geom = "line")

# What are the outliers?
qplot(month, 10 ^ effect, data = coef2, geom = "line") + facet_wrap(~ city)

# They are small cities. Hmmmmm

# Have a look at the distributions
qplot(effect, data = coef2, binwidth = 0.05) + facet_wrap(~ month)

# Single model ----------------------------------------------------

mod <- lm(log10(sales) ~ city + factor(month), data = tx)

tx$sales2 <- 10 ^ resid(mod)
qplot(date, sales2, data = tx, geom = "line", group = city)

# Now we're starting to get somewhere!  Can see general pattern, although
# there are a few outliers.  Look at cities individually to identify:
last_plot() + facet_wrap(~ city)

# Some problem cities:
#   * Bryan-College station: has different seasonal pattern (Texas A&M?)
#   * Similarly with San Marcos (a lot of missing data)
#   * Palestine: massive increase beginning 2007

# Can resolve seasonal problems by fitting separate seasonal pattern to each
# city (Challenge: how is this different to the indivudal models we fit 
# before?)  But probably better to use more sophisticated model (e.g. mixed 
# effects) model.
mod2 <- lm(log10(sales) ~ city:factor(month), data = tx)
tx$sales3 <- 10 ^ resid(mod2)
qplot(date, sales3, data = tx, geom = "line") + facet_wrap(~ city)

# Further exploration
qplot(date, sales2, data = tx, geom = "line", group = city, alpha = I(0.2))

last_plot() + geom_smooth(aes(group = 1))

# Could smooth individual cities - again just using model as a tool
library(mgcv)
smooth <- function(var, date) {
  predict(gam(var ~ s(date)))
}
tx <- ddply(tx, "city", transform, 
  sales2_sm = smooth(sales2, date))
qplot(date, sales2_sm, data = tx, geom = "line", group = city)

# Another approach -----------------------------------------------------------

# Essence of most cities is seasonal term plus long term smooth trend.  We
# could fit this model to each city, and then look for model which don't
# fit well.

library(splines)
models3 <- dlply(tx, "city", function(df) {
  lm(log10(sales) ~ factor(month) + ns(date, 3), data = df)
})

# Extract rsquared from each model
rsq <- function(mod) c(rsq = summary(mod)$r.squared)
quality <- ldply(models3, rsq) 
qplot(rsq, city, data = quality)

qplot(rsq, reorder(city, rsq), data = quality)

quality$poor <- quality$rsq < 0.7
tx2 <- merge(tx, quality, by = "city")

# The cities don't look particularly differnet 
qplot(date, log10(sales), data = tx2, geom = "line", colour = poor) + 
  facet_wrap(~ city) + opts(legend.position = "none")

# But we should probably look at the residuals & predictions
mfit <- ldply(models3, function(mod) {
  data.frame(resid = resid(mod), pred = predict(mod))
})
tx2 <- cbind(tx2, mfit[, -1])
qplot(date, pred, data = tx2, geom = "line", colour = poor) + 
  facet_wrap(~ city) + opts(legend.position = "none")

qplot(date, resid, data = tx2, geom = "line", colour = poor) + 
  facet_wrap(~ city) + opts(legend.position = "none")

library(ggplot2)
library(dplyr)
options(na.action = "na.exclude")

# Helper functions ----------------------------------------------------------
deseas <- function(var, month) {
  resid(lm(var ~ factor(month))) + mean(var, na.rm = TRUE)
}

# Explore multiple cities ----------------------------------------------------
tx <- read.csv("../data/tx-house-sales.csv")

# We know from our exploration of Houston data that many of the series
# have strong seasonal components.  It's a good idea to check that's true for 
# all cities.  We'll start with sales.
qplot(date, sales, data = tx, geom = "line", group = city)

# Hmmmm.  Problem!  There's a large variation in the number of sales between
# cities.  The seasonal pattern does look pretty constant though.

# First possible solution, just remove the seasonal effect as we did for a 
# single city, but applied to multiple cities (using model as a tool)  
tx_city <- tx %>% group_by(city)

tx = tx_city %>% mutate(sales_ds = deseas(sales, month))

qplot(date, sales_ds, data = tx, geom = "line", group = city)

# It works, but we don't gain much insight into what's going on.
# Let's fit the models, and actually look at them this time
models = tx %>% group_by(city) %>% do(model=lm(sales ~ factor(month), data=.))

models[[1, 'model']]

## 
## Call:
## lm(formula = sales ~ factor(month), data = .)
## 
## Coefficients:
##     (Intercept)   factor(month)2   factor(month)3   factor(month)4  
##            91.8             27.0             53.9             57.6  
##  factor(month)5   factor(month)6   factor(month)7   factor(month)8  
##            74.4             84.8             80.2             84.1  
##  factor(month)9  factor(month)10  factor(month)11  factor(month)12  
##            45.3             42.4             27.5             29.4

coef(models[[1, 'model']])

##     (Intercept)  factor(month)2  factor(month)3  factor(month)4 
##           91.80           27.00           53.90           57.60 
##  factor(month)5  factor(month)6  factor(month)7  factor(month)8 
##           74.42           84.76           80.20           84.09 
##  factor(month)9 factor(month)10 factor(month)11 factor(month)12 
##           45.31           42.42           27.53           29.42

# To extract the coefficients, we want to go from a list to data frame
# Notice how plyr remembers the city names that we originally broke the
# models up by.
(models %>% group_by(city) %>% do(data.frame(t(coef(.[[1, 'model']])), check.names=FALSE)))[1:5, 1:3]

## Source: local data frame [5 x 3]
## Groups: 
## 
##        city (Intercept) factor(month)2
## 1   Abilene        91.8           27.0
## 2  Amarillo       161.9           46.8
## 3 Arlington       302.2           62.8
## 4    Austin      1258.8          216.1
## 5  Bay Area       313.5           91.7

# Two problems with the model:
#   * Coefficients aren't comparable, because of varying sizes
#     Solution: log-transform to convert to ratios
#   * Coefficients not in useful form for plotting
#     Solution: create data frame ourselves
qplot(date, log10(sales), data = tx, geom = "line", group = city)

models2 <- tx_city %>% do(model=lm(log10(sales) ~ factor(month), data = .))

coef2 <- models2 %>% group_by(city) %>% do((function(row) {
  mod = row[['model']][[1]]
  data.frame(
    month = 1:12,
    effect = c(0, coef(mod)[-1]),
    intercept = coef(mod)[1])
})(.))


# Pretty consistent pattern, although there are few outliers
qplot(month, effect, data = coef2, group = city, geom = "line")

# More interpretable if we back-transform - can now interpret as ratios
qplot(month, 10 ^ effect, data = coef2, group = city, geom = "line")

# What are the outliers?
qplot(month, 10 ^ effect, data = coef2, geom = "line") + facet_wrap(~ city)

# They are small cities. Hmmmmm

# Have a look at the distributions
qplot(effect, data = coef2, binwidth = 0.05) + facet_wrap(~ month)

# Single model ----------------------------------------------------

mod <- lm(log10(sales) ~ city + factor(month), data = tx)

tx$sales2 <- 10 ^ resid(mod)
qplot(date, sales2, data = tx, geom = "line", group = city)

# Now we're starting to get somewhere!  Can see general pattern, although
# there are a few outliers.  Look at cities individually to identify:
last_plot() + facet_wrap(~ city)

# Some problem cities:
#   * Bryan-College station: has different seasonal pattern (Texas A&M?)
#   * Similarly with San Marcos (a lot of missing data)
#   * Palestine: massive increase beginning 2007

# Can resolve seasonal problems by fitting separate seasonal pattern to each
# city (Challenge: how is this different to the indivudal models we fit 
# before?)  But probably better to use more sophisticated model (e.g. mixed 
# effects) model.
mod2 <- lm(log10(sales) ~ city:factor(month), data = tx)
tx$sales3 <- 10 ^ resid(mod2)
qplot(date, sales3, data = tx, geom = "line") + facet_wrap(~ city)

# Further exploration
qplot(date, sales2, data = tx, geom = "line", group = city, alpha = I(0.2))

last_plot() + geom_smooth(aes(group = 1))

# Could smooth individual cities - again just using model as a tool
library(mgcv)
smooth <- function(var, date) {
  predict(gam(var ~ s(date)))
}
tx <- tx %>% group_by(city) %>% mutate(sales2_sm = as.vector(smooth(sales2, date)))

qplot(date, sales2_sm, data = tx, geom = "line", group = city)

# Another approach -----------------------------------------------------------

# Essence of most cities is seasonal term plus long term smooth trend.  We
# could fit this model to each city, and then look for model which don't
# fit well.

library(splines)
models3 <- tx %>% group_by(city) %>% do(model=lm(log10(sales) ~ factor(month) + ns(date, 3), data = .))

# Extract rsquared from each model
rsq <- function(mod) c(rsq = summary(mod[[1]])$r.squared)
quality <- models3 %>% group_by(city) %>% summarise(rsq=rsq(model))
qplot(rsq, city, data = quality)

qplot(rsq, reorder(city, rsq), data = quality)

quality$poor <- quality$rsq < 0.7
tx2 <- inner_join(tx, quality, by = "city")

# The cities don't look particularly differnet 
qplot(date, log10(sales), data = tx2, geom = "line", colour = poor) + 
  facet_wrap(~ city) + opts(legend.position = "none")

# But we should probably look at the residuals & predictions
mfit <- models3 %>% do((function(mod) {
  data.frame(resid = resid(mod$model), pred = predict(mod$model))
})(.))

tx2 <- cbind(tx2, mfit)
qplot(date, pred, data = tx2, geom = "line", colour = poor) + 
  facet_wrap(~ city) + opts(legend.position = "none")

qplot(date, resid, data = tx2, geom = "line", colour = poor) + 
  facet_wrap(~ city) + opts(legend.position = "none")

Author: Jim Hester Created: 2014 May 18 09:33:36 AM Last Modified: 2014 May 22 10:15:22 AM