Introduction

This is a simple demonstration of how to convert existing plyr code to use the dplyr package. This requires a dplyr version greater than 0.2, which implements the do() function which is used in place of dlply() and ldply() from the plyr package. For each example the plyr implementation is on the left, the dplyr implementation is on the right. Some care has been taken to make the outputs functionally equivalent.

Dplyr is still in the early stages, so I would expect some of the verboseness in using do() to decrease over time. In particular the pattern to convert a dlply call of an anonymous function is very verbose and non-obvious.

Ex. Plyr: dlply(data, 'group', function(x) something(x$value)) Dplyr: data %>% group_by(group) %>% do(res=(function(x) something(x$value) )(.))

Baby Names-Explore Example

Download Dataset
library(plyr)

bnames <- read.csv("../data/bnames.csv", stringsAsFactors = FALSE)
head(bnames)
##   year    name percent sex
## 1 1880    John 0.08154 boy
## 2 1880 William 0.08051 boy
## 3 1880   James 0.05006 boy
## 4 1880 Charles 0.04517 boy
## 5 1880  George 0.04329 boy
## 6 1880   Frank 0.02738 boy
# Whole dataset transformations ---------------------------------------------
letter <- function(x, n = 1) {
  if (n < 0) {
    nc <- nchar(x)
    n <- nc + n + 1
  }
  tolower(substr(x, n, n))
}
vowels <- function(x) {
  nchar(gsub("[^aeiou]", "", x))
}

bnames <- transform(bnames,
  first = letter(name, 1),
  last = letter(name, -1),
  length = nchar(name),
  vowels = vowels(name)
)

# Whole dataset summaries ----------------------------------------------------

summarise(bnames,
  max_perc = max(percent),
  min_perc = min(percent))
##   max_perc min_perc
## 1  0.08154  2.6e-05
# Group-wise transformations  ------------------------------------------------

# Want to calculate rank of each name in each year (per sex).  This is easy if
# we have a single sex for a single year:
one <- subset(bnames, sex == "boy" & year == 2008)
one$rank <- rank(-one$percent, ties.method = "first")
# or
one <- transform(one, rank = rank(-percent, ties.method = "first"))
head(one)
##        year      name  percent sex first last length vowels rank
## 128001 2008     Jacob 0.010355 boy     j    b      5      2    1
## 128002 2008   Michael 0.009437 boy     m    l      7      3    2
## 128003 2008     Ethan 0.009301 boy     e    n      5      1    3
## 128004 2008    Joshua 0.008799 boy     j    a      6      3    4
## 128005 2008    Daniel 0.008702 boy     d    l      6      3    5
## 128006 2008 Alexander 0.008566 boy     a    r      9      3    6
# Conceptually if we want to perform this same task for every sex in every 
# year, we need to split up the data, apply the transformation to every piece
# and then join the pieces back together

# This is what ddply does
bnames <- ddply(bnames, c("sex", "year"), transform, 
  rank = rank(-percent, ties.method = "first"))

# Group-wise summaries -------------------------------------------------------

# Group-wise summaries are much more interesting!

head(ddply(bnames, c("name"), summarise, tot = sum(percent)), n = 10)
##        name      tot
## 1     Aaden 0.000442
## 2   Aaliyah 0.019748
## 3     Aarav 0.000101
## 4     Aaron 0.293097
## 5        Ab 0.000218
## 6   Abagail 0.001326
## 7       Abb 0.000137
## 8     Abbey 0.007409
## 9     Abbie 0.022896
## 10 Abbigail 0.003392
head(ddply(bnames, c("length"), summarise, tot = sum(percent)), n = 10)
##    length     tot
## 1       2  0.2315
## 2       3  7.2744
## 3       4 36.8475
## 4       5 57.7588
## 5       6 60.3609
## 6       7 44.3370
## 7       8 14.8416
## 8       9  7.4245
## 9      10  0.6562
## 10     11  1.0414
head(ddply(bnames, c("year", "sex"), summarise, tot = sum(percent)), n = 10)
##    year  sex    tot
## 1  1880  boy 0.9307
## 2  1880 girl 0.9345
## 3  1881  boy 0.9304
## 4  1881 girl 0.9327
## 5  1882  boy 0.9275
## 6  1882 girl 0.9310
## 7  1883  boy 0.9288
## 8  1883 girl 0.9333
## 9  1884  boy 0.9273
## 10 1884 girl 0.9314
fl <- ddply(bnames, c("year", "sex", "first"), summarise, tot = sum(percent))
library(ggplot2)
qplot(year, tot, data = fl, geom = "line", colour = sex, facets = ~ first)
library(dplyr)

bnames <- read.csv("../data/bnames.csv", stringsAsFactors = FALSE)
head(bnames)
##   year    name percent sex
## 1 1880    John 0.08154 boy
## 2 1880 William 0.08051 boy
## 3 1880   James 0.05006 boy
## 4 1880 Charles 0.04517 boy
## 5 1880  George 0.04329 boy
## 6 1880   Frank 0.02738 boy
# Whole dataset transformations ---------------------------------------------
letter <- function(x, n = 1) {
  if (n < 0) {
    nc <- nchar(x)
    n <- nc + n + 1
  }
  tolower(substr(x, n, n))
}
vowels <- function(x) {
  nchar(gsub("[^aeiou]", "", x))
}

bnames <- mutate(bnames,
  first = letter(name, 1),
  last = letter(name, -1),
  length = nchar(name),
  vowels = vowels(name)
)

# Whole dataset summaries ----------------------------------------------------

summarise(bnames,
  max_perc = max(percent),
  min_perc = min(percent))
##   max_perc min_perc
## 1  0.08154  2.6e-05
# Group-wise transformations  ------------------------------------------------

# Want to calculate rank of each name in each year (per sex).  This is easy if
# we have a single sex for a single year:
one <- filter(bnames, sex == "boy", year == 2008)
one$rank <- rank(-one$percent, ties.method = "first")
# or
one <- mutate(one, rank = rank(-percent, ties.method = "first"))
head(one)
##   year      name  percent sex first last length vowels rank
## 1 2008     Jacob 0.010355 boy     j    b      5      2    1
## 2 2008   Michael 0.009437 boy     m    l      7      3    2
## 3 2008     Ethan 0.009301 boy     e    n      5      1    3
## 4 2008    Joshua 0.008799 boy     j    a      6      3    4
## 5 2008    Daniel 0.008702 boy     d    l      6      3    5
## 6 2008 Alexander 0.008566 boy     a    r      9      3    6
# Conceptually if we want to perform this same task for every sex in every 
# year, we need to split up the data, apply the transformation to every piece
# and then join the pieces back together

# This is what group_by and mutate do
bnames <- bnames %>% group_by(sex, year) %>% mutate(rank = rank(-percent, ties.method = "first"))

# Group-wise summaries -------------------------------------------------------

# Group-wise summaries are much more interesting!

bnames %>% group_by(name) %>% summarise(tot = sum(percent))
## Source: local data frame [6,782 x 2]
## 
##        name      tot
## 1     Aaden 0.000442
## 2   Aaliyah 0.019748
## 3     Aarav 0.000101
## 4     Aaron 0.293097
## 5        Ab 0.000218
## 6   Abagail 0.001326
## 7       Abb 0.000137
## 8     Abbey 0.007409
## 9     Abbie 0.022896
## 10 Abbigail 0.003392
## ..      ...      ...
bnames %>% group_by(length) %>% summarise(tot = sum(percent))
## Source: local data frame [10 x 2]
## 
##    length     tot
## 1       2  0.2315
## 2       3  7.2744
## 3       4 36.8475
## 4       5 57.7588
## 5       6 60.3609
## 6       7 44.3370
## 7       8 14.8416
## 8       9  7.4245
## 9      10  0.6562
## 10     11  1.0414
bnames %>% group_by(year, sex) %>% summarise(tot = sum(percent))
## Source: local data frame [258 x 3]
## Groups: year
## 
##    year  sex    tot
## 1  1880  boy 0.9307
## 2  1880 girl 0.9345
## 3  1881  boy 0.9304
## 4  1881 girl 0.9327
## 5  1882  boy 0.9275
## 6  1882 girl 0.9310
## 7  1883  boy 0.9288
## 8  1883 girl 0.9333
## 9  1884  boy 0.9273
## 10 1884 girl 0.9314
## ..  ...  ...    ...
fl <- bnames %>% group_by(year, sex, first) %>% summarise(tot = sum(percent))
library(ggplot2)
qplot(year, tot, data = fl, geom = "line", colour = sex, facets = ~ first)

Baby Names-Cluster Example

Download Dataset
library(plyr)
library(reshape)
library(ggplot2)

bnames <- read.csv("../data/bnames.csv", stringsAsFactors = FALSE)

# Focus on the last 60 years
recent <- subset(bnames, year >= 1950)

# Still a lot of names, so pull out those names which are both moderately
# popular (> 1 / 1000) and in top 1000 for at least 30 years.  In a real
# analysis you'd probably want to analyse more data.
per_name <- ddply(recent, c("sex", "name"), summarise, 
  years = length(name), percent_avg = mean(percent))
long <- subset(per_name, years >= 30 & percent_avg > 0.001)
bnames_long <- merge(recent, long[c("sex", "name")], by = c("sex", "name"))

# To cluster, we need to reshape the data so that each year forms a column.
# We'll have to do this a few times, so we'll create a function to do this
# specifically for this data.  To learn more about how this works, see the 
# documentation for the reshape package.
widen <- function(variable) {
  as.matrix(cast(bnames_long, sex + name ~ year, fill = 0,
    value = variable))
}

long$cluster1 <- kmeans(widen("percent"), 20)$cluster
bnames_cl <- merge(bnames_long, long, by = c("sex", "name"))
ggplot(bnames_cl, aes(year, percent)) +
  geom_line(aes(group = interaction(sex, name))) +
  facet_wrap(~ cluster1)
#head(dlply(long, "cluster1", function(df) as.character(df$name)))

# Hmmmm.  Maybe be clustering too much based on absolute size, and not
# on relative shape.  Lets rescale percent to 0,1

library(mgcv)
smooth <- function(var, date) {
  predict(gam(var ~ s(date)))
}
scale01 <- function(x) (x - min(x)) / diff(range(x))

bnames_long <- ddply(bnames_long, c("sex", "name"), transform,
  percent_std = scale01(percent),
  percent_smo = scale01(smooth(percent, year))
)

long$cluster2 <- kmeans(widen("percent_std"), 20)$cluster
long$cluster3 <- kmeans(widen("percent_smo"), 20)$cluster

bnames_cl <- merge(bnames_long, long, by = c("sex", "name"))
qplot(year, percent_std, data = bnames_cl, group = interaction(name, sex), geom = "line") + 
  facet_wrap(~ cluster2)
qplot(year, percent_smo, data = bnames_cl, group = interaction(name, sex), geom = "line", colour = sex) + 
  facet_wrap(~ cluster2)
#dlply(long, "cluster2", function(df) as.character(df$name))

tab <- table(long[c("cluster2", "cluster3")])
library(e1071)
match <- matchClasses(tab)
## Cases in matched pairs: 67 %
print.table(tab[, match], zero.print = ".")
##         cluster3
## cluster2 18 15  9 19 13  8 18  6  1  2  1 12  5 10 19 10 16 17  7  9
##       1   5  .  .  .  .  .  5  .  .  .  .  .  .  .  .  .  .  .  .  .
##       2   .  9  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
##       3   .  .  9  .  .  .  .  1  .  .  .  .  .  .  .  .  8  1  .  9
##       4   .  .  . 11  .  .  .  .  .  .  .  .  .  . 11  .  .  .  .  .
##       5   .  2  .  . 16  .  .  .  .  .  .  .  .  .  .  .  5  3  .  .
##       6   .  .  .  .  . 27  .  .  .  .  .  .  .  .  .  .  .  .  .  .
##       7  22  .  .  .  .  2 22  .  .  .  .  .  .  .  .  .  .  .  .  .
##       8   .  .  .  .  .  .  . 16  .  4  .  .  .  .  .  .  .  .  .  .
##       9   .  .  .  .  .  8  .  . 11  . 11  .  1  .  .  .  .  .  .  .
##       10  .  .  .  .  .  .  .  .  . 18  .  .  .  .  .  .  .  .  3  .
##       11  .  .  .  .  .  .  .  .  8  .  8  .  .  .  .  .  .  .  .  .
##       12  .  .  .  3  .  .  .  .  .  .  . 11  .  .  3  .  .  .  .  .
##       13  .  .  .  .  .  .  .  .  .  .  .  .  9  .  .  .  .  .  .  .
##       14  .  .  .  .  .  .  .  .  2  .  2  .  7 11  . 11  .  .  .  .
##       15  .  .  . 11  .  .  .  .  .  .  .  .  .  6 11  6  .  .  .  .
##       16  .  .  .  1  .  .  .  .  .  .  .  .  . 15  1 15  .  .  .  .
##       17  .  1  .  .  .  .  .  .  .  .  .  7  .  .  .  . 11  .  .  .
##       18  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 13  .  .
##       19  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 30  .
##       20  .  .  5  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  5
library(dplyr)
library(reshape2)
library(ggplot2)

bnames <- read.csv("../data/bnames.csv", stringsAsFactors = FALSE)

# Focus on the last 60 years
recent <- filter(bnames, year >= 1950)

# Still a lot of names, so pull out those names which are both moderately
# popular (> 1 / 1000) and in top 1000 for at least 30 years.  In a real
# analysis you'd probably want to analyse more data.

per_name <- recent %>% group_by(sex, name) %>% summarise(years = length(name), percent_avg = mean(percent))
long <- filter(per_name, years >= 30 & percent_avg > 0.001)
bnames_long <- inner_join(recent, long[c("sex", "name")], by = c("sex", "name"))

# To cluster, we need to reshape the data so that each year forms a column.
# We'll have to do this a few times, so we'll create a function to do this
# specifically for this data.  To learn more about how this works, see the 
# documentation for the reshape package.
widen <- function(variable) {
  select(dcast(bnames_long, sex + name ~ year, fill = 0,
    value.var = variable), -sex, -name)
}

long$cluster1 <- kmeans(widen('percent'), 20)$cluster

bnames_cl <- inner_join(bnames_long, long, by = c("sex", "name"))
ggplot(bnames_cl, aes(year, percent)) +
  geom_line(aes(group = interaction(sex, name))) +
  facet_wrap(~ cluster1)
#long %>% group_by(cluster1) %>% do(names=.$name)

# Hmmmm.  Maybe be clustering too much based on absolute size, and not
# on relative shape.  Lets rescale percent to 0,1

library(mgcv)
smooth <- function(var, date) {
  predict(gam(var ~ s(date)))
}
scale01 <- function(x) (x - min(x)) / diff(range(x))

bnames_long <- bnames_long %>% group_by(sex, name) %>%
  mutate(percent_std = scale01(percent),
         percent_smo = scale01(smooth(percent, year)))

long$cluster2 <- kmeans(widen("percent_std"), 20)$cluster
long$cluster3 <- kmeans(widen("percent_smo"), 20)$cluster

bnames_cl <- inner_join(bnames_long, long, by = c("sex", "name"))
qplot(year, percent_std, data = bnames_cl, group = interaction(name, sex), geom = "line") + 
  facet_wrap(~ cluster2)