This is a simple demonstration of how to convert existing plyr code to use the dplyr package. This requires a dplyr version greater than 0.2, which implements the do()
function which is used in place of dlply()
and ldply()
from the plyr package. For each example the plyr implementation is on the left, the dplyr implementation is on the right. Some care has been taken to make the outputs functionally equivalent.
Dplyr is still in the early stages, so I would expect some of the verboseness in using do()
to decrease over time. In particular the pattern to convert a dlply call of an anonymous function is very verbose and non-obvious.
Ex. Plyr: dlply(data, 'group', function(x) something(x$value))
Dplyr: data %>% group_by(group) %>% do(res=(function(x) something(x$value) )(.))
library(plyr)
bnames <- read.csv("../data/bnames.csv", stringsAsFactors = FALSE)
head(bnames)
## year name percent sex
## 1 1880 John 0.08154 boy
## 2 1880 William 0.08051 boy
## 3 1880 James 0.05006 boy
## 4 1880 Charles 0.04517 boy
## 5 1880 George 0.04329 boy
## 6 1880 Frank 0.02738 boy
# Whole dataset transformations ---------------------------------------------
letter <- function(x, n = 1) {
if (n < 0) {
nc <- nchar(x)
n <- nc + n + 1
}
tolower(substr(x, n, n))
}
vowels <- function(x) {
nchar(gsub("[^aeiou]", "", x))
}
bnames <- transform(bnames,
first = letter(name, 1),
last = letter(name, -1),
length = nchar(name),
vowels = vowels(name)
)
# Whole dataset summaries ----------------------------------------------------
summarise(bnames,
max_perc = max(percent),
min_perc = min(percent))
## max_perc min_perc
## 1 0.08154 2.6e-05
# Group-wise transformations ------------------------------------------------
# Want to calculate rank of each name in each year (per sex). This is easy if
# we have a single sex for a single year:
one <- subset(bnames, sex == "boy" & year == 2008)
one$rank <- rank(-one$percent, ties.method = "first")
# or
one <- transform(one, rank = rank(-percent, ties.method = "first"))
head(one)
## year name percent sex first last length vowels rank
## 128001 2008 Jacob 0.010355 boy j b 5 2 1
## 128002 2008 Michael 0.009437 boy m l 7 3 2
## 128003 2008 Ethan 0.009301 boy e n 5 1 3
## 128004 2008 Joshua 0.008799 boy j a 6 3 4
## 128005 2008 Daniel 0.008702 boy d l 6 3 5
## 128006 2008 Alexander 0.008566 boy a r 9 3 6
# Conceptually if we want to perform this same task for every sex in every
# year, we need to split up the data, apply the transformation to every piece
# and then join the pieces back together
# This is what ddply does
bnames <- ddply(bnames, c("sex", "year"), transform,
rank = rank(-percent, ties.method = "first"))
# Group-wise summaries -------------------------------------------------------
# Group-wise summaries are much more interesting!
head(ddply(bnames, c("name"), summarise, tot = sum(percent)), n = 10)
## name tot
## 1 Aaden 0.000442
## 2 Aaliyah 0.019748
## 3 Aarav 0.000101
## 4 Aaron 0.293097
## 5 Ab 0.000218
## 6 Abagail 0.001326
## 7 Abb 0.000137
## 8 Abbey 0.007409
## 9 Abbie 0.022896
## 10 Abbigail 0.003392
head(ddply(bnames, c("length"), summarise, tot = sum(percent)), n = 10)
## length tot
## 1 2 0.2315
## 2 3 7.2744
## 3 4 36.8475
## 4 5 57.7588
## 5 6 60.3609
## 6 7 44.3370
## 7 8 14.8416
## 8 9 7.4245
## 9 10 0.6562
## 10 11 1.0414
head(ddply(bnames, c("year", "sex"), summarise, tot = sum(percent)), n = 10)
## year sex tot
## 1 1880 boy 0.9307
## 2 1880 girl 0.9345
## 3 1881 boy 0.9304
## 4 1881 girl 0.9327
## 5 1882 boy 0.9275
## 6 1882 girl 0.9310
## 7 1883 boy 0.9288
## 8 1883 girl 0.9333
## 9 1884 boy 0.9273
## 10 1884 girl 0.9314
fl <- ddply(bnames, c("year", "sex", "first"), summarise, tot = sum(percent))
library(ggplot2)
qplot(year, tot, data = fl, geom = "line", colour = sex, facets = ~ first)
library(dplyr)
bnames <- read.csv("../data/bnames.csv", stringsAsFactors = FALSE)
head(bnames)
## year name percent sex
## 1 1880 John 0.08154 boy
## 2 1880 William 0.08051 boy
## 3 1880 James 0.05006 boy
## 4 1880 Charles 0.04517 boy
## 5 1880 George 0.04329 boy
## 6 1880 Frank 0.02738 boy
# Whole dataset transformations ---------------------------------------------
letter <- function(x, n = 1) {
if (n < 0) {
nc <- nchar(x)
n <- nc + n + 1
}
tolower(substr(x, n, n))
}
vowels <- function(x) {
nchar(gsub("[^aeiou]", "", x))
}
bnames <- mutate(bnames,
first = letter(name, 1),
last = letter(name, -1),
length = nchar(name),
vowels = vowels(name)
)
# Whole dataset summaries ----------------------------------------------------
summarise(bnames,
max_perc = max(percent),
min_perc = min(percent))
## max_perc min_perc
## 1 0.08154 2.6e-05
# Group-wise transformations ------------------------------------------------
# Want to calculate rank of each name in each year (per sex). This is easy if
# we have a single sex for a single year:
one <- filter(bnames, sex == "boy", year == 2008)
one$rank <- rank(-one$percent, ties.method = "first")
# or
one <- mutate(one, rank = rank(-percent, ties.method = "first"))
head(one)
## year name percent sex first last length vowels rank
## 1 2008 Jacob 0.010355 boy j b 5 2 1
## 2 2008 Michael 0.009437 boy m l 7 3 2
## 3 2008 Ethan 0.009301 boy e n 5 1 3
## 4 2008 Joshua 0.008799 boy j a 6 3 4
## 5 2008 Daniel 0.008702 boy d l 6 3 5
## 6 2008 Alexander 0.008566 boy a r 9 3 6
# Conceptually if we want to perform this same task for every sex in every
# year, we need to split up the data, apply the transformation to every piece
# and then join the pieces back together
# This is what group_by and mutate do
bnames <- bnames %>% group_by(sex, year) %>% mutate(rank = rank(-percent, ties.method = "first"))
# Group-wise summaries -------------------------------------------------------
# Group-wise summaries are much more interesting!
bnames %>% group_by(name) %>% summarise(tot = sum(percent))
## Source: local data frame [6,782 x 2]
##
## name tot
## 1 Aaden 0.000442
## 2 Aaliyah 0.019748
## 3 Aarav 0.000101
## 4 Aaron 0.293097
## 5 Ab 0.000218
## 6 Abagail 0.001326
## 7 Abb 0.000137
## 8 Abbey 0.007409
## 9 Abbie 0.022896
## 10 Abbigail 0.003392
## .. ... ...
bnames %>% group_by(length) %>% summarise(tot = sum(percent))
## Source: local data frame [10 x 2]
##
## length tot
## 1 2 0.2315
## 2 3 7.2744
## 3 4 36.8475
## 4 5 57.7588
## 5 6 60.3609
## 6 7 44.3370
## 7 8 14.8416
## 8 9 7.4245
## 9 10 0.6562
## 10 11 1.0414
bnames %>% group_by(year, sex) %>% summarise(tot = sum(percent))
## Source: local data frame [258 x 3]
## Groups: year
##
## year sex tot
## 1 1880 boy 0.9307
## 2 1880 girl 0.9345
## 3 1881 boy 0.9304
## 4 1881 girl 0.9327
## 5 1882 boy 0.9275
## 6 1882 girl 0.9310
## 7 1883 boy 0.9288
## 8 1883 girl 0.9333
## 9 1884 boy 0.9273
## 10 1884 girl 0.9314
## .. ... ... ...
fl <- bnames %>% group_by(year, sex, first) %>% summarise(tot = sum(percent))
library(ggplot2)
qplot(year, tot, data = fl, geom = "line", colour = sex, facets = ~ first)
library(plyr)
library(reshape)
library(ggplot2)
bnames <- read.csv("../data/bnames.csv", stringsAsFactors = FALSE)
# Focus on the last 60 years
recent <- subset(bnames, year >= 1950)
# Still a lot of names, so pull out those names which are both moderately
# popular (> 1 / 1000) and in top 1000 for at least 30 years. In a real
# analysis you'd probably want to analyse more data.
per_name <- ddply(recent, c("sex", "name"), summarise,
years = length(name), percent_avg = mean(percent))
long <- subset(per_name, years >= 30 & percent_avg > 0.001)
bnames_long <- merge(recent, long[c("sex", "name")], by = c("sex", "name"))
# To cluster, we need to reshape the data so that each year forms a column.
# We'll have to do this a few times, so we'll create a function to do this
# specifically for this data. To learn more about how this works, see the
# documentation for the reshape package.
widen <- function(variable) {
as.matrix(cast(bnames_long, sex + name ~ year, fill = 0,
value = variable))
}
long$cluster1 <- kmeans(widen("percent"), 20)$cluster
bnames_cl <- merge(bnames_long, long, by = c("sex", "name"))
ggplot(bnames_cl, aes(year, percent)) +
geom_line(aes(group = interaction(sex, name))) +
facet_wrap(~ cluster1)
#head(dlply(long, "cluster1", function(df) as.character(df$name)))
# Hmmmm. Maybe be clustering too much based on absolute size, and not
# on relative shape. Lets rescale percent to 0,1
library(mgcv)
smooth <- function(var, date) {
predict(gam(var ~ s(date)))
}
scale01 <- function(x) (x - min(x)) / diff(range(x))
bnames_long <- ddply(bnames_long, c("sex", "name"), transform,
percent_std = scale01(percent),
percent_smo = scale01(smooth(percent, year))
)
long$cluster2 <- kmeans(widen("percent_std"), 20)$cluster
long$cluster3 <- kmeans(widen("percent_smo"), 20)$cluster
bnames_cl <- merge(bnames_long, long, by = c("sex", "name"))
qplot(year, percent_std, data = bnames_cl, group = interaction(name, sex), geom = "line") +
facet_wrap(~ cluster2)
qplot(year, percent_smo, data = bnames_cl, group = interaction(name, sex), geom = "line", colour = sex) +
facet_wrap(~ cluster2)
#dlply(long, "cluster2", function(df) as.character(df$name))
tab <- table(long[c("cluster2", "cluster3")])
library(e1071)
match <- matchClasses(tab)
## Cases in matched pairs: 67 %
print.table(tab[, match], zero.print = ".")
## cluster3
## cluster2 18 15 9 19 13 8 18 6 1 2 1 12 5 10 19 10 16 17 7 9
## 1 5 . . . . . 5 . . . . . . . . . . . . .
## 2 . 9 . . . . . . . . . . . . . . . . . .
## 3 . . 9 . . . . 1 . . . . . . . . 8 1 . 9
## 4 . . . 11 . . . . . . . . . . 11 . . . . .
## 5 . 2 . . 16 . . . . . . . . . . . 5 3 . .
## 6 . . . . . 27 . . . . . . . . . . . . . .
## 7 22 . . . . 2 22 . . . . . . . . . . . . .
## 8 . . . . . . . 16 . 4 . . . . . . . . . .
## 9 . . . . . 8 . . 11 . 11 . 1 . . . . . . .
## 10 . . . . . . . . . 18 . . . . . . . . 3 .
## 11 . . . . . . . . 8 . 8 . . . . . . . . .
## 12 . . . 3 . . . . . . . 11 . . 3 . . . . .
## 13 . . . . . . . . . . . . 9 . . . . . . .
## 14 . . . . . . . . 2 . 2 . 7 11 . 11 . . . .
## 15 . . . 11 . . . . . . . . . 6 11 6 . . . .
## 16 . . . 1 . . . . . . . . . 15 1 15 . . . .
## 17 . 1 . . . . . . . . . 7 . . . . 11 . . .
## 18 . . . . . . . . . . . . . . . . . 13 . .
## 19 . . . . . . . . . . . . . . . . . . 30 .
## 20 . . 5 . . . . . . . . . . . . . . . . 5
library(dplyr)
library(reshape2)
library(ggplot2)
bnames <- read.csv("../data/bnames.csv", stringsAsFactors = FALSE)
# Focus on the last 60 years
recent <- filter(bnames, year >= 1950)
# Still a lot of names, so pull out those names which are both moderately
# popular (> 1 / 1000) and in top 1000 for at least 30 years. In a real
# analysis you'd probably want to analyse more data.
per_name <- recent %>% group_by(sex, name) %>% summarise(years = length(name), percent_avg = mean(percent))
long <- filter(per_name, years >= 30 & percent_avg > 0.001)
bnames_long <- inner_join(recent, long[c("sex", "name")], by = c("sex", "name"))
# To cluster, we need to reshape the data so that each year forms a column.
# We'll have to do this a few times, so we'll create a function to do this
# specifically for this data. To learn more about how this works, see the
# documentation for the reshape package.
widen <- function(variable) {
select(dcast(bnames_long, sex + name ~ year, fill = 0,
value.var = variable), -sex, -name)
}
long$cluster1 <- kmeans(widen('percent'), 20)$cluster
bnames_cl <- inner_join(bnames_long, long, by = c("sex", "name"))
ggplot(bnames_cl, aes(year, percent)) +
geom_line(aes(group = interaction(sex, name))) +
facet_wrap(~ cluster1)
#long %>% group_by(cluster1) %>% do(names=.$name)
# Hmmmm. Maybe be clustering too much based on absolute size, and not
# on relative shape. Lets rescale percent to 0,1
library(mgcv)
smooth <- function(var, date) {
predict(gam(var ~ s(date)))
}
scale01 <- function(x) (x - min(x)) / diff(range(x))
bnames_long <- bnames_long %>% group_by(sex, name) %>%
mutate(percent_std = scale01(percent),
percent_smo = scale01(smooth(percent, year)))
long$cluster2 <- kmeans(widen("percent_std"), 20)$cluster
long$cluster3 <- kmeans(widen("percent_smo"), 20)$cluster
bnames_cl <- inner_join(bnames_long, long, by = c("sex", "name"))
qplot(year, percent_std, data = bnames_cl, group = interaction(name, sex), geom = "line") +
facet_wrap(~ cluster2)
qplot(year, percent_smo, data = bnames_cl, group = interaction(name, sex), geom = "line", colour = sex) +
facet_wrap(~ cluster2)
#long %>% group_by(cluster2) %>% do(names=.$name)
tab <- table(long[c("cluster2", "cluster3")])
library(e1071)
match <- matchClasses(tab)
## Cases in matched pairs: 66.75 %
print.table(tab[, match], zero.print = ".")
## cluster3
## cluster2 16 18 20 1 4 10 14 2 2 5 15 11 7 14 2 13 12 9 8 6
## 1 15 . . . . . 2 . . 1 . . . 2 . . . 2 . .
## 2 . 21 . . . . . . . . . . . . . . . . . .
## 3 . . 35 . . . . . . . . . . . . . . . . .
## 4 . . . 9 . . . . . . . . . . . . . . . .
## 5 . . . 2 7 . . . . . . . . . . . . . . 7
## 6 . . . . . 8 . . . . 3 . . . . . . . . .
## 7 . . . . . . 9 . . 8 . . . 9 . . . . . .
## 8 . 7 . . . 1 . 8 8 . . . . . 8 . . . . .
## 9 . . . . . . . 11 11 . . . . . 11 . . . . .
## 10 . . . . . . . . . 4 . . 1 . . . 2 . . .
## 11 . 1 . . . . . 4 4 . 8 . . . 4 . . . . .
## 12 . . . . 6 . . . . . . 20 . . . . . . . .
## 13 . . . . . . . . . . . . 24 . . . . . 14 .
## 14 . . . . . . 15 . . . . . . 15 . 6 1 . . .
## 15 . . . . . 1 . 9 9 . 2 . . . 9 . . . . .
## 16 . . . . . . . . . . . 2 . . . 14 . . . .
## 17 . . . . . . . . . . . 1 . . . . 13 . . .
## 18 . . . . . . . . . 6 . . 4 . . . . 26 . .
## 19 5 . . . . . . . . . . . . . . . . . 5 .
## 20 . . . . . . . . . . . . . . . . . . . 6
library(plyr)
library(ggplot2)
# First need to create new variables that capture the number of years
# a player has played, and for each year, how far through their career
# they are.
# This is straightforward using the techniques we just learned
b2 <- ddply(baseball, "id", transform,
cyear = year - min(year) + 1)
b2 <- ddply(b2, "id", transform,
career = (cyear - 1) / max(cyear))
head(b2)
## id year stint team lg g ab r h X2b X3b hr rbi sb cs bb so
## 1 aaronha01 1954 1 ML1 NL 122 468 58 131 27 6 13 69 2 2 28 39
## 2 aaronha01 1955 1 ML1 NL 153 602 105 189 37 9 27 106 3 1 49 61
## 3 aaronha01 1956 1 ML1 NL 153 609 106 200 34 14 26 92 2 4 37 54
## 4 aaronha01 1957 1 ML1 NL 151 615 118 198 27 6 44 132 1 1 57 58
## 5 aaronha01 1958 1 ML1 NL 153 601 109 196 34 4 30 95 4 1 59 49
## 6 aaronha01 1959 1 ML1 NL 154 629 116 223 46 7 39 123 8 0 51 54
## ibb hbp sh sf gidp cyear career
## 1 NA 3 6 4 13 1 0.00000
## 2 5 3 7 4 20 2 0.04348
## 3 6 2 5 7 21 3 0.08696
## 4 15 0 0 3 13 4 0.13043
## 5 16 1 0 3 21 5 0.17391
## 6 17 4 0 9 19 6 0.21739
# Now what sort of model should we use?
bruth <- subset(b2, id == "ruthba01")
qplot(career, g, data = bruth, geom = "line")
qplot(career, g, data = b2, geom="boxplot", group = round_any(career, 0.05))
# Could we model that as two straight lines?
bruth$p <- (bruth$career - 0.5) * 100
mod <- lm(g ~ p + p:I(p > 0), data = bruth)
bruth$ghat <- predict(mod)
qplot(career, g, data = bruth, geom = "line") +
geom_line(aes(y = ghat), colour = "red")
# It doesn't look great, but it's a start
# Let's fit that model to every player
b2$p <- (b2$career - 0.5) * 100
models <- dlply(b2, "id", lm, formula = g ~ p + p:I(p > 0))
# Or a bit more explicitly
models <- dlply(b2, "id", function(df) lm(g ~ p + p:I(p > 0), data = df))
length(models)
## [1] 1228
onem <- models[[1]]
onem
##
## Call:
## lm(formula = g ~ p + p:I(p > 0), data = df)
##
## Coefficients:
## (Intercept) p p:I(p > 0)TRUE
## 162.600 0.413 -1.601
summary(onem)
##
## Call:
## lm(formula = g ~ p + p:I(p > 0), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.356 -2.935 0.478 4.339 23.478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 162.600 4.538 35.83 < 2e-16 ***
## p 0.413 0.167 2.47 0.023 *
## p:I(p > 0)TRUE -1.601 0.316 -5.07 5.8e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.9 on 20 degrees of freedom
## Multiple R-squared: 0.688, Adjusted R-squared: 0.656
## F-statistic: 22 on 2 and 20 DF, p-value: 8.84e-06
coefs <- coef(onem)
names(coefs) <- c("mid", "inc", "dec")
rsq <- summary(onem)$r.squared
get_coefs <- function(model) {
coefs <- coef(model)
names(coefs) <- c("mid", "inc", "dec")
c(coefs, rsq = summary(model)$r.squared)
}
coefs <- ldply(models, get_coefs)
qplot(rsq, data = coefs, geom = "histogram", binwidth = 0.02)
qplot(mid, data = coefs, geom = "histogram", binwidth = 5)
qplot(inc, data = coefs, geom = "histogram", binwidth = 0.2)
qplot(dec, data = coefs, geom = "histogram", binwidth = 0.2)
qplot(rsq, inc, data=coefs)
qplot(dec, inc, data=coefs, colour=rsq)
library(dplyr)
library(ggplot2)
data(package='plyr', 'baseball')
# First need to create new variables that capture the number of years
# a player has played, and for each year, how far through their career
# they are.
# This is straightforward using the techniques we just learned
b2 <- tbl_df(baseball) %>%
group_by(id) %>%
mutate(cyear = year - min(year) + 1,
career = (cyear - 1) / max(cyear))
b2
## Source: local data frame [21,699 x 24]
## Groups: id
##
## id year stint team lg g ab r h X2b X3b hr rbi sb cs bb so
## 1 ansonca01 1871 1 RC1 25 120 29 39 11 3 0 16 6 2 2 1
## 2 forceda01 1871 1 WS3 32 162 45 45 9 4 0 29 8 0 4 0
## 3 mathebo01 1871 1 FW1 19 89 15 24 3 1 0 10 2 1 2 0
## 4 startjo01 1871 1 NY2 33 161 35 58 5 1 1 34 4 2 3 0
## 5 suttoez01 1871 1 CL1 29 128 35 45 3 7 3 23 3 1 1 0
## 6 whitede01 1871 1 CL1 29 146 40 47 6 5 1 21 2 2 4 1
## 7 yorkto01 1871 1 TRO 29 145 36 37 5 7 2 23 2 2 9 1
## 8 ansonca01 1872 1 PH1 46 217 60 90 10 7 0 50 6 6 16 3
## 9 burdoja01 1872 1 BR2 37 174 26 46 3 0 0 15 0 1 1 1
## 10 forceda01 1872 1 TRO 25 130 40 53 11 0 0 16 2 2 1 0
## .. ... ... ... ... .. .. ... .. .. ... ... .. ... .. .. .. ..
## Variables not shown: ibb (int), hbp (int), sh (int), sf (int), gidp (int),
## cyear (dbl), career (dbl)
# Now what sort of model should we use?
bruth <- subset(b2, id == "ruthba01")
qplot(career, g, data = bruth, geom = "line")
qplot(career, g, data = b2, geom="boxplot", group = plyr::round_any(career, 0.05))
# Could we model that as two straight lines?
bruth$p <- (bruth$career - 0.5) * 100
mod <- lm(g ~ p + p:I(p > 0), data = bruth)
bruth$ghat <- predict(mod)
qplot(career, g, data = bruth, geom = "line") +
geom_line(aes(y = ghat), colour = "red")
# It doesn't look great, but it's a start
# Let's fit that model to every player
b2$p <- (b2$career - 0.5) * 100
models <- b2 %>% do(model=lm(data=., formula = g ~ p + p:I(p > 0)))
# Or a bit more explicitly
models <- b2 %>% do(model=(function(df) lm(data=df, g ~ p + p:I(p > 0)))(.))
nrow(models)
## [1] 1228
onem <- models[[1, 'model']]
onem
##
## Call:
## lm(formula = g ~ p + p:I(p > 0), data = df)
##
## Coefficients:
## (Intercept) p p:I(p > 0)TRUE
## 162.600 0.413 -1.601
summary(onem)
##
## Call:
## lm(formula = g ~ p + p:I(p > 0), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.356 -2.935 0.478 4.339 23.478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 162.600 4.538 35.83 < 2e-16 ***
## p 0.413 0.167 2.47 0.023 *
## p:I(p > 0)TRUE -1.601 0.316 -5.07 5.8e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.9 on 20 degrees of freedom
## Multiple R-squared: 0.688, Adjusted R-squared: 0.656
## F-statistic: 22 on 2 and 20 DF, p-value: 8.84e-06
coefs <- as.list(coef(onem))
names(coefs) <- c("mid", "inc", "dec")
rsq <- summary(onem)$r.squared
get_coefs <- function(model) {
coefs <- as.list(coef(model))
names(coefs) <- c("mid", "inc", "dec")
data.frame(coefs, rsq = summary(model)$r.squared)
}
coefs <- models %>% do(get_coefs(.$model))
qplot(rsq, data = coefs, geom = "histogram", binwidth = 0.02)
qplot(mid, data = coefs, geom = "histogram", binwidth = 5)
qplot(inc, data = coefs, geom = "histogram", binwidth = 0.2)
qplot(dec, data = coefs, geom = "histogram", binwidth = 0.2)
qplot(rsq, inc, data=coefs)
qplot(dec, inc, data=coefs, colour=rsq)
library(ggplot2)
library(plyr)
options(na.action = "na.exclude")
# Helper functions ----------------------------------------------------------
deseas <- function(var, month) {
resid(lm(var ~ factor(month))) + mean(var, na.rm = TRUE)
}
# Explore multiple cities ----------------------------------------------------
tx <- read.csv("../data/tx-house-sales.csv")
# We know from our exploration of Houston data that many of the series
# have strong seasonal components. It's a good idea to check that's true for
# all cities. We'll start with sales.
qplot(date, sales, data = tx, geom = "line", group = city)
# Hmmmm. Problem! There's a large variation in the number of sales between
# cities. The seasonal pattern does look pretty constant though.
# First possible solution, just remove the seasonal effect as we did for a
# single city, but applied to multiple cities (using model as a tool)
tx <- ddply(tx, "city", transform,
sales_ds = deseas(sales, month))
qplot(date, sales_ds, data = tx, geom = "line", group = city)
# It works, but we don't gain much insight into what's going on.
# Let's fit the models, and actually look at them this time
models <- dlply(tx, "city", function(df)
lm(sales ~ factor(month), data = df))
models[[1]]
##
## Call:
## lm(formula = sales ~ factor(month), data = df)
##
## Coefficients:
## (Intercept) factor(month)2 factor(month)3 factor(month)4
## 91.8 27.0 53.9 57.6
## factor(month)5 factor(month)6 factor(month)7 factor(month)8
## 74.4 84.8 80.2 84.1
## factor(month)9 factor(month)10 factor(month)11 factor(month)12
## 45.3 42.4 27.5 29.4
coef(models[[1]])
## (Intercept) factor(month)2 factor(month)3 factor(month)4
## 91.80 27.00 53.90 57.60
## factor(month)5 factor(month)6 factor(month)7 factor(month)8
## 74.42 84.76 80.20 84.09
## factor(month)9 factor(month)10 factor(month)11 factor(month)12
## 45.31 42.42 27.53 29.42
# To extract the coefficients, we want to go from a list to data frame
# Notice how plyr remembers the city names that we originally broke the
# models up by.
(ldply(models, coef))[1:5, 1:3]
## city (Intercept) factor(month)2
## 1 Abilene 91.8 27.0
## 2 Amarillo 161.9 46.8
## 3 Arlington 302.2 62.8
## 4 Austin 1258.8 216.1
## 5 Bay Area 313.5 91.7
# Two problems with the model:
# * Coefficients aren't comparable, because of varying sizes
# Solution: log-transform to convert to ratios
# * Coefficients not in useful form for plotting
# Solution: create data frame ourselves
qplot(date, log10(sales), data = tx, geom = "line", group = city)
models2 <- dlply(tx, "city", function(df)
lm(log10(sales) ~ factor(month), data = df))
coef2 <- ldply(models2, function(mod) {
data.frame(
month = 1:12,
effect = c(0, coef(mod)[-1]),
intercept = coef(mod)[1])
})
# Pretty consistent pattern, although there are few outliers
qplot(month, effect, data = coef2, group = city, geom = "line")
# More interpretable if we back-transform - can now interpret as ratios
qplot(month, 10 ^ effect, data = coef2, group = city, geom = "line")
# What are the outliers?
qplot(month, 10 ^ effect, data = coef2, geom = "line") + facet_wrap(~ city)
# They are small cities. Hmmmmm
# Have a look at the distributions
qplot(effect, data = coef2, binwidth = 0.05) + facet_wrap(~ month)
# Single model ----------------------------------------------------
mod <- lm(log10(sales) ~ city + factor(month), data = tx)
tx$sales2 <- 10 ^ resid(mod)
qplot(date, sales2, data = tx, geom = "line", group = city)
# Now we're starting to get somewhere! Can see general pattern, although
# there are a few outliers. Look at cities individually to identify:
last_plot() + facet_wrap(~ city)
# Some problem cities:
# * Bryan-College station: has different seasonal pattern (Texas A&M?)
# * Similarly with San Marcos (a lot of missing data)
# * Palestine: massive increase beginning 2007
# Can resolve seasonal problems by fitting separate seasonal pattern to each
# city (Challenge: how is this different to the indivudal models we fit
# before?) But probably better to use more sophisticated model (e.g. mixed
# effects) model.
mod2 <- lm(log10(sales) ~ city:factor(month), data = tx)
tx$sales3 <- 10 ^ resid(mod2)
qplot(date, sales3, data = tx, geom = "line") + facet_wrap(~ city)
# Further exploration
qplot(date, sales2, data = tx, geom = "line", group = city, alpha = I(0.2))
last_plot() + geom_smooth(aes(group = 1))
# Could smooth individual cities - again just using model as a tool
library(mgcv)
smooth <- function(var, date) {
predict(gam(var ~ s(date)))
}
tx <- ddply(tx, "city", transform,
sales2_sm = smooth(sales2, date))
qplot(date, sales2_sm, data = tx, geom = "line", group = city)
# Another approach -----------------------------------------------------------
# Essence of most cities is seasonal term plus long term smooth trend. We
# could fit this model to each city, and then look for model which don't
# fit well.
library(splines)
models3 <- dlply(tx, "city", function(df) {
lm(log10(sales) ~ factor(month) + ns(date, 3), data = df)
})
# Extract rsquared from each model
rsq <- function(mod) c(rsq = summary(mod)$r.squared)
quality <- ldply(models3, rsq)
qplot(rsq, city, data = quality)
qplot(rsq, reorder(city, rsq), data = quality)
quality$poor <- quality$rsq < 0.7
tx2 <- merge(tx, quality, by = "city")
# The cities don't look particularly differnet
qplot(date, log10(sales), data = tx2, geom = "line", colour = poor) +
facet_wrap(~ city) + opts(legend.position = "none")
# But we should probably look at the residuals & predictions
mfit <- ldply(models3, function(mod) {
data.frame(resid = resid(mod), pred = predict(mod))
})
tx2 <- cbind(tx2, mfit[, -1])
qplot(date, pred, data = tx2, geom = "line", colour = poor) +
facet_wrap(~ city) + opts(legend.position = "none")
qplot(date, resid, data = tx2, geom = "line", colour = poor) +
facet_wrap(~ city) + opts(legend.position = "none")
library(ggplot2)
library(dplyr)
options(na.action = "na.exclude")
# Helper functions ----------------------------------------------------------
deseas <- function(var, month) {
resid(lm(var ~ factor(month))) + mean(var, na.rm = TRUE)
}
# Explore multiple cities ----------------------------------------------------
tx <- read.csv("../data/tx-house-sales.csv")
# We know from our exploration of Houston data that many of the series
# have strong seasonal components. It's a good idea to check that's true for
# all cities. We'll start with sales.
qplot(date, sales, data = tx, geom = "line", group = city)
# Hmmmm. Problem! There's a large variation in the number of sales between
# cities. The seasonal pattern does look pretty constant though.
# First possible solution, just remove the seasonal effect as we did for a
# single city, but applied to multiple cities (using model as a tool)
tx_city <- tx %>% group_by(city)
tx = tx_city %>% mutate(sales_ds = deseas(sales, month))
qplot(date, sales_ds, data = tx, geom = "line", group = city)
# It works, but we don't gain much insight into what's going on.
# Let's fit the models, and actually look at them this time
models = tx %>% group_by(city) %>% do(model=lm(sales ~ factor(month), data=.))
models[[1, 'model']]
##
## Call:
## lm(formula = sales ~ factor(month), data = .)
##
## Coefficients:
## (Intercept) factor(month)2 factor(month)3 factor(month)4
## 91.8 27.0 53.9 57.6
## factor(month)5 factor(month)6 factor(month)7 factor(month)8
## 74.4 84.8 80.2 84.1
## factor(month)9 factor(month)10 factor(month)11 factor(month)12
## 45.3 42.4 27.5 29.4
coef(models[[1, 'model']])
## (Intercept) factor(month)2 factor(month)3 factor(month)4
## 91.80 27.00 53.90 57.60
## factor(month)5 factor(month)6 factor(month)7 factor(month)8
## 74.42 84.76 80.20 84.09
## factor(month)9 factor(month)10 factor(month)11 factor(month)12
## 45.31 42.42 27.53 29.42
# To extract the coefficients, we want to go from a list to data frame
# Notice how plyr remembers the city names that we originally broke the
# models up by.
(models %>% group_by(city) %>% do(data.frame(t(coef(.[[1, 'model']])), check.names=FALSE)))[1:5, 1:3]
## Source: local data frame [5 x 3]
## Groups:
##
## city (Intercept) factor(month)2
## 1 Abilene 91.8 27.0
## 2 Amarillo 161.9 46.8
## 3 Arlington 302.2 62.8
## 4 Austin 1258.8 216.1
## 5 Bay Area 313.5 91.7
# Two problems with the model:
# * Coefficients aren't comparable, because of varying sizes
# Solution: log-transform to convert to ratios
# * Coefficients not in useful form for plotting
# Solution: create data frame ourselves
qplot(date, log10(sales), data = tx, geom = "line", group = city)
models2 <- tx_city %>% do(model=lm(log10(sales) ~ factor(month), data = .))
coef2 <- models2 %>% group_by(city) %>% do((function(row) {
mod = row[['model']][[1]]
data.frame(
month = 1:12,
effect = c(0, coef(mod)[-1]),
intercept = coef(mod)[1])
})(.))
# Pretty consistent pattern, although there are few outliers
qplot(month, effect, data = coef2, group = city, geom = "line")
# More interpretable if we back-transform - can now interpret as ratios
qplot(month, 10 ^ effect, data = coef2, group = city, geom = "line")
# What are the outliers?
qplot(month, 10 ^ effect, data = coef2, geom = "line") + facet_wrap(~ city)
# They are small cities. Hmmmmm
# Have a look at the distributions
qplot(effect, data = coef2, binwidth = 0.05) + facet_wrap(~ month)
# Single model ----------------------------------------------------
mod <- lm(log10(sales) ~ city + factor(month), data = tx)
tx$sales2 <- 10 ^ resid(mod)
qplot(date, sales2, data = tx, geom = "line", group = city)
# Now we're starting to get somewhere! Can see general pattern, although
# there are a few outliers. Look at cities individually to identify:
last_plot() + facet_wrap(~ city)
# Some problem cities:
# * Bryan-College station: has different seasonal pattern (Texas A&M?)
# * Similarly with San Marcos (a lot of missing data)
# * Palestine: massive increase beginning 2007
# Can resolve seasonal problems by fitting separate seasonal pattern to each
# city (Challenge: how is this different to the indivudal models we fit
# before?) But probably better to use more sophisticated model (e.g. mixed
# effects) model.
mod2 <- lm(log10(sales) ~ city:factor(month), data = tx)
tx$sales3 <- 10 ^ resid(mod2)
qplot(date, sales3, data = tx, geom = "line") + facet_wrap(~ city)
# Further exploration
qplot(date, sales2, data = tx, geom = "line", group = city, alpha = I(0.2))
last_plot() + geom_smooth(aes(group = 1))
# Could smooth individual cities - again just using model as a tool
library(mgcv)
smooth <- function(var, date) {
predict(gam(var ~ s(date)))
}
tx <- tx %>% group_by(city) %>% mutate(sales2_sm = as.vector(smooth(sales2, date)))
qplot(date, sales2_sm, data = tx, geom = "line", group = city)
# Another approach -----------------------------------------------------------
# Essence of most cities is seasonal term plus long term smooth trend. We
# could fit this model to each city, and then look for model which don't
# fit well.
library(splines)
models3 <- tx %>% group_by(city) %>% do(model=lm(log10(sales) ~ factor(month) + ns(date, 3), data = .))
# Extract rsquared from each model
rsq <- function(mod) c(rsq = summary(mod[[1]])$r.squared)
quality <- models3 %>% group_by(city) %>% summarise(rsq=rsq(model))
qplot(rsq, city, data = quality)
qplot(rsq, reorder(city, rsq), data = quality)
quality$poor <- quality$rsq < 0.7
tx2 <- inner_join(tx, quality, by = "city")
# The cities don't look particularly differnet
qplot(date, log10(sales), data = tx2, geom = "line", colour = poor) +
facet_wrap(~ city) + opts(legend.position = "none")
# But we should probably look at the residuals & predictions
mfit <- models3 %>% do((function(mod) {
data.frame(resid = resid(mod$model), pred = predict(mod$model))
})(.))
tx2 <- cbind(tx2, mfit)
qplot(date, pred, data = tx2, geom = "line", colour = poor) +
facet_wrap(~ city) + opts(legend.position = "none")
qplot(date, resid, data = tx2, geom = "line", colour = poor) +
facet_wrap(~ city) + opts(legend.position = "none")
Author: Jim Hester Created: 2014 May 18 09:33:36 AM Last Modified: 2014 May 22 10:15:22 AM