In the world of data science in R, the battle between dplyr and datatable is real. Here I compare their performance against base r commands for some common tasks. Who will be the winner on speed and simplicity?

Make random datasets

set.seed(71)
size1 <- 4*10^6
size2 <- size1 * 0.1
df1 <- data.frame(id=paste0("SERVICE_", 1:size1), value=rnorm(size1), stringsAsFactors=FALSE)
df2 <- data.frame(id=paste0("SERVICE_", sample(1:size1, size2)), value=rnorm(size2), stringsAsFactors=FALSE)

dt1 <- data.table(df1)
dt2 <- data.table(df2)

# mtcars data
M <- data.table(mtcars)
m <- as_tibble(mtcars)

Group by sum

microbenchmark(
  times=100L,
  M[, sum(hp), by=cyl],
  mtcars %>% group_by(cyl) %>% summarise(sum(hp)),
  aggregate(x=mtcars$hp, by=list(cyl=mtcars$cyl), FUN=sum),
  aggregate(hp ~ cyl, data=mtcars, FUN=sum),
  tapply(X=mtcars$hp, INDEX=mtcars$cyl, FUN=sum),
  plyr::ddply(mtcars, 'cyl', plyr::summarise, sum(hp))
) 
## Unit: microseconds
##                                                              expr      min
##                                            M[, sum(hp), by = cyl]  984.308
##                   mtcars %>% group_by(cyl) %>% summarise(sum(hp)) 1395.253
##  aggregate(x = mtcars$hp, by = list(cyl = mtcars$cyl), FUN = sum)  953.034
##                     aggregate(hp ~ cyl, data = mtcars, FUN = sum) 1210.410
##              tapply(X = mtcars$hp, INDEX = mtcars$cyl, FUN = sum)  207.599
##              plyr::ddply(mtcars, "cyl", plyr::summarise, sum(hp)) 2580.687
##         lq      mean   median       uq       max neval
##  1451.6960 2065.5977 1655.470 1912.101 20003.058   100
##  1965.7315 2795.7730 2171.311 2528.985 17896.043   100
##  1141.5000 1342.8060 1252.845 1335.886  3748.162   100
##  1374.5885 1582.7563 1485.008 1619.644  3317.021   100
##   257.8225  297.2278  285.102  310.905  1260.213   100
##  2907.2670 3935.3080 3197.389 3669.062 30506.079   100

Conclusion:

  • tapply is wicked fast on this small dataset.
  • how does this scale as mtcars increase in size?

Only look at data.table, apply, and dplyr.

https://www.r-bloggers.com/using-apply-sapply-lapply-in-r/

Double group by, sum of two column

M[order(cyl, vs), lapply(.SD, sum, na.rm = T), by=.(cyl, vs), .SDcols=c('mpg', 'disp')]
##    cyl vs   mpg   disp
## 1:   4  0  26.0  120.3
## 2:   4  1 267.3 1036.2
## 3:   6  0  61.7  465.0
## 4:   6  1  76.5  818.2
## 5:   8  0 211.4 4943.4
M[order(cyl, vs), .(sum_mpg=sum(mpg), sum_disp=sum(disp)), by=.(cyl, vs)]
##    cyl vs sum_mpg sum_disp
## 1:   4  0    26.0    120.3
## 2:   4  1   267.3   1036.2
## 3:   6  0    61.7    465.0
## 4:   6  1    76.5    818.2
## 5:   8  0   211.4   4943.4
m %>% group_by(cyl, vs) %>% select(mpg, disp) %>% summarise_all(.funs=sum)
## Adding missing grouping variables: `cyl`, `vs`
## # A tibble: 5 x 4
## # Groups:   cyl [?]
##     cyl    vs   mpg  disp
##   <dbl> <dbl> <dbl> <dbl>
## 1     4     0  26    120.
## 2     4     1 267.  1036.
## 3     6     0  61.7  465 
## 4     6     1  76.5  818.
## 5     8     0 211.  4943.
m %>% group_by(cyl, vs) %>% summarise(sum(mpg), sum(disp))
## # A tibble: 5 x 4
## # Groups:   cyl [?]
##     cyl    vs `sum(mpg)` `sum(disp)`
##   <dbl> <dbl>      <dbl>       <dbl>
## 1     4     0       26          120.
## 2     4     1      267.        1036.
## 3     6     0       61.7        465 
## 4     6     1       76.5        818.
## 5     8     0      211.        4943.
aggregate(cbind(mpg, disp) ~ cyl + vs, FUN=sum, data = mtcars)
##   cyl vs   mpg   disp
## 1   4  0  26.0  120.3
## 2   6  0  61.7  465.0
## 3   8  0 211.4 4943.4
## 4   4  1 267.3 1036.2
## 5   6  1  76.5  818.2
with(mtcars, aggregate(list(sum_mpg=mpg, sum_disp=disp), by = list(cyl=cyl, vs=vs), FUN=sum))
##   cyl vs sum_mpg sum_disp
## 1   4  0    26.0    120.3
## 2   6  0    61.7    465.0
## 3   8  0   211.4   4943.4
## 4   4  1   267.3   1036.2
## 5   6  1    76.5    818.2
# two variables groups, tapply breaks down into three-by-two (not pretty)
with(mtcars, tapply(X=mpg, INDEX=list(cyl, vs), FUN=sum))
##       0     1
## 4  26.0 267.3
## 6  61.7  76.5
## 8 211.4    NA
with(mtcars, tapply(X=disp, INDEX=list(cyl, vs), FUN=sum))
##        0      1
## 4  120.3 1036.2
## 6  465.0  818.2
## 8 4943.4     NA
# You could combine them using lapply/sapply. 
with(mtcars, lapply(list(disp=disp, mpg=mpg), function(x) tapply(X=x, INDEX=list(cyl=cyl, vs=vs), FUN=sum)))
## $disp
##    vs
## cyl      0      1
##   4  120.3 1036.2
##   6  465.0  818.2
##   8 4943.4     NA
## 
## $mpg
##    vs
## cyl     0     1
##   4  26.0 267.3
##   6  61.7  76.5
##   8 211.4    NA
with(mtcars, sapply(list(disp=disp, mpg=mpg), function(x) tapply(X=x, INDEX=list(cyl=cyl, vs=vs), FUN=sum)))
##        disp   mpg
## [1,]  120.3  26.0
## [2,]  465.0  61.7
## [3,] 4943.4 211.4
## [4,] 1036.2 267.3
## [5,]  818.2  76.5
## [6,]     NA    NA
# Plyr
plyr::ddply(mtcars, plyr::.(cyl, vs), plyr::summarise, sum_mpg = sum(mpg), sum_disp=sum(disp))
##   cyl vs sum_mpg sum_disp
## 1   4  0    26.0    120.3
## 2   4  1   267.3   1036.2
## 3   6  0    61.7    465.0
## 4   6  1    76.5    818.2
## 5   8  0   211.4   4943.4
dt <- function() M[order(cyl, vs), .(sum_mpg=sum(mpg), sum_disp=sum(disp)), by=c('cyl', 'vs')]
dp <- function() m %>% group_by(cyl, vs) %>% summarise(sum(mpg), sum(disp))
ag <- function() aggregate(cbind(mpg, disp) ~ cyl + vs, FUN=sum, data = mtcars)
la <- function() with(mtcars, lapply(list(disp=disp, mpg=mpg), function(x) tapply(X=x, INDEX=list(cyl=cyl, vs=vs), FUN=sum)))
sa <- function() with(mtcars, sapply(list(disp=disp, mpg=mpg), function(x) tapply(X=x, INDEX=list(cyl=cyl, vs=vs), FUN=sum)))
pl <- function()  plyr::ddply(mtcars, plyr::.(cyl, vs), plyr::summarise, sum_mpg = sum(mpg), sum_disp=sum(disp))

# Compare speeds of top approaches
microbenchmark(
  times=100L,
  dt(),
  dp(),
  ag(),
  la(),
  sa(),
  pl()
)
## Unit: microseconds
##  expr      min        lq      mean   median       uq       max neval
##  dt() 1840.965 2365.6115 3625.6795 2955.488 3683.574 29689.690   100
##  dp() 2194.468 2361.0175 3018.3744 2573.593 3057.261 24208.547   100
##  ag() 1882.167 2076.0035 2538.4617 2264.648 2643.394 13054.272   100
##  la()  578.740  696.0965  843.8617  744.078  834.230  6411.912   100
##  sa()  603.055  720.3955  911.7546  778.282  873.011  6693.510   100
##  pl() 3527.191 3918.8410 4696.1208 4272.321 4886.168 24423.534   100

Conclusion:

  • Tapply breaks down when you start introducing additional groups. Not easy to read.
  • For tapply, it seems you ahve to do two statements, or you could use lapply (which returns a list), or you could use sapply and feed lapply.
  • sapply removes the labels, so you’d have to keep track of the indexing yourself.
  • plyr seems like Hadley’s early work of love (2011). But he re-worked the package into dplyr, which is now 100x easier to use.

Filter and summarize

Get the mean of one and sum of the other by group

M[cyl > 4, .(summ=sum(mpg), sumd=sum(disp)), by=.(cyl, vs)]
##    cyl vs  summ   sumd
## 1:   6  0  61.7  465.0
## 2:   6  1  76.5  818.2
## 3:   8  0 211.4 4943.4
mtcars %>% filter(cyl > 4) %>% group_by(cyl, vs) %>% summarise(sum(mpg), sum(disp))
## # A tibble: 3 x 4
## # Groups:   cyl [?]
##     cyl    vs `sum(mpg)` `sum(disp)`
##   <dbl> <dbl>      <dbl>       <dbl>
## 1     6     0       61.7        465 
## 2     6     1       76.5        818.
## 3     8     0      211.        4943.
aggregate(cbind(mpg, disp) ~ cyl + vs, FUN=sum, data = mtcars, subset=(cyl > 4))
##   cyl vs   mpg   disp
## 1   6  0  61.7  465.0
## 2   8  0 211.4 4943.4
## 3   6  1  76.5  818.2
with(mtcars[which(mtcars$cyl > 4),], lapply(list(mpg, disp), function(x) tapply(x, INDEX=list(cyl, vs), sum)))
## [[1]]
##       0    1
## 6  61.7 76.5
## 8 211.4   NA
## 
## [[2]]
##        0     1
## 6  465.0 818.2
## 8 4943.4    NA

Compare dplyr vs datatable to do strings of tasks:

  1. filter/subset
  2. mutate
  3. join (merge)
  4. group by
  5. summarize
  6. order
  7. reshape