# Dplyr vs Datatable

In the world of data science in R, the battle between `dplyr`

and `datatable`

is real. Here I compare their performance against base r commands for some common tasks. Who will be the winner on speed and simplicity?

# Make random datasets

```
set.seed(71)
size1 <- 4*10^6
size2 <- size1 * 0.1
df1 <- data.frame(id=paste0("SERVICE_", 1:size1), value=rnorm(size1), stringsAsFactors=FALSE)
df2 <- data.frame(id=paste0("SERVICE_", sample(1:size1, size2)), value=rnorm(size2), stringsAsFactors=FALSE)
dt1 <- data.table(df1)
dt2 <- data.table(df2)
# mtcars data
M <- data.table(mtcars)
m <- as_tibble(mtcars)
```

Group by sum

```
microbenchmark(
times=100L,
M[, sum(hp), by=cyl],
mtcars %>% group_by(cyl) %>% summarise(sum(hp)),
aggregate(x=mtcars$hp, by=list(cyl=mtcars$cyl), FUN=sum),
aggregate(hp ~ cyl, data=mtcars, FUN=sum),
tapply(X=mtcars$hp, INDEX=mtcars$cyl, FUN=sum),
plyr::ddply(mtcars, 'cyl', plyr::summarise, sum(hp))
)
```

```
## Unit: microseconds
## expr min
## M[, sum(hp), by = cyl] 984.308
## mtcars %>% group_by(cyl) %>% summarise(sum(hp)) 1395.253
## aggregate(x = mtcars$hp, by = list(cyl = mtcars$cyl), FUN = sum) 953.034
## aggregate(hp ~ cyl, data = mtcars, FUN = sum) 1210.410
## tapply(X = mtcars$hp, INDEX = mtcars$cyl, FUN = sum) 207.599
## plyr::ddply(mtcars, "cyl", plyr::summarise, sum(hp)) 2580.687
## lq mean median uq max neval
## 1451.6960 2065.5977 1655.470 1912.101 20003.058 100
## 1965.7315 2795.7730 2171.311 2528.985 17896.043 100
## 1141.5000 1342.8060 1252.845 1335.886 3748.162 100
## 1374.5885 1582.7563 1485.008 1619.644 3317.021 100
## 257.8225 297.2278 285.102 310.905 1260.213 100
## 2907.2670 3935.3080 3197.389 3669.062 30506.079 100
```

Conclusion:

- tapply is wicked fast on this small dataset.
- how does this scale as mtcars increase in size?

# Only look at data.table, apply, and dplyr.

https://www.r-bloggers.com/using-apply-sapply-lapply-in-r/

Double group by, sum of two column

`M[order(cyl, vs), lapply(.SD, sum, na.rm = T), by=.(cyl, vs), .SDcols=c('mpg', 'disp')]`

```
## cyl vs mpg disp
## 1: 4 0 26.0 120.3
## 2: 4 1 267.3 1036.2
## 3: 6 0 61.7 465.0
## 4: 6 1 76.5 818.2
## 5: 8 0 211.4 4943.4
```

`M[order(cyl, vs), .(sum_mpg=sum(mpg), sum_disp=sum(disp)), by=.(cyl, vs)]`

```
## cyl vs sum_mpg sum_disp
## 1: 4 0 26.0 120.3
## 2: 4 1 267.3 1036.2
## 3: 6 0 61.7 465.0
## 4: 6 1 76.5 818.2
## 5: 8 0 211.4 4943.4
```

`m %>% group_by(cyl, vs) %>% select(mpg, disp) %>% summarise_all(.funs=sum)`

`## Adding missing grouping variables: `cyl`, `vs``

```
## # A tibble: 5 x 4
## # Groups: cyl [?]
## cyl vs mpg disp
## <dbl> <dbl> <dbl> <dbl>
## 1 4 0 26 120.
## 2 4 1 267. 1036.
## 3 6 0 61.7 465
## 4 6 1 76.5 818.
## 5 8 0 211. 4943.
```

`m %>% group_by(cyl, vs) %>% summarise(sum(mpg), sum(disp))`

```
## # A tibble: 5 x 4
## # Groups: cyl [?]
## cyl vs `sum(mpg)` `sum(disp)`
## <dbl> <dbl> <dbl> <dbl>
## 1 4 0 26 120.
## 2 4 1 267. 1036.
## 3 6 0 61.7 465
## 4 6 1 76.5 818.
## 5 8 0 211. 4943.
```

`aggregate(cbind(mpg, disp) ~ cyl + vs, FUN=sum, data = mtcars)`

```
## cyl vs mpg disp
## 1 4 0 26.0 120.3
## 2 6 0 61.7 465.0
## 3 8 0 211.4 4943.4
## 4 4 1 267.3 1036.2
## 5 6 1 76.5 818.2
```

`with(mtcars, aggregate(list(sum_mpg=mpg, sum_disp=disp), by = list(cyl=cyl, vs=vs), FUN=sum))`

```
## cyl vs sum_mpg sum_disp
## 1 4 0 26.0 120.3
## 2 6 0 61.7 465.0
## 3 8 0 211.4 4943.4
## 4 4 1 267.3 1036.2
## 5 6 1 76.5 818.2
```

```
# two variables groups, tapply breaks down into three-by-two (not pretty)
with(mtcars, tapply(X=mpg, INDEX=list(cyl, vs), FUN=sum))
```

```
## 0 1
## 4 26.0 267.3
## 6 61.7 76.5
## 8 211.4 NA
```

`with(mtcars, tapply(X=disp, INDEX=list(cyl, vs), FUN=sum))`

```
## 0 1
## 4 120.3 1036.2
## 6 465.0 818.2
## 8 4943.4 NA
```

```
# You could combine them using lapply/sapply.
with(mtcars, lapply(list(disp=disp, mpg=mpg), function(x) tapply(X=x, INDEX=list(cyl=cyl, vs=vs), FUN=sum)))
```

```
## $disp
## vs
## cyl 0 1
## 4 120.3 1036.2
## 6 465.0 818.2
## 8 4943.4 NA
##
## $mpg
## vs
## cyl 0 1
## 4 26.0 267.3
## 6 61.7 76.5
## 8 211.4 NA
```

`with(mtcars, sapply(list(disp=disp, mpg=mpg), function(x) tapply(X=x, INDEX=list(cyl=cyl, vs=vs), FUN=sum)))`

```
## disp mpg
## [1,] 120.3 26.0
## [2,] 465.0 61.7
## [3,] 4943.4 211.4
## [4,] 1036.2 267.3
## [5,] 818.2 76.5
## [6,] NA NA
```

```
# Plyr
plyr::ddply(mtcars, plyr::.(cyl, vs), plyr::summarise, sum_mpg = sum(mpg), sum_disp=sum(disp))
```

```
## cyl vs sum_mpg sum_disp
## 1 4 0 26.0 120.3
## 2 4 1 267.3 1036.2
## 3 6 0 61.7 465.0
## 4 6 1 76.5 818.2
## 5 8 0 211.4 4943.4
```

```
dt <- function() M[order(cyl, vs), .(sum_mpg=sum(mpg), sum_disp=sum(disp)), by=c('cyl', 'vs')]
dp <- function() m %>% group_by(cyl, vs) %>% summarise(sum(mpg), sum(disp))
ag <- function() aggregate(cbind(mpg, disp) ~ cyl + vs, FUN=sum, data = mtcars)
la <- function() with(mtcars, lapply(list(disp=disp, mpg=mpg), function(x) tapply(X=x, INDEX=list(cyl=cyl, vs=vs), FUN=sum)))
sa <- function() with(mtcars, sapply(list(disp=disp, mpg=mpg), function(x) tapply(X=x, INDEX=list(cyl=cyl, vs=vs), FUN=sum)))
pl <- function() plyr::ddply(mtcars, plyr::.(cyl, vs), plyr::summarise, sum_mpg = sum(mpg), sum_disp=sum(disp))
# Compare speeds of top approaches
microbenchmark(
times=100L,
dt(),
dp(),
ag(),
la(),
sa(),
pl()
)
```

```
## Unit: microseconds
## expr min lq mean median uq max neval
## dt() 1840.965 2365.6115 3625.6795 2955.488 3683.574 29689.690 100
## dp() 2194.468 2361.0175 3018.3744 2573.593 3057.261 24208.547 100
## ag() 1882.167 2076.0035 2538.4617 2264.648 2643.394 13054.272 100
## la() 578.740 696.0965 843.8617 744.078 834.230 6411.912 100
## sa() 603.055 720.3955 911.7546 778.282 873.011 6693.510 100
## pl() 3527.191 3918.8410 4696.1208 4272.321 4886.168 24423.534 100
```

Conclusion:

- Tapply breaks down when you start introducing additional groups. Not easy to read.
- For tapply, it seems you ahve to do two statements, or you could use lapply (which returns a list), or you could use sapply and feed lapply.
- sapply removes the labels, so you’d have to keep track of the indexing yourself.
- plyr seems like Hadley’s early work of love (2011). But he re-worked the package into
`dplyr`

, which is now 100x easier to use.

# Filter and summarize

Get the mean of one and sum of the other by group

`M[cyl > 4, .(summ=sum(mpg), sumd=sum(disp)), by=.(cyl, vs)]`

```
## cyl vs summ sumd
## 1: 6 0 61.7 465.0
## 2: 6 1 76.5 818.2
## 3: 8 0 211.4 4943.4
```

`mtcars %>% filter(cyl > 4) %>% group_by(cyl, vs) %>% summarise(sum(mpg), sum(disp))`

```
## # A tibble: 3 x 4
## # Groups: cyl [?]
## cyl vs `sum(mpg)` `sum(disp)`
## <dbl> <dbl> <dbl> <dbl>
## 1 6 0 61.7 465
## 2 6 1 76.5 818.
## 3 8 0 211. 4943.
```

`aggregate(cbind(mpg, disp) ~ cyl + vs, FUN=sum, data = mtcars, subset=(cyl > 4))`

```
## cyl vs mpg disp
## 1 6 0 61.7 465.0
## 2 8 0 211.4 4943.4
## 3 6 1 76.5 818.2
```

`with(mtcars[which(mtcars$cyl > 4),], lapply(list(mpg, disp), function(x) tapply(x, INDEX=list(cyl, vs), sum)))`

```
## [[1]]
## 0 1
## 6 61.7 76.5
## 8 211.4 NA
##
## [[2]]
## 0 1
## 6 465.0 818.2
## 8 4943.4 NA
```

# Compare dplyr vs datatable to do strings of tasks:

- filter/subset
- mutate
- join (merge)
- group by
- summarize
- order
- reshape