Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@

6. By-reference sub-assignments of strings to factor columns now _actually_ match the levels in UTF-8 when required and now don't result in invalid factors being created, [#7648](https://github.com/Rdatatable/data.table/issues/7648), amending a previous incomplete fix to [#6886](https://github.com/Rdatatable/data.table/issues/6886) in v1.17.2. Thanks @BASS-JN for the report and @aitap for the fix.

7. Grouping operations with constant `list()` expressions in `j` are now optimized to avoid per-group allocation overhead, [#712](https://github.com/Rdatatable/data.table/issues/712). Thanks @macrakis for the report and @ben-schwen for the fix.

### Notes

1. {data.table} now depends on R 3.5.0 (2018).
Expand Down
16 changes: 16 additions & 0 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,16 @@ replace_dot_alias = function(e) {
list(jsub=jsub, jvnames=jvnames, funi=funi+1L)
}

# Optimize constant list() expressions to avoid per-group allocation overhead
# e.g., list(1) -> 1, where the value is a simple atomic constant, #712
# return NULL for no optimization possible
.optimize_constant_list = function(jsub) {
if (!jsub %iscall% "list") return(NULL)
if (length(jsub) != 2L) return(NULL)
if (!is_constantish(jsub[[2L]])) return(NULL)
jsub[[2L]]
}

# Optimize .SD subsetting patterns like .SD[1], head(.SD), first(.SD)
# return NULL for no optimization possible
.optimize_sd_subset = function(jsub, sdvars, SDenv, envir) {
Expand Down Expand Up @@ -505,6 +515,12 @@ replace_dot_alias = function(e) {
return(list(GForce=FALSE, jsub=jsub, jvnames=jvnames))
}

# Step 0: Unwrap constant list() to avoid per-group allocation, #712
if (!is.null(unwrapped_consts <- .optimize_constant_list(jsub))) {
if (verbose) catf("Optimized j from '%s' to bare constant '%s'\n", deparse(jsub), deparse(unwrapped_consts, width.cutoff=200L, nlines=1L))
jsub = unwrapped_consts
}

# Step 1: Apply lapply(.SD) optimization
lapply_result = .optimize_lapply(jsub, jvnames, sdvars, SDenv, verbose, envir)
jsub = lapply_result$jsub
Expand Down
9 changes: 8 additions & 1 deletion inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -15433,7 +15433,7 @@ test(2069.25, forder(data.table(a=3:1), ord), 3:1)
test(2069.26, data.table(c='1')[ , expression(1), by=c], error="j evaluates to type 'expression'")
test(2069.27, data.table(c='1', d=2)[ , d := .(NULL), by=c], error='RHS of := is NULL during grouped assignment')
test(2069.28, data.table(c='1', d=2)[ , c(a='b'), by=c, verbose=TRUE], output='j appears to be a named vector')
test(2069.29, data.table(c = '1', d = 2)[ , .(a = c(nm='b')), by = c, verbose = TRUE], output = 'Column 1 of j is a named vector')
test(2069.29, data.table(c = '1', d = 2)[ , .(a = c(nm1='b'), b = c(nm2='b')), by = c, verbose = TRUE], output = 'Column 1 of j is a named vector') # single column list gets already optimized
DT <- data.table(a = rep(1:3, each = 4), b = LETTERS[1:4], z = 0:3 + (4:1)*1i)
test(2069.30, DT[, .SD[3,], by=b], DT[9:12, .(b, a, z)])
DT = data.table(x=1:4,y=1:2,lgl=TRUE,key=c('x', 'y'))
Expand Down Expand Up @@ -21559,3 +21559,10 @@ xenv_empty = new.env()
test(2366.5, tables(env=xenv_empty, depth=1L), invisible(data.table(NULL)))
test(2366.6, tables(env=xenv_empty), invisible(data.table(NULL)))
rm(xenv_empty)

# dt[, j=list(var), by] is slower than dt[, j=var, by], #712
dt = data.table(x=rep(1:3, 2L), y=1L)
test(2367.1, dt[, .(1), by=x, verbose=TRUE], dt[, 1, by=x], output="Optimized j from.*to bare constant")
dt = data.table(x=1:5, key="x")
test(2367.2, dt[dt, list(1), by=.EACHI, verbose=TRUE], dt[dt, 1, by=.EACHI], output="Optimized j from.*to bare constant")
test(2367.3, dt[dt, list(x), by=.EACHI, verbose=TRUE], dt[dt, x, by=.EACHI], output="Optimized j from.*to bare constant")
Loading