library(data.table)
Create data table
# it is similar with data frame
set.seed(999)
dt = data.table(x = sample(c("a", "b", "c", "d"), 10, replace = TRUE), y = rnorm(10))
dt
x y
1: b -0.5660
2: c -1.8787
3: a -1.2668
4: d -0.9677
5: d -1.1210
6: a 1.3255
7: c 0.1340
8: a 0.9387
9: b 0.1725
10: c 0.9577
dt[2] # 2nd row
x y
1: c -1.879
dt[, y] # y column (as vector)
[1] -0.5660 -1.8787 -1.2668 -0.9677 -1.1210 1.3255 0.1340 0.9387
[9] 0.1725 0.9577
dt[, list(y)] # y column (as data.table)
y
1: -0.5660
2: -1.8787
3: -1.2668
4: -0.9677
5: -1.1210
6: 1.3255
7: 0.1340
8: 0.9387
9: 0.1725
10: 0.9577
# convert existing data.frame objects to data.table.
dt.cars = data.table(cars)
head(dt.cars)
speed dist
1: 4 2
2: 4 10
3: 7 4
4: 7 22
5: 8 16
6: 9 10
list out all data.tables in memory
tables() # The result of tables() is itself a data.table
NAME NROW MB COLS KEY
[1,] dt 10 1 x,y
[2,] dt.cars 50 1 speed,dist
Total: 2MB
Keys
A key consists of one or more columns of rownames, which may be integer, factor, character or some other class, not simply character. The rows are sorted by the key. A data.table can have at most one key, but duplicate key values are allowed.
# use data.frame syntax in a data.table
dt[2, ] # the second row of df
x y
1: c -1.879
dt[dt$x == "a", ] # all rows with first column is 'a'
x y
1: a -1.2668
2: a 1.3255
3: a 0.9387
# data.table unique key
setkey(dt, x) # set x column as key
dt["a", ] # all rows with first column is 'a', The comma is optional.
x y
1: a -1.2668
2: a 1.3255
3: a 0.9387
dt["a"]
x y
1: a -1.2668
2: a 1.3255
3: a 0.9387
# By default all the rows in the group are returned The mult argument allows
# only the first or last row of the group to be returned
dt["a", mult = "first"]
x y
1: a -1.267
dt["a", mult = "last"]
x y
1: a 0.9387
binary search (faster)
The vector scan is linear, but the binary search is O(log n).
df2 <- data.frame(x = sample(LETTERS, 1e+07, replace = T), y = sample(letters,
1e+07, replace = T), z = rnorm(1e+07))
system.time(ans1 <- df2[df2$x == "R" & df2$y == "h", ]) # 'vector scan'
user system elapsed
5.81 0.28 6.11
dt2 <- data.table(df2)
setkey(dt2, x, y)
system.time(ans2 <- dt2[J("R", "h")]) # binary search, faster
user system elapsed
0.02 0.00 0.01
identical(ans1$z, ans2$z)
[1] TRUE
Fast grouping
system.time(sum1 <- dt2[, sum(z), by = x])
user system elapsed
0.21 0.05 0.25
head(sum1)
x V1
1: A -211.43
2: B -60.01
3: C -723.30
4: D 392.90
5: E -1251.52
6: F 92.21
dt2[, sum(z), by = list(x == "A")] # by expression
x V1
1: TRUE -211.4
2: FALSE -6875.4
system.time(sum2 <- dt2[, sum(z), by = "x,y"])
user system elapsed
0.14 0.04 0.19
head(sum2)
x y V1
1: A a 147.617
2: A b 2.296
3: A c 203.767
4: A d 77.195
5: A e -16.550
6: A f 58.810
Fast time series join (or a rolling join)
set.seed(9999)
dt3 = data.table(x = sample(letters, 10, replace = T), y = sample(1:20, 10,
replace = T), z = sample(1:99, 10, replace = T))
setkey(dt3, x)
dt3["o"] # join to 1st column of key
x y z
1: o NA NA
dt3[J("o")] # same. J() stands for Join, an alias for list()
x y z
1: o NA NA
dt3[!"o"] # all rows other than 'o'
x y z
1: f 16 55
2: f 19 34
3: r 1 98
4: r 9 34
5: t 3 35
6: u 4 85
7: v 19 39
8: v 2 7
9: w 17 33
10: z 17 45
dt3[!2:4] # all rows other than 2:4
x y z
1: f 16 55
2: t 3 35
3: u 4 85
4: v 19 39
5: v 2 7
6: w 17 33
7: z 17 45
setkey(dt3, x, y)
dt3[J("o", 3)] # join to 2 columns
x y z
1: o 3 NA
dt3[J("o", 3:6)] # join 4 rows (1 missing)
x y z
1: o 3 NA
2: o 4 NA
3: o 5 NA
4: o 6 NA
dt3[!J("o", 3)] # multiple join
x y z
1: f 16 55
2: f 19 34
3: r 1 98
4: r 9 34
5: t 3 35
6: u 4 85
7: v 2 7
8: v 19 39
9: w 17 33
10: z 17 45
dt3[, sum(z), by = x][order(-V1)] # ordering results
x V1
1: r 132
2: f 89
3: u 85
4: v 46
5: z 45
6: t 35
7: w 33
Learn more
vignette("datatable-intro")
example(data.table)