Sparse Matrix and Dummy Variables

Why sparse matrix?

  • XGBoost only works with matrices that contain all numeric variables; consequently, we need to one-hot encode our data. (UC Business Analytics R Programming Guide)
  • caret::preProcess uses bagging regression trees for missing values recovery (Yevhen Vasylenko), which requires all numeric variables.
  • There are different ways to do this in R.
library(tidyverse)
dd <- data.frame(a = gl(3,4), 
                 b = gl(4,1,12), 
                 c = 1:12, 
                 d = sample(c("X", "Y", "Z"), 12, replace = TRUE))
str(dd)
## 'data.frame':    12 obs. of  4 variables:
##  $ a: Factor w/ 3 levels "1","2","3": 1 1 1 1 2 2 2 2 3 3 ...
##  $ b: Factor w/ 4 levels "1","2","3","4": 1 2 3 4 1 2 3 4 1 2 ...
##  $ c: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ d: Factor w/ 3 levels "X","Y","Z": 1 1 3 2 1 1 3 1 3 1 ...

The above data frame contains 4 columns (variables): 3 factor and 1 numeric. Multiple packages in R have similar functions on generating sparse matrix or dummy variables.

base::model.matrix

(mm0 <- model.matrix(~ . , data = dd))
##    (Intercept) a2 a3 b2 b3 b4  c dY dZ
## 1            1  0  0  0  0  0  1  0  0
## 2            1  0  0  1  0  0  2  0  0
## 3            1  0  0  0  1  0  3  0  1
## 4            1  0  0  0  0  1  4  1  0
## 5            1  1  0  0  0  0  5  0  0
## 6            1  1  0  1  0  0  6  0  0
## 7            1  1  0  0  1  0  7  0  1
## 8            1  1  0  0  0  1  8  0  0
## 9            1  0  1  0  0  0  9  0  1
## 10           1  0  1  1  0  0 10  0  0
## 11           1  0  1  0  1  0 11  0  0
## 12           1  0  1  0  0  1 12  0  1
## attr(,"assign")
## [1] 0 1 1 2 2 2 3 4 4
## attr(,"contrasts")
## attr(,"contrasts")$a
## [1] "contr.treatment"
## 
## attr(,"contrasts")$b
## [1] "contr.treatment"
## 
## attr(,"contrasts")$d
## [1] "contr.treatment"
(mm1 <- model.matrix(~ . -1, data = dd)) # no intercept
##    a1 a2 a3 b2 b3 b4  c dY dZ
## 1   1  0  0  0  0  0  1  0  0
## 2   1  0  0  1  0  0  2  0  0
## 3   1  0  0  0  1  0  3  0  1
## 4   1  0  0  0  0  1  4  1  0
## 5   0  1  0  0  0  0  5  0  0
## 6   0  1  0  1  0  0  6  0  0
## 7   0  1  0  0  1  0  7  0  1
## 8   0  1  0  0  0  1  8  0  0
## 9   0  0  1  0  0  0  9  0  1
## 10  0  0  1  1  0  0 10  0  0
## 11  0  0  1  0  1  0 11  0  0
## 12  0  0  1  0  0  1 12  0  1
## attr(,"assign")
## [1] 1 1 1 2 2 2 3 4 4
## attr(,"contrasts")
## attr(,"contrasts")$a
## [1] "contr.treatment"
## 
## attr(,"contrasts")$b
## [1] "contr.treatment"
## 
## attr(,"contrasts")$d
## [1] "contr.treatment"
#if only applied to one column
head(mm2 <- model.matrix(~ a , data = dd)) 
##   (Intercept) a2 a3
## 1           1  0  0
## 2           1  0  0
## 3           1  0  0
## 4           1  0  0
## 5           1  1  0
## 6           1  1  0
head(mm3 <- model.matrix(~ a -1, data = dd)) # no intercept
##   a1 a2 a3
## 1  1  0  0
## 2  1  0  0
## 3  1  0  0
## 4  1  0  0
## 5  0  1  0
## 6  0  1  0

Matrix::sparse.model.matrix

sm1 <- Matrix::sparse.model.matrix(~ . -1, data = dd)
str(sm1)  #class 'dgCMatrix'
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   ..@ i       : int [1:38] 0 1 2 3 4 5 6 7 8 9 ...
##   ..@ p       : int [1:10] 0 4 8 12 15 18 21 33 34 38
##   ..@ Dim     : int [1:2] 12 9
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:12] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:9] "a1" "a2" "a3" "b2" ...
##   ..@ x       : num [1:38] 1 1 1 1 1 1 1 1 1 1 ...
##   ..@ factors : list()
(sm2 <- Matrix::sparse.model.matrix(~ a -1, data = dd))
## 12 x 3 sparse Matrix of class "dgCMatrix"
##    a1 a2 a3
## 1   1  .  .
## 2   1  .  .
## 3   1  .  .
## 4   1  .  .
## 5   .  1  .
## 6   .  1  .
## 7   .  1  .
## 8   .  1  .
## 9   .  .  1
## 10  .  .  1
## 11  .  .  1
## 12  .  .  1

MatrixModels::model.Matrix

model.Matrix is a simple wrapper around the traditional model.matrix and returns a ddenseModelMatrix object.

rm1 <- MatrixModels::model.Matrix(~ . -1, data = dd)
str(rm1)  #class 'ddenseModelMatrix'
## Formal class 'ddenseModelMatrix' [package "MatrixModels"] with 6 slots
##   ..@ x        : num [1:108] 1 1 1 1 0 0 0 0 0 0 ...
##   ..@ Dim      : int [1:2] 12 9
##   ..@ Dimnames :List of 2
##   .. ..$ : chr [1:12] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:9] "a1" "a2" "a3" "b2" ...
##   ..@ factors  : list()
##   ..@ assign   : int [1:9] 1 1 1 2 2 2 3 4 4
##   ..@ contrasts:List of 3
##   .. ..$ a: chr "contr.treatment"
##   .. ..$ b: chr "contr.treatment"
##   .. ..$ d: chr "contr.treatment"
(rm2 <- MatrixModels::model.Matrix(~ a -1, data = dd))
## 12 x 3 Matrix of class "ddenseModelMatrix"
##    a1 a2 a3
## 1   1  0  0
## 2   1  0  0
## 3   1  0  0
## 4   1  0  0
## 5   0  1  0
## 6   0  1  0
## 7   0  1  0
## 8   0  1  0
## 9   0  0  1
## 10  0  0  1
## 11  0  0  1
## 12  0  0  1

caret::dummyVars

library(caret)
dummy.vars <- dummyVars(~ . , data = dd)
str(dummy.vars)  #dummyVars list
## List of 9
##  $ call      : language dummyVars.default(formula = ~., data = dd)
##  $ form      :Class 'formula'  language ~.
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##  $ vars      : chr [1:4] "a" "b" "c" "d"
##  $ facVars   : chr [1:3] "a" "b" "d"
##  $ lvls      :List of 3
##   ..$ a: chr [1:3] "1" "2" "3"
##   ..$ b: chr [1:4] "1" "2" "3" "4"
##   ..$ d: chr [1:3] "X" "Y" "Z"
##  $ sep       : chr "."
##  $ terms     :Classes 'terms', 'formula'  language ~a + b + c + d
##   .. ..- attr(*, "variables")= language list(a, b, c, d)
##   .. ..- attr(*, "factors")= int [1:4, 1:4] 1 0 0 0 0 1 0 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:4] "a" "b" "c" "d"
##   .. .. .. ..$ : chr [1:4] "a" "b" "c" "d"
##   .. ..- attr(*, "term.labels")= chr [1:4] "a" "b" "c" "d"
##   .. ..- attr(*, "order")= int [1:4] 1 1 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 0
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(a, b, c, d)
##   .. ..- attr(*, "dataClasses")= Named chr [1:4] "factor" "factor" "numeric" "factor"
##   .. .. ..- attr(*, "names")= chr [1:4] "a" "b" "c" "d"
##  $ levelsOnly: logi FALSE
##  $ fullRank  : logi FALSE
##  - attr(*, "class")= chr "dummyVars"
dd.dummy   <- predict(dummy.vars, dd)
dd.dummy  # numeric matrix
##    a.1 a.2 a.3 b.1 b.2 b.3 b.4  c d.X d.Y d.Z
## 1    1   0   0   1   0   0   0  1   1   0   0
## 2    1   0   0   0   1   0   0  2   1   0   0
## 3    1   0   0   0   0   1   0  3   0   0   1
## 4    1   0   0   0   0   0   1  4   0   1   0
## 5    0   1   0   1   0   0   0  5   1   0   0
## 6    0   1   0   0   1   0   0  6   1   0   0
## 7    0   1   0   0   0   1   0  7   0   0   1
## 8    0   1   0   0   0   0   1  8   1   0   0
## 9    0   0   1   1   0   0   0  9   0   0   1
## 10   0   0   1   0   1   0   0 10   1   0   0
## 11   0   0   1   0   0   1   0 11   1   0   0
## 12   0   0   1   0   0   0   1 12   0   0   1

base::model.matrix

mm.dummy <- cbind(model.matrix(~ a - 1, data = dd),
                  model.matrix(~ b - 1, data = dd),
                  c = dd[, "c"],
                  model.matrix(~ d - 1, data = dd)
                  )
mm.dummy
##    a1 a2 a3 b1 b2 b3 b4  c dX dY dZ
## 1   1  0  0  1  0  0  0  1  1  0  0
## 2   1  0  0  0  1  0  0  2  1  0  0
## 3   1  0  0  0  0  1  0  3  0  0  1
## 4   1  0  0  0  0  0  1  4  0  1  0
## 5   0  1  0  1  0  0  0  5  1  0  0
## 6   0  1  0  0  1  0  0  6  1  0  0
## 7   0  1  0  0  0  1  0  7  0  0  1
## 8   0  1  0  0  0  0  1  8  1  0  0
## 9   0  0  1  1  0  0  0  9  0  0  1
## 10  0  0  1  0  1  0  0 10  1  0  0
## 11  0  0  1  0  0  1  0 11  1  0  0
## 12  0  0  1  0  0  0  1 12  0  0  1

Related