tests/test-hashing.R

if (require(RUnit)) {
  library(FeatureHashing)
  
  mapping_value <- structure(c(3789462177, 4122940517, 1079927366, 1505155248, 4103768016, 
    1576910802, 248868694, 2189134401, 1321560276, 2636986885, 1980993114, 
    3588767725, 3873367263, 3437882550, 1125161513, 875000041, 1178743966, 
    1791688646), .Names = c("PlantQn1", "PlantQn2", "PlantQn3", "uptake", 
      "TypeMississippi", "Treatmentchilled", "PlantMn1", "PlantMn2", 
      "PlantMn3", "PlantQc1", "PlantQc2", "PlantQc3", "Treatmentnonchilled", 
      "PlantMc1", "PlantMc2", "PlantMc3", "conc", "TypeQuebec"))
  
  checkTrue(all(hashed.value(names(mapping_value)) %% 2^32 == mapping_value),
            "Unexpected hashing result by hashed.value")
  
  m <- hashed.model.matrix(~ ., CO2, hash.size = 2^10, create.mapping = TRUE, transpose = TRUE, is.dgCMatrix = FALSE, signed.hash = TRUE)
  mapping <- as.list(attr(m, "mapping"))
  checkTrue(all(!duplicated(unlist(mapping) %% 2^10 + 1)),
            "Unexpected collision of hashing example")
  
  name.candidate <- lapply(seq_along(CO2), function(i) {
    if (class(CO2[[i]])[1] == "numeric") rep(names(CO2)[i], nrow(CO2)) else {
      paste0(names(CO2)[i], CO2[[i]])
    }
  })
  
  for(index in seq_along(mapping)) {
    i <- hashed.value(names(mapping))[index] %% 2^10 + 1
    name <- names(mapping)[index]
    col.i <- which(sapply(name.candidate, function(x) name %in% x))
    
    X <- name.candidate[[col.i]]
    j <- which(name == X)
    value <- hash.sign(names(mapping))[index]
    if (class(CO2[[col.i]])[1] == "numeric") {
      value <- value * CO2[[col.i]]
    }
    
    x <- m[i,]
    checkTrue(all(x[j] == value), "Inconsistent hash value between hashed.value, hash.sign and hashed.model.matrix")
    checkTrue(all(x[-j] == 0), "Inconsistent hash value between hashed.value, hash.sign and hashed.model.matrix")
  }
  
  m <- hashed.model.matrix(~ .^2, CO2, hash.size = 2^10, create.mapping = TRUE,
                           transpose = TRUE, is.dgCMatrix = FALSE, signed.hash = TRUE)
  mapping_value <- hash.mapping(m)
  
  mapping_value.expected <- structure(list(PlantQc1 = 2636986885, PlantQn1 = 3789462177, 
                           PlantQc2 = 1980993114, PlantQn2 = 4122940517, "PlantMc3:conc" = 3739801583, 
                           "PlantMc2:conc" = 1395604525, "PlantMn3:conc" = 3235311896, 
                           "PlantMn2:conc" = 540436913, "PlantQc1:TypeQuebec" = 925725157, 
                           "TypeQuebec:conc" = 2746149015, "PlantQn1:TypeQuebec" = 4237186241, 
                           PlantQc3 = 3588767725, "PlantQn1:conc" = 2288578128, "Treatmentchilled:conc" = 255906650, 
                           PlantQn3 = 1079927366, "PlantQc1:conc" = 2251098928, TypeQuebec = 1791688646, 
                           "PlantMc3:Treatmentchilled" = 3891992855, "PlantMn2:Treatmentnonchilled" = 1166626593, 
                           "TypeQuebec:Treatmentchilled" = 2941354564, "PlantMn1:Treatmentnonchilled" = 3205838849, 
                           Treatmentnonchilled = 3873367263, Treatmentchilled = 1576910802, 
                           "PlantMc2:uptake" = 175028363, "PlantMc1:uptake" = 4172577474, 
                           "PlantMn3:uptake" = 4053818018, "TypeQuebec:uptake" = 3178360588, 
                           "PlantQc3:uptake" = 3937745018, uptake = 1505155248, "PlantMc3:TypeMississippi" = 1611851048, 
                           "PlantMn3:TypeMississippi" = 2164201561, PlantMc1 = 3437882550, 
                           PlantMn1 = 248868694, PlantMc2 = 1125161513, PlantMn2 = 2189134401, 
                           PlantMc3 = 875000041, PlantMn3 = 1321560276, "PlantQn3:conc" = 3023324759, 
                           "TypeMississippi:conc" = 1821057777, "PlantQc2:conc" = 714503009, 
                           "PlantQc3:conc" = 1657455447, "PlantQn2:TypeQuebec" = 2781903365, 
                           "PlantQc2:TypeQuebec" = 51079300, "PlantQn3:TypeQuebec" = 862836424, 
                           "PlantQn2:conc" = 900931883, "PlantQc3:TypeQuebec" = 510730956, 
                           "TypeMississippi:Treatmentchilled" = 2281473014, "PlantMn3:Treatmentnonchilled" = 1193828828, 
                           "PlantQc1:Treatmentchilled" = 4048766167, "PlantQc2:Treatmentchilled" = 2534699902, 
                           "PlantQn2:Treatmentnonchilled" = 1386368423, "PlantQn1:Treatmentnonchilled" = 160472416, 
                           "PlantMc3:uptake" = 712540968, "PlantQn2:uptake" = 974267571, 
                           "Treatmentnonchilled:uptake" = 48404478, "PlantQn1:uptake" = 3757368420, 
                           "conc:uptake" = 375043273, "PlantMc1:conc" = 3318696701, 
                           conc = 1178743966, "Treatmentnonchilled:conc" = 4015153694, 
                           "PlantMn1:conc" = 1646616462, "PlantMc2:Treatmentchilled" = 2148207650, 
                           "PlantMc1:Treatmentchilled" = 3227113198, "PlantQc3:Treatmentchilled" = 3565755962, 
                           "TypeQuebec:Treatmentnonchilled" = 1973117262, "TypeMississippi:Treatmentnonchilled" = 4231588217, 
                           "PlantQn3:Treatmentnonchilled" = 1411718622, "PlantMn2:uptake" = 342355816, 
                           "TypeMississippi:uptake" = 31699673, "PlantQn3:uptake" = 4247950021, 
                           "PlantQc2:uptake" = 3035200074, "Treatmentchilled:uptake" = 3235544148, 
                           "PlantQc1:uptake" = 946686067, "PlantMn1:uptake" = 3186749149, 
                           "PlantMc2:TypeMississippi" = 3984179893, "PlantMc1:TypeMississippi" = 517052919, 
                           "PlantMn2:TypeMississippi" = 2154821378, "PlantMn1:TypeMississippi" = 3688281232, 
                           TypeMississippi = 4103768016), .Names = c("PlantQc1", "PlantQn1", 
                                                                     "PlantQc2", "PlantQn2", "PlantMc3:conc", "PlantMc2:conc", "PlantMn3:conc", 
                                                                     "PlantMn2:conc", "PlantQc1:TypeQuebec", "TypeQuebec:conc", "PlantQn1:TypeQuebec", 
                                                                     "PlantQc3", "PlantQn1:conc", "Treatmentchilled:conc", "PlantQn3", 
                                                                     "PlantQc1:conc", "TypeQuebec", "PlantMc3:Treatmentchilled", "PlantMn2:Treatmentnonchilled", 
                                                                     "TypeQuebec:Treatmentchilled", "PlantMn1:Treatmentnonchilled", 
                                                                     "Treatmentnonchilled", "Treatmentchilled", "PlantMc2:uptake", 
                                                                     "PlantMc1:uptake", "PlantMn3:uptake", "TypeQuebec:uptake", "PlantQc3:uptake", 
                                                                     "uptake", "PlantMc3:TypeMississippi", "PlantMn3:TypeMississippi", 
                                                                     "PlantMc1", "PlantMn1", "PlantMc2", "PlantMn2", "PlantMc3", "PlantMn3", 
                                                                     "PlantQn3:conc", "TypeMississippi:conc", "PlantQc2:conc", "PlantQc3:conc", 
                                                                     "PlantQn2:TypeQuebec", "PlantQc2:TypeQuebec", "PlantQn3:TypeQuebec", 
                                                                     "PlantQn2:conc", "PlantQc3:TypeQuebec", "TypeMississippi:Treatmentchilled", 
                                                                     "PlantMn3:Treatmentnonchilled", "PlantQc1:Treatmentchilled", 
                                                                     "PlantQc2:Treatmentchilled", "PlantQn2:Treatmentnonchilled", 
                                                                     "PlantQn1:Treatmentnonchilled", "PlantMc3:uptake", "PlantQn2:uptake", 
                                                                     "Treatmentnonchilled:uptake", "PlantQn1:uptake", "conc:uptake", 
                                                                     "PlantMc1:conc", "conc", "Treatmentnonchilled:conc", "PlantMn1:conc", 
                                                                     "PlantMc2:Treatmentchilled", "PlantMc1:Treatmentchilled", "PlantQc3:Treatmentchilled", 
                                                                     "TypeQuebec:Treatmentnonchilled", "TypeMississippi:Treatmentnonchilled", 
                                                                     "PlantQn3:Treatmentnonchilled", "PlantMn2:uptake", "TypeMississippi:uptake", 
                                                                     "PlantQn3:uptake", "PlantQc2:uptake", "Treatmentchilled:uptake", 
                                                                     "PlantQc1:uptake", "PlantMn1:uptake", "PlantMc2:TypeMississippi", 
                                                                     "PlantMc1:TypeMississippi", "PlantMn2:TypeMississippi", "PlantMn1:TypeMississippi", 
                                                                     "TypeMississippi"))
  mapping_value.expected <- unlist(mapping_value.expected) %% 2^10 + 1
  mapping_value[names(mapping_value.expected)]
  checkTrue(isTRUE(all.equal(mapping_value[names(mapping_value.expected)], 
                             mapping_value.expected)),
            "Unexpected hashing result of interaction term")
  
  m2 <- hashed.model.matrix(~ . ^ 2, data = CO2, hash.size = 32, create.mapping = TRUE,
                            transpose = TRUE, is.dgCMatrix = FALSE, signed.hash = TRUE)
  checkTrue(!all(m2@i == 0),
            "All hashed indices created by hashed.model.matrix are zero")
  checkTrue(sum(m2 %*% rep(1, ncol(m2)) != 0) > 1,
            "Incorrect hashed matrix created by hashed.model.matrix")
  
  mapping <- hash.mapping(m2)
  mapping.raw <- hashed.value(names(mapping))
  names(mapping.raw) <- names(mapping)
  is.interaction <- grepl(":", names(mapping), fixed = TRUE)
  checkTrue(all(mapping[!is.interaction] == mapping.raw[!is.interaction] %% 32 + 1))
  checkTrue(sum(is.na(unlist(lapply(names(mapping), strsplit, "")))) == 0,
            "Non-ascii name occurs!")
  
  for(i in grep(":", names(mapping), fixed = TRUE)) {
    name <- names(mapping)[i]
    key <- strsplit(name, ":")[[1]]
    input <- unlist(lapply(mapping.raw[key], intToRaw), use.names = FALSE)
    r1 <- hashed.value(rawToChar(input))
    checkTrue(r1 %% 32 + 1== mapping[name],
              "The hashing result of interaction is not expected!")
  }
  
  # check handling of NA
  tryCatch(m <- hashed.model.matrix(~ PlAnT, CO2, 8, signed.hash = TRUE,
                                    transpose = TRUE, is.dgCMatrix = FALSE), error = function(e) {
    if (class(e)[1] != "std::invalid_argument") stop(e)
    if (conditionMessage(e) != "Failed to find the column:PlAnT") stop(e)
  })
  
  m <- hashed.model.matrix(~ Plant:Type:Treatment, CO2, create.mapping = TRUE, signed.hash = TRUE)
  map <- hash.mapping(m)
  map <- map[grepl("\\w+:\\w+:\\w+", names(map))]
  checkTrue(all(hashed.interaction.value(names(map)) %% (2^18) + 1== map),
            "The hashed.interaction.value gives inconsistent result of hashed.model.matrix")
}

Try the FeatureHashing package in your browser

Any scripts or data that you put into this service are public.

FeatureHashing documentation built on Oct. 31, 2022, 1:06 a.m.