# How long does it take to compare attributes?
library(microbenchmark)
y <- sample(seq_len(1000000)); class(y) <- c('foo', class(y));
names(y) <- sample(letters, length(y), replace = TRUE)
z <- sample(seq_len(1000000)); class(z) <- c('foo', class(z));
names(z) <- names(y)
k <- names(y)
r <- names(z)
print(microbenchmark(
"base::identical on attributes" = identical(attributes(y), attributes(z)),
"base::identical on attributes with cheap mode" = identical(attributes(y), attributes(z),F,F,F,F),
"base::identical on 1 name" = identical(.subset2(names(y), 1), .subset2(names(z), 1)),
"1 million chars" = identical(k, r),
"get attributes" = attributes(y)
))
w1 <- sample(seq_len(1000000)); z1 <- w1; z1[1000000] <- -1
w2 <- sample(letters, 1000000, replace = TRUE); z2 <- w2; z2[1000000] <- 'no'
print(microbenchmark(
"1 million numerics" = identical(w1, z1),
"1 million chars" = identical(w2, z2)
))
# Lesson: Comparing characters is *really* slow
# Unit: nanoseconds
# expr min lq median uq max neval
# base::identical on attributes 3232790 3468022.5 3744298.0 4768320.5 7028641 100
# base::identical on attributes with cheap mode 3255716 3437544.5 3640882.5 4782078.0 13316575 100
# base::identical on 1 name 7229 11389.0 48779.5 57523.5 90371 100
# 1 million chars 3221980 3542708.0 3888966.0 4686419.0 9568459 100
# get attributes 342 1606.5 4148.0 5723.5 10847 100
# Unit: nanoseconds
# expr min lq median uq max neval
# 1 million numerics 828 1401.5 6138.5 7940 30903 100
# 1 million chars 3239020 3385422.5 3571055.5 4357100 6963236 100
#
# What if two things are equal except for attributes and class?
# We need some ways to check for that.
unat <- function(y) { attributes(y) <- NULL; y }
# TODO: (RK) Crap, what was x?
#microbenchmark(all.equal(x,y), identical(x, y), identical(head(x,-1), head(y,-1)),
# identical(unat(x), unat(y)))
# Unit: nanoseconds
# expr min lq median uq max neval
# all.equal(x, y) 100956 225546.0 263571.5 293590.5 662754 100
# identical(x, y) 901 3331.5 7947.5 10311.0 25113 100
# identical(head(x, -1), head(y, -1)) 12927407 16399260.0 23043983.0 28436341.5 218761241 100
# identical(unat(x), unat(y)) 1835558 2348825.5 2511917.0 2936794.5 22979307 100
# For character comparison, it takes 4x longer
# identical(unat(x), unat(y)) 8352153 9224829.0 10499011.0 14747340.0 213376807 100
#
# Maybe a good strategy then is to first sample 100 indices and see
# if they match before doing the full comparison.
smp <- sample(seq_len(1000000), 100)
#microbenchmark(all.equal(unclass(x), unclass(y), check.attributes = FALSE),
# for(i in smp) identical(.subset2(x, i), .subset2(y, i)))
# Unit: microseconds
# expr min lq median uq max neval
# all.equal(unclass(x), unclass(y), check.attributes = FALSE) 31587.659 37940.179 44013.4875 53668.0470 239773.415 100
# for (i in smp) identical(.subset2(x, i), .subset2(y, i)) 146.883 196.441 228.7455 300.8225 500.595 100
# identical(x[smp], y[smp]) 7.482 11.211 43.934 55.687 200.682 100
# It seems like temporarily removing their attributes may be necessary.
# It seems
# Once we know that two things are identical
# we need to create a patch from one onto the other. It seems like the fastest way
# to "drop" any
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.