test-drandomForest.R
In randomForest.ddR: Distributed 'randomForest' for Big Data using 'ddR' API

nInst = 2
errorRate <- function(a,b)
{
	if(is.dframe(a) | is.darray(a))
		a <- collect(a)
	if(is.dframe(b) | is.darray(b))
		b <- collect(b)
	if(is.data.frame(a))
		a <- a[,1]
	if(is.data.frame(b))
		b <- b[,1]
	if(is.factor(a))
		a <- as.character(a)
	if(is.factor(b))
		b <- as.character(b)
	sum(a!=b)/length(a)
}

rSquared <- function(a,b)
{
	if(is.dframe(a) | is.darray(a))
		a <- collect(a)
	if(is.dframe(b) | is.darray(b))
		b <- collect(b)
	if(is.data.frame(a))
		a <- a[,1]
	if(is.data.frame(b))
		b <- b[,1]
	mean_a = mean(a)
	a <- sapply(a,function(x,mean_a) (x-mean_a)^2, mean_a)
	b <- sapply(b,function(x,mean_a) (x-mean_a)^2, mean_a)
	sum(b)/sum(a)
}


########## Tests for formula interface ##########
# The input is combination of a formula and a data.frame or dframe

### Tests for data.frame ###
context("Checking the formula interface of drandomForest with data.frame")

test_that("the inputes are validated", {
    expect_error(drandomForest(Species ~ ., data=iris, nExecutor = nInst, ntree=-1))
    expect_error(drandomForest(Species ~ ., airquality, nExecutor = nInst))
    expect_error(drandomForest(Ozone ~ ., data=airquality, nExecutor=nInst)) # airquality contains NA
    expect_error(drandomForest(Ozone ~ ., data=airquality, nExecutor=nInst, na.action=na.pass))
})

test_that("the results are returned correctly", {
    hRF <- drandomForest(Species ~ ., data=iris, nExecutor = nInst)
    rRF <- randomForest(Species ~ ., data=iris)

    # availability of some outputs
    expect_false(is.null(hRF$call))
    expect_false(is.null(hRF$importance))
    expect_false(is.null(hRF$terms))

    # the value of some outputs
    expect_identical(hRF$ntree, rRF$ntree)
    expect_identical(hRF$type, rRF$type)
    expect_identical(hRF$mtry, rRF$mtry)
    expect_identical(hRF$classes, rRF$classes)
    expect_is(hRF, "drandomForest")
    expect_is(hRF, "randomForest")

    # prediction accuracy
    hpredicted <- predict(hRF, iris[-5])
    rpredicted <- predict(rRF, iris[-5])
    herrRate <- errorRate(iris[5], hpredicted)
    rerrRate <- errorRate(iris[5], rpredicted)

    expect_equivalent(herrRate, rerrRate)

    # complete mode
    expect_warning(hRFc <- drandomForest(Species ~ ., data=iris, nExecutor = nInst, completeModel=TRUE, proximity=TRUE, xtest=iris[-5], ytest=iris[5]))
    expect_false(is.null(hRFc$predicted))
    expect_false(is.null(hRFc$votes))
    expect_false(is.null(hRFc$oob.times))
    expect_false(is.null(hRFc$y))
    expect_false(is.null(hRFc$proximity))
    expect_false(is.null(hRFc$test))

    # handling missed data
    hRFm <- drandomForest(Ozone ~ ., data=airquality, nExecutor = nInst, na.action=na.omit, completeModel=TRUE)
    rRFm <- randomForest(Ozone ~ ., data=airquality, na.action=na.omit)
    
    expect_identical(length(hRFm$y), length(rRFm$y))
})

### Tests for dframe ###
context("Checking the formula interface of drandomForest with dframe")
    Diris <- as.dframe(iris)
    Xiris <- as.dframe(iris[-5])
    Yiris <- as.dframe(iris[5])
    Dairquality <- as.dframe(airquality)

test_that("the inputes are validated", {
    expect_error(drandomForest(Species ~ ., data=Diris, nExecutor = nInst, ntree=-1))
    expect_error(drandomForest(Species ~ ., Dairquality, nExecutor = nInst))
    expect_error(drandomForest(Ozone ~ ., data=Dairquality, nExecutor=nInst))
    expect_error(drandomForest(Ozone ~ ., data=Dairquality, nExecutor=nInst, na.action=na.pass))
})

test_that("the results are returned correctly", {
    hRF <- drandomForest(Species ~ ., data=Diris, nExecutor = nInst)
    rRF <- randomForest(Species ~ ., data=iris)

    # availability of some outputs
    expect_false(is.null(hRF$call))
    expect_false(is.null(hRF$importance))
    expect_false(is.null(hRF$terms))

    # the value of some outputs
    expect_identical(hRF$ntree, rRF$ntree)
    expect_identical(hRF$type, rRF$type)
    expect_identical(hRF$mtry, rRF$mtry)
    expect_identical(hRF$classes, rRF$classes)
    expect_is(hRF, "drandomForest")
    expect_is(hRF, "randomForest")

    # prediction accuracy
    hpredicted <- predict(hRF, Xiris)
    rpredicted <- predict(rRF, iris[-5])
    herrRate <- errorRate(Yiris, hpredicted)
    rerrRate <- errorRate(iris[5], rpredicted)

    expect_equivalent(herrRate, rerrRate)

    # complete mode
    expect_warning(hRFc <- drandomForest(Species ~ ., data=Diris, nExecutor = nInst, completeModel=TRUE, proximity=TRUE))
    expect_false(is.null(hRFc$predicted))
    expect_false(is.null(hRFc$votes))
    expect_false(is.null(hRFc$oob.times))
    expect_false(is.null(hRFc$y))
    expect_false(is.null(hRFc$proximity))

    # handling missed data
    hRFm <- drandomForest(Ozone ~ ., data=Dairquality, nExecutor = nInst, na.action=na.omit, completeModel=TRUE)
    rRFm <- randomForest(Ozone ~ ., data=airquality, na.action=na.omit)
    
    expect_identical(length(hRFm$y), length(rRFm$y))
})

########## Tests for default interface ##########
# There are 2 main different scenarios for the inputs that are not available through formula: the input data is an ordinary R object, or it is darray

context("Checking the default interface of drandomForest with ordibary R objects")

cleandata <- na.omit(airquality)
x <- cleandata[-1]
y <- cleandata[1]

test_that("the inputes are validated", {
    expect_error(drandomForest(x=x, y=y, nExecutor = nInst, ntree=-1))
    expect_error(drandomForest(x=airquality[-1], y=airquality[1], nExecutor = nInst)) # airquality contains NA
    expect_error(drandomForest(x=x, y=y, nExecutor=100, ntree=50))
})

test_that("the results are returned correctly for regression", {
    hRF <- drandomForest(x=x, y=y, nExecutor = nInst, ntree=5000)
    rRF <- randomForest(Ozone ~ ., data=airquality, na.action=na.omit, ntree=5000)

    # availability of some outputs
    expect_false(is.null(hRF$call))
    expect_false(is.null(hRF$importance))
    expect_false(is.null(hRF$mse))
    expect_false(is.null(hRF$rsq))

    # the value of some outputs
    expect_identical(hRF$ntree, rRF$ntree)
    expect_identical(hRF$type, rRF$type)
    expect_identical(hRF$mtry, rRF$mtry)
    expect_is(hRF, "drandomForest")
    expect_is(hRF, "randomForest")

    # prediction accuracy
    hpredicted <- predict(hRF, x)
    rpredicted <- predict(rRF, x)
    hrsq <- rSquared(y, hpredicted)
    rrsq <- rSquared(y, rpredicted)

    expect_true( abs(hrsq - rrsq) < 0.01)

    # complete mode
    expect_warning(hRFc <- drandomForest(x=x, y=y, nExecutor = nInst, completeModel=TRUE, proximity=TRUE, xtest=x, ytest=y))
    expect_false(is.null(hRFc$predicted))
    expect_false(is.null(hRFc$oob.times))
    expect_false(is.null(hRFc$y))
    expect_false(is.null(hRFc$proximity))
    expect_false(is.null(hRFc$test))

})

test_that("the results are returned correctly for unsupervised", {
    expect_warning(hRFc <- drandomForest(x=x, nExecutor = nInst, completeModel=TRUE, proximity=TRUE, ntree=501, mtry=3))

    # availability of some outputs
    expect_false(is.null(hRFc$call))
    expect_false(is.null(hRFc$importance))
    expect_false(is.null(hRFc$proximity))

    # the value of some outputs
    expect_identical(hRFc$ntree, 501)
    expect_identical(hRFc$type, "unsupervised")
    expect_identical(hRFc$mtry, 3)
    expect_is(hRFc, "drandomForest")
    expect_is(hRFc, "randomForest")
})

context("Checking the default interface of drandomForest with darray")

X <- as.darray(data.matrix(x))
Y <- as.darray(data.matrix(y))

test_that("the inputes are validated", {
    expect_error(drandomForest(x=X, y=Y, nExecutor = nInst, ntree=-1))
    expect_error(drandomForest(x=X, y=Y, nExecutor=100, ntree=50))
})

test_that("the results are returned correctly", {
    hRF <- drandomForest(x=X, y=Y, nExecutor = nInst, ntree=5000)
    rRF <- randomForest(Ozone ~ ., data=airquality, na.action=na.omit, ntree=5000)

    # availability of some outputs
    expect_false(is.null(hRF$call))
    expect_false(is.null(hRF$importance))
    expect_false(is.null(hRF$mse))
    expect_false(is.null(hRF$rsq))

    # the value of some outputs
    expect_identical(hRF$ntree, rRF$ntree)
    expect_identical(hRF$type, rRF$type)
    expect_identical(hRF$mtry, rRF$mtry)
    expect_is(hRF, "drandomForest")
    expect_is(hRF, "randomForest")

    # prediction accuracy
    hpredicted <- predict(hRF, x)
    rpredicted <- predict(rRF, x)
    hrsq <- rSquared(y, hpredicted)
    rrsq <- rSquared(y, rpredicted)

    expect_true( abs(hrsq - rrsq) < 0.01)

    # complete mode
    expect_warning(hRFc <- drandomForest(x=x, y=y, nExecutor = nInst, completeModel=TRUE, proximity=TRUE, xtest=x, ytest=y))
    expect_false(is.null(hRFc$predicted))
    expect_false(is.null(hRFc$oob.times))
    expect_false(is.null(hRFc$y))
    expect_false(is.null(hRFc$proximity))
    expect_false(is.null(hRFc$test))
})

test_that("the results are returned correctly for unsupervised", {
    expect_warning(hRFc <- drandomForest(x=X, nExecutor = nInst, completeModel=TRUE, proximity=TRUE, ntree=501, mtry=3))

    # availability of some outputs
    expect_false(is.null(hRFc$call))
    expect_false(is.null(hRFc$importance))
    expect_false(is.null(hRFc$proximity))

    # the value of some outputs
    expect_identical(hRFc$ntree, 501)
    expect_identical(hRFc$type, "unsupervised")
    expect_identical(hRFc$mtry, 3)
    expect_is(hRFc, "drandomForest")
    expect_is(hRFc, "randomForest")
})

context("Checking effectiveness of set.seed")

test_that("setting seed must make the output of the function deterministic", {

    generate_random_data <- function(x)
    {
	data.frame(matrix(sample.int(1e6, 50), nrow=10, ncol=5))
    }
    DFdata <- dmapply(generate_random_data, x = 1:nInst,
    	   output.type = "dframe", combine = "rbind", nparts = c(nInst,1))
    colnames(DFdata) <- c("X1", "X2", "X3", "X4", "X5")

    set.seed(100)
    model1 <- drandomForest(X1~., data=DFdata, nExecutor = nInst)

    set.seed(100)
    model2 <- drandomForest(X1~., data=DFdata, nExecutor = nInst)

    expect_true(all(model1$mse == model2$mse))
})

Any scripts or data that you put into this service are public.

randomForest.ddR documentation built on May 29, 2017, 2:15 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

randomForest.ddR
Distributed 'randomForest' for Big Data using 'ddR' API

tests/testthat/test-drandomForest.R
In randomForest.ddR: Distributed 'randomForest' for Big Data using 'ddR' API

Try the randomForest.ddR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

randomForest.ddR Distributed 'randomForest' for Big Data using 'ddR' API

tests/testthat/test-drandomForest.R In randomForest.ddR: Distributed 'randomForest' for Big Data using 'ddR' API

Try the randomForest.ddR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

randomForest.ddR
Distributed 'randomForest' for Big Data using 'ddR' API

tests/testthat/test-drandomForest.R
In randomForest.ddR: Distributed 'randomForest' for Big Data using 'ddR' API