machinelearnr: Package with various functions for preprocessing, clustering, and classification

library(machinelearnr)

#DONE find.best.number.of.trees 
#Test if the output is the expected type. 

#eval.classification.results #No need to test

#DONE RandomForestAutomaticMtryAndNtree 
#Test if the output is the expected type.


#DONE LOOCVPredictionsRandomForestAutomaticMtryAndNtree
#Test if the output is the expected type. 
##Expected number of objects in list
###Expected number of observations in first object


#DONE RandomForestClassificationGiniMatrixForPheatmap
#Test if output is what you would get if you did things manually. 

#DONE RandomForestClassificationPercentileMatrixForPheatmap
##Test if output is what you would get if you did things manually. 


#DONE LOOCVRandomForestClassificationMatrixForPheatmap
##Test if output is what you would get if you did things manually. 



#-----------------------------------------------
#New functions to add. Specify the cross validation
#fold. If the function works, then N fold
#should have the same results as using the LOOCV
#function


#DONE CVPredictionsRandomForestAutomaticMtryAndNtree

#DONE CVRandomForestClassificationMatrixForPheatmap



#------------------------------------------------
# Tests begin here
#------------------------------------------------

test_that("find.best.number.of.trees works", {
  
  example.data <- GenerateExampleDataMachinelearnr()
  
  set.seed(1)
  rf.result <- randomForest::randomForest(x=example.data[,c("x", "y", "a", "b")], y=example.data[,"actual"])
  
  error.oob <- rf.result[[4]][,1]
  
  result <- find.best.number.of.trees(error.oob)
  
  expect_equal(length(result), 1)
  
})


test_that("RandomForestAutomaticMtryAndNtree works", {
  
  example.data <- GenerateExampleDataMachinelearnr()
  
  rf.result <- RandomForestAutomaticMtryAndNtree(example.data, c("x", "y", "a", "b"), "actual", seed=2)
  
  #Should result in a rf object
  expect_equal(class(rf.result), "randomForest")
  
  #The error rate of the classification model should be very low because 
  #the test data purposely has several features that can separate the target. 
  expect_equal(rf.result$err.rate[length(rf.result$err.rate[,1])] < 0.1, TRUE)
  
})


test_that("LOOCVPredictionsRandomForestAutomaticMtryAndNtree works", {
  
  example.data <- GenerateExampleDataMachinelearnr()
  
  invisible(capture.output(
    result <- LOOCVPredictionsRandomForestAutomaticMtryAndNtree(example.data,
                              predictors.that.PCA.can.be.done.on = c("x", "y", "a", "b"),
                              predictors.that.should.not.PCA = NULL,
                              should.PCA.be.used = FALSE,
                              target.column.name = "actual",
                              seed=2,
                              percentile.threshold.to.keep = 0.5)
  ))
  
  #Output should be a list with two objects
  expect_equal(length(result), 2)
  
  #First object should be a vector of predicted values for each observation
  expect_equal(length(result[[1]]), length(example.data[,1]))
  
  #Second object should be table with number of entries equal to number of features
  #identified as important.
  expect_equal(class(result[[2]]), "table")
  
  #The predicted values should be very close to the actual values
  #because the test data purposely has several features that can separate the target.
  actual <- example.data$actual
  predicted <- result[[1]]
  expect_equal(mltools::mcc(preds = as.integer(predicted), actuals = as.integer(actual))>0.8, TRUE)
  
})


test_that("RandomForestClassificationGiniMatrixForPheatmap works", {
  
  example.data <- GenerateExampleDataMachinelearnr()
  
  invisible(capture.output(
    results <- RandomForestClassificationGiniMatrixForPheatmap(input.data = example.data,
                                            factor.name.for.subsetting = "sep.xy.ab",
                                            name.of.predictors.to.use = c("x", "y", "a", "b"),
                                            target.column.name = "actual",
                                            seed = 2)
  ))
  
  matrix.for.pheatmap <- results[[1]]
  
  #The resulting matrix should have 5 rows (4 rows for features and 1 row for MCC)
  expect_equal(dim(matrix.for.pheatmap)[1], 5)
  
  #The resulting matrix should have 2 columns. 1 column for each level of the factor
  #used for subetting
  expect_equal(dim(matrix.for.pheatmap)[2], 2)
  
  #The last row of the column should hold the MCC value
  expect_equal(row.names(matrix.for.pheatmap)[[length(row.names(matrix.for.pheatmap))]], "MCC.val")
  
  #The MCC values for each subset should be very high because the testing data
  #was created in a way where this is true.
  MCC_val_row <- matrix.for.pheatmap[dim(matrix.for.pheatmap)[1],]
  expect_equal(MCC_val_row[[1]]>0.8, TRUE)
  expect_equal(MCC_val_row[[2]]>0.8, TRUE)
  
  #-------------------------------------------------------------------------
  #See if the results from this function are equal to the results
  #if pheatmap matrix is created manually. Not mtry and ntree optimization
  #-------------------------------------------------------------------------
  
  #Subset 1/2/3
  
  subset.123 <- subset(example.data, example.data[,"sep.xy.ab"]=="1/2/3")
  subset.123$actual <- as.factor(as.numeric(subset.123$actual))
  
  set.seed(2)
  rf.result.subset.123 <- randomForest::randomForest(x=subset.123[,c("x", "y", "a", "b")], y=subset.123[,"actual"])
  
  importance.values.from.123 <- rf.result.subset.123$importance
  predicted <- rf.result.subset.123$predicted
  actual <- subset.123[,"actual"]
  MCC.val <- MCC.val.123 <- mltools::mcc(preds=predicted, actuals=actual)
  
  #Subset 4/5
  
  subset.45 <- subset(example.data, example.data[,"sep.xy.ab"]=="4/5")
  subset.45$actual <- as.factor(as.numeric(subset.45$actual))
  
  set.seed(2)
  rf.result.subset.45 <- randomForest::randomForest(x=subset.45[,c("x", "y", "a", "b")], y=subset.45[,"actual"])
  
  importance.values.from.45 <- rf.result.subset.45$importance
  predicted <- rf.result.subset.45$predicted
  actual <- subset.45[,"actual"]
  MCC.val.45 <- mltools::mcc(preds=predicted, actuals=actual)

  
  #Make the dataframe so that it imitates the pheatmap output
  column.123 <- rbind(importance.values.from.123, MCC.val)
  column.45 <- rbind(importance.values.from.45, MCC.val.45)
  
  combined.result <- cbind(column.123, column.45)
  
  #Change column names
  colnames(combined.result)[1] <- "sep.xy.ab 1/2/3"
  colnames(combined.result)[2] <- "sep.xy.ab 4/5"
  
  expect_equal(matrix.for.pheatmap, combined.result)
  
  #-------------------------------------------------------------------------
  #See if the results from this function are equal to the results
  #if pheatmap matrix is created manually. With mtry and ntree optimization
  #-------------------------------------------------------------------------
  
  #Subset 1/2/3
  
  subset.123 <- subset(example.data, example.data[,"sep.xy.ab"]=="1/2/3")
  subset.123$actual <- as.factor(as.numeric(subset.123$actual))
  
  set.seed(2)
  rf.result.subset.123 <- RandomForestAutomaticMtryAndNtree(subset.123, c("x", "y", "a", "b"), "actual", seed=2)
  
  importance.values.from.123 <- rf.result.subset.123$importance
  predicted <- rf.result.subset.123$predicted
  actual <- subset.123[,"actual"]
  MCC.val <- MCC.val.123 <- mltools::mcc(preds=predicted, actuals=actual)
  
  #Subset 4/5
  
  subset.45 <- subset(example.data, example.data[,"sep.xy.ab"]=="4/5")
  subset.45$actual <- as.factor(as.numeric(subset.45$actual))
  
  set.seed(2)
  rf.result.subset.45 <- RandomForestAutomaticMtryAndNtree(subset.45, c("x", "y", "a", "b"), "actual", seed=2)
  
  importance.values.from.45 <- rf.result.subset.45$importance
  predicted <- rf.result.subset.45$predicted
  actual <- subset.45[,"actual"]
  MCC.val.45 <- mltools::mcc(preds=predicted, actuals=actual)
  
  
  #Make the dataframe so that it imitates the pheatmap output
  column.123 <- rbind(importance.values.from.123, MCC.val)
  column.45 <- rbind(importance.values.from.45, MCC.val.45)
  
  combined.result <- cbind(column.123, column.45)
  
  #Change column names
  colnames(combined.result)[1] <- "sep.xy.ab 1/2/3"
  colnames(combined.result)[2] <- "sep.xy.ab 4/5"
  
  #Toggle TRUE for mtry and ntree optimization
  invisible(capture.output(
    results <- RandomForestClassificationGiniMatrixForPheatmap(input.data = example.data,
                                                                           factor.name.for.subsetting = "sep.xy.ab",
                                                                           name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                                           target.column.name = "actual",
                                                                           seed = 2,
                                                                           should.mtry.and.ntree.be.optimized = TRUE)
  ))
  
  matrix.for.pheatmap <- results[[1]]
  
  expect_equal(matrix.for.pheatmap, combined.result)
  
  #-------------------------------------------------------------------------
  # See if the subsetted output data frames are outputted
  #-------------------------------------------------------------------------
  
  invisible(capture.output(
    results <- RandomForestClassificationGiniMatrixForPheatmap(input.data = example.data,
                                                               factor.name.for.subsetting = "sep.xy.ab",
                                                               name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                               target.column.name = "actual",
                                                               seed = 2,
                                                               should.mtry.and.ntree.be.optimized = FALSE)
  ))
  
  subsetted.dataframes <- results[[2]]
  
  #Should contain two dataframes
  expect_equal(length(subsetted.dataframes), 2)
  
  #First dataframe should be subset 123
  expect_equal(subsetted.dataframes[[1]], subset.123)
  
  #Second dataframe should be subset 45
  expect_equal(subsetted.dataframes[[2]], subset.45)
  
  #-------------------------------------------------------------------------
  # See if the predicted values make sense. 
  #-------------------------------------------------------------------------
  
  matrix.for.pheatmap <- results[[1]]
  predicted.values <- results[[3]]
  
  #Should have two vectors of predicted values
  expect_equal(length(predicted.values), 2)
  
  #The first vector should have values 1, 2, 3 as levels
  expect_equal(unique(predicted.values[[1]]), c("1", "2", "3"))
  
  #The second vector should have values 4 and 5 as levels
  expect_equal(unique(predicted.values[[2]]), c("4", "5"))
  
  #The MCC values should be equivalent
  first.vec.actual <- as.character(example.data$actual[1:22])
  first.vec.predicted <- as.character(predicted.values[[1]])
  first.vec.MCC <- mltools::mcc(preds = first.vec.predicted, actuals = first.vec.actual)
  
  second.vec.actual <- as.character(example.data$actual[23:36])
  second.vec.predicted <- as.character(predicted.values[[2]])
  second.vec.MCC <- mltools::mcc(preds = second.vec.predicted, actuals = second.vec.actual)
  
  expect_equal(first.vec.MCC, matrix.for.pheatmap[5, 1])
  expect_equal(second.vec.MCC, matrix.for.pheatmap[5, 2])

})


test_that("RandomForestClassificationPercentileMatrixForPheatmap works", {
  
  example.data <- GenerateExampleDataMachinelearnr()
  
  invisible(capture.output(
     results <- RandomForestClassificationPercentileMatrixForPheatmap(input.data = example.data,
                                                                    factor.name.for.subsetting = "sep.xy.ab",
                                                                    name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                                    target.column.name = "actual",
                                                                    seed = 2,
                                                                    should.mtry.and.ntree.be.optimized = FALSE)
  ))
  
  percentile.result <- results[[1]]
  
  invisible(capture.output(
    results <- RandomForestClassificationGiniMatrixForPheatmap(input.data = example.data,
                                                                               factor.name.for.subsetting = "sep.xy.ab",
                                                                               name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                                               target.column.name = "actual",
                                                                               seed = 2,
                                                                               should.mtry.and.ntree.be.optimized = FALSE)
  ))
  
  gini.result <- results[[1]]
  
  #Check if MCC is placed in column name.
  MCC.val.in.col1.name <- as.numeric(strsplit(colnames(percentile.result)[[1]], " ")[[1]][[6]])
  MCC.val.in.col2.name <- as.numeric(strsplit(colnames(percentile.result)[[2]], " ")[[1]][[6]])
  MCC.val.gini.col1 <- as.numeric(gini.result[5,1])
  MCC.val.gini.col2 <- as.numeric(gini.result[5,2])
  
  expect_equal(MCC.val.in.col1.name, MCC.val.gini.col1, tolerance = 0.01)
  expect_equal(MCC.val.in.col2.name, MCC.val.gini.col2, tolerance = 0.01)
  
  
  #Is the matrix to be expected given the gini matrix results.
  #In the first column, x and y should both have larger
  #percentiles than a and b.
  expect_equal(percentile.result[1,1] > percentile.result[3,1] & percentile.result[1,1] > percentile.result[4,1], TRUE)
  expect_equal(percentile.result[2,1] > percentile.result[3,1] & percentile.result[2,1] > percentile.result[4,1], TRUE)
  
  #In the second column, a and b should both have larger
  #percentiles than x and y
  expect_equal(percentile.result[3,2] > percentile.result[1,2] & percentile.result[3,2] > percentile.result[2,2], TRUE)
  expect_equal(percentile.result[4,2] > percentile.result[1,2] & percentile.result[4,2] > percentile.result[2,2], TRUE)

})

#LOOCVRandomForestClassificationMatrixForPheatmap
#Expected output is a matrix. 

test_that("LOOCVRandomForestClassificationMatrixForPheatmap works", {
  
  example.data <- GenerateExampleDataMachinelearnr()
  
  invisible(capture.output(
    result <- LOOCVRandomForestClassificationMatrixForPheatmap(input.data = example.data,
                                                                               factor.name.for.subsetting = "sep.xy.ab",
                                                                               name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                                               target.column.name = "actual",
                                                                               seed = 2,
                                                                               should.mtry.and.ntree.be.optimized = FALSE,
                                                                               percentile.threshold.to.keep = 0.5)
  ))
  
  percentile.result <- result[[1]]
  
  #Is the matrix to be expected given the gini matrix results.
  #In the first column, x and y should both have larger
  #percentiles than a and b.
  expect_equal(percentile.result[1,1] > percentile.result[3,1] & percentile.result[1,1] > percentile.result[4,1], TRUE)
  expect_equal(percentile.result[2,1] > percentile.result[3,1] & percentile.result[2,1] > percentile.result[4,1], TRUE)
  
  #In the second column, a and b should both have larger
  #percentiles than x and y
  expect_equal(percentile.result[3,2] > percentile.result[1,2] & percentile.result[3,2] > percentile.result[2,2], TRUE)
  expect_equal(percentile.result[4,2] > percentile.result[1,2] & percentile.result[4,2] > percentile.result[2,2], TRUE)
  
  #MCC values should be very large
  MCC.val.in.col1.name <- as.numeric(strsplit(colnames(percentile.result)[[1]], " ")[[1]][[6]])
  MCC.val.in.col2.name <- as.numeric(strsplit(colnames(percentile.result)[[2]], " ")[[1]][[6]])
  expect_equal(MCC.val.in.col1.name>0.8, TRUE)
  expect_equal(MCC.val.in.col2.name>0.8, TRUE)
  
  #Are there two subsetted dataframes
  subset.result <- result[[2]]
  expect_equal(length(subset.result), 2)
  
  
  #Should have two vectors of predicted values
  predicted.values <- result[[3]]
  expect_equal(length(predicted.values), 2)
  
  #The first vector should have values 1, 2, 3 as levels
  expect_equal(unique(predicted.values[[1]]), c("1", "2", "3"))
  
  #The second vector should have values 4 and 5 as levels
  expect_equal(unique(predicted.values[[2]]), c("4", "5"))
  
  
})



test_that("CVPredictionsRandomForest works", {
  
  example.data <- GenerateExampleDataMachinelearnr()
  
  invisible(capture.output(
    result.LOOCV <- LOOCVPredictionsRandomForestAutomaticMtryAndNtree(inputted.data = example.data,
                                                                      predictors.that.should.not.PCA = c("x", "y", "a", "b"),
                                                                      predictors.that.PCA.can.be.done.on = NULL,
                                                                      should.PCA.be.used = FALSE,
                                                                      target.column.name = "actual",
                                                                      seed = 2,
                                                                      should.mtry.and.ntree.be.optimized = FALSE,
                                                                      percentile.threshold.to.keep = 0.5)
  ))
  
  invisible(capture.output(
    result.CV <- CVPredictionsRandomForest(inputted.data = example.data,
                                           name.of.predictors.to.use = c("x", "y", "a", "b"),
                                           target.column.name = "actual",
                                           seed = 2,
                                           percentile.threshold.to.keep = 0.5,
                                           number.of.folds = nrow(example.data))
  ))
  
  #Results from CVPredictionsRandomForest should be equivalent to results
  #from LOOCVPredictionsRandomForestAutomaticMtryAndNtree if
  #1. Number of folds is equal to number of observations.
  #2. PCA is not used.
  #3. Default mtry and ntree is used.
  expect_equal(result.LOOCV[[1]], result.CV[[1]])
  expect_equal(result.LOOCV[[2]], result.CV[[2]])
  
  #Check if 10 fold works. Should not get any errors. 
  invisible(capture.output(
    result.CV.ten.fold <- CVPredictionsRandomForest(inputted.data = example.data,
                                           name.of.predictors.to.use = c("x", "y", "a", "b"),
                                           target.column.name = "actual",
                                           seed = 2,
                                           percentile.threshold.to.keep = 0.5,
                                           number.of.folds = 10)
  ))
  
  #Check if 4 fold works. Should not get any errors.
  invisible(capture.output(
    result.CV.four.fold <- CVPredictionsRandomForest(inputted.data = example.data,
                                                    name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                    target.column.name = "actual",
                                                    seed = 2,
                                                    percentile.threshold.to.keep = 0.5,
                                                    number.of.folds = 4)
  ))
  
  #Check if 3 fold works. Should not get any errors.
  invisible(capture.output(
    result.CV.three.fold <- CVPredictionsRandomForest(inputted.data = example.data,
                                                     name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                     target.column.name = "actual",
                                                     seed = 2,
                                                     percentile.threshold.to.keep = 0.5,
                                                     number.of.folds = 3)
  ))
  
  #Check if 2 fold works. Should not get any errors.
  invisible(capture.output(
    result.CV.two.fold <- CVPredictionsRandomForest(inputted.data = example.data,
                                                      name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                      target.column.name = "actual",
                                                      seed = 2,
                                                      percentile.threshold.to.keep = 0.5,
                                                      number.of.folds = 2)
  ))
  two.fold.predicted <- as.numeric(result.CV.two.fold[[1]])
  two.fold.actual <- as.numeric(example.data[,"actual"])
  two.fold.mcc <- mltools::mcc(preds = two.fold.predicted, actuals = two.fold.actual)
  
  #Randomly shuffle the data, this should improve the two fold performance. 
  #This is because the CVPredictionsRandomForest() should not automatically shuffle
  #and since the example.data was created sequentially with first half focused on
  #predicting categories 1/2/3 with x/y and the second half on predicting categories 4/5
  #with a/b, two fold CV should result in very poor performance.
  set.seed(1)
  example.data.shuffled <- example.data[sample(nrow(example.data)),]
  #Check if 2 fold works. Should not get any errors.
  invisible(capture.output(
    result.CV.two.fold <- CVPredictionsRandomForest(inputted.data = example.data.shuffled,
                                                    name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                    target.column.name = "actual",
                                                    seed = 2,
                                                    percentile.threshold.to.keep = 0.5,
                                                    number.of.folds = 2)
  ))
  two.fold.predicted <- as.numeric(result.CV.two.fold[[1]])
  two.fold.actual <- as.numeric(example.data.shuffled[,"actual"])
  two.fold.mcc.shuffled <- mltools::mcc(preds = two.fold.predicted, actuals = two.fold.actual)
  
  expect_equal(two.fold.mcc.shuffled > two.fold.mcc, TRUE)
  
  #---------------------------------
  # Add test to illustrate stratified cross-validation
  #--------------------------------
  
  #What happens if the training (first fold) doesn't contain the same target classes as the testing (second fold)
  #Make example data set perfectly halved by the target.
  example.data.halved <- example.data[-c(1,2,3,8,9,10,14,15),]
  
  #The first fold only has targets 1/2/3, the second fold only has targets 4/5. 
  invisible(capture.output(
    result.CV.two.fold <- CVPredictionsRandomForest(inputted.data = example.data.halved,
                                                    name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                    target.column.name = "actual",
                                                    seed = 2,
                                                    percentile.threshold.to.keep = 0.5,
                                                    number.of.folds = 2)
  ))
  
  #The MCC should be very bad
  two.fold.predicted <- as.numeric(result.CV.two.fold[[1]])
  two.fold.actual <- as.numeric(example.data.halved[,"actual"])
  two.fold.mcc.stratified.imbalanced <- mltools::mcc(preds = two.fold.predicted, actuals = two.fold.actual)
  
  expect_equal(two.fold.mcc.stratified.imbalanced < 0.2, TRUE)
  
})




test_that("CVRandomForestClassificationMatrixForPheatmap works", {
  
  example.data <- GenerateExampleDataMachinelearnr()
  
  # set.seed(1)
  # example.data <- example.data[sample(nrow(example.data)),]
  
  
  invisible(capture.output(
    result.LOOCV <- LOOCVRandomForestClassificationMatrixForPheatmap(input.data = example.data,
                                                                     factor.name.for.subsetting = "sep.xy.ab",
                                                                     name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                                     target.column.name = "actual",
                                                                     seed = 2,
                                                                     should.mtry.and.ntree.be.optimized = FALSE,
                                                                     percentile.threshold.to.keep = 0.5)
  ))
  
  invisible(capture.output(
    result.CV <- CVRandomForestClassificationMatrixForPheatmap(input.data = example.data,
                                                               factor.name.for.subsetting = "sep.xy.ab",
                                                               name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                               target.column.name = "actual",
                                                               seed = 2,
                                                               percentile.threshold.to.keep = 0.5,
                                                               number.of.folds = -1)
  ))
  
  #Results from CVRandomForestClassificationMatrixForPheatmap should be equivalent to results
  #from LOOCVRandomForestClassificationMatrixForPheatmap if
  #1. Number of folds is equal to number of observations.
  #2. PCA is not used.
  #3. Default mtry and ntree is used.
  expect_equal(result.LOOCV, result.CV)
  
  #Check if 10 fold works. Should not get any errors. 
  invisible(capture.output(
    result.ten.fold.CV <- CVRandomForestClassificationMatrixForPheatmap(input.data = example.data,
                                                               factor.name.for.subsetting = "sep.xy.ab",
                                                               name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                               target.column.name = "actual",
                                                               seed = 2,
                                                               percentile.threshold.to.keep = 0.5,
                                                               number.of.folds = 10)
  ))
  
  #Check if 4 fold works. Should not get any errors.
  invisible(capture.output(
    result.four.fold.CV <- CVRandomForestClassificationMatrixForPheatmap(input.data = example.data,
                                                               factor.name.for.subsetting = "sep.xy.ab",
                                                               name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                               target.column.name = "actual",
                                                               seed = 2,
                                                               percentile.threshold.to.keep = 0.5,
                                                               number.of.folds = 4)
  ))
  
  #Check if 3 fold works. Should not get any errors.
  invisible(capture.output(
    result.three.fold.CV <- CVRandomForestClassificationMatrixForPheatmap(input.data = example.data,
                                                                         factor.name.for.subsetting = "sep.xy.ab",
                                                                         name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                                         target.column.name = "actual",
                                                                         seed = 2,
                                                                         percentile.threshold.to.keep = 0.5,
                                                                         number.of.folds = 3)
  ))
  
  #Should expect an error because subsetting and then doing two-fold CV will cause
  #the two folds of the 4/5 subsetted data to each have only one target value (4 or 5).
  expect_error(
  invisible(capture.output(
    result.two.fold.CV <- CVRandomForestClassificationMatrixForPheatmap(input.data = example.data,
                                                                         factor.name.for.subsetting = "sep.xy.ab",
                                                                         name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                                         target.column.name = "actual",
                                                                         seed = 2,
                                                                         percentile.threshold.to.keep = 0.5,
                                                                         number.of.folds = 2)
  ))
  )
  
  #Error should be gone after shuffling the data
  set.seed(1)
  example.data.shuffled <- example.data[sample(nrow(example.data)),]
  invisible(capture.output(
    result.two.fold.CV <- CVRandomForestClassificationMatrixForPheatmap(input.data = example.data.shuffled,
                                                                        factor.name.for.subsetting = "sep.xy.ab",
                                                                        name.of.predictors.to.use = c("x", "y", "a", "b"),
                                                                        target.column.name = "actual",
                                                                        seed = 2,
                                                                        percentile.threshold.to.keep = 0.5,
                                                                        number.of.folds = 2)
  ))

  
  #Check if the function outputs a list of two data frames as its second output
  expect_equal(length(result.two.fold.CV[[2]]), 2)
  
  #Check if the function outputs a list of two vectors as its third output.
  expect_equal(length(result.two.fold.CV[[2]]), 2)
  
  #Check if the predicted values can be used to calculate the same MCC values.
  predicted_vecs <- result.two.fold.CV[[3]]
  
  MCC_for_first_column <- predicted_vecs[[1]]
  
  MCC_for_second_column <- predicted_vecs[[2]]
  
})


test_that("eval.classification.results works", {
  
  example.data <- GenerateExampleDataMachinelearnr()
  
  set.seed(1)
  rf.result <- randomForest::randomForest(x=example.data[,c("x", "y", "a", "b")], y=example.data[,"actual"])
  
  metric_results <- eval.classification.results(actual = as.character(example.data$actual), predicted = as.character(rf.result$predicted))
  
  #The output should be a list with four objects
  expect_equal(length(metric_results), 4)
  
  #The second object should be a table
  expect_equal(class(metric_results[[2]]), "table")
  
  #The third object should be a Momocs:,:classification_metrics() object with $accuracy equivalent to the OOB err.rate
  #of rf.results after using set number of trees in the forest
  expect_equal(metric_results[[3]]$accuracy, 1-as.numeric(rf.result$err.rate[nrow(rf.result$err.rate),1]))
  
  
})
yhhc2/machinelearnr documentation built on Dec. 23, 2021, 7:19 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
yhhc2/machinelearnr
Package with various functions for preprocessing, clustering, and classification

tests/testthat/test-classification.R
In yhhc2/machinelearnr: Package with various functions for preprocessing, clustering, and classification

R Package Documentation

Browse R Packages

We want your feedback!

yhhc2/machinelearnr Package with various functions for preprocessing, clustering, and classification

tests/testthat/test-classification.R In yhhc2/machinelearnr: Package with various functions for preprocessing, clustering, and classification

R Package Documentation

Browse R Packages

We want your feedback!

yhhc2/machinelearnr
Package with various functions for preprocessing, clustering, and classification

tests/testthat/test-classification.R
In yhhc2/machinelearnr: Package with various functions for preprocessing, clustering, and classification