library(RUnit)
#----------------------------------------------------------------------------------------------------
runTests <- function()
{
test_splitNames()
test_eliminateDuplicateRows()
test_expand.single.row.when.needed()
test_do.it.all()
test_double.check()
} # runTests
#----------------------------------------------------------------------------------------------------
if(!exists("test.tbl"))
load("test.tbl.RDAta")
if(!exists("roots.tsv"))
read.table("roots.tsv", header=TRUE, sep="\t")
#----------------------------------------------------------------------------------------------------
eliminateDuplicateRows <- function(tbl)
{
last.column <- ncol(tbl)
mtx <- as.matrix(tbl[, 3:last.column])
variances.of.just.the.number.columns.after.making.a.matrix <- apply(mtx, 1, var)
tbl$variance <- variances.of.just.the.number.columns.after.making.a.matrix
# adding the column "variance" to the data frame "tbl"
# used the apply function to get the variance for each row in "tbl"
tbl.sorted <- tbl[order(tbl$Alias, tbl$variance, decreasing= TRUE), ]
# sorted the table in decreasing order of the "Alias" & "variance" columns.
# sorting the data frame this way allows us see the duplicate Alias names and the variance associated
# By having it in decreasing order, we can then delete the duplicates with the lowest variance
# Get rid of duplicates
tbl.sorted.nodupes <- tbl.sorted[-which(duplicated(tbl.sorted$Alias)), ]
# deleting the duplicates (with the lower variance) from the Alias column
# setting these changes in another version of the data frame, named "tbl.sorted.nodupes"
rownames(tbl.sorted.nodupes) <- tbl.sorted.nodupes$Alias
# Setting the Alias column names to now be the rownames of the data frame (it was previously in numerical order)
tbl.sorted.nodupes <- tbl.sorted.nodupes[, -c(1:2)]
#deleting columns 1 & 2 from the data frame: the "id" column & "Alias" column.
#In the previous step, the Alias column was used for the rownames, so it is not unnecessary to have the column
variance.column <- grep("variance", colnames(tbl.sorted.nodupes))
#search for the column name with "variance". set it as variance.column
tbl.sorted.nodupes <- tbl.sorted.nodupes[, -variance.column]
#delete variance column
return(tbl.sorted.nodupes)
#result is an organized table with no duplicate rows
} # eliminateDuplicateRows
#----------------------------------------------------------------------------------------------------
test_eliminateDuplicateRows <- function()
#create test for eliminateDuplicateRows
{
message(sprintf("--- test_eliminateDuplicateRows"))
#print message stated
there.are.duplicates <- any(duplicated(test.tbl$Alias))
#set a variable stating that there are duplicates in the Alias column on the test.tbl
checkTrue(there.are.duplicates)
#check if there are duplicates
tbl.fixed <- eliminateDuplicateRows(test.tbl)
#set a variable showing the result table
checkTrue(nrow(tbl.fixed) < nrow(test.tbl))
#check if there are less rows in the result table than than the original table
checkTrue(ncol(tbl.fixed) < ncol(test.tbl))
#check if there are less columns in the result table than the original table
checkEquals((length(grep(";", test.tbl$Alias))), (length(grep(";", rownames(tbl.fixed)))))
#check if the # of times ";" is found in the orginal table is the same as the result table
#in our first original table (the smaller table before the entire root.tbl is used), there should be 15
} # test_loadData
#----------------------------------------------------------------------------------------------------
# create function splitNames
old.splitNames <- function(string)
{
if(!grepl(";", string))
{return(string)}
#if there isnt a ";" in the parameter, just simply return the the parameter
else if(grepl(";", string)){
#if there is a ";" in our input, do the following:
singleOrfNames <- unlist(strsplit(string, ";"))
#split the rownames by ";", this alone will print a list, so "unlist" is used to print out characters/strings
return(singleOrfNames)}
}
#---------------------------------------------------------------------------------------------------
splitNames <- function(names)
{
unlist(strsplit(names, ";"))
} # splitNames
#---------------------------------------------------------------------------------------------------
test_splitNames <- function()
{
message(sprintf("--- test_splitNames"))
checkEquals(splitNames("abc"), "abc")
checkEquals(splitNames("abcd;efgh"), c("abcd", "efgh"))
checkEquals(splitNames(c("abc", "abcd;efgh")), c("abc","abcd", "efgh"))
} # test_splitNames
#---------------------------------------------------------------------------------------------------
test_old.splitNames <- function()
{
message(sprintf("--- test_old.splitNames"))
checkEquals(old.splitNames("abc"), "abc")
checkEquals(old.splitNames(""), "")
#checking for no splits
result <- old.splitNames("abcd;efgh")
checkEquals(result, c("abcd", "efgh"))
#checking for 1 splits
result.2 <- old.splitNames("abcd;efgh;ijkl")
checkEquals(result.2, c("abcd", "efgh", "ijkl"))
#checking for 2 splits
result.3 <- old.splitNames("abcd;efgh;ijkl;mnop")
checkEquals(result.3, c("abcd", "efgh", "ijkl", "mnop"))
#checking for 3 splits
result.4 <- old.splitNames("abcd;efgh;ijkl;mnop;qrst")
checkEquals(result.4, c("abcd", "efgh", "ijkl", "mnop", "qrst"))
#checking for 4 splits
} # test_splitNames
#----------------------------------------------------------------------------------------------------
#create function splitNamesDoubleRows by using previously made splitNames function
tbl <- eliminateDuplicateRows(test.tbl)
splitNamesRepeatRows <- function(tbl)
{
mtx <- as.matrix(numRepeatRows)
numRepeatRows <- list(apply(mtx, 1, length))
x <- lapply(1:nrow(tbl), function(i) splitNames(tbl[i,]))
# (1:nrow(tbl)) creates a sequence/list to loop over by making a vector of integers calculated from the # of rows in "tbl"
# in this case, tbl 400 rows so integers 1-25
# "i" allows any variable to be used in the function (mainly loops); is simply used as an index
# lapply takes the vector of integers calculated from the # of rows in tbl & the function as inputs so the function is applied to each element in the vector
# sets it to a variable "x" to make life easier
tbl.combined <- do.call(rbind, x)
}
#----------------------------------------------------------------------------------------------------
test_splitNamesRepeatRows <- function()
{
message(sprintf("--- test_splitNamesRepeatRows"))
stopifnot(exists("test.tbl"))
tbl.fixed <- eliminateDuplicateRows(test.tbl)
tbl.tiny <- tbl.fixed[1:6, 1:5]
} # test_splitNamesRepeatRows
#----------------------------------------------------------------------------------------------------
expand.single.row.when.needed <- function(x)
{
stopifnot(nrow(x)==1)
new.row.names <- splitNames(rownames(x[,,drop=FALSE]))
total.row.count <- length(new.row.names)
if(total.row.count == 1)
return(x)
# we get here only if total.row.count > 1
tbl.expanded <- matrix(rep(t(x), total.row.count), ncol = ncol(x) , byrow = TRUE )
colnames(tbl.expanded) <- colnames(x)
rownames(tbl.expanded) <- new.row.names
return(tbl.expanded)
} # expand.single.row.when.needed
#----------------------------------------------------------------------------------------------------
test_expand.single.row.when.needed <- function()
{
message(sprintf("--- test_expand.single.row.when.needed"))
stopifnot(exists("test.tbl"))
tbl.fixed <- eliminateDuplicateRows(test.tbl)
tbl.tiny <- as.matrix(tbl.fixed[1:6, 1:5])
# row 2 of tbl.tiny has a simple single orf name, ATMG00840
tbl.expanded2 <- expand.single.row.when.needed(tbl.tiny[2,,drop=FALSE])
checkEquals(dim(tbl.expanded2), c(1, 5))
# row 1 has a double orf name: ATMG00850;AT2G07682
tbl.expanded.1 <- expand.single.row.when.needed(tbl.tiny[1,,drop=FALSE])
checkEquals(dim(tbl.expanded.1), c(2, 5))
checkEquals(rownames(tbl.expanded.1), c("ATMG00850", "AT2G07682"))
# do a bunch of checks here
checkEquals(rownames(tbl.expanded.1), splitNames(rownames(tbl.tiny[1,,drop=FALSE])))
checkEquals(rownames(tbl.expanded2), splitNames(rownames(tbl.tiny[2,,drop=FALSE])))
# now for a more complete and realistic test, iterate over the whole of the matrix,
# row by row, ucalling expand.single.row.when.needed on each row. since apply (an
# otherwise sensible solution) drops row names, let's try an old-fashioned for loop
result <- list()
for(i in 1:nrow(tbl.tiny)){
result[[i]] <- expand.single.row.when.needed(tbl.tiny[i,,drop=FALSE])
}
mtx.expanded <- do.call(rbind, result)
checkEquals(dim(mtx.expanded), c(8, 5))
expected.rownames <- unlist(strsplit(rownames(tbl.tiny), ";"))
checkEquals(rownames(mtx.expanded), expected.rownames)
} # test_expand.single.row.when.needed
#----------------------------------------------------------------------------------------------------
# given the name of a tab-delimited file:
# - read it
# - eliminate duplicate rows using variance
# - mv Alias column to be row names
# - some rows are ;-separated orfs: duplicate them: do this one row at a time, aggregate and rbind
# - support testing and incremental development using optional paramater: numberOfRows to use
do.it.all <- function(data_file, numberOfRows=-1)
{
tbl.Raw <- read.table(data_file, header=TRUE, sep="\t", nrows=numberOfRows)
#reads file
tbl.Organized <- eliminateDuplicateRows(tbl.Raw)
#removes dupes; makes alias column the row names
result <- list()
for(i in 1:nrow(tbl.Organized)){
result[[i]] <- expand.single.row.when.needed(tbl.Organized[i,,drop=FALSE])
}
tbl.Expanded <- do.call(rbind, result)
#expands table
return(tbl.Expanded)
} # do.it.all
#----------------------------------------------------------------------------------------------------
test_do.it.all <- function()
{
message(sprintf("--- test_do.it.all"))
filename <- "roots.tsv"
tbl.small <- read.table(filename, header=TRUE, sep="\t", nrows=350)
tbl.small.organized <- eliminateDuplicateRows((tbl.small))
tbl.good <- do.it.all("roots.tsv", numberOfRows = 350)
checkTrue(nrow(tbl.small.organized) < nrow(tbl.good))
checkEquals(dim(tbl.good), c(212,1939))
checkEquals((grep(";", rownames(tbl.good))), integer(0))
tbl.rownames <- splitNames(rownames(eliminateDuplicateRows(tbl.small)))
checkEquals(rownames(tbl.good), tbl.rownames)
# now test each stage with the full 17770 row table freshly read from disk
tbl.final <- do.it.all(filename)
checkEquals(dim(tbl.final), c(18617, 1939))
originalRootData <- read.table(filen, header=TRUE, sep="\t")
checkEquals(nrow(originalRootData), 17770)
}
#----------------------------------------------------------------------------------------------------
test_double.check <- function()
{
message(sprintf("---test_double.check"))
originalRootData <- read.table("roots.tsv", header=TRUE, sep="\t")
checkEquals(nrow(originalRootData), 17770)
dim_originalRootData <- dim(originalRootData)
organizedRootData <- eliminateDuplicateRows(originalRootData)
# eliminate just 144
checkEquals(dim(organizedRootData), c(17626, 1939))
dim_organizedRootData <- dim(organizedRootData)
entireRoot <- do.it.all("roots.tsv", numberOfRows = -1)
dim_entireRootData <- dim(entireRoot)
# [1] stands for the rows
# [2] stands for the columns
checkTrue(dim_organizedRootData[1] < dim_originalRootData[1])
#as expected from removing duplicate rows
checkEquals(dim_organizedRootData[2], dim_originalRootData[2] - 2 )
#as expected from removing columns "ID" and "Alias"
checkEquals(dim_entireRootData[2], dim_organizedRootData[2])
#no columns were removed/added so it should be the same
checkTrue(dim_entireRootData[1] > dim_organizedRootData[1])
#as expected from expanding the rows with mulitple orf names
}#test_double.check
#----------------------------------------------------------------------------------------------------
if(!interactive())
runTests()
#---------------------------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.