####################################################################################################
####################################################################################################
#### Experiment: measure hit rates with increasing number of variables ordered by DEG p-value ####
###################################################################################################
##### define the file to store memory Image for " the Number of DEG Ordered" test #####
image.dir <- file.path( project.parameters$global$dir$memoryImages, parameters$recountID)
dir.create(image.dir, showWarnings = FALSE, recursive = TRUE)
image.file <- file.path(image.dir, paste(sep = "", "train_test_no._of_DEG_ordered_",parameters$recountID , ".Rdata"))
## For debug: reset the parameteres for all the study cases
## This is used to re-run the analyses on each study case after having changed some parameters in the yaml-specific configuration file
reload.parameters <- TRUE
if (reload.parameters) {
project.parameters <- yaml.load_file(configFile)
project.parameters <- initParallelComputing(project.parameters)
if(exists("studyCases")) {
for (recountID in names(studyCases)) {
parameters <- initRecountID(recountID, project.parameters)
studyCases[[recountID]]$parameters <- parameters
for (dataSetName in grep("_sorted", names(studyCases[[recountID]]),value = TRUE)) {
if(dataSetName == "log2norm_edgeR_sorted" ){
var.numbers <- c( 10, 40, 80, 200, 400, 1000,
seq(from=3000, to=nrow(studyCases[[recountID]]$dataSetName$dataTable )-1, by = 70000), nrow(studyCases[[recountID]]$dataSetName$dataTable))
studyCases[[recountID]]$parameters$var.numbers <- var.numbers
studyCases[[recountID]]$dataSetName$parameters <- parameters
} else if (dataSetName =="log2norm_DESeq2_sorted" ) {
var.numbers <- c( 10, 40, 80, 200, 400, 1000,
seq(from=3000, to=nrow(studyCases[[recountID]]$dataSetName$dataTable )-1, by = 70000), nrow(studyCases[[recountID]]$dataSetName$dataTable))
studyCases[[recountID]]$parameters$var.numbers <- var.numbers
studyCases[[recountID]]$dataSetName$parameters <- parameters
} # end if dataset is included in DEGdataset
} # end for over dataSetNames
# print (studyCases[[recountID]]$parameters$dir$tablesDetail)
}
} # end of
}
## Default for quick test without iterating over all cases
permute <- FALSE
#DEG.object <- DataTableWithDEG(studyCases$filtered)
if (project.parameters$global$compute) {
message.with.time("Train/test all computations with constant training proportion :")
train.test.results.all.DEG.ordered.per.classifier <- list()
##### Loop over recountID ######
# for (recountID in selectedRecountIDs) {
parameters <- studyCases[[recountID]]$parameters
DEGdataset <- studyCases[[recountID]]$datasetsForTest$DEGdataSets
for (classifier in project.parameters$global$classifiers) {
train.test.results.DEG <- list()
#### Associate all computation with permuted and not permuted class lables ####
for (permute in project.parameters$global$permute) {
for (dataSetName in c("log2norm_DESeq2_sorted","log2norm_edgeR_sorted")) {
DEG <- DEGdataset[[dataSetName]]
# DEG <- DEG.object$DEGdataSetsType[[dataset]]
# v <- 1
for(v in 1:length(studyCases[[recountID]]$parameters$var.numbers)){
varnb <- studyCases[[recountID]]$parameters$var.numbers[v]
## For the time being we do this experiment only with log2 normalised counts
## since we saw that it improves the result with all variables
# data.type <- paste(dataset$dataType, dataset$method, sep = "_")
# data.table <- na.omit( as.data.frame(get(data.type)[["orderedDataTable"]]))
if (dataSetName == "DESeq2"){
selected.DEG.names <- rownames( DEGdataset$orderedDataTable[1:varnb,])
} else{
selected.DEG.names <- rownames( DEG$orderedDataTable[1:varnb,])
}
## Make sure that we select gene names present in the selected data type
## (some genes may be filtered out or technical reasons)
valid.DEG.names <- selected.DEG.names[selected.DEG.names %in% rownames(DEG$orderedDataTable)]
DEG.object <- DEG$orderedDataTable[valid.DEG.names,]
DEGdataset$dataTable <- DEG.object
#DEG$DEG.object <- DEG.object
#DataTableWithTrainTestSets(DEG$DEG.object )
## dim(counts)
#### Define experiment prefix ####
variable.type <- paste(sep="_", "DEG-",dataset, "top", varnb, "var")
exp.prefix <- paste(sep="_", project.parameters$global$classifiers, studyCases[[recountID]]$ID , variable.type)
if (permute) {
exp.prefix <- paste(sep="_", exp.prefix,project.parameters$global$perm.prefix)
}
message (format(Sys.time(), "%Y-%m-%d_%H%M%S"), "\t", paste("Experiment prefix: ", exp.prefix, "DEG-Method", dataset))
train.test.results.DEG[[exp.prefix]] <-
IterateTrainingTesting(DEGdataset,
classifier=classifier,
permute = permute)
} # end of for loop over nb.variables
} # end of for loop over DEG.methods
} # end of for loop over permuted lables
#### Print the results of the effect of the number of DEG ordered on the efficiancy of each classifier ####
ErrorRateBoxPlot(experimentList = train.test.results.DEG,
classifier = classifier,
main = paste( "impact of number of variables sorted according DEG", "\n",
studyCases[[recountID]]$ID,";",
project.parameters$global$iteration,
"iterations;",
"DEG.data.type;" ,
"ordered_variables",sep = ""))
train.test.results.all.DEG.ordered.per.classifier[[recountID]][[classifier]] <- train.test.results.DEG
} # end loop over classifiers
#### Save an image of the results to enable reloading them withouht recomputing everything ####
if (project.parameters$global$save.image) {
save.image(file = image.file)
}
##### if compution not required, you can load the image file without any computations ####
# } # end iteration over recountID
} # end of compute condition
# else {
# # message.with.time("Skipping train/test analysis with DEG-ordered top variables")
# # reload previous results if exist
# if (file.exists(image.file)) {
# message ("Reloading memory image ", image.file)
# load(image.file)
# } else {
# stop("Cannot reload memory image file ", image.file)
# }
#
# }
## For each experiment, the results were stored in an element of the list train.test.results.
## Test the list of experiments done.
names(train.test.results.DEG)
#################################################################################################################
#### results for the imapct of the number of variables (genes) sorted according to DEG ####
#################################################################################################################
# ErrorRateBoxPlot(experimentList = train.test.results.DEG,
# classifier=classifier,
#
# variable.type = "ordered_variables_2",
# main = paste( "impact of number of variables sorted according DEG", "\n",
# parameters$recountID,";",
# parameters$iterations,
# "iterations;",
# "DEG.data.type;" ,
# "ordered_variables",sep = ""))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.