R/functions.private.R

Defines functions plotErrorNumGenes extractGeneLabels SV.dif assignment.conditions queryResultCheck configurePlotOutput plotGeNetClassifierReturn remove.redundancy correlation.net interaction.net svmCrossVal cvSplitSamples difMean PEB iqr.filter create.hypothesis cvNumbers

Documented in plotGeNetClassifierReturn

## I N D E X ##
# cvNumbers
# create.hypothesis
# iqr.filter
# PEB
#   calculateOrder
#   minimumDiff
#   gene.select
# difMean
# linear.SVM
# interaction.net
# correlation.net
# remove.redundancy
# plotGeNetClassifierReturn
# configurePlotOutput
# queryResultCheck
#
# assignment.conditions
# SV.dif
# extractGeneLabels
# plotErrorNumGenes

############################### PRIVATE FUNCTIONS ##############################


## Estima los parametros de cross validation (numero de vueltas del bucle interno y externo) en base al numero de elemtos de cada clase
## Si el numero de elementos por clase es menor que 7, no hay suficientes elementos para realizar el bucle externo: numCV.extern=1
cvNumbers <- function(numElemClass)
{
  numElemClass <- min(numElemClass)
  if (numElemClass <=6)
  {
    numCV.intern <- numElemClass
    numCV.extern <- 1                # NO hacer la externa (no hay suficientes)
  }else{
    numCV.extern <- 5
    numCV.intern <- numElemClass-2
    if (numCV.intern > 8)
      numCV.intern <- 8
  }
  return (c(numCV.intern, numCV.extern))
}

#Funcion Crear hipotesis
# Ejemplo de uso: hipotesis <- create.hypothesis (datosCell$TIPO)
# datosCell es un ExpressionSet, $TIPO la variable utilizada para clasificar
create.hypothesis <- function(classLabels)
{
    classLabels <- factor(classLabels) #Transformed in case it wasn't a factor or the levels weren't updated
    numSamples <- length(classLabels)
    classNames <- levels(classLabels)
    numClasses <- length(classNames)

    # Hipotesis nula (todos iguales)
    hypothesis <- paste(rep(1,numSamples), sep=",", collapse=",")

    if (numClasses == 1){
        warning("There is only one class. Only the null hypothesis was created.", immediate. = TRUE)
        names(hypothesis)<- c("Null Hipothesis")
        return (hypothesis);
    }

    # Add hypothesis for each class
    for (i in classNames)
    {
         newHypothesis <- c()
         for (j in 1:numSamples)
         {
            if (classLabels[j] == i) newHypothesis <- c(newHypothesis, 2)
            else  newHypothesis <- c(newHypothesis, 1)
         }
         newHypothesis <- paste(newHypothesis, sep=",", collapse=",")
         hypothesis <- c(hypothesis, newHypothesis)
         if (numClasses ==2 ) # Only one hypothesis needed. The another one is the complementary.
         {
            names(hypothesis)<- c("Null Hipothesis", "BothClasses")
            return (hypothesis)
         }
    }
    names(hypothesis)<- c("Null Hipothesis", classNames)
    return (hypothesis)
}

## filtrado IRQ
iqr.filter <- function(eset, percentage = 0.50)
{
    if(is.integer(eset[1,1]))        # Transforms in case of integer. !is.numeric doesnt work (it is true even if it is numeric)
    {
        newEset<-cbind(apply(eset,2,as.numeric))
        colnames(newEset)<-colnames(eset)
        rownames(newEset)<-rownames(eset)
        eset<-newEset
    }

    if(percentage>0)
    {
        num <- dim(eset)[[2]]
        lowQ <- rowQ(eset, floor(0.25*num))
        upQ <- rowQ(eset, ceiling(0.75*num))
        iqrs <- upQ - lowQ
        giqrs <- iqrs > stats::quantile(iqrs, probs=percentage)
    }else{
        giqrs <- 1:nrow(eset)
    }
    return(giqrs)
}

## Calculo de las probabilidades posteriores de los genes para cada hipotesis
## Recibe como argumento una matriz con los datos de expresion, el conjunto de hipotesis, el numero de clases,
## el numero de elementos de cada clase y un booleano indicando si se van a devolver solo los genes sobreexpresados (TRUE) o tanto los genes soberexpresados como reprimidos (FALSE)

# En nuestro programa la opcion onlyOverExpressed no se utiliza
PEB <- function(eset, sampleLabels, labelsOrder=NULL,  nullHiphothesisFilter=0.95, untie="bestRank")
{
    hypothesis     <- create.hypothesis (sampleLabels)

    numClasses <- length(hypothesis) -1
    if(numClasses == 1) numClasses <- 2

    postProb <- gene.select(eset, hypothesis)
    colnames(postProb) <- names(hypothesis)
    postProb <- postProb[postProb[,"Null Hipothesis"] < nullHiphothesisFilter,, drop=FALSE]
    if(dim(postProb)[1] == 0) stop("The current genes don't differentiate the classes.")
    # Si hay un solo gen y 1 sola clase puede dar error...

    if(!is.null(labelsOrder))  postProb <-postProb[,c("Null Hipothesis", labelsOrder)]

    genesOrder <- calculateOrder(eset, sampleLabels, postProb, untie=untie)

    # Returns the order and the posterior probabilities
    ret <- new("GenesRanking",  postProb=postProb, numGenesClass=genesOrder$nGenes , ord=genesOrder$ord)
    return(ret)
}


# calculate order
calculateOrder <- function (eset, sampleLabels, postProb,untie)        #untie="bestRank" / "bestPostProb"
{
    numClasses <- ncol(postProb) -1
    ranking <- apply(-postProb[,-1,drop=FALSE], 2, rank, ties.method="min")

    # Desempatar:
    empates <- NULL
    if(numClasses > 2)
    {
        tab<- apply(ranking, 2, table)
        if(is.matrix(tab))
        {
            temp<-NULL
            for(i in 1:dim(tab)[2]) temp[[i]]<-tab[,i]
            names(temp) <- colnames(tab)
            tab <- temp
        }
        empates <- lapply(tab, function(x)   names(which(x>1))) #if (any(x>1))
    }else                             empates <- list(BothClasses=names(which(table(ranking)>1)))

    if(!is.null(names(empates)))
    {
        for (cl in names(empates))
        {
            if(length(empates[[cl]])>0)
            {
                meanDiffs <- lapply( empates[[cl]], function(x) { minimumDiff (rownames(ranking)[which(ranking[,cl] == x)], cl, eset, sampleLabels)})
                names(meanDiffs)  <- empates[[cl]]

                meanDiffRanks <- lapply(meanDiffs, function(x) {rank(-x)-1}) # rank(-meanDiff)-1
                for(rnk in names(meanDiffRanks))
                {
                    ranking[names(meanDiffRanks[[rnk]]),cl] <- as.numeric(rnk) + meanDiffRanks[[rnk]]   # (without drop=FALSE)
                }
            }
        }
    }

    # New order:
    ord <- apply(ranking, 2, order)

    # Remove repeated genes from ord:     Keep genes only in the class with best rank
    if(numClasses > 2 )
    {
        if(untie == "bestRank")    # The gene is kept in the class with best rank. (if there is a tie, maybe the postProbs should be compared, but not likely and would take longer)
        {
            for(gen in 1:dim(ord)[1])
            {
                geneRanks <- which(ord == ord[gen,1], arr.ind=TRUE)
                rmv <- geneRanks[-which(geneRanks[,1] == min(geneRanks[,1]), arr.ind=TRUE)[1],,drop=FALSE]

                ord[rmv] <- 0
                # if (dim(rmv)[1] >0)
                # {
                    # for(i in 1:dim(rmv)[1]) ord[ rmv[i,1],rmv[i,2]]<-0
                # }
            }
        }
        if(untie== "bestPostProb")  # The gene is kept in the class with best postProb.
        {
            for(gen in 1:dim(ord)[1])
            {
                geneRanks <- which(ord == ord[gen,1], arr.ind=TRUE)
                genePostProb <- postProb[ord[gen,1],-1]
                rmv <- geneRanks[-which(geneRanks[,2]==which(genePostProb == max(genePostProb))),,drop=FALSE]
                ord[rmv] <- 0
            }
        }

        nGenes<-apply(ord,2,function(x) { sum(x!=0)})
        # Reorder the matrix
        tmp<- matrix(ncol=ncol(ord), nrow=max(nGenes))
        colnames(tmp) <- colnames(ord)
        for(cl in names(nGenes))
        {
            if(nGenes[cl]>0) tmp[1:nGenes[cl],cl]<-   ord[which(ord[,cl]!=0),cl]
        }
        ord <- tmp
    } else
    {
        nGenes<-dim(ord)[1]
        names(nGenes)<-colnames(ord)
    }
    return(list(ord=ord, nGenesClass=nGenes))
}
## Funcion para la seleccion de genes
## Recibe como argumentos un objeto exprSet y la lista de hipotesis que se quieren contrastar
## devuelve un objeto ''ebarraysPostProb'' con la probabilidad posterior para cada una de las hipotesis para cada gen
## Un objeto de tipo ebarraysPostProb is essentially a matrix of probabilities with number of rows given by the number of
##genes in 'data' and as many columns as the number of hypotheses for the fit. It additionally contains a slot `hypotheses'
##containing these hypotheses.
gene.select <- function (eset, hypothesis){
    pattern <- ebPatterns(hypothesis)
    ggOVR.em.out <- emfit(eset, family = "GG", hypotheses = pattern, num.iter = 10)
    return(round(postprob(ggOVR.em.out, eset)$pattern,12))
}

# Returns the value of the difference between the mean expression of the given class, with the mean expression of the closest class
# This is used for untie the ranking
# minimumDiff (c("ENSG00000178445", "ENSG00000175097"), "ALL", eset, sampleLabels)
minimumDiff <- function (genesDiff, genesClass, eset, sampleLabels)
{
    if(any(!colnames(eset) %in% names(sampleLabels))) stop("The expressionSet doesn't contain the requested samples.")
    if(any(!genesDiff %in% rownames(eset))) stop("The expressionSet doesn't contain the requested genes.")
    if(genesClass %in% "BothClasses") genesClass <- levels(sampleLabels)[1]
    if(any(!genesClass %in% levels(sampleLabels))) stop("The genes' class is not one of the provided sample labels.")
    minDiff <- rep(0, length(genesDiff))
    names(minDiff)  <- genesDiff

    classes <-  levels(sampleLabels)
    numClasses <- length(classes)
    genesClass <- which(classes == genesClass)
    if(length(classes)==2) genesClass <- 1
    for( gen in genesDiff)
    {
        classExprsMean <- rep(0, numClasses)
        names(classExprsMean) <- classes

        for(cl in classes)
        {
            classExprsMean[cl] <- mean(eset[gen, names(sampleLabels) [which(sampleLabels == cl)]])
        }
        #minDiff[gen] <- min(abs(classExprsMean[genesClass] - classExprsMean[which(classExprsMean == max(classExprsMean[-genesClass]))]), abs(classExprsMean[genesClass] - classExprsMean[which(classExprsMean == min(classExprsMean[-genesClass]))]))
        minDiff[gen] <- min(abs(classExprsMean[genesClass] - classExprsMean[-genesClass]))
    }
    return(minDiff)
}


## Mean difference for each gene between a class and the other classes (calculated for all classes for each gene, returns a matrix)
# Used for knowing if the gene is up- or down-regulated
difMean <- function(eset, sampleLabels)
{
  classes <- levels(sampleLabels)
  if(length(classes)==2) classes <- classes[2]   # First class is refference
  dif <- sapply(classes, function(cl)
  {
      samplesClass <- sampleLabels %in% cl
      apply(eset[,samplesClass],1,mean) - apply(eset[,-samplesClass],1, mean)
  })
  # if two classes:
  #if(ncol(dif)==1) colnames(dif) <- "BothClasses"
  return(dif)
}

# Splits the samples into several train and test groups for cross-validation
cvSplitSamples <- function(numCV.extern, sampleLabels)
{
    samples <- split(names(sampleLabels), sampleLabels)

    testSamples <- vector("list", numCV.extern)
    trainSamples <- vector("list", numCV.extern)

    for(clSamples in samples)
    {
        clSamples <- sample(clSamples) # shuffle
        numSamples <- length(clSamples)

        # Split samples:
        numSamplesCV <- rep(floor(numSamples/numCV.extern), numCV.extern)
        if(numSamples%%numCV.extern!=0) numSamplesCV[0:(numSamples-(numSamplesCV[1]*numCV.extern))] <- numSamplesCV[1]+1

        for(i in 1:length(numSamplesCV))
        {
            ini <- sum(numSamplesCV[0:(i-1)])+1
            end <- sum(numSamplesCV[1:i])
            testSamples[[i]] <- c(testSamples[[i]], clSamples[ini:end])
            trainSamples[[i]] <- c(trainSamples[[i]], clSamples[!clSamples %in% testSamples[[i]]])
        }
    }

    cvSamples <- list(train=trainSamples, test=testSamples)
    return(cvSamples)
}


## Funcion para construir el clasificador. LLeva a cabo la Cross validation
## Recibe como argumentos un objeto exprSet, una lista con la clase a la que pertenece cada muestra, el numero de clases,
## num iters de cross validation
## Devuelve una lista con el clasificador construido, la matriz de confusion y el error
## resultado$clasificador, resultado$matriz.confusion, resultado$error

svmCrossVal <- function(trainEset, trainSampleLabels, cvInternSamples, minProbAssignCoeff=1, minDiffAssignCoeff=0.8, kernel="linear", ...)
{
  if(!is.matrix(trainEset)) trainEset <- rbind(NULL,trainEset)
  numClasses <- length(levels(trainSampleLabels))

  mxcf <- matrix(data=0,  nrow=numClasses, ncol=numClasses)
   colnames(mxcf) <- levels(trainSampleLabels)
   rownames(mxcf) <- levels(trainSampleLabels)
   
  for( i in 1:length(cvInternSamples$train))
  {
    # SVM - TRAINING  (phase 1)
    svmClassif  <-  svm(
      x=t(trainEset[,cvInternSamples$train[[i]], drop=FALSE]),
      y=trainSampleLabels[cvInternSamples$train[[i]]],
      kernel=kernel, probability=TRUE, ...) # cost=1

        # In case there are only two classes, svmClassif$SV doesn't contain the gene names
        if (is.null(dimnames(svmClassif$SV)) && numClasses ==2)
        {
            dimnames(svmClassif$SV)<- vector("list",2)
            colnames(svmClassif$SV) <- rownames(trainEset)
        }

        predictTestSamples  <- queryGeNetClassifier(svmClassif, as.matrix(trainEset[,cvInternSamples$test[[i]], drop=FALSE]), verbose=FALSE, minProbAssignCoeff=minProbAssignCoeff, minDiffAssignCoeff=minDiffAssignCoeff)$class
        predictTestSamples  <- predictTestSamples[predictTestSamples!="NotAssigned"] # Ignorar los NA
        predictTestSamples <- factor(predictTestSamples, levels=levels(trainSampleLabels))

    mxcf  <-  mxcf + table(predictTestSamples, trainSampleLabels[names(predictTestSamples)])
  }
  error.mxcf <- (1-(sum(diag(mxcf))/sum( mxcf )))

  numSamples <- table(trainSampleLabels[unlist(cvInternSamples$test)])
  sensitivity.class <- diag(mxcf)/numSamples
  sensitivity.global <- sum(diag(mxcf))/sum(numSamples)

  result <- list(classifier=svmClassif, confMatrix=mxcf, error=error.mxcf, sensitClass=sensitivity.class, sensitGlobal=sensitivity.global)
  return(result)
}


# si lo voy a hacer para todas las clases juntas pero luego separando por las interacciones dentro de la misma clase hay que pasarle los genes en columnas para cada clase en lugar de una lista con todos los genes...
# eset <- as.data.frame(exprs(rmaFiltrado))
# si devolvemos el geneSymbol en lugar del ensg hay que utilizar gnes.human.Annotation
# genes deberia se el equivalente a rankENSG de manera que si es una matriz asignemos a cada gen su clase y si es un vector pues todos directamente

## Devuelve la red de interaccion de los genes pasados como parametro
interaction.net <- function(eset, genes, lp, method="clr", estimator="mi.empirical", threshold=0.5)
{
    # require(infotheo) # required by minet
    net <- cbind(gene1=NULL, class1=character(0), gene2=NULL, class2=character(0), relation=character(0), value=numeric(0))

    #Comprobacion de parametros
    if(is(eset, "ExpressionSet")) eset <- exprs(eset) else if (!is.matrix(eset)) {warning("The first argument (eset) should be an expression matrix or an ExpressionSet.", immediate. = TRUE)
                                return(net)    }
    if(is.vector(genes)) genes <- as.matrix(genes)
    if(!is.matrix(genes))         {warning("The argument 'genes' should be a matrix or a vector.", immediate. = TRUE)
                                return(net)    }
    if(dim(genes)[1]>dim(eset)[1] || sum(which(dim(genes)==0))) {warning("The argument 'genes' can't be neither empty nor have more genes than the available.")
                                                return(net)}
    if(!is.numeric(lp))         {warning("The argument 'lp' should be numeric.", immediate. = TRUE)
                                return(net)    }
        if (sum(lp!=0)==0) return(net)
        if(dim(genes)[2]!=length(lp)){warning("The dimensions of the given genes and lp don't match.", immediate. = TRUE)
                                return(net)    }
        if(sum(lp>dim(genes)[1])>0) {warning("The number of genes in lp is bigger than the available genes.", immediate. = TRUE)
                                lp[1:length(lp)]<-dim(genes)[1]
                                }
    #Calculos
    ensg<-c()
    classes <- c()
    for(i in 1:length(lp))
    {
        if(lp[i]!=0)
        {
            ensg <- c(ensg, genes[1:lp[i],i])
            classes <- c(classes, rep(names(lp[i]), times=lp[i]))
        }
    }
    datos <- eset[as.vector(ensg),]
    if(!is.matrix(datos))
    {
        datos<- rbind(NULL, datos)
        rownames(datos)<- as.vector(ensg)
    }
    if(any(datos == -Inf)) stop("Expression matrix contains invalid values (-Inf).")
    names(classes) <- as.character(ensg)

    mi <- minet(t(datos), method=method, estimator=estimator, disc="equalwidth", nbins=sqrt(nrow(datos)))
    tmp <- which(mi > threshold, arr.ind=TRUE)
    tmp <- tmp[which(tmp[,1] < tmp[,2]),,drop=FALSE]
    if(!is.matrix(class(tmp))) tmp<- rbind(NULL,tmp)

    net <- cbind(gene1=rownames(tmp), class1=classes[rownames(tmp)], gene2=colnames(mi)[tmp[,2]], class2=classes[colnames(mi)[tmp[,2]]], relation=rep(paste("Interaction -",method), times=nrow(tmp)), value=as.numeric(apply(tmp,1,function(x)mi[x[1],x[2]])))
    #rownames(net) <- apply(net[,c("gene1","gene2")], 1, function(x) paste(x[1],x[2],sep="_"))
    rownames(net) <- NULL
    return(net)
}

## Devuelve la red de correlaciones de los genes pasados como parametro
correlation.net <- function(eset, genes, lp, method="pearson", threshold=0.8)
{
    net <- cbind(gene1=NULL, class1=character(0), gene2=NULL, class2=character(0), relation=character(0), value=numeric(0))

    # Check arguments (Choose warning+ret  // stop)
    if(is(eset, "ExpressionSet"))
    {
        eset <- exprs(eset)
    }else
    {
        if (!is.matrix(eset))
        {
            stop("The first argument should be an expression matrix.", immediate. = TRUE)
            return(net)
        }
    }
    if(is.vector(genes)) genes <- as.matrix(genes)
    if(!is.matrix(genes))
    {
        stop("The argument 'genes' should be a matrix or a vector.", immediate. = TRUE)
        return(net)
    }
    if(dim(genes)[1]>dim(eset)[1] || sum(which(dim(genes)==0)))
    {   warning("The argument 'genes' can't be neither empty nor have more genes than the available.")
        return(net)
    }
    if(!is.numeric(lp))
    {
        stop("The argument 'lp' should be numeric.", immediate. = TRUE)
        return(net)
    }
    if (sum(lp!=0)==0) return(net)
    if(dim(genes)[2]!=length(lp))
    {
        stop("The dimensions of the given genes and lp don't match.", immediate. = TRUE)
        return(net)
    }
    if(sum(lp>dim(genes)[1])>0)
    {
        warning("The number of genes in lp is bigger than the available genes.", immediate. = TRUE)
        lp[1:length(lp)] <- dim(genes)[1]
    }

    # Calculo de la correlacion
    ensg<-c()
    classes <- c()
    for(i in 1:length(lp)){
      if(lp[i]!=0)
      {
        ensg <- c(ensg, genes[1:lp[i],i])
        classes <- c(classes, rep(names(lp[i]), times=lp[i]))
      }
    }
    datos <- eset[as.vector(ensg),]
    names(classes) <- as.character(ensg)

    pearsoncor <- stats::cor(t(datos), method=method)
    tmp <- which(pearsoncor > threshold, arr.ind=TRUE)
    tmp <- tmp[which(tmp[,1] < tmp[,2]),,drop=FALSE] #Eliminamos los redundantes porque la matriz es simetrica
    if(!is.matrix(class(tmp))) tmp<- rbind(NULL,tmp)
    net <- cbind(gene1=colnames(pearsoncor)[tmp[,1]], class1=classes[colnames(pearsoncor)[tmp[,1]]], gene2=colnames(pearsoncor)[tmp[,2]], class2=classes[colnames(pearsoncor)[tmp[,2]]], relation=rep(paste("Correlation -",method), times=nrow(tmp)), value=as.numeric(apply(tmp,1,function(x)pearsoncor[x[1],x[2]])))
    #rownames(net) <- apply(net[,c("gene1","gene2")], 1, function(x) paste(x[1],x[2],sep="_"))
    rownames(net) <- NULL
    return(net)
}

## Elimina los genes correlacionados o que interactuan de la lista total de genes
remove.redundancy <- function(eset, genes, lp, net, relation=NULL)
{
    #Comprobacion de parametros
    if(is(eset, "ExpressionSet")) eset <- exprs(eset) else if (!is.matrix(eset))      stop("The first argument should be an expression matrix or an expressionset.")
    if(is.vector(genes)) genes <- as.matrix(genes)
    if(!is.matrix(genes))     stop("The argument 'genes' should be a matrix.")
    if(!is.numeric(lp))     stop("The argument 'lp' should be numeric.")
            if(dim(genes)[2]!=length(lp)){warning("The dimensions of the given genes and lp don't match.", immediate. = TRUE)
                                return(net)    }
    if(!class(net) == "GenesNetwork" && !is.list(net)) stop("The argument 'net' should be the GenesNetwork or a list of GenesNetwork returned by correlation.net or interaction.net.")
    #if (is.matrix(net) && (sum(colnames(net) %in% c("gene1", "class1", "gene2", "class2", "relation", "value")) !=6 ))     stop("The argument 'net' should be the matrix or a list of matrixes returned by correlation.net or interaction.net.")
    if (is.list(net)&!is.null(net))
    {
        netList<- net
        net <- NULL
        i <- 1
        for( cl in 1:length(netList))
        {
            if(!is.null(netList[[cl]]))
            {
                subnet <- netList[[cl]]@edges
                if(!is.matrix(subnet) || (sum(!colnames(subnet) %in% c("gene1", "class1", "gene2", "class2", "relation", "value")) >0 ) )     stop("The argument 'net' should be the GenesNetwork or a list of GenesNetwork returned by correlation.net or interaction.net.")
                if(sum(colnames(subnet) %in% c("class1", "class2")) ==0)
                {
                    classNameRep <- rep(names(netList)[i],dim(subnet)[1])
                    subnet<- cbind(subnet, class1=classNameRep, class2=classNameRep)
                }
                net<- rbind(net,subnet)
            }
            i <- i+1
        }
    }else{
        if(is.null(net)) { return(net) }
        net <- net@edges
    }

    #Filter matrix to remove only the relation requested
    if (!is.null(relation))
    {
        if (relation != "Correlation - pearson" && relation != "Interaction - clr") warning ("The relation was not recognized. Check if the results were what you expected.", immediate=TRUE)
        net <- net[which(net[,"relation"]==relation),]
    }

    #Calculos
    nonRedundantGenes <- list()
    removedGenes <- list()

    for( j in 1:length(lp))
    {
        tmp <- genes[1:lp[j],j]
        tmp <- tmp[which(tmp!="NA")]
        i <- 1
        while(i<length(tmp))     # No cambiar por for, length(tmp) cambia en el bucle
        {
          if (tmp[i] %in% net[,c("gene1","gene2")])
          {
            ind <- which(net[,c("gene1","gene2"), drop=FALSE]==tmp[i] & net[,"class1"]==names(lp[j]) & net[,"class2"]==names(lp[j]), arr.ind=TRUE) # drop=FALSE added. check it works!
            redundant <- unique(c(tmp[i], as.vector(net[ind[,1],c("gene1","gene2")])))[-1]
            if( length(which(tmp %in% redundant)) != 0)
            {
              tmp <- tmp[-which(tmp %in% redundant)]
              removedGenes[[names(lp[j])]] <- unique(c(removedGenes[[names(lp[j])]], redundant))
            }
          }
          i <- i+1
        }
        nonRedundantGenes[[names(lp[j])]] <- tmp
    }
    ret <- list(nonRedundantGenes=nonRedundantGenes, removedGenes=removedGenes)
    return(ret)
}

# Used in main. Also exported as plot() for class Global Return
plotGeNetClassifierReturn <- function( geNetClassifierReturn, fileName=NULL, lpThreshold=0.95, numGenesLpPlot=1000, numGenesNetworkPlot=100, geneLabels=NULL, verbose=TRUE)
{
  if(class(geNetClassifierReturn)[1] != "GeNetClassifierReturn") stop("")
  if(!is.null(fileName) && !is.character(fileName))  stop("The plots name is not a valid name.")
  if(!is.numeric(lpThreshold) || (lpThreshold>=1 || lpThreshold <0)) stop("Lp threshold should be a probability (a number between 0 and 1).")
  if(!is.numeric(numGenesLpPlot)) stop("The number of genes to plot in the posterior probability plot should be a number .")
  if(!is.numeric(numGenesNetworkPlot) || numGenesNetworkPlot< 2) stop("The number of genes to plot in the network should be a number higher than two.")
  if(!is.logical(verbose)) verbose <- TRUE

  ####### LP plot
  if(("genesRanking" %in% names(geNetClassifierReturn)) && any(numGenes(geNetClassifierReturn@genesRanking)>0))
  {
    if(!is.null(fileName))
    {
      pdf(paste(fileName,"_postProb.pdf",sep=""))
    }
    calculateGenesRanking(precalcGenesRanking = geNetClassifierReturn@genesRanking, lpThreshold=lpThreshold, numGenesPlot=numGenesLpPlot, plotLp=TRUE, returnRanking=FALSE, verbose=FALSE)
    if(!is.null(fileName))
    {
      dev.off()
    }
  }

  ####### NETWORK plot
  if(("genesNetwork" %in% names(geNetClassifierReturn)) && length(geNetClassifierReturn@genesNetwork)) if ("igraph" %in% rownames(utils::installed.packages()))
  {
    # Extract requested network
    topGenes <- getRanking(getTopRanking(geNetClassifierReturn@genesRanking, numGenesClass=numGenesNetworkPlot), showGeneID=TRUE, showGeneLabels=FALSE)$geneID
    netTopGenes <- getSubNetwork(geNetClassifierReturn@genesNetwork, topGenes, showWarnings=FALSE)
    if(is.null(fileName))
    {
      plotType <-"dynamic"
      nwFileName <- ""
      plotOnlyConnectedNodesNetwork <- FALSE
    }else{
      plotType <-"pdf"
      nwFileName <- paste(fileName,"_network_top",numGenesNetworkPlot,"Genes.pdf",sep="")
      plotOnlyConnectedNodesNetwork <- TRUE
    }
    plotNetwork(genesNetwork=netTopGenes,  classificationGenes=geNetClassifierReturn@classificationGenes, genesRanking=getTopRanking(geNetClassifierReturn@genesRanking, numGenesNetworkPlot), plotAllNodesNetwork=TRUE, plotOnlyConnectedNodesNetwork=plotOnlyConnectedNodesNetwork,  plotClassifcationGenesNetwork=FALSE, genesInfo=NULL, geneLabels=geneLabels, returniGraphs=FALSE, plotType=plotType, labelSize=0.3, fileName=nwFileName, verbose=FALSE)
  }

  ####### DISCRIMINANT POWER plot
  if (is.null(fileName))
  {
    dpFileName<- NULL
    x11()
  }
  else dpFileName <- paste(fileName,"_discriminantPower.pdf",sep="")

  if(("classifier" %in% names(geNetClassifierReturn)) && length(geNetClassifierReturn@classifier)) plotDiscriminantPower(geNetClassifierReturn@classifier, classificationGenes=geNetClassifierReturn@classificationGenes, fileName=dpFileName, returnTable=FALSE, verbose=FALSE)

  if (verbose && !is.null(fileName)) { message(paste("The plots were saved in ",getwd()," with the prefix '",fileName,"'.",sep="")); utils::flush.console()}
}

# Opens the pdf file name, or, if null, divides the plot window for the appropiate number of plots
# Returns the number of genes for which the screen was prepared. Check if it is lower than provided to show warning.
# After the plot is drawn, the device the devide should be closed:
    # if (!is.null(fileName))
    # {
        # dev.off()
        # if (verbose){ message(paste("The SV plot was saved as ",getwd(),"/",fileName," (PDF file)",sep="")); flush.console()}
    # }
configurePlotOutput <- function(nGenes, fileName=NULL)
{

  if (!is.null(fileName))
  {
    if(!is.character(fileName)) stop("The file name is not valid.")
    pdf(fileName)
  }else
  {
    if(nGenes>20)
    {
      nGenes <- 20
      #    warning(paste("Up to ",nGenes," genes will be shown. To plot more genes specify a PDF output file name.",sep=""))
    }
    if(nGenes<=20)
    {
      #Columns and rows of the plot
      if(nGenes == 4) rows <-2         # 2x2
      else if (nGenes <= 5) rows <- 1
      else if (nGenes <= 10) rows <- 2
      else if (nGenes <= 15 || nGenes == 18) rows <- 3
      else rows <- 4

      cols <-ceiling(nGenes/rows)

      # Split plot window:
      par(mfcol=c(rows,cols))
    }
  }
  return(nGenes)
}



# Comprobar y concatenar si hay varias queries
# globalQueryResult <- queryResultCheck(queryResult)
queryResultCheck <- function(queryResult)
{
  if((!is.list(queryResult)) || (length(queryResult)<2)) stop('The argument should be the result from executing queryGeNetClassifier.')
  else if (!is.factor(queryResult$class) || !is.matrix(queryResult$probabilities))  stop('The argument should be the result from executing queryGeNetClassifier.')

  # Eliminar los $call de la prediccion
  temp<- queryResult
  queryResult<- NULL
  for(i in 1:length(temp))
  {
      if(is.null(temp[i]$call)) queryResult<- c(queryResult, temp[i])
  }

  # Concatenar
  if(length(queryResult)==2) globalQueryResult <- queryResult
  if(length(queryResult)>2)
  {
    globalQueryResult <- c(queryResult[1], queryResult[2])

    numQueryResults <- length(queryResult)/2
    for (i in 2:numQueryResults)
    {
      #Comprobar q coinciden etiquetas de probabilities
      # Probabilities: Tienen q ser exactamente las mismas (en numero y orden)
      if(dim(globalQueryResult$probabilities)[1] !=  dim(queryResult[i*2]$probabilities)[1])
      {
        warning("The classes of the predictors don't match.", immediate. = TRUE)
        return (list(stats=NULL, countNotAssigned=NULL,statsLegend=NULL))
      }
      else
      {
        if (sum(rownames(globalQueryResult$probabilities) != rownames(queryResult[i*2]$probabilities))!=0) #Si las dimensiones son distintas da error
        {
          warning("The classes of the predictors don't match.", immediate. = TRUE)
          return (list(stats=NULL, countNotAssigned=NULL,statsLegend=NULL))
        }
      }

      # Unir las clases (puede que los niveles no coincidan si solo se hayan asignado algunas clases en cada resultado concreto)
      globalNames<- names(globalQueryResult$class)
      globalQueryResult$class <- factor(c(as.character(globalQueryResult$class),as.character(queryResult[(i*2)-1]$class)))
      names(globalQueryResult$class) <- c(globalNames, names(queryResult[(i*2)-1]$class))

      # Unir las probabilidades
      globalQueryResult$probabilities <- cbind(globalQueryResult$probabilities,queryResult[i*2]$probabilities)
    }
  }

  return(globalQueryResult)
}

## Establece los valores minimos de probabilidad de una muestra para que sea asignada a una determinada clase
# Para 2 clases, requiere al menos un 75%
assignment.conditions <- function(prob, minProb, minDiff)
{
  if(length(prob)==2) minProb <- 0

  ord <- order(prob,decreasing=TRUE)
  ret <- names(prob)[ord[1]]

  if(prob[ord[1]]-prob[ord[2]]<=minDiff)
  {
    ret <- "NotAssigned"
  }
  if(prob[ord[1]]<=minProb)
  {
    ret <- "NotAssigned"
  }

  return(ret)
}

## Calcula la diferencia para un gen entre la suma de los valores SV para la clase que diferencian y la siguiente clase mas cercana a ese valor (valor discriminante...)
SV.dif <- function(classifier, gene, originalGeneNames=NULL, correctedAlpha=FALSE)
{
    numClasses <- length(classifier$levels)

    if(!gene %in% colnames(classifier$SV))
    {
        m <- matrix(0)    #Transformed into dataframe colnames to make sure they match the $SV
        colnames(m) <- gene
        m <- data.frame(m)
        #gene <- gsub("^X","",colnames(data.frame(m)))

        colnames(classifier$SV)[which(colnames(classifier$SV)%in% colnames(m))] <- gene
        #colnames(classifier$SV)<-  gsub("^X","",colnames(classifier$SV))    #If the genenames start by number -->DataFrame adds an X to the name
    }

    # same number of elements in plot as the number of support vectors, nrow=max(classifier$nSV)
    tmp <- matrix(ncol=numClasses, nrow=max(classifier$nSV))
    colnames(tmp) <- classifier$levels

    # Si se corrige alfa: Sumar los valores absolutos de los coeficientes y multiplicar por el del gen
    if(correctedAlpha)
    {
        alpha <- apply(classifier$coefs,1,function(x)sum(abs(x)))
        mult <- classifier$SV[,gene]*alpha
    }else
    {
        mult <- classifier$SV
    }

    ind <- 1
    for(i in 1:numClasses)
    {
        # mult para los SV de la clase. Rellenado con la media en las clases que tienen menos.
        if(correctedAlpha)
        {
          tmp[,i] <- mean(mult[ind:(ind+classifier$nSV[i]-1)])
          tmp[1:classifier$nSV[i],i] <- mult[ind:(ind+classifier$nSV[i]-1)]
        } else
        {
          tmp[,i] <- mean(mult[ind:(ind+classifier$nSV[i]-1),gene])
          tmp[1:classifier$nSV[i],i] <- mult[ind:(ind+classifier$nSV[i]-1),gene]
        }
        ind <- ind+classifier$nSV[i]
    }

    # Separate negative/positive
    neg <- tmp
    pos <- tmp
    for(i in 1:numClasses)
    {
        neg[which(neg[,i]>0),i] <- 0
        pos[which(pos[,i]<0),i] <- 0
    }

    # Max and min
    sum_pos <- apply(pos,2,sum)
    maxim <- which(sum_pos == max(sum_pos), arr.ind=TRUE)[1]
    sum_neg <- apply(neg,2,sum)
    minim <- which(sum_neg == min(sum_neg), arr.ind=TRUE)[1]

    # up regulated
    if (max(sum_pos[-maxim]) != 0){         sig <- max(sum_pos[-maxim])     #Si el siguiente valor no es cero
    }else                                     sig <- max(sum_neg[-maxim])        #Si no hay mas "positivas" se busca la negativa
    up_reg <- max(sum_pos)-sig

    # down regulated
    if (min(sum_neg[-minim]) !=0){             sig <- min(sum_neg[-minim])
    }else                                     sig <- min(sum_pos[-minim])
    dw_reg <- min(sum_neg)-sig

    if(numClasses>2)
    {
        # Choose biggest absolute value (The most diferent class' DP)
        if(up_reg < abs(dw_reg)){     discriminantPower <- dw_reg
        }else                                         discriminantPower <- up_reg
    }else
    {
        # Choose smallest absolute value (Taking into account the most likely to be mistaken)
        if(up_reg > abs(dw_reg)){     discriminantPower <- dw_reg
        }else                                         discriminantPower <- up_reg
    }

    # Find save the class for wich it was calculated
    dpClass <- classifier$levels[ifelse(discriminantPower>0,which(sum_pos == max(sum_pos)),which(sum_neg == min(sum_neg)))]

    ret<- list(discriminantPower=discriminantPower, discrPwClass=dpClass ,positive=pos, negative=neg)
    return(ret)
}


# Checks the given annotation format and extracts the desired gene names from it
# Returns: Vector (name=geneID, content=geneShowName), or NULL
# genesAnnotation: Data frame or matrix containing the expressionSet "geneID" as first column or rowname, and the desired "geneName" (symbol) to show in plots and tables
#
extractGeneLabels <- function(genesAnnotation, geneList=NULL)
{
    if((is.null(genesAnnotation)) || (length(genesAnnotation)==0))
    {
        if(is.null(genesAnnotation))             ret <- NULL
        if(length(genesAnnotation)==0)     ret <- genesAnnotation
    }else
    {
        if (is.matrix(genesAnnotation) || is.data.frame(genesAnnotation))
        {
            if(all(dim(genesAnnotation)>1)) stop("geneSymbols is not valid. Please provide only ONE value per gene. ")
            if(dim(genesAnnotation)[1]<dim(genesAnnotation)[2]) genesAnnotation <- t(genesAnnotation)  # Ordered as column
            if(dim(genesAnnotation)[2]>1)
            {
                if(!any(rownames(genesAnnotation)!=genesAnnotation[,1]))  genesAnnotation <- as.matrix(genesAnnotation[,2])        # The rows are named and the first column contain the geneID
                if(is.null(rownames(genesAnnotation)))
                {
                    rownames(genesAnnotation) <- genesAnnotation[,1]    # First column assumed to contain the geneID
                    genesAnnotation <- as.matrix(genesAnnotation[,2])    # Second column assumed to contain the gene show name (symbol)
                }
            }
        }else
        {
            if (is.factor(genesAnnotation) || is.character(genesAnnotation))
            {
                if(length(genesAnnotation) > 0) genesAnnotation <- as.matrix(genesAnnotation) # Transform into matrix
            }else stop("The genesAnnotation should be either a matrix, a dataframe or a vector.")
        }

        if(!is.null(geneList))
        {
            missingGenes <- !geneList %in% rownames(genesAnnotation)
            if(all(missingGenes))
            {
                stop("The given gene IDs are not available in the genes Annotation.")
            }
            if(any(missingGenes))
            {
                warning("Some of the given gene IDs are not available in the genes Annotation. Their alias will not be shown.", immediate.=TRUE)
            }
        }else { geneList <- rownames(genesAnnotation) }

        if(!is.null(rownames(genesAnnotation)))
        {
            ret <- as.character(genesAnnotation[geneList,]) # Returns vector
            names(ret) <- geneList
        }else ret <- genesAnnotation
    }
    return(ret)
}

    # ARREGLAR O ELIMINAR
        plotErrorNumGenes <- function(numGenesTPglobal, numGenesClass, numTrainGenes, numGenesClassGE, plotsName, plotCVgE=FALSE)
        {
            if(!is.null(numGenesTPglobal) && (!is.vector(numGenesTPglobal) && !is.matrix(numGenesTPglobal)))  stop("numGenesTPglobal should be a numeric vector.")
            numClasses <- ncol(numGenesClass)

            pdf(plotsName)

            # buildClassifier=TRUE
            if(!is.null(numGenesTPglobal))
            {
                ######
                # Plot of errorNumGenes (all in one)
                if((length(numGenesTPglobal)>3 && length(numGenesTPglobal)<10) && ("RColorBrewer" %in% rownames(utils::installed.packages())))
                {
                    colores<- RColorBrewer::brewer.pal(length(numGenesTPglobal),"Dark2")
                }else colores <- grDevices::rainbow(length(numGenesTPglobal))
                maxError <- round(1-min(sapply(numGenesTPglobal, function(x){min(x[,numClasses+1])})), 1)
                maxGenes <- ceiling(max(sapply(numGenesTPglobal, function(x){max(apply(x[,1:numClasses, drop=FALSE],1,sum))}))/20)*20
                numGenesTPglobal <- lapply(numGenesTPglobal, function(x)  { cbind(x, totalGenes=apply(x[,1:numClasses,drop=FALSE],1,sum), error=1-x[,numClasses+1])})

                plot.new( )
                plot.window(xlim=c(0, maxGenes), ylim=c(0,maxError))
                title( main="Gene-selection iterations",  xlab="Total number of genes", ylab="Error rate") #sub="(Error  vs Total gene number)",
                axis(1)
                axis(2)
                for(iter in 1:length(numGenesTPglobal))
                {
                    errorNumGenes <- numGenesTPglobal[[iter]][,c("totalGenes", "error"), drop=FALSE]
                    #maxError <- max(errorNumGenes[,"error"])
                    minError <- which(errorNumGenes == min(errorNumGenes[,"error"]), arr.ind=TRUE)
                    graphics::lines(errorNumGenes, type="l", col=colores[iter], lwd=2)
                    graphics::points(errorNumGenes[minError[1],1], errorNumGenes[minError[1],2], type="b", pch=16, col=colores[iter])
                    graphics::text(maxGenes*0.60,maxError-(iter*(maxError*0.05)), paste("Error ",round(errorNumGenes[minError[1],2],2), ": ",errorNumGenes[minError[1],1]," genes",sep=""), col=colores[iter], pos=4)
                }

                ######
                # Gene selection (barplot) # Classifier
                barplot(numGenesClass, beside=TRUE, main="Number of genes selected in each iteration", density=50, col=colores, sub=paste("Total number of selected genes: ", sum(numTrainGenes["Classifier",])), ylim=c(0, max(numGenesClass)), border=colores, xlab="Classes", ylab="Number of genes")

                numTrainGenesPlot <-numGenesClass
                for(cl in 1:dim(numTrainGenes)[2])
                {
                    numTrainGenesPlot[-which(numTrainGenesPlot[,cl] == numTrainGenes["Classifier",cl])[1],cl]<- 0
                }
                graphics::barplot(numTrainGenesPlot, beside=TRUE,add=TRUE, col=colores, border=colores)

                ######
                # Plot max genes recorridos
                maxGenesChecked <- NULL
                for(i in 1:length(numGenesTPglobal)) {
                    maxGenesChecked <- rbind(maxGenesChecked, numGenesTPglobal[[i]][dim(numGenesTPglobal[[i]])[1],1:numClasses])
                }
                graphics::barplot(maxGenesChecked, beside=TRUE, main="Total number of genes explored in each iteration", col=colores, density=50, xlab="Classes", ylab="Number of genes")
                graphics::barplot(numGenesClass, beside=TRUE,add=TRUE, col=colores)
            }

            # GeneralizationError
            if(!is.null(numGenesClassGE) && plotCVgE)
            {
                ######
                # Gene selection (barplot) # GE
                nIters <- dim(numGenesClassGE[[1]])[1]
                if((length(numGenesClassGE)==5) && ("RColorBrewer" %in% rownames(utils::installed.packages())))
                {
                    colores<- RColorBrewer::brewer.pal(9,"Purples")[-c(1,2,3,9)]
                }else colores <- grDevices::rainbow(length(numGenesClassGE))

                numGenesClassGEbinded <-NULL
                for(i in 1:5) numGenesClassGEbinded<-rbind(numGenesClassGEbinded, numGenesClassGE[[i]]) # 5 = GE loops
                colsCV <- NULL
                for(colCVIter in colores) colsCV<-c(colsCV, rep(colCVIter, nIters))

                maxY <- ifelse(is.null(numGenesTPglobal), max(numGenesClassGEbinded), max(max(numGenesClassGEbinded), max(numGenesClass))) # Misma escala
                graphics::barplot(numGenesClassGEbinded, beside=TRUE, main="Genes selected in the 5-fold CV for Generalization Error", density=50, col=colsCV, ylim=c(0, maxY), border=colsCV, xlab="Classes", ylab="Number of genes")

                # Selected genes
                numTrainGenesPlot <-numGenesClassGEbinded
                ind <- 0
                indexes <- NULL
                for(iter in 1:5)
                {
                    indexes <- rbind(indexes, cbind(rep(0, numClasses), 1:numClasses))
                    for(cl in 1:numClasses)
                    {
                        indexes[(iter-1)*numClasses+cl, 1] <- ind + which(numTrainGenesPlot[(ind+1):(ind+nIters), cl] == numTrainGenes[iter,cl])[1]
                    }
                    ind <- ind+nIters
                }

                # Hacer 0...
                temp <- matrix(nrow=nrow(numTrainGenesPlot), ncol=ncol(numTrainGenesPlot), data=0)
                temp[indexes] <- numTrainGenesPlot[indexes]
                numTrainGenesPlot <- temp

                graphics::barplot(numTrainGenesPlot, beside=TRUE,add=TRUE, col=colsCV, border=colsCV)
            }

            dev.off()
        }

Try the geNetClassifier package in your browser

Any scripts or data that you put into this service are public.

geNetClassifier documentation built on Nov. 8, 2020, 4:53 p.m.