R/cbaf-automatedStatistics.R

Defines functions automatedStatistics

Documented in automatedStatistics

#' @title Perform the requested statistics for various studies / subgroups of a
#' study.
#'
#' @description This function calculates frequency percentage, frequency ratio,
#' mean value and median value of samples greather than specific cutoff in the
#' selected study / subgroups of the study. Furthermore, it can looks for the
#' five genes that contain the highest values in each study / study subgroup.
#' It uses the data generated by obtainOneStudy()/obtainMultipleStudies()
#' function.
#'
#' @details
#' \tabular{lllll}{
#' Package: \tab cbaf \cr
#' Type: \tab Package \cr
#' Version: \tab 1.12.1 \cr
#' Date: \tab 2020-12-07 \cr
#' License: \tab Artistic-2.0 \cr
#' }
#'
#'
#'
#' @importFrom stats median na.omit
#'
#' @importFrom BiocFileCache bfcnew bfcquery bfcpath
#'
#' @importFrom utils head setTxtProgressBar txtProgressBar
#'
#'
#'
#' @include cbaf-obtainOneStudy.R cbaf-obtainMultipleStudies.R
#'
#'
#'
#'
#' @usage automatedStatistics(submissionName, obtainedDataType =
#'   "multiple studies", calculate = c("frequencyPercentage", "frequencyRatio",
#'   "meanValue"), topGenes = TRUE, cutoff=NULL, round=TRUE)
#'
#'
#'
#' @param submissionName a character string containing name of interest. It is
#' used for naming the process.
#'
#' @param obtainedDataType a character string that specifies the type of input
#' data produced by the previous function. Two options are availabe:
#' \code{"single study"} for obtainOneStudy() and \code{"multiple studies"} for
#' obtainMultipleStudies(). The function uses obtainedDataType and
#' submissionName to construct the name of the BiocFileCach object and then
#' finds the appropriate data inside it. Default value is multiple studies`.
#'
#' @param calculate a character vector that containes the statistical procedures
#' users prefer the function to compute. The complete results can be obtained
#' by \code{c("frequencyPercentage", "frequencyRatio", "meanValue",
#' "medianValue")}. This will tell the function to compute the following:
#' \code{"frequencyPercentage"}, which is the percentge of samples having the
#' value greather than specific cutoff divided by the total sample size for
#' every study / study subgroup;
#' \code{"frequency ratio"}, which shows the number of selected samples divided
#' by the total number of samples that give the frequency percentage for every
#' study / study subgroup. It shows the selected and total sample sizes.;
#' \code{"Mean Value"}, that contains mean value of selected samples for each
#' study;
#' \code{"Median Value"}, which shows the median value of selected samples for
#' each study.
#' The default input is \code{calculate = c("frequencyPercentage",
#' "frequencyRatio", "meanValue")}.
#'
#' @param topGenes a logical value that, if set as TRUE, causes the function to
#' create three data.frame that contain the five top genes for each cancer. To
#' get all the three data.frames, "frequencyPercentage", "meanValue" and
#' "MedianValue" must have been included for \code{calculate}.
#'
#' @param cutoff a number used to limit samples to those that are greather than
#' this number (cutoff). The default value for methylation data is \code{0.6}
#' while gene expression studies use default value of \code{2}. For methylation
#' studies, it is \code{average of relevant locations}, for the rest, it is
#' \code{"log z-score"}. To change the cutoff to any desired number, change the
#' option to \code{cutoff = desiredNumber} in which desiredNumber is the number
#' of interest.
#'
#' @param round a logical value that, if set to be \code{TRUE}, will force the
#' function to round all the calculated values to two decimal places. The
#' default value is \code{TRUE}.
#'
#'
#'
#' @return A new section in the BiocFileCache object that was created by one of
#' the obtainOneStudy() or obtainMultipleStudies() functions. It contains a list
#'  that contains some or all of the following statistical measurements for
#'  every gene group, based on what user has chosen: \code{Frequency.Percentage}
#'  , \code{Top.Genes.of.Frequency.Percentage}, \code{Frequency.Ratio},
#'  \code{Mean.Value}, \code{Top.Genes.of.Mean.Value}, \code{Median},
#'  \code{Top.Genes.of.Median}.
#'
#'
#'
#' @examples
#' genes <- list(K.demethylases = c("KDM1A", "KDM1B", "KDM2A", "KDM2B", "KDM3A",
#'  "KDM3B", "JMJD1C", "KDM4A"), K.methyltransferases = c("SUV39H1", "SUV39H2",
#'  "EHMT1", "EHMT2", "SETDB1", "SETDB2", "KMT2A", "KMT2A"))
#'
#' obtainOneStudy(genes, "test", "Breast Invasive Carcinoma (TCGA, Cell 2015)",
#' "RNA-Seq", desiredCaseList = c(3,4))
#'
#' automatedStatistics("test", obtainedDataType = "single study", calculate =
#' c("frequencyPercentage", "frequencyRatio"))
#'
#' @author Arman Shahrisa, \email{shahrisa.arman@hotmail.com} [maintainer,
#' copyright holder]
#' @author Maryam Tahmasebi Birgani, \email{tahmasebi-ma@ajums.ac.ir}
#'
#' @export



################################################################################
################################################################################
############### Automatically calculate statistical measurements ###############
################################################################################
################################################################################

automatedStatistics<- function(

  submissionName,

  obtainedDataType = "multiple studies",

  calculate = c("frequencyPercentage", "frequencyRatio", "meanValue"),

  topGenes = TRUE,

  cutoff = NULL,

  round = TRUE

  ){

  ##############################################################################
  ########## Prerequisites

  # Check submissionName

  if(!is.character(submissionName)){

    stop("'submissionName' must be entered as a character string for naming the process")

  }



  # Obtain the unprocessed data list

  if(obtainedDataType == "multiple studies"){

    previousParamName <- "Parameters for obtainMultipleStudies()"

    paramDeterminant <- "ObtainMultipleStudies"

    databaseType <- "Obtained data for multiple studies"

  } else if(obtainedDataType == "single study"){

    previousParamName <- "Parameters for obtainOneStudy()"

    paramDeterminant <- "ObtainOneStudy"

    databaseType <- "Obtained data for single study"

  } else{

    stop("'obtainedDataType' must be entered as either 'multiple studies' or 'single study'.")

  }



  # Check calculate

  if(is.character(calculate)){

    if(!any(calculate %in% c("frequencyPercentage",

                             "frequencyRatio",

                             "meanValue",

                             "medianValue"))

       ){

      stop("'calculate' must contain at least one of the following: 'frequencyPercentage', 'frequencyRatio', 'meanValue' and 'medianValue'.")

    }

  }else{

    stop("'calculate' must be a character string.")

  }



  # Check topGenes

  if(!is.logical(topGenes)){

    stop("'topGenes' can only accept logical values: TRUE or FALSE .")

  }



  # Check cutoff

  if(!is.null(cutoff) & !is.numeric(cutoff)){

    stop("'cutoff' must be set as NULL or a be a numeric value.")

  }



  # Check round

  if(!is.logical(round)){

    stop("'round' can only accept logical values: TRUE or FALSE .")

  }





  ##############################################################################
  ########## Decide whether function should stops now!

  # Check wheather the requested data exists

  database <- system.file("extdata", submissionName, package="cbaf")

  if(!dir.exists(database)){

    stop("Please run one of the obtainSingleStudy() or obtainMultipleStudies() functions first")

  } else if(dir.exists(database)){

    bfc <- BiocFileCache(

      file.path(system.file("extdata", package = "cbaf"), submissionName),

      ask = FALSE

      )

    if(!nrow(bfcquery(bfc, previousParamName)) == 1){

      stop("Please run one of the obtainSingleStudy() or obtainMultipleStudies() functions first")

    }

  }



  # obtain parameters for prevous function

  previousFunctionParam <-

    readRDS(bfcpath(bfc, bfcquery(bfc, c(previousParamName))$rid))


  # fetch an old parameter from the previous function

  desiredTechnique <- previousFunctionParam$desiredTechnique



  # setting the value for cutoff

  if(desiredTechnique == "methylation"){

    cutoff.phrase <- "average of relevant locations cutoff"

    if(is.null(cutoff)){

      cutoff <- 0.8

    }

  } else{

    cutoff.phrase <- "log z-score cutoff"

    if(is.null(cutoff)){

      cutoff <- 0.5

    }

  }





  # Store the new parameteres

  newParameters <-list()

  newParameters$submissionName <- submissionName

  newParameters$obtainedDataType <- obtainedDataType

  newParameters$calculate <- calculate

  newParameters$cutoff <- cutoff

  newParameters$round <- round

  newParameters$topGenes <- topGenes

  newParameters$desiredTechnique <- desiredTechnique





  # Check wheather the requested data exists

  number.of.rows.parameters <-

    nrow(bfcquery(bfc, "Parameters for automatedStatistics()"))


  if(number.of.rows.parameters == 1){

    oldParameters <-

      readRDS(bfcpath(

        bfc,

        bfcquery(bfc, c("Parameters for automatedStatistics()"))$rid)

    )

    # Check whether the previous function is skipped

    if(previousFunctionParam$lastRunStatus == "skipped"){

      if(identical(oldParameters[-8], newParameters) |

         submissionName %in% c("test", "test2")){

        continue <- FALSE

        # Store the last parameter

        newParameters$lastRunStatus <- "skipped"

        oldParamAutomatedStatistics <- newParameters

        saveRDS(

          oldParamAutomatedStatistics,

          file=bfc[[bfcquery(bfc, "Parameters for automatedStatistics()")$rid]]

          )

        if(submissionName %in% c("test", "test2")){

          message("--- 'test' and 'test2' databases contain sample data and therefore, are not changable. Please use a different submission name. ---")

        }

        message("--- Function 'automatedStatistics()' was skipped: the requested data already exist ---")

      } else{

        continue <- TRUE

      }

    } else{

      continue <- TRUE

    }

  } else{

    continue <- TRUE

  }





  if(continue){

    # Getting the source data

    sourceDataList <- readRDS(bfcpath(bfc, bfcquery(bfc, databaseType)$rid))

    number.of.gene.groups <- sourceDataList[[1]]

    if(!is.list(sourceDataList)){

      stop("Input database must be a list.")

    }



    ############################################################################
    ########## Set the function ready to work

    # creating output fortmat

    processedList <- list()

    # temporarily Inactive

    # options(stringsAsFactors = FALSE)



    # Report

    message("***", "Performing the requested statistical analyses for ", submissionName, "***")



    # Create a progressbar

    total.number <- length(sourceDataList)*length(number.of.gene.groups)

    automatedStatisticsProgressBar <-

      txtProgressBar(min = 0, max = total.number , style = 3)

    ExtA <- 0



    ############################################################################
    ########## Core segment

    # calculating the first 'for' loop for different gene groups

    for(gg in seq_along(sourceDataList)){

      geneNumber <- ncol(sourceDataList[[gg]][[1]])

      temList <- list()



      # Creating empty lists for iterations

      if("frequencyPercentage" %in% calculate){

        Frequency.Percentage <- vector("list", length(number.of.gene.groups))

        if(topGenes){

          Top.Genes.of.Frequency.Percentage <-

            vector("list", length(number.of.gene.groups))

        }

      }


      if("frequencyRatio" %in% calculate){

        Frequency.Ratio <- vector("list", length(number.of.gene.groups))

      }


      if("meanValue" %in% calculate){

        Mean.Value <- vector("list", length(number.of.gene.groups))

        if(topGenes){

          Top.Genes.of.Mean.Value <-

            vector("list", length(number.of.gene.groups))

        }

      }


      if("medianValue" %in% calculate){

        Median.Value <- vector("list", length(number.of.gene.groups))

        if(topGenes){

          Top.Genes.of.Median.Value <-

            vector("list", length(number.of.gene.groups))

        }

      }





      for(cs in seq_along(number.of.gene.groups)){

        # start working on one study

        source.data.subset <- sourceDataList[[gg]][[cs]]

        source.data.subset.name <- names(sourceDataList[[gg]])[cs]

        genes.involved <- colnames(sourceDataList[[gg]][[cs]])





        # Creating and filling the empty matrix with frequency.percentage data

        if("frequencyPercentage" %in% calculate){

          # creating empty matrix

          frequency.percentage.for.a.subset <-

            matrix(, nrow = 1, ncol = geneNumber)


          dimnames(frequency.percentage.for.a.subset) <-

            list(source.data.subset.name, genes.involved)



          # calculate frequency percentage

          for(fp in seq_len(geneNumber)){

            # Subset a column

            a.column <- source.data.subset[,fp]

            a.column.with.absolute.values <- abs(a.column)



            # General statements for core statistics conditions

            frequency <-

              mean(

                as.vector(a.column.with.absolute.values >= cutoff),

                na.rm=TRUE

                )


            mean.with.cutoff.minus.NA <-

              mean(

                as.vector(a.column)[a.column.with.absolute.values >= cutoff],

                na.rm=TRUE

                )


            mean.is.not.na <- !is.na(mean(as.vector(a.column)))

            mean.is.nan.with.cutoff <- is.nan(mean.with.cutoff.minus.NA)

            number.of.not.nan.members <- length((a.column)[!is.nan(a.column)])

            all.members.are.infinite <- all(!is.finite(a.column))



            # Check all members are under cutoff

            if(mean.is.not.na & mean.is.nan.with.cutoff){

              frequency.percentage.for.a.subset[1, fp] <- 0

              # Check all members are NaN

            } else if(number.of.not.nan.members == 0 &

                      all.members.are.infinite &

                      mean.is.nan.with.cutoff){

              frequency.percentage.for.a.subset[1, fp] <- NaN

              # Check all members are NA

            } else if(number.of.not.nan.members > 0 &

                      all.members.are.infinite &

                      mean.is.nan.with.cutoff){

              frequency.percentage.for.a.subset[1, fp] <- NA

              # Mean is bigger than 0

            } else if(mean.with.cutoff.minus.NA > 0 & !mean.is.nan.with.cutoff){

              frequency.percentage.for.a.subset[1, fp] <- 100*frequency

              # Mean is smaller than 0

            } else if(mean.with.cutoff.minus.NA < 0 & !mean.is.nan.with.cutoff){

              frequency.percentage.for.a.subset[1, fp] <- -100*frequency

            }

          }



          # Merging calculations

          if(round){

            Frequency.Percentage[[cs]] <-

              round(frequency.percentage.for.a.subset, digits = 2)

          }else{

            Frequency.Percentage[[cs]] <- frequency.percentage.for.a.subset

          }





          if(topGenes){

            # Check if manual naming is requested

            pre.topGenes <- frequency.percentage.for.a.subset

            # Removing NaN and NA

            pre.topGenes[is.nan(pre.topGenes) | is.na(pre.topGenes)] <- 0



            # Finding the top 5 values

            topGenes.values <-

              head(unique(sort(pre.topGenes, decreasing = TRUE)), n = 5)

            # Creating empty list for iterations

            complete.top.list <- vector("list", length(topGenes.values))


            for(topV in seq_along(topGenes.values)){

              topGene.name <-

                colnames(pre.topGenes)[pre.topGenes %in% topGenes.values[topV]]

              # check whether ttwo or more genes have the same rank

              if(length(topGene.name) > 1){

                topGene.name <- paste(topGene.name, collapse = ", ")

              }

              # rounding

              if(round){

                complete.top <- data.frame(

                  topGene = topGene.name,

                  topValue = round(topGenes.values[topV], digits = 2)

                  , stringsAsFactors = FALSE

                             )

              } else{

                complete.top <- data.frame(

                  topGene = topGene.name,

                  topValue = topGenes.values[topV],

                  stringsAsFactors = FALSE

                             )

              }

              # correcting column names

              colnames(complete.top) <- c(paste(topV, "th ", "Gene", sep=""),

                                          paste(topV, "th ", "Value", sep=""))

              # complete list

              complete.top.list[[topV]] <- complete.top

            }

            # Merge list to give post.topGenes

            post.topGenes <- do.call("cbind", complete.top.list)

            # correcting rowname

            rownames(post.topGenes) <- source.data.subset.name


            # fixing the problem caused by more thank one gene with same rank

            if(length(topGenes.values) < 5){

              # Repeat unit

              fix.dataframe <- data.frame(

                topGene = "-",

                topValue = "-",

                stringsAsFactors = FALSE

                           )

              # number of new units

              newUnits <- 5 - length(topGenes.values)

              # finding current number of units

              oldUnits <- length(topGenes.values)

              for(empty in seq_len(newUnits)){

                colnames(fix.dataframe) <-

                  c(paste(oldUnits + empty, "th ", "Gene", sep=""),

                    paste(oldUnits + empty, "th ", "Value", sep=""))

                post.topGenes <- cbind(post.topGenes, fix.dataframe)

              }

            }

            # assigning the value to the second level list

            Top.Genes.of.Frequency.Percentage[[cs]] <- post.topGenes

          }

        }










        # Creating and filling the empty matrix with frequency.ratio data

        if("frequencyRatio" %in% calculate){

          # creating empty matrix

          frequency.ratio.for.a.subset <- matrix(, nrow = 1, ncol = geneNumber)

          dimnames(frequency.ratio.for.a.subset) <-

            list(source.data.subset.name, genes.involved)



          # calculate frequency ratio

          for(fr in seq_len(geneNumber)){

            # Subset a column

            a.column <- source.data.subset[,fr]

            a.column.with.absolute.values <- abs(a.column)



            # General statements for core statistics conditions

            frequency <-

              mean(

                as.vector(a.column.with.absolute.values >= cutoff),

                na.rm=TRUE

              )


            mean.with.cutoff.minus.NA <-

              mean(

                as.vector(a.column)[a.column.with.absolute.values >= cutoff],

                na.rm=TRUE

                )


            mean.is.not.na <- !is.na(mean(as.vector(a.column)))

            mean.is.nan.with.cutoff <- is.nan(mean.with.cutoff.minus.NA)

            number.of.not.nan.members <- length((a.column)[!is.nan(a.column)])

            all.members.are.infinite <- all(!is.finite(a.column))



            # Check all members are under cutoff

            if(mean.is.not.na & mean.is.nan.with.cutoff){

              frequency.ratio.for.a.subset[1, fr] <-

                paste0("0 out of ", as.character(length(as.vector(a.column))))

              # Check all members are NaN

            } else if (number.of.not.nan.members == 0 &

                       all.members.are.infinite &

                       mean.is.nan.with.cutoff){

              frequency.ratio.for.a.subset[1, fr] <- NaN

              # Check all members are NA

            } else if (number.of.not.nan.members > 0 &

                       all.members.are.infinite &

                       mean.is.nan.with.cutoff){

              frequency.ratio.for.a.subset[1, fr] <- NA

              # Mean is number

            } else if (!mean.is.nan.with.cutoff){

              frequency.ratio.for.a.subset[1, fr] <-

                paste(as.character(length(na.omit(as.vector(a.column)[

                  a.column.with.absolute.values >= cutoff]))), " out of ",

                  as.character(length(as.vector(a.column))), sep="")

            }

          }



          # Merging calculations

          Frequency.Ratio[[cs]] <- frequency.ratio.for.a.subset

        }










        # Creating and filling the empty matrix with mean.value data

        if("meanValue" %in% calculate){

          # creating empty matrix

          mean.value.for.a.subset <- matrix(, nrow = 1, ncol = geneNumber)

          dimnames(mean.value.for.a.subset) <-

            list(source.data.subset.name, genes.involved)



          # calculate Mean value

          for(mv in seq_len(geneNumber)){

            # Subset a column

            a.column <- source.data.subset[,mv]

            a.column.with.absolute.values <- abs(a.column)



            # General statements for core statistics conditions

            frequency <-

              mean(

                as.vector(a.column.with.absolute.values >= cutoff),

                na.rm=TRUE

                )


            mean.with.cutoff.minus.NA <-

              mean(

                as.vector(a.column)[a.column.with.absolute.values >= cutoff],

                na.rm=TRUE

                )

            mean.is.not.na <- !is.na(mean(as.vector(a.column)))

            mean.is.nan.with.cutoff <- is.nan(mean.with.cutoff.minus.NA)

            number.of.not.nan.members <- length((a.column)[!is.nan(a.column)])

            all.members.are.infinite <- all(!is.finite(a.column))



            # Check all members are under cutoff

            if(mean.is.not.na & mean.is.nan.with.cutoff){

              mean.value.for.a.subset[1, mv] <- 0

              # Check all members are NaN

            } else if (number.of.not.nan.members == 0 &

                       all.members.are.infinite &

                       mean.is.nan.with.cutoff){

              mean.value.for.a.subset[1, mv] <- NaN

              # Check all members are NA

            } else if (number.of.not.nan.members > 0 &

                       all.members.are.infinite &

                       mean.is.nan.with.cutoff){

              mean.value.for.a.subset[1, mv] <- NA

              # Mean is number

            } else if (!mean.is.nan.with.cutoff){

              mean.value.for.a.subset[1, mv] <-

                mean(

                  as.vector(a.column)[a.column.with.absolute.values >= cutoff],

                  na.rm=TRUE

                )

            }

          }



          # Merging calculations

          if(round){

            Mean.Value[[cs]] <- round(mean.value.for.a.subset, digits = 2)

          }else{

            Mean.Value[[cs]] <- mean.value.for.a.subset

          }





          if(topGenes){

            # Check if manual naming is requested

            pre.topGenes <- mean.value.for.a.subset

            # Removing NaN and NA

            pre.topGenes[is.nan(pre.topGenes) | is.na(pre.topGenes)] <- 0



            # Finding the top 5 values

            topGenes.values <-

              head(unique(sort(pre.topGenes, decreasing = TRUE)), n = 5)

            # Creating empty list for iterations

            complete.top.list <- vector("list", length(topGenes.values))


            for(topV in seq_along(topGenes.values)){

              topGene.name <-

                colnames(pre.topGenes)[pre.topGenes %in% topGenes.values[topV]]

              # check whether ttwo or more genes have the same rank

              if(length(topGene.name) > 1){

                topGene.name <- paste(topGene.name, collapse = ", ")

              }

              # rounding

              if(round){

                complete.top <- data.frame(

                  topGene = topGene.name,

                  topValue = round(topGenes.values[topV], digits = 2)

                  , stringsAsFactors = FALSE

                )

              } else{

                complete.top <- data.frame(

                  topGene = topGene.name,

                  topValue = topGenes.values[topV],

                  stringsAsFactors = FALSE

                )

              }

              # correcting column names

              colnames(complete.top) <- c(paste(topV, "th ", "Gene", sep=""),

                                          paste(topV, "th ", "Value", sep=""))

              # complete list

              complete.top.list[[topV]] <- complete.top

            }

            # Merge list to give post.topGenes

            post.topGenes <- do.call("cbind", complete.top.list)


            # correcting rowname

            rownames(post.topGenes) <- source.data.subset.name


            # fixing the problem caused by more thank one gene with same rank

            if(length(topGenes.values) < 5){

              # Repeat unit

              fix.dataframe <- data.frame(

                  topGene = "-",

                  topValue = "-",

                  stringsAsFactors = FALSE

                  )

              # number of new units

              newUnits <- 5 - length(topGenes.values)

              # finding current number of units

              oldUnits <- length(topGenes.values)

              for(empty in seq_len(newUnits)){

                colnames(fix.dataframe) <-

                  c(paste(oldUnits + empty, "th ", "Gene", sep=""),

                    paste(oldUnits + empty, "th ", "Value", sep=""))

                post.topGenes <- cbind(post.topGenes, fix.dataframe)

              }

            }

            # assigning the value to the second level list

            Top.Genes.of.Mean.Value[[cs]] <- post.topGenes

          }

        }










        # Creating and filling the empty matrix with median.value data

        if("medianValue" %in% calculate){

          # creating empty matrix

          median.value.for.a.subset <- matrix(, nrow = 1, ncol = geneNumber)

          dimnames(median.value.for.a.subset) <-

            list(source.data.subset.name, genes.involved)



          # calculate median value

          for(mdv in seq_len(geneNumber)){

            # Subset a column

            a.column <- source.data.subset[,mdv]

            a.column.with.absolute.values <- abs(a.column)



            # General statements for core statistics conditions

            frequency <-

              mean(

                as.vector(a.column.with.absolute.values >= cutoff),

                na.rm=TRUE

                )


            mean.with.cutoff.minus.NA <-

              mean(

                as.vector(a.column)[a.column.with.absolute.values >= cutoff],

                na.rm=TRUE

                )


            mean.is.not.na <- !is.na(mean(as.vector(a.column)))

            mean.is.nan.with.cutoff <- is.nan(mean.with.cutoff.minus.NA)

            number.of.not.nan.members <- length((a.column)[!is.nan(a.column)])

            all.members.are.infinite <- all(!is.finite(a.column))



            # Check all members are under cutoff

            if(mean.is.not.na & mean.is.nan.with.cutoff){

              median.value.for.a.subset[1, mdv] <- 0

              # Check all members are NaN

            } else if (number.of.not.nan.members == 0 &

                       all.members.are.infinite &

                       mean.is.nan.with.cutoff){

              median.value.for.a.subset[1, mdv] <- NaN

              # Check all members are NA

            } else if (number.of.not.nan.members > 0 &

                       all.members.are.infinite &

                       mean.is.nan.with.cutoff){

              median.value.for.a.subset[1, mdv] <- NA

              # Mean is number

            } else if (!mean.is.nan.with.cutoff){

              median.value.for.a.subset[1, mdv] <-

                median(

                  as.vector(a.column)[a.column.with.absolute.values >= cutoff],

                  na.rm=TRUE

                  )

            }

          }



          # Merging calculations

          if(round){

            Median.Value[[cs]] <- round(median.value.for.a.subset, digits = 2)

          }else{

            Median.Value[[cs]] <- median.value.for.a.subset

          }





          if(topGenes){

            # Check if manual naming is requested

            pre.topGenes <- median.value.for.a.subset

            # Removing NaN and NA

            pre.topGenes[is.nan(pre.topGenes) | is.na(pre.topGenes)] <- 0



            # Finding the top 5 values

            topGenes.values <-

              head(unique(sort(pre.topGenes, decreasing = TRUE)), n = 5)

            # Creating empty list for iterations

            complete.top.list <- vector("list", length(topGenes.values))


            for(topV in seq_along(topGenes.values)){

              topGene.name <-

                colnames(pre.topGenes)[pre.topGenes %in% topGenes.values[topV]]

              # check whether ttwo or more genes have the same rank

              if(length(topGene.name) > 1){

                topGene.name <- paste(topGene.name, collapse = ", ")

              }

              # rounding

              if(round){

                complete.top <- data.frame(

                  topGene = topGene.name,

                  topValue = round(topGenes.values[topV], digits = 2)

                  , stringsAsFactors = FALSE

                )

              } else{

                complete.top <- data.frame(

                  topGene = topGene.name,

                  topValue = topGenes.values[topV],

                  stringsAsFactors = FALSE

                )

              }

              # correcting column names

              colnames(complete.top) <- c(paste(topV, "th ", "Gene", sep=""),

                                          paste(topV, "th ", "Value", sep=""))

              # complete list

              complete.top.list[[topV]] <- complete.top


            }

            # Merge list to give post.topGenes

            post.topGenes <- do.call("cbind", complete.top.list)

            # correcting rowname

            rownames(post.topGenes) <- source.data.subset.name


            # fixing the problem caused by more thank one gene with same rank

            if(length(topGenes.values) < 5){

              # Repeat unit

              fix.dataframe <- data.frame(

                topGene = "-",

                topValue = "-",

                stringsAsFactors = FALSE

              )

              # number of new units

              newUnits <- 5 - length(topGenes.values)

              # finding current number of units

              oldUnits <- length(topGenes.values)

              for(empty in seq_len(newUnits)){

                colnames(fix.dataframe) <-

                  c(paste(oldUnits + empty, "th ", "Gene", sep=""),

                    paste(oldUnits + empty, "th ", "Value", sep=""))

                post.topGenes <- cbind(post.topGenes, fix.dataframe)

              }

            }

            # assigning the value to the second level list

            Top.Genes.of.Median.Value[[cs]] <- post.topGenes

          }

        }

        # Update progressbar

        ExtA <- ExtA + 1

        setTxtProgressBar(automatedStatisticsProgressBar, ExtA)

      }

      # assign the statistics list fot a subgroup of processedList

      if("frequencyPercentage" %in% calculate){

        temList$Frequency.Percentage <- do.call("rbind", Frequency.Percentage)

        if(topGenes){

          temList$Top.Genes.of.Frequency.Percentage <-

            do.call("rbind", Top.Genes.of.Frequency.Percentage)

        }

      }


      if("frequencyRatio" %in% calculate){

        temList$Frequency.Ratio <- do.call("rbind", Frequency.Ratio)

      }


      if("meanValue" %in% calculate){

        temList$Mean.Value <- do.call("rbind", Mean.Value)

        if(topGenes){

          temList$Top.Genes.of.Mean.Value <-

            do.call("rbind", Top.Genes.of.Mean.Value)

        }

      }


      if("medianValue" %in% calculate){

        temList$Median.Value <- do.call("rbind", Median.Value)

        if(topGenes){

          temList$Top.Genes.of.Median.Value <-

            do.call("rbind", Top.Genes.of.Median.Value)

        }

      }





      processedList[[gg]] <- temList

      names(processedList)[gg] <- names(sourceDataList)[gg]

    }

    # close progressbar

    close(automatedStatisticsProgressBar)



    # Store the prepared Data

    number.of.rows.calculated.data <-

      nrow(bfcquery(bfc, "Calculated statistics"))

    if(number.of.rows.calculated.data == 0){

      saveRDS(

        processedList,

        file=bfcnew(bfc, "Calculated statistics", ext="RDS")

        )

    } else if(number.of.rows.calculated.data == 1){

      saveRDS(

        processedList,

        file=bfc[[bfcquery(bfc, "Calculated statistics")$rid]]

        )

    }



    # Store the last parameter

    newParameters$lastRunStatus <- "succeeded"

    oldParamAutomatedStatistics <- newParameters


    # Store the parameters for this run

    if(number.of.rows.parameters == 0){

      saveRDS(

        oldParamAutomatedStatistics,

        file=bfcnew(bfc, "Parameters for automatedStatistics()", ext="RDS")

        )

    } else if(number.of.rows.parameters == 1){

      saveRDS(

        oldParamAutomatedStatistics,

        file=bfc[[bfcquery(bfc, "Parameters for automatedStatistics()")$rid]]

        )

    }

  }

}

Try the cbaf package in your browser

Any scripts or data that you put into this service are public.

cbaf documentation built on Dec. 9, 2020, 2:02 a.m.