R/testDataset.R

Defines functions recommendMethod columnChecks columnLevels testDataset

Documented in columnChecks columnLevels recommendMethod testDataset

## Copyright © 2012-2014 EMBL - European Bioinformatics Institute
##
## Licensed under the Apache License, Version 2.0 (the "License");
## you may not use this file except in compliance with the License.
## You may obtain a copy of the License at
##
##     http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##------------------------------------------------------------------------------
## testDataset.R contains testDataset and modelFormula functions and constructs
## the PhenTestResult class object
##------------------------------------------------------------------------------
## testDataset function performs the following checks for dependent variable:
## 1. "depVariable" column should present in the dataset
## 2. "depVariable" should be numeric for Mixed Model (MM) framework,
## otherwise performs Fisher Exact Test (FE)
## 3. Each one genotype level should have more than one "depVariable" level
## (variability) for MM framework, otherwise recommends FE framework.
## MM framework consists of two functions: startModel and finalModel.
## callAll argument set to TRUE (default) indicates to call both functions
## FET framework consists of only one function FisherExactTest that creates
## count matrix (matrices) and perform test(s).
##------------------------------------------------------------------------------
testDataset <-  function(phenList = NULL,
												 depVariable = NULL,
												 equation = "withWeight",
												 outputMessages = TRUE,
												 pThreshold = 0.05,
												 method = "MM",
												 modelWeight = NULL,
												 callAll = TRUE,
												 keepList = NULL,
												 dataPointsThreshold = 4,
												 RR_naturalVariation = 95,
												 RR_controlPointsThreshold = 60,
												 transformValues = FALSE,
												 useUnfiltered = FALSE,
												 threshold = 10 ^ -18,
												 check     = 1,
												 upper     = 5)
{
  stop_message <- ""
  upper   = max(upper, 2) # HAMED 1st Feb 2019
  transformationRequired <- FALSE
  lambdaValue <- NA
  scaleShift <- NA
  transformationCode <- 0
  columnOfBatch <- NULL
  columnOfWeight <- NULL
  columnOfInterestBatchAdjusted <- NULL
  columnOfInterestBatchWeightAdjusted <- NULL
  # RR and FE special case - unfiltered dataset
  if ((method == "RR" || method == "FE") && useUnfiltered) {
    phenList <- new(
      "PhenList",
      datasetPL = phenList@datasetUNF,
      refGenotype = refGenotype(phenList),
      testGenotype = testGenotype(phenList),
      hemiGenotype = hemiGenotype(phenList)
    )
  }
  ## CHECK ARGUMENTS

  # 1
  if (is.null(phenList) || !is(phenList, "PhenList")) {
    stop_message <-
      paste(stop_message,
            "Error:\nPlease create and specify PhenList object.\n",
            sep = "")
  }

  # 2
  if (is.null(depVariable)) {
    stop_message <-
      paste(
        stop_message,
        "Error:\nPlease specify dependent variable, ",
        "for example: depVariable='Lean.Mass'.\n",
        sep = ""
      )
  }

  # 3
  if (!(equation %in% c("withWeight", "withoutWeight")) &&
      method %in% c("MM", "TF"))
    stop_message <-
    paste(
      stop_message,
      "Error:\nPlease define equation you ",
      "would like to use from the following options: 'withWeight', 'withoutWeight'.\n",
      sep = ""
    )

  # 4  Checks for provided significance values
  if (!is.null(keepList)) {
    ## Stop function if there are no enough needed input parameters
    if ((length(keepList[keepList == TRUE]) + length(keepList[keepList ==
                                                              FALSE])) != 5)
      stop_message <- paste(
        stop_message,
        "Error:\nPlease define the values for 'keepList' list ",
        "where for each effect/part of the model TRUE/FALSE value defines to ",
        "keep it in the model or not, for instance: ",
        "'keepList=c(keepBatch=TRUE,keepEqualVariance=TRUE,keepWeight=FALSE, ",
        "keepSex=TRUE,keepInteraction=TRUE)'.\n",
        sep = ""
      )

  }

  # 5
  if (!(method %in% c("MM", "FE", "RR", "TF", "LR"))) {
    stop_message <-
      paste(
        stop_message,
        "Error:\nMethod define in the 'method' argument '",
        method,
        "' is not supported.\nAt the moment we are supporting 'MM' ",
        "value for Mixed Model framework, 'FE' value for Fisher Exact Test framework",
        "'RR' for Reference Ranges Plus framework and 'TF' value for Time as Fixed Effect framework.\n",
        sep = ""
      )

  }

  # 6
  # Transformation performed only for MM and TF (linear regression methods for continuous data))
  if (method %in% c("RR", "FE", "LR")) {
    transformValues <- FALSE
  }

  # 7
  if (dataPointsThreshold < 2) {
    dataPointsThreshold <- 2
    if (outputMessages)
      message("Warning:\nData points threshold is set to 2 (minimal value).\n")
  }

  # RR_naturalVariation=95, RR_controlPointsThreshold=60
  if (RR_naturalVariation < 60) {
    RR_naturalVariation <- 60
    if (outputMessages)
      message("Warning:\nNatural variation threshold is set to 60 (minimal value).\n")
  }

  if (RR_controlPointsThreshold < 40) {
    RR_controlPointsThreshold <- 40
    if (outputMessages)
      message("Warning:\nControl points threshold is set to 40 (minimal value).\n")
  }

  # 7
  if (nchar(stop_message) == 0) {
    # create small data frame with values to analyse
    columnOfInterest <- getColumn(phenList, depVariable)
    # Presence
    if (is.null(columnOfInterest))
      stop_message <-
        paste(
          "Error:\nDependent variable column '",
          depVariable,
          "' is missed in the dataset.\n",
          sep = ""
        )
  }

  ## STOP CHECK ARGUMENTS

  # Creates a subset with analysable variable
  if (nchar(stop_message) == 0) {
    checkDepV <-
      columnChecks(getDataset(phenList), depVariable, dataPointsThreshold)
    # TRANSFORMATION
    if (transformValues && checkDepV[2] && checkDepV[3]) {
      # check for transformation
      transformationVector <-
        determiningLambda(phenList, depVariable, equation)
      transformationRequired <-
        as.logical(transformationVector[[5]])
      lambdaValue <- as.numeric(transformationVector[[4]])
      transformationCode <- as.numeric(transformationVector[[7]])
      if (!is.na(transformationVector[[6]])) {
        scaleShift <- as.numeric(transformationVector[[6]])
      }
      else {
        scaleShift <- 0
      }
      if (transformationRequired) {
        columnOfInterestOriginal <- columnOfInterest
        columnOfInterest <-
          round(performTransformation(columnOfInterest, lambdaValue, scaleShift),
                digits = 6)
      }
      if (batchIn(phenList)) {
        # Adjusted for batch depVariable values WITHOUT transformation!
        columnOfInterestBatchAdjusted <-
          getColumnBatchAdjusted(phenList, depVariable)
      }
      if (weightIn(phenList)) {
        # Adjusted for weight and batch if present depVariable values WITHOUT transformation!
        columnOfInterestBatchWeightAdjusted <-
          getColumnWeightBatchAdjusted(phenList, depVariable)
      }
    }
    else {
      columnOfInterestOriginal <- columnOfInterest
      # Transformation for LR -> 0/1
    }


    columnOfSex <- getColumn(phenList, "Sex")
    columnOfGenotype <- getColumn(phenList, "Genotype")
    if (batchIn(phenList)) {
      columnOfBatch <- getColumn(phenList, "Batch")
    }

    if (weightIn(phenList)) {
      columnOfWeight <- getColumn(phenList, "Weight")
    }
    # Get rid of factors
    if (class(columnOfInterest) == "factor") {
      columnOfInterest <- as.character(columnOfInterest)
      tryCatch({
        # try to convert into numbers
        columnOfInterest <-
          as.numeric(columnOfInterest)
      },
      warning = function(war) {
        # convert into characters
        columnOfInterest <-
          as.character(columnOfInterest)
      },
      error = function(err) {
        # convert into characters
        columnOfInterest <-
          as.character(columnOfInterest)
      })
    }

    if (!is.null(columnOfBatch) &&
        !is.null(columnOfWeight)) {
      datasetToAnalyse <-
        data.frame(
          columnOfInterest,
          columnOfSex,
          columnOfGenotype,
          columnOfBatch,
          columnOfWeight
        )
      colnames(datasetToAnalyse) <-
        c(depVariable, "Sex", "Genotype", "Batch", "Weight")
    }
    else if (!is.null(columnOfBatch)) {
      datasetToAnalyse <-
        data.frame(columnOfInterest,
                   columnOfSex,
                   columnOfGenotype,
                   columnOfBatch)
      colnames(datasetToAnalyse) <-
        c(depVariable, "Sex", "Genotype", "Batch")
    }
    else if (!is.null(columnOfWeight)) {
      datasetToAnalyse <-
        data.frame(columnOfInterest,
                   columnOfSex,
                   columnOfGenotype,
                   columnOfWeight)
      colnames(datasetToAnalyse) <-
        c(depVariable, "Sex", "Genotype", "Weight")
    }
    else {
      datasetToAnalyse <-
        data.frame(columnOfInterest, columnOfSex, columnOfGenotype)
      colnames(datasetToAnalyse) <-
        c(depVariable, "Sex", "Genotype")
    }
    if (transformValues && transformationRequired) {
      columnNameOriginal <- paste(depVariable, "_original", sep = "")
      datasetToAnalyse[, columnNameOriginal] <-
        columnOfInterestOriginal
    }
    if (!is.null(columnOfInterestBatchAdjusted)) {
      columnNameBatchAdjusted <-
        paste(depVariable, "_batch_adjusted", sep = "")
      datasetToAnalyse[, columnNameBatchAdjusted] <-
        columnOfInterestBatchAdjusted
    }
    if (!is.null(columnOfInterestBatchWeightAdjusted)) {
      columnNameBatchWeightAdjusted <-
        paste(depVariable, "_batch_weight_adjusted", sep = "")
      datasetToAnalyse[, columnNameBatchWeightAdjusted] <-
        columnOfInterestBatchWeightAdjusted
    }

    # HAMEd 21-05-2018 fixed the issue of only including Response/Genotype/Weight/Sex/Batch
    if (sum(!names(phenList@datasetPL) %in% names(datasetToAnalyse)) > 0) {
    	datasetToAnalyse = data.frame(datasetToAnalyse, phenList@datasetPL[, !names(phenList@datasetPL) %in% names(datasetToAnalyse)])
    }

    phenListToAnalyse <-
      new(
        "PhenList",
        datasetPL = datasetToAnalyse,
        refGenotype = refGenotype(phenList),
        testGenotype = testGenotype(phenList),
        hemiGenotype = hemiGenotype(phenList)
      )

    x <- datasetToAnalyse

    checkDepV <- columnChecks(x, depVariable, dataPointsThreshold)

    checkDepVLevels <- columnLevels(x, depVariable)

    checkWeight <- columnChecks(x, "Weight", dataPointsThreshold)

    # MM checks
    # Go first since there are switches between MM to FE
    if (method %in% c("MM", "TF")) {
      # Numeric
      if (!checkDepV[2]) {
        method <- "FE"
        transformValues <- FALSE
        if (outputMessages)
          message(
            paste(
              "Warning:\nDependent variable '",
              depVariable,
              "' is not numeric. Fisher Exact Test will be used for the ",
              "analysis of this dependent variable.\n",
              sep = ""
            )
          )
      }
      else{
        # Variability - the ratio of different values to all values in the column
        if (checkDepVLevels[1] > 0)
          variability <-
            checkDepVLevels[2] / checkDepVLevels[1]
        else
          variability <- 0
        # where checkDepVLevels[2] contains number of levels and checkDepVLevels[1] contains number of data points
        # One level only
        if (checkDepVLevels[2] == 1 ||
            checkDepVLevels[2] == 0) {
          if (transformValues && transformationRequired) {
            stop_message <-
              paste(
                stop_message,
                "Error:\nNo variability in dependent variable '",
                depVariable,
                "'. Try with argument transformValues set to FALSE.\n",
                sep = ""
              )
          }
          else {
            stop_message <-
              paste(
                stop_message,
                "Error:\nNo variability in dependent variable '",
                depVariable,
                "'.\n",
                sep = ""
              )
          }
        }
        else  if (variability < 0.005) {
          stop_message <-
            paste(
              stop_message,
              "Error:\nInsufficient variability in dependent variable '",
              depVariable,
              "' for MM/TF framework. Fisher Exact Test can be better way to do the analysis.\n",
              sep = ""
            )
        }
        # Data points - number of data points in the depVariable column for genotype/sex combinations
        else if (!checkDepV[3])
          stop_message <-
            paste(
              stop_message,
              "Error:\nNot enough data points in dependent variable '",
              depVariable,
              "' for genotype/sex combinations to allow the application of MM/TF framework. ",
              "Threshold used: ",
              dataPointsThreshold,
              ".\n",
              sep = ""
            )

        #Weight checks
        if (equation == "withWeight") {
          if (!checkWeight[1]) {
            if (outputMessages)
              message(
                paste(
                  "Warning:\nWeight column is not present in dataset.\n",
                  "Equation 'withWeight' can't be used in such a case and has been ",
                  "replaced to 'withoutWeight'.\n",
                  sep = ""
                )
              )
            equation <- "withoutWeight"

          }
          else{
            #CHECKS FOR WEIGHT
            # Equality to depVariable
            columnOfInterest <-
              na.omit(columnOfInterest)
            columnOfWeight <- na.omit(columnOfWeight)

            if (length(columnOfInterest) == length(columnOfWeight))

              if (sum(columnOfInterest - columnOfWeight) == 0) {
                if (outputMessages)
                  message(
                    paste(
                      "Warning:\nWeight and dependent variable values seemed to be equivalent. ",
                      "Equation 'withWeight' can't be used in such a case and ",
                      "has been replaced to 'withoutWeight'.\n",
                      sep = ""
                    )
                  )
                equation <- "withoutWeight"
              }

            if (!checkWeight[2]) {
              if (outputMessages)
                message(
                  paste(
                    "Warning:\nWeight column is not numeric.\n",
                    "Equation 'withWeight' can't be used in such a case and has been ",
                    "replaced to 'withoutWeight'.\n",
                    sep = ""
                  )
                )
              equation <- "withoutWeight"
            }

            if (!checkWeight[3]) {
              if (outputMessages)
                message(
                  paste(
                    "Warning:\nWeight column does not have enough data points ",
                    "for for genotype/sex combinations.\n",
                    "Equation 'withWeight' can't be used in such a case and has been ",
                    "replaced to 'withoutWeight'.\n",
                    sep = ""
                  )
                )
              equation <- "withoutWeight"
            }

          }
        }
      }
    }

    # TF checks
    if (method == "TF" && nchar(stop_message) == 0) {
      # Lists possible combinations
      Genotype_levels <- levels(factor(x$Genotype))
      Batch_levels <- levels(factor(x$Batch))
      # upper: HAMED 1st Feb 2019
      if (length(Batch_levels) < 2 || length(Batch_levels) > upper) {
        stop_message <-
        	# upper: HAMED 1st Feb 2019
          paste(
            stop_message,
            "Error:\n'TF' framework requires from 2 to ",upper," batch levels. ",
            "There is/are '",
            length(Batch_levels),
            "' batch level(s) in the dataset.\n",
            sep = ""
          )
      }
      TFDataPoints <- TRUE
      for (i in 1:length(Batch_levels)) {
        BatchSubset <- subset(x, x$Batch == Batch_levels[i])
        for (j in 1:length(Genotype_levels)) {
          GenotypeBatchSubset <- subset(BatchSubset,
                                        BatchSubset$Genotype == Genotype_levels[j])
          columnOfInterestSubset <-
            na.omit(GenotypeBatchSubset[, c(depVariable)])
          if (length(columnOfInterestSubset) == 0) {
            TFDataPoints <- FALSE
          }
        }

      }
      if (!TFDataPoints)
        stop_message <-
        paste(
          stop_message,
          "Error:\n'TF' framework requires data points for at least one sex ",
          "in all genotype/batch level combinations.\n",
          sep = ""
        )
    }

    # FE checks
    if (method == "FE") {
      if (checkDepVLevels[2] == 0)
        stop_message <-
          paste(
            "Error:\nInsufficient data in the dependent variable '",
            depVariable,
            "' to allow the application of Fisher Exact test framework.\n",
            sep = ""
          )
      if (checkDepVLevels[2] > 10)
        stop_message <-
          paste(
            "Error:\nPackage supports up to 10 levels ",
            "in dependent variable in FE framework. ",
            "The variable '",
            depVariable,
            "' has more than 10 levels.\n",
            sep = ""
          )
    }

    # LR checks
    if (method == "LR") {
      if (checkDepVLevels[2] == 0)
        stop_message <-
          paste(
            "Error:\nInsufficient data in the dependent variable '",
            depVariable,
            "' to allow the application of Logistic Regression framework.\n",
            sep = ""
          )
      if (checkDepVLevels[2] != 2) {
        stop_message <-
          paste(
            "Error:\nLogistic regression is applicable only if there are two levels ",
            "in dependent variable. ",
            "The variable '",
            depVariable,
            "' has less or more than 2 levels.\n",
            sep = ""
          )
      }
      else {
        depVariableLevels <- levels(factor(na.omit(columnOfInterest)))
        if (!(depVariableLevels[1] == 0 &&
              depVariableLevels[2] == 1))
          stop_message <-
            paste(
              "Error:\nLogistic regression is applicable only if there are two levels ",
              "in dependent variable: 0/1\n",
              sep = ""
            )
      }
    }

    # RR checks
    if (method == "RR") {
      # Numeric
      if (!checkDepV[2]) {
        method <- "FE"
        if (outputMessages)
          message(
            paste(
              "Warning:\nDependent variable '",
              depVariable,
              "' is not numeric. Fisher Exact Test will be used for the ",
              "analysis of this dependent variable.\n",
              sep = ""
            )
          )
      }
      else{
        if (checkDepVLevels[2] == 0)
          stop_message <-
            paste(
              "Error:\nInsufficient data in the dependent variable '",
              depVariable,
              "' to allow the application of RR plus framework.\n",
              sep = ""
            )
        # Number of control data


        controlSubset <-
          subset(x, x$Genotype == refGenotype(phenList))
        columnOfInterestSubset <-
          na.omit(controlSubset[, c(depVariable)])

        Sex_levels <- levels(factor(x$Sex))

        controlNotEnough <- FALSE
        errorText <- ""

        for (j in 1:length(Sex_levels)) {
          GenotypeSexSubset <- subset(controlSubset,
                                      controlSubset$Sex == Sex_levels[j])

          columnOfInterestSubset <-
            na.omit(GenotypeSexSubset[, c(depVariable)])

          errorText <-
            paste(errorText,
                  Sex_levels[j],
                  " - ",
                  length(columnOfInterestSubset),
                  ", ",
                  sep = "")

          if (length(columnOfInterestSubset) < RR_controlPointsThreshold) {
            controlNotEnough <- TRUE
          }
        }
        errorText <-
          substr(errorText, 1, nchar(errorText) - 2)


        if (controlNotEnough)
          stop_message <-
          paste(
            "Error:\nInsufficient data in the dependent variable '",
            depVariable,
            "' control subset (",
            errorText,
            ") to allow the application of RR plus framework.",
            "\nThe threshold is ",
            RR_controlPointsThreshold,
            " datapoints. \n",
            sep = ""
          )
      }
    }

  }

  ## STOP DATASET'S CHECKS


  # If problems are deteckted
  if (nchar(stop_message) > 0) {
    if (outputMessages)  {
      message(stop_message)
      opt <- options(show.error.messages = FALSE)
      on.exit(options(opt))
      stop()
    }
    else {
      stop()
    }
  }



  # RUN FRAMEWORK
  if (outputMessages)
    message(paste(
      "Information:\nDependent variable: '",
      depVariable,
      "'.\n",
      sep = ""
    ))

  ## Mixed Models framework
  if (method == "MM")    {
    if (callAll) {
      if (outputMessages)
        message(
          paste(
            "Information:\nPerform all MM framework stages: startModel and finalModel.\n",
            sep = ""
          )
        )
    }



    if (outputMessages)
      message(paste("Information:\nMethod: Mixed Model framework.\n", sep =
                      ""))

    result <-
      startModel(
        phenList =phenListToAnalyse,
        depVariable = depVariable,
        equation = equation,
        outputMessages = outputMessages,
        pThreshold = pThreshold,
        keepList = keepList,
        modelWeight = modelWeight,
        threshold = threshold ,
        check = check
      )

    ## Perform all framework methods
    if (callAll && is(result, "PhenTestResult")) {
    	result <-
    		finalModel(
    			phenTestResult = result,
    			outputMessages = outputMessages,
    			modelWeight = modelWeight
    		)
    }

  }
  else if (method == "TF") {
    if (callAll) {
      if (outputMessages)
        message(
          paste(
            "Information:\nPerform all TF framework stages: startTFModel and finalTFModel.\n",
            sep = ""
          )
        )
    }



    if (outputMessages)
      message(paste(
        "Information:\nMethod: Time as Fixed Effect framework.\n",
        sep = ""
      ))

    result <-
      startTFModel(phenListToAnalyse,
                   depVariable,
                   equation,
                   outputMessages,
                   pThreshold,
                   keepList)

    ## Perform all framework methods
    if (callAll && is(result, "PhenTestResult")) {
      result <- finalTFModel(result, outputMessages)
    }
  }
  else if (method == "FE") {
    ## Fisher Exact Test
    if (outputMessages)
      message(
        paste(
          "Information:\nMethod: Fisher Exact Test framework with ",
          ifelse(useUnfiltered, "unfiltered dataset.", "filtered dataset."),
          "\n",
          sep = ""
        )
      )
    result <-
      FisherExactTest(phenListToAnalyse, depVariable, outputMessages)
  }

  else if (method == "RR") {
    ## RR Plus
    if (outputMessages)
      message(
        paste(
          "Information:\nMethod: Reference Ranges Plus framework with ",
          ifelse(useUnfiltered, "unfiltered dataset.", "filtered dataset."),
          "\n",
          sep = ""
        )
      )
    result <-
      RRTest(
        phenListToAnalyse,
        depVariable,
        outputMessages,
        RR_naturalVariation,
        RR_controlPointsThreshold
      )
  }
  else if (method == "LR") {
    ## Logistic Regression
    if (callAll) {
      if (outputMessages)
        message(
          paste(
            "Information:\nPerform all LR framework stages: startLRModel and finalLRModel.\n",
            sep = ""
          )
        )
    }

    if (outputMessages)
      message(paste(
        "Information:\nMethod: Logistic Regression framework.\n",
        sep = ""
      ))

    result <-
      startLRModel(
        phenListToAnalyse,
        depVariable,
        outputMessages = TRUE,
        pThreshold = 0.05#,
        #threshold = threshold
      )

    ## Perform all framework methods
    if (callAll && is(result, "PhenTestResult")) {
      result <- finalLRModel(result, outputMessages)
    }
  }

  result@transformationCode <- transformationCode
  if (transformValues && transformationRequired) {
    # Code = 2 or 3
    result@transformationRequired <-
      as.logical(transformationRequired)
    result@lambdaValue <- lambdaValue
    result@scaleShift <- scaleShift
  }
  else {
    result@transformationRequired <- FALSE
    if (transformValues) {
      # Code = 1 or 4
      result@lambdaValue <- lambdaValue
      result@scaleShift <- scaleShift
    }
  }



  # result@callAll =  callAll
  # result@dataPointsThreshold = dataPointsThreshold
  # result@RR_naturalVariation = RR_naturalVariation
  # result@RR_controlPointsThreshold = RR_controlPointsThreshold
  # result@transformValues = transformValues
  # result@useUnfiltered = useUnfiltered




  return(result)
}
##------------------------------------------------------------------------------
## Returns values after null removing procedure: No of data points, No of levels,
## No of Genotype/Sex combinations, No of data points for each one combination
columnLevels <- function(dataset, columnName) {
  columnOfInterest <- na.omit(dataset[, c(columnName)])


  values <- c(length(columnOfInterest))

  #Test for the data points quantity for Genotype/sex combinations
  Genotype_levels <- levels(factor(dataset$Genotype))
  Sex_levels <- levels(factor(dataset$Sex))
  values <- append(values, length(levels(factor(columnOfInterest))))

  values <-
    append(values, length(Genotype_levels) * length(Sex_levels))

  for (i in 1:length(Genotype_levels)) {
    GenotypeSubset <-
      subset(dataset, dataset$Genotype == Genotype_levels[i])
    for (j in 1:length(Sex_levels)) {
      GenotypeSexSubset <- subset(GenotypeSubset,
                                  GenotypeSubset$Sex == Sex_levels[j])

      columnOfInterestSubset <-
        na.omit(GenotypeSexSubset[, c(columnName)])

      values <- append(values, length(columnOfInterestSubset))

    }
  }
  return (values)
}
##------------------------------------------------------------------------------
## Checks the column for eligibility, returns values:
## presence of column, all data are numeric, No of levels passed checks (number of data points for each genotype/sex
## combination is at least equals to threshold)
columnChecks <-
  function(dataset,
           columnName,
           dataPointsThreshold = 4) {
    presence <- TRUE
    numeric <- FALSE
    levelsCheck <- 0
    variabilityThreshold <- 10
    # Test: dependent variable presence
    if (!(columnName %in% colnames(dataset))) {
      presence <- FALSE
    }
    else {
      columnOfInterest <- na.omit(dataset[, c(columnName)])

      if (all(sapply(columnOfInterest, is.numeric))) {
        numeric <- TRUE
      }

      dataPointsSummary <- columnLevels(dataset, columnName)

      NoCombinations <- dataPointsSummary[3]
      #message(dataPointsSummary[3])
      variabilityThreshold <- NoCombinations
      #if (NoCombinations==4)
      #variabilityThreshold <- 3

      for (i in 1:NoCombinations) {
        if (dataPointsSummary[3 + i] >= dataPointsThreshold)
          levelsCheck <- levelsCheck + 1

      }

    }

    values <-
      c(presence, numeric, (levelsCheck >= variabilityThreshold))

    return (values)
  }

##------------------------------------------------------------------------------
recommendMethod <- function(phenList       = NULL,
                            depVariable    = NULL,
                            outputMessages = TRUE,
														upper          = 5)
{
  # evaluate    dataPointsThreshold, RR_controlPointsThreshold
  stop_message <- ""
  # upper: HAMED 1st Feb 2019
  upper = max(upper,2)
  dataPointsThreshold <- 4
  RR_controlPointsThreshold <- 40
  suggestedFramework <- "NO"

  ## CHECK ARGUMENTS

  # 1
  if (is.null(phenList) || !is(phenList, "PhenList")) {
    stop_message <-
      paste(stop_message,
            "Error:\nPlease create and specify PhenList object.\n",
            sep = "")
  }

  # 2
  if (is.null(depVariable)) {
    stop_message <-
      paste(
        stop_message,
        "Error:\nPlease specify dependent variable, ",
        "for example: depVariable='Lean.Mass'.\n",
        sep = ""
      )
  }
  else {
    columnOfInterest <- getColumn(phenList, depVariable)
  }

  # stop if there is something wrong with the arguments
  if (nchar(stop_message) == 0) {
    x <- getDataset(phenList)
    checkDepV <- columnChecks(x, depVariable, dataPointsThreshold)

    # Presence
    if (!checkDepV[1])
      stop_message <- paste(
        "Error:\nDependent variable column '",
        depVariable,
        "' is missed in the dataset.\n",
        sep = ""
      )
  }


  # If problems are deteckted
  if (nchar(stop_message) > 0) {
    if (outputMessages)  {
      message(stop_message)
      opt <- options(show.error.messages = FALSE)
      on.exit(options(opt))
      stop()
    }
    else {
      stop(stop_message)
    }
  }


  ## STOP CHECK ARGUMENTS



  checkDepVLevels <- columnLevels(x, depVariable)



  # check for FE
  if (checkDepVLevels[2] > 0 && checkDepVLevels[2] <= 10) {
    suggestedFramework <- "FE"
  }

  # check for LR
  if (checkDepVLevels[2] == 2) {
    suggestedFramework <- paste(suggestedFramework, ", LR", sep = "")
  }


  if (checkDepV[2]) {
    # NUMERIC

    # VARIABILITY
    variabilityPass <- TRUE

    # Variability - the ratio of different values to all values in the column
    if (checkDepVLevels[1] > 0)
      variability <-
        checkDepVLevels[2] / checkDepVLevels[1]
    else
      variability <- 0
    # where checkDepVLevels[2] contains number of levels and checkDepVLevels[1] contains number of data points
    # One level only
    if (checkDepVLevels[2] == 1 || checkDepVLevels[2] == 0) {
      variabilityPass <- FALSE
    }
    else
      if (variability < 0.005) {
        variabilityPass <- FALSE
      }
    # Data points - number of data points in the depVariable column for genotype/sex combinations
    else if (!checkDepV[3])
      variabilityPass <- FALSE
    # VARIABILITY


    # check for TF
    if ('Batch' %in% colnames(x) && variabilityPass) {
      phenListTF <-
        TFDataset(
          phenList,
          depVariable,
          outputMessages  = FALSE,
          forDecisionTree = FALSE,
          upper           = upper
        )
      xTF <- getDataset(phenListTF)

      # check for batches - shoud be from 2 to upper (see the input parameters) batches
      # upper: HAMED 1st Feb 2019
      if (length(levels(factor(xTF$Batch))) >= 2 &&
          length(levels(factor(xTF$Batch))) <= upper) {
        TF <- TRUE
        Genotype_levels <- levels(factor(x$Genotype))
        Batch_levels <- levels(factor(x$Batch))
        # check for data points in all genotype/batch level combinations (records at least in on Sex)
        for (i in 1:length(Batch_levels)) {
          BatchSubset <- subset(x, x$Batch == Batch_levels[i])
          # Genotype loop
          for (j in 1:length(Genotype_levels)) {
            GenotypeBatchSubset <- subset(BatchSubset,
                                          BatchSubset$Genotype == Genotype_levels[j])
            columnOfInterestSubset <-
              na.omit(GenotypeBatchSubset[, c(depVariable)])
            if (length(columnOfInterestSubset) == 0) {
              TF <- FALSE
            }
          }
        }
        if (suggestedFramework == "NO") {
          suggestedFramework <- "TF"
        }
        else {
          suggestedFramework <- paste(suggestedFramework, ", TF", sep = "")
        }
      }

    }

    # check for MM
    if (variabilityPass) {
      if (suggestedFramework == "NO") {
        suggestedFramework <- "MM"
      }
      else {
        suggestedFramework <- paste(suggestedFramework, ", MM", sep = "")
      }
    }

    # checks for RR
    controlNotEnough <- FALSE
    if (checkDepVLevels[2] != 0) {
      controlSubset <- subset(x, x$Genotype == refGenotype(phenList))
      columnOfInterestSubset <-
        na.omit(controlSubset[, c(depVariable)])
      Sex_levels <-
        levels(factor(x$Sex))
      for (j in 1:length(Sex_levels)) {
        GenotypeSexSubset <- subset(controlSubset,
                                    controlSubset$Sex == Sex_levels[j])
        columnOfInterestSubset <-
          na.omit(GenotypeSexSubset[, c(depVariable)])
        if (length(columnOfInterestSubset) < RR_controlPointsThreshold) {
          controlNotEnough <- TRUE
        }
      }
    }
    else {
      controlNotEnough <- TRUE
    }

    if (!controlNotEnough) {
      if (suggestedFramework == "NO") {
        suggestedFramework <- "RR"
      }
      else {
        suggestedFramework <- paste(suggestedFramework, " and RR", sep = "")
      }
    }

  }

  return(suggestedFramework)

}
##------------------------------------------------------------------------------"

Try the PhenStat package in your browser

Any scripts or data that you put into this service are public.

PhenStat documentation built on Nov. 8, 2020, 8:13 p.m.