R/GPrank.R

Defines functions .filterTargets .getProcessedData generateModels .formModel GPRankTFs GPRankTargets GPLearn

Documented in generateModels GPLearn GPRankTargets GPRankTFs

GPLearn <- function(preprocData, TF = NULL, targets = NULL,
                    useGpdisim = !is.null(TF), randomize = FALSE,
                    addPriors = FALSE, fixedParams = FALSE,
                    initParams = NULL, initialZero = TRUE,
                    fixComps = NULL, dontOptimise = FALSE,
                    allowNegativeSensitivities = FALSE, quiet = FALSE,
                    gpsimOptions = NULL, allArgs = NULL) {

  if (!is.null(allArgs)) {
    for (i in seq(along=allArgs))
      assign(names(allArgs)[[i]], allArgs[[i]])
  }

  if (class(preprocData) != "ExpressionTimeSeries")
    stop("preprocData should be an instance of ExpressionTimeSeries. Please use processData or processRawData to create one.")

  if (is.null(targets))
    stop("no target genes specified, unable to define a model")

  if (is.list(targets))
    targets <- unlist(targets)
  
  if (useGpdisim)
    genes <- c(TF, targets)
  else
    genes <- targets

  if (class(try(preprocData[genes,], silent=TRUE)) == "try-error") {
    if (! all(TF %in% featureNames(preprocData))) {
      stop("unknown TF")
    } else {
      stop("unknown target gene")
    }
  }

  # The preprocessed data is searched for the data of the specified genes.
  newData <- .getProcessedData(preprocData[genes,])
  y <- newData$y
  yvar <- newData$yvar
  times <- newData$times

  Nrep <- length(y)

  options <- list(includeNoise=0, optimiser="SCG")

  if (any(yvar[[1]] == 0))
    options$includeNoise = 1
  
  if (!is.null(gpsimOptions)) {
    for (i in seq(along=gpsimOptions))
      options[[names(gpsimOptions)[[i]]]] <- gpsimOptions[[i]]
  }
  
  if (addPriors)
    options$addPriors <- TRUE

  if (!initialZero && useGpdisim)
    options$timeSkew <- 1000.0
  #options$gaussianInitial <- TRUE

  if (useGpdisim) {
    Ngenes <- length(genes) - 1
    options$fix$names <- "di_variance"
    options$fix$value <- expTransform(c(1), "xtoa")
  }
  else {
    Ngenes <- length(genes)
    if (allowNegativeSensitivities) {
      options$fix$names <- "sim1_sensitivity"
      options$fix$value <- 1
    }
    else {
      options$fix$names <- "sim1_variance"
      options$fix$value <- expTransform(c(1), "xtoa")
    }
  }
  Ntf <- 1

  if (initialZero && !useGpdisim) {
    options$proteinPrior <- list(values=array(0), times=array(0))

    ## Set the variance of the latent function to 1.
    options$fix$names <- append(options$fix$names, "rbf1_variance")
    options$fix$value <- append(options$fix$value, expTransform(1, "xtoa"))
  }
  
  # fixing first output sensitivity to fix the scaling
  if (fixedParams && !is.null(initParams)) {
    if (is.list(initParams))
      initParams <- unlist(initParams)
    if (!is.null(names(initParams))) {
      options$fix$names <- names(initParams)
      options$fix$value <- initParams
    }
    else {
      I <- which(!is.na(initParams))
      for (k in 1:length(I)) {
        options$fix$index[k+1] <- I[k]
        options$fix$value[k+1] <- initParams[I[k]]
      }
    }      
  }

  if (! is.null(fixComps))
    options$fixedBlocks <- fixComps

  if (allowNegativeSensitivities)
    options$isNegativeS <- TRUE
  
  # initializing the model
  if (useGpdisim)
    model <- list(type="cgpdisim")
  else
    model <- list(type="cgpsim")

  if (length(annotation(preprocData)) > 0)
    dataAnnotation <- annotation(preprocData)
  else
    dataAnnotation <- NULL

  for ( i in seq(length=Nrep) ) {
    #repNames <- names(model$comp)
    if (useGpdisim) {
      model$comp[[i]] <- gpdisimCreate(Ngenes, Ntf, times, t(y[[i]]), t(yvar[[i]]), options, genes = featureNames(preprocData[genes,]), annotation=dataAnnotation)
    }
    else {
      model$comp[[i]] <- gpsimCreate(Ngenes, Ntf, times, t(y[[i]]), t(yvar[[i]]), options, genes = featureNames(preprocData[genes,]), annotation=dataAnnotation)
    }
    #names(model$comp) <- c(repNames, paste("rep", i, sep=""))
    #if (fixedParams) {
    #  model$comp[[i]]$kern <- multiKernFixBlocks(model$comp[[i]]$kern, fixComps)
    #}
  }

  optOptions <- optimiDefaultOptions()

  optOptions$maxit <- 3000
  optOptions$optimiser <- "SCG"
  if (quiet)
    optOptions$display <- FALSE

  if (randomize) {
    a <- modelExtractParam(model)
    #I <- a==0
    n <- length(a)
    a <- array(rnorm(n), dim = c(1, n))
    #a[I] <- 0
    model <- modelExpandParam(model, a)
  }

  if (!is.null(initParams)) {
    if (!is.list(initParams) && all(is.finite(initParams))
        && is.null(names(initParams)))
      model <- modelExpandParam(model, initParams)
    else if (!is.null(names(initParams))) {
      params <- modelExtractParam(model, only.values=FALSE)
      for (i in seq(along=initParams)) {
        j <- grep(names(initParams)[i], names(params))
        if (length(j) > 0)
          params[j] <- initParams[i]
        else
          warning(paste("Ignoring invalid initial parameter specification:", initParams$names[i]))
      }
      model <- modelExpandParam(model, params)
    }
  }

  if (!dontOptimise) {
    if (useGpdisim) {
      message(paste(c("Optimising the model.\nTF: ", TF, "\nTargets: ", paste(targets, collapse=' '), "\n"), sep=" "))
    } else {
      message(paste(c("Optimising the model.\nTargets: ", paste(genes, collapse=' '), "\n"), sep=" "))
    }
    model <- modelOptimise(model, optOptions)
  }

  model <- modelUpdateProcesses(model)

  model$allArgs <- list(TF=TF, targets=targets, useGpdisim=useGpdisim,
                        randomize=randomize, addPriors=addPriors,
                        fixedParams=fixedParams, initParams=initParams,
                        initialZero=initialZero, fixComps=fixComps,
                        dontOptimise=dontOptimise, gpsimOptions=gpsimOptions)

  return (new("GPModel", model))
}



GPRankTargets <- function(preprocData, TF = NULL, knownTargets = NULL,
                          testTargets = NULL, filterLimit = 1.8,
                          returnModels = FALSE, options = NULL,
                          scoreSaveFile = NULL,
                          datasetName = "", experimentSet = "") {

  if (class(preprocData) != "ExpressionTimeSeries")
    stop("preprocData should be an instance of ExpressionTimeSeries. Please use processData or processRawData to create one.")

  if (is.null(testTargets))
    testTargets <- featureNames(preprocData)

  if (is.list(testTargets))
    testTargets <- unlist(testTargets)

  if (is.list(knownTargets))
    knownTargets <- unlist(knownTargets)
  
  if (!is.null(TF) && !all(TF %in% featureNames(preprocData))) {
    stop("unknown TF")
  }
  
  if (!all(knownTargets %in% featureNames(preprocData))) {
    stop("unknown genes in knownTargets")
  }
  
  if (class(try(preprocData[testTargets,], silent=TRUE)) == "try-error") {
    stop("unknown genes in testTargets")
  }
  
  if ('var.exprs' %in% assayDataElementNames(preprocData))
    testTargets <- .filterTargets(preprocData[testTargets,], filterLimit)
  else
    testTargets <- featureNames(preprocData[testTargets,])

  if (length(testTargets) < 1)
    stop("No test targets passed filtering")
  
  # The variable useGpdisim determines whether GPDISIM is used in creating the 
  # models. GPSIM (default) is used if no TF has been specified.
  useGpdisim = !is.null(TF)

  if (!useGpdisim && is.null(knownTargets))
    stop("There are no known targets for GPSIM.")

  if (useGpdisim)
    numberOfKnownGenes <- length(knownTargets) + 1
  else
    numberOfKnownGenes <- length(knownTargets)

  logLikelihoods <- rep(NA, length.out=length(testTargets))
  baselogLikelihoods <- rep(NA, length.out=length(testTargets))
  modelParams <- list()
  modelArgs <- list()
  if (returnModels)
    rankedModels <- list()

  genes <- list()

  if (!is.null(options)) {
    allArgs <- options
    allArgs$useGpdisim <- useGpdisim
  }
  else
    allArgs <- list(useGpdisim=useGpdisim)

  if (!is.null(knownTargets) && length(knownTargets) > 0) {
    baselineModel <- .formModel(preprocData, TF, knownTargets, allArgs=allArgs)
    baselineParameters <- modelExtractParam(baselineModel$model,
                                            only.values=FALSE)
    sharedModel <- list(ll=baselineModel$ll,
                        params=baselineModel$params,
                        args=modelStruct(baselineModel$model)$allArgs)

    allArgs$fixedParams <- TRUE
    allArgs$initParams <- baselineParameters
    J <- grep('Basal', names(allArgs$initParams))
    names(allArgs$initParams)[J] <-
      paste(names(allArgs$initParams)[J], '$', sep='')
    allArgs$fixComps <- 1:numberOfKnownGenes
  }
  else {
    baselineModel <- NULL
  }

  for (i in seq(along=testTargets)) {
    returnData <- .formModel(preprocData, TF, knownTargets,
                            testTargets[i], allArgs = allArgs)

    if (!is.finite(returnData$ll)) {
      logLikelihoods[i] <- NA
      modelParams[[i]] <- NA
      modelArgs[[i]] <- NA
      if (returnModels)
        rankedModels[[i]] <- NA
    }
    else {
      logLikelihoods[i] <- returnData$ll
      modelParams[[i]] <- returnData$params
      modelArgs[[i]] <- modelStruct(returnData$model)$allArgs
      if (returnModels)
        rankedModels[[i]] <- returnData$model
    }
    genes[[i]] <- testTargets[[i]]

    testdata <- preprocData[testTargets[i],]
    testdata$experiments <- rep(1, length(testdata$experiments))
    newData <- .getProcessedData(testdata)
    baselogLikelihoods[i] <- .baselineOptimise(newData$y[[1]], newData$yvar[[1]], list(includeNoise=(any(newData$yvar[[1]]==0))))

    if (!is.null(scoreSaveFile)) {
      scoreList <- new("scoreList", params = modelParams,
                       loglikelihoods = logLikelihoods,
                       baseloglikelihoods = baselogLikelihoods,
                       genes = genes, modelArgs = modelArgs,
                       knownTargets = knownTargets, TF = TF,
                       sharedModel = sharedModel,
                       datasetName = datasetName,
                       experimentSet = experimentSet)
      save(scoreList, file=scoreSaveFile)
    }
  }

  scoreList <- new("scoreList", params = modelParams,
                   loglikelihoods = logLikelihoods,
                   baseloglikelihoods = baselogLikelihoods,
                   genes = genes, modelArgs = modelArgs,
                   knownTargets = knownTargets, TF = TF,
                   sharedModel = sharedModel,
                   datasetName = datasetName,
                   experimentSet = experimentSet)

  if (returnModels)
    return (list(scores=scoreList, models=rankedModels))
  else
    return (scoreList)
}



GPRankTFs <- function(preprocData, TFs, targets,
                      filterLimit = 1.8, 
                      returnModels = FALSE, options = NULL,
                      scoreSaveFile = NULL,
                      datasetName = "", experimentSet = "") {
  if (class(preprocData) != "ExpressionTimeSeries")
    stop("preprocData should be an instance of ExpressionTimeSeries. Please use processData or processRawData to create one.")

  if (is.null(targets)) stop("No targets specified.")

  if (is.list(targets))
    targets <- unlist(targets) 

  numberOfTargets <- length(targets)

  genes = c(TFs, targets)

  if (class(try(preprocData[TFs,], silent=TRUE)) == "try-error") {
    stop("unknown genes in TFs")
  }

  if (!all(targets %in% featureNames(preprocData))) {
    stop("unknown genes in targets")
  }
  
  ## Filtering the genes based on the calculated ratios. If the limit is 0, all genes are accepted.
  if ('var.exprs' %in% assayDataElementNames(preprocData))
    TFs <- .filterTargets(preprocData[TFs,], filterLimit)
  else
    TFs <- featureNames(preprocData[TFs,])

  if (length(TFs) < 1)
    stop("No TFs passed the filtering.")

  logLikelihoods <- rep(NA, length.out=length(TFs))
  modelParams <- list()
  modelArgs <- list()
  if (returnModels)
    rankedModels <- list()

  if (!is.null(options)) {
    allArgs <- options
    allArgs$useGpdisim <- TRUE
  }
  else
    allArgs <- list(useGpdisim=TRUE)

  numberOfTargets <- length(targets)

  genes <- list()

  for (i in 1:length(TFs)) {
    returnData <- .formModel(preprocData, TF = TFs[i], targets, allArgs=allArgs)
    if (!is.finite(returnData$ll)) {
      logLikelihoods[i] <- NA
      modelParams[[i]] <- NA
      modelArgs[[i]] <- NA
      if (returnModels)
        rankedModels[[i]] <- NA
    }
    else {
      logLikelihoods[i] <- returnData$ll
      modelParams[[i]] <- returnData$params
      modelArgs[[i]] <- modelStruct(returnData$model)$allArgs
      if (returnModels)
        rankedModels[[i]] <- returnData$model
    }
    genes[[i]] <- TFs[[i]]

    if (!is.null(scoreSaveFile)) {
      scoreList <- new("scoreList", params = modelParams,
                       loglikelihoods = logLikelihoods,
                       genes = genes, modelArgs = modelArgs,
                       knownTargets = targets, TF = '(see genes)',
                       datasetName = datasetName,
                       experimentSet = experimentSet)
      save(scoreList, file=scoreSaveFile)
    }
  }

  scoreList <- new("scoreList", params = modelParams,
                   loglikelihoods = logLikelihoods,
                   genes = genes, modelArgs = modelArgs,
                   knownTargets = targets, TF = '(see genes)',
                   datasetName = datasetName,
                   experimentSet = experimentSet)

  if (returnModels)
    return (list(scores=scoreList, models=rankedModels))
  else
    return (scoreList)
}



.formModel <- function(preprocData, TF = NULL, knownTargets = NULL,
                       testTarget = NULL, allArgs = NULL) {

  if (!is.null(testTarget))
    targets <- append(knownTargets, testTarget)
  else
    targets <- knownTargets

  error1 <- TRUE
  error2 <- TRUE

  tryCatch({
    model <- GPLearn(preprocData, TF, targets, allArgs = allArgs)
    error1 <- FALSE
  }, error = function(ex) {
    warning("Stopped due to an error.\n")
  })

  if (error1) {
    success <- FALSE
    i <- 0
    allArgs$randomize <- TRUE
    while (!success && i < 10) {
      tryCatch({
        message("Trying again with different parameters.\n")
        model <- GPLearn(preprocData, TF, targets, allArgs = allArgs)
        success <- TRUE
        error2 <- FALSE
      }, error = function(ex) {
        warning("Stopped due to an error.\n")
      })
      i <- i + 1
    }
  }
  else {
    error2 <- FALSE
  }

  if (error2) {
    logLikelihood <- -Inf
    params <- NA
    rankedModel <- NA
  }
  else {
    logLikelihood <- modelLogLikelihood(model)
    params <- modelExtractParam(model)
    rankedModel <- model
  }

  return (list(ll=logLikelihood, model=rankedModel, params=params))
}


generateModels <- function(preprocData, scores) {
  models <- list()

  # recreate the models for each gene in the scoreList
  for (i in seq(along=params(scores))) {
    args <- modelArgs(scores)[[i]]
    args$initParams <- params(scores)[[i]]
    args$dontOptimise <- TRUE
    models[[i]] <- GPLearn(preprocData, allArgs=args)
  }

  return (models)
}


.getProcessedData <- function(data) {
  times <- data$modeltime
  experiments <- data$experiments

  y <- assayDataElement(data, 'exprs')
  if ('var.exprs' %in% assayDataElementNames(data))
    yvar <- assayDataElement(data, 'var.exprs')
  else
    yvar <- 0 * y

  scale <- sqrt(rowMeans(y^2))
  scale[scale==0] <- 1
  scaleMat <- scale %*% array(1, dim = c(1, ncol(data)))
  y <- y / scaleMat
  yvar <- yvar / scaleMat^2

  ylist <- list()
  yvarlist <- list()

  expids <- unique(experiments)
  for (i in seq(along=expids)) {
    ylist[[i]] <- y[,experiments==expids[i], drop=FALSE]
    yvarlist[[i]] <- yvar[,experiments==expids[i], drop=FALSE]
  }
  times <- times[experiments==expids[1]]

  genes <- featureNames(data)

  newData <- list(y = ylist, yvar = yvarlist, genes = genes, times = times)
  return (newData)
}



.filterTargets <- function(data, filterLimit) {
  y <- assayDataElement(data, 'exprs')
  yvar <- assayDataElement(data, 'var.exprs')

  zScores <- rowMeans(y / sqrt(yvar))
  testTargets <- names(which(zScores > filterLimit))

  return (testTargets)
}

Try the tigre package in your browser

Any scripts or data that you put into this service are public.

tigre documentation built on Nov. 8, 2020, 5:32 p.m.