R/grn_status.R

Defines functions geneScores ccn_queryGRNstatus utils_reduceMat ccn_reduceMatLarge ccn_extract_SN_DF zscore meanTraining ccn_trainNorm ccn_normalizeScores ccn_score ccn_rawScore minDif ccn_make_tVals_predict ccn_make_tVals ccn_netScores

Documented in ccn_extract_SN_DF ccn_make_tVals ccn_make_tVals_predict ccn_netScores ccn_normalizeScores ccn_queryGRNstatus ccn_rawScore ccn_reduceMatLarge ccn_score ccn_trainNorm geneScores meanTraining minDif utils_reduceMat zscore

#' GRN status
#'
#' Calculates the status of all GRNs in query samples as compared to training data for
#' @param expDat query expression matrix
#' @param subList of ct => genes
#' @param tVals tvals
#' @param classList classList
#' @param minVals minVals
#' @param classWeight class weight
#' @param exprWeight  expression weight
#' @return grn scores (not normalized)
ccn_netScores<-function(expDat, genes, tVals, ctt, classList=NULL, classWeight=TRUE, classWeightVal = 3, exprWeight=TRUE, exprWeightVal = 3, xmax=1e3){
  cat(ctt,"\n")
  aMat = matrix(0, nrow=length(genes), ncol=ncol(expDat))
  rownames(aMat) = names(genes)

  weights = rep(1, length(genes))
  names(weights) = names(genes)

  cat(dim(aMat),"\n")
  if(exprWeight){
    meanVect = unlist(tVals[[ctt]][['mean']][names(genes)])
    weights = (exprWeightVal*meanVect)/sum(exprWeightVal*meanVect) # also arbritary value on the weight you are putting on the expression
  }

  if(classWeight){

    classImp = weights
    for(gene in names(classList[[ctt]])) {
      if(gene %in% names(classImp)) {
        classImp[gene] = classList[[ctt]][gene] + classWeightVal # arbritary value
      }
    }

    weights = weights*classImp
  }

  for(gene in names(genes)){

    zzs = ccn_rawScore(expDat[gene,], tVals[[ctt]][['mean']][[gene]], tVals[[ctt]][['sd']][[gene]], xmax=xmax, reg_type = genes[gene])

    aMat[gene,] = zzs
  }

  print(dim(aMat))
  xscores = apply(aMat, 2, weighted.mean, w=weights)

  return(xscores)
}

#' Estimate gene expression dist in CTs
#'
#' Calculate mean and SD
#' @param expDat training data
#' @param sampTab, ### training sample table
#' @param dLevel="description1", ### column to define CTs
#' @param predictSD=FALSE ### whether to predict SD based on expression level
#'
#' @return tVals list of ct->mean->named vector of average gene expression, ->sd->named vector of gene standard deviation
ccn_make_tVals<-function(expDat, sampTab, dLevel="description1", predictSD=FALSE){

  if(predictSD){
    ans = ccn_make_tVals_predict(expDat, sampTab, dLevel)
  }
  else{
    # Note: returns a list of dName->gene->mean, sd, where 'dName' is a ctt or lineage
    # make sure everything is lined up
    expDat = expDat[,rownames(sampTab)]
    tVals = list()
    dNames = unique(as.vector(sampTab[,dLevel]))
    allGenes = rownames(expDat)
    for(dName in dNames){

      xx = which(sampTab[,dLevel]==dName)
      sids = rownames(sampTab[xx,])
      xDat = expDat[,sids]
      means = apply(xDat, 1, mean)
      sds = apply(xDat, 1, sd)
      tVals[[dName]][['mean']] = as.list(means)
      tVals[[dName]][['sd']] = as.list(sds)
    }
    ans = tVals
  }

  return(ans)
}

#' ccn_make_tVals_predict
#'
#' predicts SD based on mean expression using linear regression
#' @param expDat training data
#' @param sampTab training sample table
#' @param dLevel="description1" column to define CT
#'
#' @return tVals list of ct->mean->named vector of average gene expression, ->sd->named vector of gene standard deviation
ccn_make_tVals_predict<-function(expDat, sampTab, dLevel="description1"){
  # Note: returns a list of dName->gene->mean, sd, where 'dName' is a ctt or lineage
  # make sure everything is lined up
  expDat = expDat[,rownames(sampTab)]
  tVals = list()
  dNames = unique(as.vector(sampTab[,dLevel]))
  allGenes = rownames(expDat)

  # make a model to predict SD given average expression level across all samples
  sdT = apply(expDat, 1, sd)
  mT = apply(expDat, 1, mean)
  myModel = lm(sdT~mT)
  for(dName in dNames){
    xx = which(sampTab[,dLevel]==dName)
    sids = rownames(sampTab[xx,])
    xDat = expDat[,sids]
    means = apply(xDat, 1, mean)
    sds = predict(myModel, data.frame(mT=means))
    tVals[[dName]][['mean']] = as.list(means)
    tVals[[dName]][['sd']] = as.list(sds)
  }

  return(tVals)
}

#' Min Difference
#'
#' computes mean gene A in CT 1 - mean gene A in CT 2, where CT2 has the non CT1 max value. does this for genes
#' @param tVals tVals
#' @param genes vector of gene names
#' @param ct ct to compare to
#' @return vector of differences
minDif<-function(tVals, genes, ct){
  octs = setdiff(names(tVals), ct)
  qq = lapply(tVals[octs], "[[", "mean")

  tmpMat = matrix(unlist(lapply(qq, "[", genes)), nrow=length(genes))
  rownames(tmpMat) = genes
  maxes = apply(tmpMat, 1, max)

  return(unlist(tVals[[ct]][["mean"]][genes])-maxes)
}

#' computes the raw z score for a gene as xmax-abs(zscore).
#'
#' better values are higher.
#' @param vect a vector of gene expression values for multiple samples
#' @param mmean mean value in training data
#' @param ssd standard deviation in training data
#' @param reg_type the type of regulation (up or down) for the specific subnetwork
#' @return transformed (but not normalized) GRN score
#'
ccn_rawScore<-function(vect, mmean, ssd, xmax=1e3, reg_type){
  zcs = zscore(vect, mmean, ssd)

  if(as.numeric(reg_type) == 1) { # if the
    #zcs[zcs > 0 & zcs < 1] = 0 # change this to revert back to original place
    zcs[zcs > 0] = 0
    return(xmax - abs(zcs))
  }
  else {
    #zcs[zcs < 0 & zcs > -1] = 0 # change this to revert back to original place
    zcs[zcs < 0] = 0
    return(xmax - abs(zcs))
  }
}

#' GRN status
#'
#' Calculates the GRN status in query samples as compared to training data
#' @param expDat query expression matrix
#' @param subList of ct => genes
#' @param tVals list of ctt->list(means->genes, sds->genes)
#' @param classList class list
#' @param minVals  min vals
#' @param classWeight classweght
#' @param exprWeight  expression weight
#' @return GRN scores
#' @export
ccn_score <- function(expDat, subList, tVals, classList=NULL, minVals=NULL, classWeight=FALSE, exprWeight=TRUE, xmax=1e3){
  #nSubnets<-sum(sapply(subList, length))
  if(class(expDat) != "matrix") {
    expDat = as.matrix(expDat)
  }
  nSubnets = length(subList)
  ans = matrix(0, nrow=nSubnets, ncol=ncol(expDat))
  ctts = names(subList)
  rnames = vector()
  rIndex = 1
  for(ctt in ctts){
    cat(ctt,"\n")
    genes = subList[[ctt]]
    # 06-06-16 -- added to allow for use of GRNs defined elsewhere
    genes = genes[intersect(names(genes), rownames(expDat))]  # only select the genes that are

    ans[rIndex,] = ccn_netScores(expDat, genes, tVals=tVals, ctt=ctt,classList=classList, classWeight=classWeight,exprWeight=exprWeight, xmax=xmax)
    rnames = append(rnames, ctt)
    rIndex = rIndex+1

  }
  rownames(ans) = rnames
  colnames(ans) = colnames(expDat)
  if(!is.null(minVals)){
    minVals = minVals[rownames(ans)]
    ans = ans-minVals
  }

  return(ans)
}

#' Normalize grn status as compared to training data
#'
#' Divide the query scores by the mean values in the training data.
#' @param ctrlScores a list of subnet->mean value, all subnets
#' @param queryScores a matrix, rownames = subnet names, all subnets
#' @param subNets a vector of subnets names to use
#'
#' @return normalized grn status matrix
#' @export
ccn_normalizeScores <- function(ctrlScores, queryScores, subNets){

  ans = matrix(0, nrow=length(subNets), ncol=ncol(queryScores))
  rownames(ans) = subNets
  for(subNet in subNets){
    ans[subNet,] = queryScores[subNet,] / ctrlScores[[subNet]]
  }
  colnames(ans) = colnames(queryScores)

  return(ans)
}


#' Figure out normalization factors for GRNs, and norm training data
#'
#' Exactly that.
#' @param expTrain expression matrix
#' @param stTrain  sample table
#' @param subNets named list of genes, one list per CTT, tct=>gene vector
#' @param classList list of classifiers
#' @param dLevel column name to group on
#' @param tVals seful when debugging
#' @param classWeight weight GRN status by importance of gene to classifier
#' @param exprWeight weight GRN status by expression level of gene?
#' @param sidCol sample id colname
#' @param xmax the maximum raw score that a sample could receive per gene
#' @param meanNorm normalize raw scores based on the lowest mean in a category
#'
#' @return list of trainingScores, normVals, raw_scores, minVals, tVals=tVals
#' @export
ccn_trainNorm <- function(expTrain, stTrain, subNets, classList = NULL,  dLevel = "description1", tVals=NULL, classWeight=TRUE, exprWeight=FALSE, sidCol='sample_id', xmax=1e3, meanNorm = FALSE){

  if(is.null(tVals)){
    tVals = ccn_make_tVals(expTrain, stTrain, dLevel)
  }

  ctts = as.vector(unique(stTrain[,dLevel]))
  scoreList = list()
  normList = list()  # a list of ctt->subnet->mean value
  minVect = vector()  # a list of ctt->subnet->min value, used to shift raw grn est scores

  cat("calculating GRN scores on training data ...\n")
  tmpScores = ccn_score(expTrain, subNets, tVals, classList, minVals=NULL, classWeight=classWeight, exprWeight=exprWeight, xmax=xmax)


  if(meanNorm == TRUE) {
    train_meanScores = meanTraining(tmpScores, stTrain, dLevel, sidCol)
    minVect = apply(train_meanScores, 1, min)
    names(minVect) = rownames(train_meanScores)

  } else {
    minVect = apply(tmpScores, 1, min)
    names(minVect) = rownames(tmpScores)
  }


  # shift the raw scores so that min=0
  tmpScores = tmpScores - minVect
  cat("norm factors\n")
  for(ctt in ctts){
    # determine nomalization factors
    snets = ctt

    scoreDF = ccn_extract_SN_DF(tmpScores, stTrain, dLevel, snets, sidCol=sidCol)
    scoreDF = ccn_reduceMatLarge(scoreDF, "score", "description", "subNet")
    xdf = scoreDF[which(scoreDF$grp_name==ctt),]
    tmpSNS = as.list(xdf$mean)
    names(tmpSNS) = xdf$subNet
    normList[names(tmpSNS)] = tmpSNS
  }

  # normalize training scores
  nScores = ccn_normalizeScores(normList, tmpScores, rownames(tmpScores))

  scoreDF = ccn_extract_SN_DF(nScores, stTrain, dLevel, sidCol=sidCol)

  scoreDF = ccn_reduceMatLarge(scoreDF, "score", "description", "subNet")

  return(list(trainingScores=scoreDF,
       normVals=normList,
       raw_scores=tmpScores,
       minVals=minVect,
       tVals=tVals))
}

#' @title calculate the mean GRN scores across categories
#' @description calculate the mean GRN scores across categories
#' @param grnScores the GRN scores calculated through ccn_score
#' @param stTrain the sample table used for training
#' @param dLevel the name of the column with all the categories
#' @return an averaged GRN score matrix
meanTraining <- function(grnScores, stTrain, dLevel, sidCol) {
  rownames(stTrain) = as.vector(stTrain[, sidCol])
  # return matrix
  meanMatrix = matrix(0, nrow = nrow(grnScores), ncol = length(unique(stTrain[, dLevel])))

  rownames(meanMatrix) = rownames(grnScores)
  colnames(meanMatrix) = unique(stTrain[, dLevel])

  for(cancerName in unique(stTrain[, dLevel])) {
    tempStTrain = stTrain[stTrain[, dLevel] == cancerName, ]
    tempGRNscores = grnScores[, rownames(tempStTrain)]

    meanMatrix[, cancerName] = apply(tempGRNscores, 1, mean)
  }

  return(meanMatrix)
}
#' Z scores
#' Figure out Z score given mean and standard deviation
#' @param x query score
#' @param meanVal mean of the distribution
zscore<-function(x,meanVal,sdVal){

  return((x-meanVal)/sdVal)
}


#' returns a DF of: sample_id, description, ctt, subnet_name, score
#'
#' returns a DF of: sample_id, description, ctt, subnet_name, score
#' @param scores a matrix of subNet scores
#' @param sampTab sample table
#' @param dLevel column name of sampTab to group values by
#' @param rnames rownames to extract
#' @param sidCol sample identifier column name
#'
#' @return returns a DF of: sample_id, description, ctt, subnet_name, score
ccn_extract_SN_DF <- function(scores, sampTab, dLevel, rnames=NULL, sidCol="sample_id") {

  if(is.null(rnames)){
    rnames = rownames(scores)
  }

  tss = scores[rnames,]
  if(length(rnames)==1){
    tss = t(as.matrix(scores[rnames,]))
    rownames(tss) = rnames
  }

  nSamples = ncol(tss)
  stTmp = sampTab[colnames(tss),]  ####
  snNames = rownames(tss)
  num_subnets = length(snNames)
  snNames = unlist(lapply(snNames, rep, times=nSamples))
  sample_ids = rep(as.vector(stTmp[,sidCol]), num_subnets)
  descriptions = rep(as.vector(stTmp[,dLevel]), num_subnets)

  scores = as.vector(t(tss))

  return(data.frame(sample_id=sample_ids,
             description=descriptions,
             subNet = snNames,
             score=scores))
}

#' reduce large matrix
#' reduce large matrix from ccn_extract_SN_DF
#'
#' @param datFrame the result from ccn_extract_SN_DF
#' @param valCol the column name of the score
#' @param cName the column name of the different groups
#' @param iterOver the column name of the subnetworks
#'
#' @return reduced large matrix
ccn_reduceMatLarge <- function(datFrame, valCol="score", cName="description", iterOver="subNet") {

  iterOvers = unique(as.vector(datFrame[,iterOver]))
  ans = data.frame()
  for(io in iterOvers){
    xi = which(datFrame[,iterOver]==io)
    dfTmp = datFrame[xi,]
    x = utils_reduceMat(dfTmp,valCol=valCol,cName=cName)
    x = cbind(x, subNet=rep(io, nrow(x)))
    ans = rbind(ans, x)
  }

  return(ans)
}

#' @title  Reduce data matrix
#' @description reduce the data.matrix values by averaging and getting st dvs
#'
#' @param datFrame the dataframe
#' @param valCol the column with scores
#' @param cName column with groups
#'
#' @return df of grp_name, mean, sd
utils_reduceMat <- function(datFrame, valCol, cName='ann') {

  mids = unique(as.vector(datFrame[,cName]))
  means = rep(0, length(mids))
  sds = rep(1, length(mids))
  indexI = rep(0, length(mids))  # index to extract other columns from original data.frame

  for(i in seq(length(mids))){
    mid = mids[i]
    xi = which(datFrame[,cName]==mid)
    tmpDat = datFrame[xi,]
    means[i] = mean(tmpDat[,valCol])
    sds[i] = sd(tmpDat[,valCol])
  }

  return(data.frame(grp_name=mids, mean=means, stdev=sds))
}

#' @title Get Query GRN status
#' @description Get the GRN status of query samples
#'
#' @param expQuery logRanked query expression matrix
#' @param expTrain logRanked training expression matrix
#' @param stTrain sample table of training expression matrix
#' @param dLevel the name of the column with cancer types
#' @param sidCol the name of the column with sample IDs
#' @param grn_return the grn list that is returned from \code{\link{ccn_makeGRN}}
#' @param trainNorm normalization statistics from \code{\link{ccn_trainNorm}}. If you are using pre-calculated normalization statistics please make sure all the parameters are the same for applying to query and calculating normalization
#' @param classifier_return the classifier_return list that is returned from \code{\link{broadClass_train}}
#' @param classWeight boolean indicating whether to take the importance of the classification into status calculation
#' @param exprWeight boolean indicating whether to take the weight of gene expression into status calculation
#' @param prune boolean indicating whether to select exclusive genes for processing classification gene importance
#' @param predSD a parameter for calculating normalization statistics from training data
#' @return a matrix indicating the GRN status
#' @export
ccn_queryGRNstatus <- function(expQuery, expTrain, stTrain, dLevel, sidCol, grn_return, trainNorm = NULL, classifier_return,  classWeight = TRUE, exprWeight = FALSE, prune = TRUE, xmax = 1e3, predSD=FALSE) {
  cnProc = classifier_return$cnProc

  trainNorm_prior = trainNorm
  geneImportance = processImportance(classifier = cnProc$classifier, xpairs = classifier_return$xpairs_list, prune = prune)

  if(is.null(trainNorm) == TRUE) {
    trainNorm = ccn_trainNorm(expTrain, stTrain, subNets=grn_return$ctGRNs$geneLists, classList = geneImportance, dLevel = dLevel, sidCol = sidCol, classWeight = classWeight, exprWeight = exprWeight)
    cat("Finished finding normalization statistics", "\n")
  }

  status_score = ccn_score(expDat = expQuery,
                           subList = grn_return$ctGRNs$geneLists, tVals = trainNorm$tVals,
                           classList = geneImportance, minVals = trainNorm$minVals,
                           classWeight = classWeight, exprWeight = exprWeight,
                           xmax = xmax)
  normScoresQuery = ccn_normalizeScores(trainNorm$normVals, status_score, rownames(status_score))

  if(is.null(trainNorm_prior) == TRUE) {
    return(list("trainNorm" = trainNorm, "query_GRNstatus" = normScoresQuery))
  }
  else {
    return(normScoresQuery)
  }
}

#' @title score indvidual genes
#' @description score individual genes
#' @param queryMatrix query ranked expression matrix
#' @param ctt the name of the cancer type specific network
#' @param trainNorm the normalization statistics from \code{\link{ccn_trainNorm}}
#' @param grn_all the grn construction results from \code{\link{ccn_makeGRN}}
#' @param importantGenes a list of gene importances
#' @return a matrix with individual gene z score for each query sample, and the expression direction it should move to achieve ideal similarity with subnetwork. The order of the gene is ranked by importance from the classifier
#' @export
geneScores <- function(queryMatrix, ctt, trainNorm, grn_all, importantGenes) {
  tvals = trainNorm$tVals
  cgenes = grn_all$ctGRNs$geneLists[[ctt]]

  returnMatrix = matrix(nrow = length(cgenes), ncol = ncol(queryMatrix))
  rownames(returnMatrix) = names(cgenes)
  colnames(returnMatrix) = colnames(queryMatrix)

  for(gene in names(cgenes)) {
    returnMatrix[gene, ] = cgenes[[gene]] * (1000 - ccn_rawScore(queryMatrix[gene, ], tvals[[ctt]][["mean"]][[gene]], tvals[[ctt]][["sd"]][[gene]], reg_type = cgenes[[gene]], xmax = 1000))

  }

  genes_important = importantGenes[[ctt]]
  orderGenes = genes_important[intersect(names(genes_important), rownames(returnMatrix))]
  orderGenes = sort(orderGenes, decreasing = TRUE)

  notImportantGenes = rownames(returnMatrix)[!(rownames(returnMatrix) %in% names(orderGenes))]

  orderGenes = c(names(orderGenes), notImportantGenes)
  returnMatrix = returnMatrix[orderGenes, ]
  return(returnMatrix)
}
pcahan1/cancerCellNet documentation built on July 16, 2022, 12:12 a.m.