R/MetaKTSP_Source.R

Defines functions MetaTSP.Pvalue MetaTSP.mean meta.mul.rule.predict predict.mtsp

Documented in meta.mul.rule.predict MetaTSP.mean MetaTSP.Pvalue predict.mtsp

######################################################################################################################
######################################################################################################################
######################################################################################################################
#
# MetaKTSP: A Meta-Analytic Top Scoring Pair Method for Robust Cross-Study Validation of Omics Prediction Analysis
# Version : 1.1.1
# Authors : SungHwan Kim, Chien-Wei Lin and George C. Tseng
# latest update : 08/10/2016
#
######################################################################################################################
######################################################################################################################
######################################################################################################################
##' Fit a prediction rule of meta top scoring pair
##'
##'
##' @title A Meta-Analytic Top Scoring Pair Method for Robust Cross-Study Validation of Omics Prediction Analysis
##' @param DList Input variable matrix (a list of multiple datasets; row=features, column=samples).
##' @param Method "Fisher" = fisher's meta-analysis, "Stouffer" = Stouffer's meta-analysis, NULL = mean score method.
##' @param K.max The maximum number of top score pairs.
##' @param is.VO Logical value indicating whether K selection is Variance optimization (VO).
##' @param Para.num.gene # of top scoring pairs that are initially considered.
##' @export
##' @return meta.tspobj = a list of meta top scoring pairs results
##' meta.tspobj$model = Meta top scoring pairs (n by k-1) meta.tspobj$num.K = #
##' of K meta top scoring pairs
##' @author Chien-Wei Lin <masaki396@gmail.com>
##' @example
MetaTSP.Pvalue <- function(DList,  Method = "Fisher", K.max, is.VO=TRUE, Para.num.gene = 10000) {

  # Description
  #     Fit a prediction rule of meta top scoring pair
  # Usage
  #     S = MetaTSP(DList,  Method = "Fisher", K.max, is.VO=TRUE, Para.num.gene = 10000)
  # Input
  #     X = Input variable matrix (a list of multiple datasets; row=features, column=samples)
  #     Method = "Fisher" = fisher's meta-analysis, "Stouffer" = Stouffer's meta-analysis, NULL = mean score method
  #     K.max = # of top score pairs
  #     is.VO = Logical value indicating whether K selection is Variance optimization (VO).
  #     Para.num.gene = # of top scoring pairs that are initially considered.
  # Output
  #     meta.tspobj = a list of meta top scoring pairs results
  #        meta.tspobj$model = Meta top scoring pairs (n by k-1)
  #        meta.tspobj$num.K = # of K meta top scoring pairs

  # Computing the scores of all gene pairs
  out <- foreach( j=1:length(DList)) %dopar% {
    dat  <- t(DList[[j]])
    n <- dim(dat)[1]
    m <- dim(dat)[2]
    grp <- as.numeric(rownames(dat))
    labels <- as.character(unique(grp))
    xx <- as.matrix(cbind(rep(1,n), grp))
    hatxx <- solve(t(xx) %*% xx) %*% t(xx)
    tmp <- foreach(i = c(1:(m-1))) %dopar% {
      yy <- dat[ ,-c(1:i)] < dat[,i]
      t(t((hatxx %*% yy)[2,]))
    }
    tmp <-do.call(rbind, tmp)
    return(tmp)
  }
  out <- do.call(cbind, out)
  #column refer to different study

  #gene pair index
  m <- dim(t(DList[[1]]))[2]
  ind <- foreach(i=1:(m-1)) %dopar% {
    dat  <- t(DList[[1]])
    cbind(rep(i, length(i:m)-1 ), (i+1):m,  rep(colnames(dat)[i], length(i:m)-1), colnames(dat)[(i+1):m])
  }
  ind <- do.call(rbind, ind)

  # Standardize the score
  out <- scale(out)
  Score.pvalue <- 1 - pnorm(out, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
  rownames(Score.pvalue) = NULL

  if(Method == "Fisher")
  {
  ##  Opposite direction p-value is required.
    Score.pvalue <- rbind(Score.pvalue, 1 - Score.pvalue)
  }

  tmp.metaTSP <- switch( Method,
                         Stouffer = {
                           Score_quantile <- -qnorm(Score.pvalue, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
                           Score_stouffer <- rowSums(Score_quantile) / sqrt(ncol(Score_quantile))
                           upper.q <- quantile(Score_stouffer, probs = seq(0, 1, 0.05))[20]
                           lower.q <- quantile(Score_stouffer, probs = seq(0, 1, 0.05))[2]
                           stouffer.ind <- (Score_stouffer > upper.q) | (Score_stouffer < lower.q)
                           data.frame(ind[stouffer.ind, ], Score_stouffer = Score_stouffer[stouffer.ind], Score_quantile[stouffer.ind,], stringsAsFactors = F)
                         },
                         Fisher = {
                           tmp <-  -rowSums(log(Score.pvalue))
                           .ind <- rbind(ind, ind)
                           score.direction <- c(rep(1,nrow(ind)),rep(-1,nrow(ind)))
                           data.frame(.ind, score.direction, tmp, -log(Score.pvalue), stringsAsFactors = F)
                         }
  )

  tspobj <- list()
  if(Method =="Fisher"){
    tmp <- tmp.metaTSP[ ,6]
    o <-  order(tmp, decreasing=TRUE)
    tspobj$m.index <- tmp.metaTSP[o[1:Para.num.gene], ]
  } else {
    tspobj$m.index <- tmp.metaTSP[order(abs(as.numeric(tmp.metaTSP[ ,5])), decreasing=TRUE ), ]
  }

  #remove the genes that appeared more than one time
  #note that, even only one gene in a pair appear multiple time, the entire pair was removed

  tmp.index <- c()
  comp.tmp <- c()
  o.chs <- dim(tspobj$m.index)[1]

  i=1
  repeat{
    if(sum(comp.tmp %in% c(as.character(tspobj[[1]][i,1]),as.character(tspobj[[1]][i,2]))) == 0){
      tmp.index[i] <- sum(comp.tmp %in% c(as.character(tspobj[[1]][i,1]),as.character(tspobj[[1]][i,2]))) == 0
      comp.tmp <- c(comp.tmp, c(as.character(tspobj[[1]][i,1]),as.character(tspobj[[1]][i,2])))
    }else{
      tmp.index[i] <- sum(comp.tmp %in% c(as.character(tspobj[[1]][i,1]),as.character(tspobj[[1]][i,2]))) == 0
    }
    i <- i + 1
    if(sum(tmp.index) == K.max) {break}
  }

  tspobj$m.index <- tspobj$m.index[c(tmp.index, rep( FALSE, o.chs-length(tmp.index)))   ,]

  if(Method == "Fisher"){
    colnames(tspobj$m.index) = c("GeneIndex1", "GeneIndex2", "Gene1", "Gene2", "Direction", "Score_overall",
                                 paste("Score_study", 1:length(DList), sep = ""))

  } else {
    colnames(tspobj$m.index) = c("GeneIndex1", "GeneIndex2", "Gene1", "Gene2", "Score_overall",
                                 paste("Score_study", 1:length(DList), sep = ""))

  }
  gene.pair.table = tspobj$m.index[1:min(nrow(tspobj$m.index), K.max),]

  K <- K.max
  if(is.VO){
    k.try = 2:K.max
    Total_tau <- foreach(k =  k.try ,.combine=c) %dopar% {
      Var.0 <- var(apply(foreach(ii=1:k, .combine=rbind)%do%{
        as.numeric(foreach( j = 1:length(DList) ,.combine=c) %do% {
          label <- colnames(DList[[j]])
          DList[[j]][tspobj$m.index[ii, 3], label=="0" ]  >  DList[[j]][tspobj$m.index[ii, 4], label== "0" ]
        })},2,sum))

      Var.1 <- var(apply(foreach(ii=1:k, .combine=rbind)%do%{
        as.numeric(foreach( j = 1:length(DList) ,.combine=c) %do% {
          label <- colnames(DList[[j]])
          DList[[j]][tspobj$m.index[ii, 3], label=="1" ]  >  DList[[j]][tspobj$m.index[ii, 4], label== "1" ]
        })},2,sum))

      sum(abs(as.numeric(tspobj$m.index[1:k, 5]))) / sqrt(Var.0 + Var.1)
    }

    i <- which(Total_tau==max(Total_tau))[1]
    tspobj$m.index <- tspobj$m.index[1:k.try[i], ]
  } else {

    if(K==1){
      tspobj$m.index <- tspobj$m.index
    } else{
      ## When "is.VO=FALSE", just simple K collection.
      tspobj$m.index <- tspobj$m.index[1:K,]
    }
  }

  rownames(tspobj$m.index) <- rownames(gene.pair.table) <- NULL
  meta.tspobj <- list()
  meta.tspobj$model <- tspobj$m.index
  meta.tspobj$num.K <- dim(tspobj$m.index)[1]
  meta.tspobj$gene.pair.table <- gene.pair.table
  if(is.VO) meta.tspobj$VO <- Total_tau

  return(meta.tspobj)
}

##' MetaTSP.mean is a function of a meta-analytic top scoring pair (MetaTSP) algorithm via Mean score method.
##'
##'
##' @title A Meta-Analytic K Top Scoring Pair via Mean Score Method
##' @param DList Input variable matrix (a list of multiple datasets; row=features, column=samples).
##' @param K.max The maximum number of top score pairs.
##' @param is.VO Logical value indicating whether K selection is Variance optimization (VO)
##' @param quantile.index A quantile threshold for selecting top scoring pair. Default is 10.
##' @return An object of class list is returned with components as following:
##' model includes meta k top scoring pairs (n by k-1).
##' num.K is the number of K meta top scoring pairs.
##' @author Chien-Wei Lin
##' @export
##' @examples
MetaTSP.mean <- function(DList, K.max, is.VO=TRUE, quantile.index=10){
  # Description
  #     Fit a prediction rule of meta top scoring pair
  # Usage
  #     S = MetaTSP(DList,  Method = "Fisher", K.max, is.VO=TRUE, Para.num.gene = 10000)
  # Input
  #     X = Input variable matrix (a list of multiple datasets; row=features, column=samples)
  #     Method = "Fisher" = fisher's meta-analysis, "Stouffer" = Stouffer's meta-analysis, NULL = mean score method
  #     K.max = # of top score pairs
  #     is.VO = Logical value indicating whether K selection is Variance optimization (VO).
  #     Para.num.gene = # of top scoring pairs that are initially considered.
  # Output
  #     meta.tspobj = a list of meta top scoring pairs results
  #        meta.tspobj$model = Meta top scoring pairs (n by k-1)
  #        meta.tspobj$num.K = # of K meta top scoring pairs

  # Computing the scores of all gene pairs
  out <- foreach( j=1:length(DList)) %dopar% {
    dat  <- t(DList[[j]])
    n <- dim(dat)[1] #number of samples
    m <- dim(dat)[2] #number of features
    grp <- as.numeric(rownames(dat))
    labels <- as.character(unique(grp))
    xx <- as.matrix(cbind(rep(1,n), grp))
    hatxx <- solve(t(xx) %*% xx) %*% t(xx) #2 x n matrix
    tmp <- foreach(i = c(1:(m-1))) %dopar% {
      yy <- dat[ ,-c(1:i)] < dat[,i] # n x m matrix
      t(t((hatxx %*% yy)[2,]))
    }
    tmp <-do.call(rbind, tmp)
    return(tmp)
  }
  out <- do.call(cbind, out)

  score <- rowSums(out)

  tmp.thread <- abs(score) > quantile(abs(score), probs = seq(0, 1, 0.1))[quantile.index]
  score <- score[tmp.thread]
  out.tmp <- out[tmp.thread,]
  rownames(out.tmp) = NULL

  m <- dim(t(DList[[1]]))[2] #number of features
  ind <- foreach(i=1:(m-1)) %dopar% {
    dat  <- t(DList[[1]])
    cbind(rep(i, length(i:m)-1), (i+1):m, rep(colnames(dat)[i], length(i:m)-1), colnames(dat)[(i+1):m])
  }
  ind <- do.call(rbind, ind)
ind <- ind[tmp.thread,]


  o.score <- order(abs(score), decreasing=T)
  o.chs = ifelse(dim(ind)[1]<1000, dim(ind)[1], 1000) #only use up to 1000 pairs
  tspobj <- list()
  tspobj$m.index <- data.frame(ind[o.score,][1:o.chs,], score[o.score][1:o.chs], out.tmp[o.score,][1:o.chs,], stringsAsFactors = F)
  #---------------------------------------------------------------------------
  # remove the genes that appeared more than one time
  # note that, even only one gene in a pair appear multiple time,
  # the entire pair was removed
  #---------------------------------------------------------------------------
  tmp.index <- c()
  comp.tmp <- c()
  i=1
  repeat{
    if(sum(comp.tmp %in% c(as.character(tspobj[[1]][i,1]),as.character(tspobj[[1]][i,2]))) == 0){
      tmp.index[i] <- sum(comp.tmp %in% c(as.character(tspobj[[1]][i,1]),as.character(tspobj[[1]][i,2]))) == 0
      comp.tmp <- c(comp.tmp, c(as.character(tspobj[[1]][i,1]),as.character(tspobj[[1]][i,2])))
    }else{
      tmp.index[i] <- sum(comp.tmp %in% c(as.character(tspobj[[1]][i,1]),as.character(tspobj[[1]][i,2]))) == 0
    }
    i <- i + 1
    if( i == dim(tspobj$m.index)[1]) {break}
}
  tspobj$m.index <- tspobj$m.index[c(tmp.index, rep( FALSE, o.chs-length(tmp.index))) ,]
  colnames(tspobj$m.index) = c("GeneIndex1", "GeneIndex2", "Gene1", "Gene2", "Score_overall",
                               paste("Score_study", 1:length(DList), sep = ""))
  gene.pair.table = tspobj$m.index[1:min(nrow(tspobj$m.index), K.max),]
  ### Variation optimization approach ###
  if(is.VO){
    # k.try = seq(3, dim(tspobj$m.index)[1]  ,2)[seq(3, dim(tspobj$m.index)[1]  ,2) <= K.max]
    k.try = 2:K.max
    Total_tau <- foreach(k = k.try,.combine=c) %dopar% {
      Var.0 <- var(apply(foreach(ii=1:k, .combine=rbind)%do%{
        as.numeric(foreach( j = 1:length(DList) ,.combine=c) %do% {
          label <- colnames(DList[[j]])
          DList[[j]][tspobj$m.index[ii, 3], label=="0" ]  >  DList[[j]][tspobj$m.index[ii, 4], label== "0" ]
        })},2,sum))

      Var.1 <- var(apply(foreach(ii=1:k, .combine=rbind)%do%{
        as.numeric(foreach( j = 1:length(DList) ,.combine=c) %do% {
          label <- colnames(DList[[j]])
          DList[[j]][tspobj$m.index[ii, 3], label=="1" ]  >  DList[[j]][tspobj$m.index[ii, 4], label== "1" ]
        })},2,sum))

      sum(abs(as.numeric(tspobj$m.index[1:k, 5]))) / sqrt(Var.0 + Var.1)
    }
    i <- which(Total_tau==max(Total_tau))[1]
                                        # tspobj$m.index <- tspobj$m.index[1:seq(3, dim(tspobj$m.index)[1]  ,2)[i], ]
    tspobj$m.index <- tspobj$m.index[1:k.try[i], ]
  } else {

    tspobj$m.index <- tspobj$m.index[1:K.max,]

}


  rownames(tspobj$m.index) <- rownames(gene.pair.table) <- NULL
  meta.tspobj <- list()
  meta.tspobj$model <- tspobj$m.index
  meta.tspobj$num.K <- dim(tspobj$m.index)[1]
  meta.tspobj$gene.pair.table <- gene.pair.table
  if(is.VO) meta.tspobj$VO <- Total_tau
  return(meta.tspobj)
}

##' Function for predicting subjects class based on given MetaKTSP model, and summarize the results
##'
##'
##' @title Prediction based on a given MetaKTSP model
##' @param test.dat Test dataset.
##' @param tspobj MetaKTSP model.
##' @param K The maximum number of top score pairs.
##' @param is.youden Used youden index in result. Default TRUE.
##' @return An object of class list is returned with following three components:
##' @return "mul.rule" is the prediction result
##' @return "accuracy" is the prediction accuracy
##' @return "youden" is Youden index
##' @author Chien-Wei Lin
##' @export
meta.mul.rule.predict <- function(test.dat, tspobj, K , is.youden=TRUE){
  test.grp <- as.numeric(colnames(test.dat))

  if(K != 1){
    mul.rule <- foreach(i = 1: length(test.grp) ) %dopar% {
      cum.rule <- predict.mtsp(tspobj, test.dat, K=K)[ ,i]
      if(sum(cum.rule)/K == cum.rule[1]){
        cum.rule[1]
      }else{
        ifelse(table(cum.rule)[2] > (K/2) ,  1 ,  0)
      }
    }
    mul.rule <- do.call(cbind, mul.rule)
  }else{
    mul.rule <- predict.mtsp(tspobj, test.dat, K=K)
  }

  youden <- c()
  result<-list()
  if(is.youden){
    .tab <- table( mul.rule, test.grp)
    if(length(rownames(.tab)) == 1) {
      if(rownames(.tab)=="1"){ .tab <- rbind( c(0,0),.tab)
      } else {
        .tab <- rbind(.tab,c(0,0))
      }
    }
    if((length(rownames(.tab)) == 2 & length(colnames(.tab)) == 2)) {youden <- (.tab[1,1]  / sum(.tab[,1]))  + (.tab[2,2] / sum(.tab[,2])) - 1}
    result$youden <- youden
  }
  accuracy <- mean(as.numeric(mul.rule) == test.grp)
  result$mul.rule <- as.numeric(mul.rule)
  result$accuracy <- accuracy
  return(result)
}

##' Function for predicting subjects class based on given MetaKTSP model. For internal use in function meta.mul.rule.predict.
##'
##'
##' @title Prediction based on a given MetaKTSP model
##' @param object MetaKTSP model.
##' @param test.dat A matrix with features in row and samples in column (p x n).
##' @param K The maximum number of top score pairs.
##' @return A matrix of prediction result (K x n) with samples in column and different row correspond to different K.
##' @author Chien-Wei Lin <masaki396@gmail.com>
predict.mtsp <- function(object, test.dat, K){
  tspobj <- object
  test.dat <- t(test.dat)
  predict <- foreach(i = 1 : K, .combine=rbind)%dopar% {
    if( as.numeric(tspobj$model[i, 5]) > 0 ){
      as.numeric(test.dat[ ,colnames(test.dat) == tspobj$model[i, 3]]  >  test.dat[ ,colnames(test.dat) == tspobj$model[i, 4]])
    } else {
      as.numeric(test.dat[ ,colnames(test.dat) == tspobj$model[i, 3]]  <=  test.dat[ ,colnames(test.dat) == tspobj$model[i, 4]])
    }
  }
  return(predict)
}
metaOmics/MetaKTSP documentation built on May 29, 2019, 4:43 a.m.