R/PredScore.r

Defines functions DataPrep_test score_func

Documented in DataPrep_test score_func

#============================================================================================#
#======================= some helper functions for prediction score =========================#
#============================================================================================#

### log 1: 10/11/2019: add an argument for Demo transformation
###        10/11/2019: add a section for N-fold cv

# see the vignettes for an concrete example

### function 1
#=========================================================================================#
#========== A function to create pseudo data sets used in the score calculation ==========#
#=================================== input ===============================================#
# GeneExp: test GeneExp data                                                              #
# CellProp: test CellProp data                                                            #
# Demo: test Demo data except for the response vector; Null if no such information        #
# train_response: the response vector from training data                                  #
# include.demo: this option should be the same as the option in FastMix. Default = T      #
# w: the weight covariance matrix among subjects                                          #
#=========================================================================================#

###  A function to generate the test data. Here, w is the weight
### function. Response_idx is the idx of response variable in the Demo
### matrix Demo here is a vertor/matrix that contains demo information
### except for the Response vector, because this is the variable we
### want to classify.
DataPrep_test <- function(GeneExp, CellProp, Demo, train_Demo, train_response, include.demo=TRUE, w="iid"){
  ### add the case for n = 1

  GeneExp = as.matrix(GeneExp)
  m <- nrow(GeneExp); n <- ncol(GeneExp)
  ## assign gene names if empty
  if (is.null(rownames(GeneExp))) rownames(GeneExp) <- paste0("Gene", 1:m)
  ## turn vectors into matrix
  CellProp <- as.matrix(CellProp)
  ## assign colnames (variable names) if empty
  if (is.null(colnames(CellProp))) colnames(CellProp) <- paste0("Cell", 1:ncol(CellProp))

  ### Demo may be Null
  if(is.null(Demo)) {
    Demo0 = c()
  }
  else{
    Demo <- as.matrix(Demo)
    if(is.null(colnames(Demo))) {
      colnames(Demo) <- paste0("Var", 1:ncol(Demo))
    }

    ## standardize all clinical covariates: can not do it directly
    mean_sd = attributes(scale(train_Demo))
    for(i in 1:mean_sd$dim[2]){
      me = mean_sd$`scaled:center`[i]
      se = mean_sd$`scaled:scale`[i]
      Demo[,i] = (Demo[,i] - me) / se
    }
    Demo0 = Demo
    #Demo0 <- scale(Demo)
  }


  ### by definition,
  response_impute = sort(unique(scale(train_response)))
  #============================================================#
  #======== here we replace the Demo0 as another vector =======#
  #============================================================#

  Demo0_0 = cbind(Demo0, Res = rep(response_impute[1], n))
  Demo0_1 = cbind(Demo0, Res = rep(response_impute[2], n))
  # if(Response == 0){
  #   ### Generate the data assuming Response = 0
  #   ## actually if Demo is binary, then after scaling, the subjects with Demo = 0 should be <0.
  #   R_value = unique(Demo0[,Response_idx]) ### the unique 2 values
  #   R_value = R_value[R_value < 0] ### the value we use to impute the data
  #   Demo0[Demo0[,Response_idx] != R_value, Response_idx] = R_value ### replace the Demo0
  # }
  # else if(Response == 1){
  #   R_value = unique(Demo0[,Response_idx])
  #   R_value = R_value[R_value > 0] ### the value we use to impute the data
  #   Demo0[Demo0[,Response_idx] != R_value, Response_idx] = R_value ### replace the Demo0
  # }
  #Demo0 = Demo
  ## create interaction terms



  #============================================#
  #======= construct Data0 ====================#
  #============================================#
  Crossterms.idx <- cbind(rep(1:ncol(CellProp), ncol(Demo0_0)),
                          rep(1:ncol(Demo0_0), each=ncol(CellProp)))
  rownames(Crossterms.idx) <- paste(colnames(CellProp)[Crossterms.idx[,1]],
                                    colnames(Demo0_0)[Crossterms.idx[,2]], sep=".")
  Crossterms <- sapply(1:nrow(Crossterms.idx), function(l) {
    k <- Crossterms.idx[l,1]; p <- Crossterms.idx[l,2]
    CellProp[,k] * Demo0_0[,p]
  })
  if(is.null(dim(Crossterms))){
    Crossterms = t(as.matrix(Crossterms))
  }
  colnames(Crossterms) <- rownames(Crossterms.idx)
  ## combine all variables
  if (include.demo==TRUE){
    X0 <- cbind(CellProp, Demo0_0, Crossterms)
  } else {
    X0 <- cbind(CellProp, Crossterms)
  }

  ## by default, the weighting matrix is a diagonal matrix,
  ## representing the i.i.d. data
  if (all(w=="iid")) w <- diag(nrow(X0))
  ### weighted sample
  X0 = w %*% X0
  GeneExp = GeneExp %*% w

  ## we ignore the svd because all Demo0 is the same.
  #ss <- svd(X0)$d
  #if (min(ss)<1e-7) stop("The design matrix is numerically singular. Please consider: (a) include less cell types in the model, or (b) use 'include.demo=FALSE' to exclude the main effects of demographic covariates from the model.")
  ## create the long table for all covariates
  X <- cbind("ID"=rep(1:m, each=n),
             do.call(rbind, replicate(m,X0,simplify=FALSE)))
  ## create the long vector of Y
  Y <- as.vector(t(GeneExp))

  Data_0 = list(X=X,Y=Y)


  #============================================#
  #======= construct Data1 ====================#
  #============================================#
  Crossterms.idx <- cbind(rep(1:ncol(CellProp), ncol(Demo0_1)),
                          rep(1:ncol(Demo0_1), each=ncol(CellProp)))
  rownames(Crossterms.idx) <- paste(colnames(CellProp)[Crossterms.idx[,1]],
                                    colnames(Demo0_1)[Crossterms.idx[,2]], sep=".")
  Crossterms <- sapply(1:nrow(Crossterms.idx), function(l) {
    k <- Crossterms.idx[l,1]; p <- Crossterms.idx[l,2]
    CellProp[,k] * Demo0_1[,p]
  })
  if(is.null(dim(Crossterms))){
    Crossterms = t(as.matrix(Crossterms))
  }
  colnames(Crossterms) <- rownames(Crossterms.idx)
  ## combine all variables
  if (include.demo==TRUE){
    X0 <- cbind(CellProp, Demo0_1, Crossterms)
  } else {
    X0 <- cbind(CellProp, Crossterms)
  }

  ### weighted sample
  X0 = w %*% X0
  GeneExp = GeneExp %*% w

  ## we ignore the svd because all Demo0 is the same.
  #ss <- svd(X0)$d
  #if (min(ss)<1e-7) stop("The design matrix is numerically singular. Please consider: (a) include less cell types in the model, or (b) use 'include.demo=FALSE' to exclude the main effects of demographic covariates from the model.")
  ## create the long table for all covariates
  X <- cbind("ID"=rep(1:m, each=n),
             do.call(rbind, replicate(m,X0,simplify=FALSE)))
  ## create the long vector of Y
  Y <- as.vector(t(GeneExp))

  Data_1 = list(X=X,Y=Y)

  return(list(Data_0 = Data_0, Data_1 = Data_1))
}



### function 3
#=========================================================================================#
#========== A function to calculated summary scores used in classification ===============#
#=================================== input ===============================================#
#=========================================================================================#
# mod1: the fitted mod using FastMix from training data                                   #
# GeneExp: test GeneExp data                                                              #
# CellProp: test CellProp data                                                            #
# Demo: test Demo data. We require it to be binary here                                   #
# idx: the index for covariates without Demo interaction                                  #
#=========================================================================================#

### m, gene number; n, subjects number; interaction_index, the index
### in design matrix for the interaction between cell and Demo
### Response_idx is the single index of the main response variable.
score_func = function(mod1, Data_0, Data_1, Response_interaction_index, Response_idx, sig.level=0.05){
  ### gene-level coefficient
  gene_coef = mod1$beta.mat

  m_train = dim(mod1$beta.mat)[1]
  p_train = dim(mod1$beta.mat)[2]
  m = length(unique(Data_0$X[,1]))
  n = dim(Data_0$X)[1]/m
  p = dim(Data_0$X)[2]-1 # -1 for ID column

  if(p != p_train){
    stop("The number of covariates is different, please check!")
  }
  if(m_train != m){
    stop("The number of genes is different, please check!")
  }
  if (length(Response_idx) !=1 & !is.null(Response_idx)) stop("You must select a single Response variable or set it as null.")
  ### more than one sybjects in the test data
  if(n > 1) {
    ### because the data is stacked by ID, we also need to argument the coef_matrix
    #expand_coef = gene_coef[rep(1:nrow(gene_coef), each = n), ] # n is the number of subjects
    mu_0 = gene_coef %*% t(as.matrix(Data_0$X[1:n,-1])) ## the first col is ID, and the design matrix doesn't change over genes
    mu_1 = gene_coef %*% t(as.matrix(Data_1$X[1:n,-1])) ## the first col is ID, and the design matrix doesn't change over genes

    Y = matrix(Data_0$Y, ncol = n, nrow = m, byrow = T) ### transform the Y vector into the m by n matrix
    res = Y - mu_0   ### Y - \mu_0
    delta = mu_1 - mu_0


    #=======================================================#
    #====== step1 : generate a single summary score ========#
    #=======================================================#
    delta_scale = apply(delta, 2, function(x) sqrt(sum(x^2)))
    ### calculate the
    single_score = t(res) %*% delta
    single_score = diag(single_score) / (delta_scale)


    #=======================================================#
    #====== step2: generate multi summary scores ===========#
    #=======================================================#
    multi_score = res * delta ### m by n

    #=======================================================#
    #====== step3: generate sparse summary scores ==========#
    #=======================================================#
    ### the difference would be the calculation of delta vector

    ### we first identify DEGs in the interaction directions
    union_index = c()
    l = length(Response_interaction_index)

    ### here, we need to consider the case when there is no res in te mdoel
    if(is.null(Response_idx)){
      sparse_coef = fix_coef = matrix(0, ncol = l, nrow = m)
      #fix_coef[,1:length(Response_idx)] = mod1$fixed.results[Response_idx,1] ### the coef for Demo variable
      #sparse_coef[,1:length(Response_idx)] = mod1$beta.mat[,Response_idx]-mod1$fixed.results[Response_idx,1]
      ### fill the coef for interaction
      for(i in 1:l){
        fix_coef[,i] = mod1$fixed.results[Response_interaction_index[i],1]
        idx = which((mod1$re.ind.pvalue)[,Response_interaction_index[i]] < sig.level)
        union_index = union(union_index, idx)

        ## the reason for this "-" is because beta.mat is the individual
        ## coefficient (fix+ranodm)
        sparse_coef[idx,i] = mod1$beta.mat[idx,Response_interaction_index[i]] - mod1$fixed.results[Response_interaction_index[i],1]
        ### only random effects
      }
    } else{
      sparse_coef = fix_coef = matrix(0, ncol = l+length(Response_idx), nrow = m)
      fix_coef[,1:length(Response_idx)] = mod1$fixed.results[Response_idx,1] ### the coef for Demo variable
      sparse_coef[,1:length(Response_idx)] = mod1$beta.mat[,Response_idx]-mod1$fixed.results[Response_idx,1]
      ### fill the coef for interaction
      for(i in 1:l){
        fix_coef[,i+length(Response_idx)] = mod1$fixed.results[Response_interaction_index[i],1]
        idx = which((mod1$re.ind.pvalue)[,Response_interaction_index[i]] < sig.level)
        union_index = union(union_index, idx)

        ## the reason for this "-" is because beta.mat is the individual
        ## coefficient (fix+ranodm)
        sparse_coef[idx,i+length(Response_idx)] = mod1$beta.mat[idx,Response_interaction_index[i]] - mod1$fixed.results[Response_interaction_index[i],1]
        ### only random effects
      }
    }

    ### calculate the sparse score
    delta_sparse_fix = fix_coef %*% as.matrix(t(Data_1$X[1:n,c(Response_idx, Response_interaction_index)+1] -
                                                  Data_0$X[1:n,c(Response_idx, Response_interaction_index)+1]))
    delta_sparse_random = sparse_coef %*%  as.matrix(t(Data_1$X[1:n,c(Response_idx, Response_interaction_index)+1] -
                                                         Data_0$X[1:n,c(Response_idx, Response_interaction_index)+1]))

    delta_sparse_scale= apply(delta_sparse_fix+delta_sparse_random, 2, function(x) sqrt(sum(x^2)))
    single_sparse_score = t(res) %*% (delta_sparse_fix+delta_sparse_random)
    single_sparse_score = diag(single_sparse_score) / (delta_sparse_scale)

    #=======================================================#
    #====== step4: generate sparse multiple scores =========#
    #=======================================================#

    multi_sparse_score = rbind("fixed"=colSums(res * (delta_sparse_fix)),
                                        (res * delta_sparse_random)[union_index,]) ### m by n


    result = list(single_score = single_score, single_sparse_score = single_sparse_score, multi_score = multi_score, multi_sparse_score = multi_sparse_score)
    return(result)
  }
  else if(n == 1) {
    ### because the data is stacked by ID, we also need to argument the coef_matrix
    #expand_coef = gene_coef[rep(1:nrow(gene_coef), each = n), ] # n is the number of subjects
    mu_0 = gene_coef %*% as.matrix(Data_0$X[1:n,-1]) ## the first col is ID, and the design matrix doesn't change over genes
    mu_1 = gene_coef %*% as.matrix(Data_1$X[1:n,-1]) ## the first col is ID, and the design matrix doesn't change over genes

    Y = matrix(Data_0$Y, ncol = n, nrow = m, byrow = T) ### transform the Y vector into the m by n matrix
    res = Y - mu_0   ### Y - \mu_0
    delta = mu_1 - mu_0


    #=======================================================#
    #====== step1 : generate a single summary score ========#
    #=======================================================#
    delta_scale = apply(delta, 2, function(x) sqrt(sum(x^2)))
    ### calculate the
    single_score = t(res) %*% delta
    single_score = diag(single_score) / (delta_scale)


    #=======================================================#
    #====== step2: generate multi summary scores ===========#
    #=======================================================#
    multi_score = res * delta ### m by n

    #=======================================================#
    #====== step3: generate sparse summary scores ==========#
    #=======================================================#
    ### the difference would be the calculation of delta vector

    ### we first identify DEGs in the interaction directions
    union_index = c()
    l = length(Response_interaction_index)

    if(is.null(Response_idx)){
      sparse_coef = fix_coef = matrix(0, ncol = l+length(Response_idx), nrow = m)
      #fix_coef[,1:length(Response_idx)] = mod1$fixed.results[Response_idx,1] ### the coef for Demo variable
      #sparse_coef[,1:length(Response_idx)] = mod1$beta.mat[,Response_idx]-mod1$fixed.results[Response_idx,1]
      ### fill the coef for interaction
      for(i in 1:l){
        fix_coef[,i+length(Response_idx)] = mod1$fixed.results[Response_interaction_index[i],1]
        idx = which((mod1$re.ind.pvalue)[,Response_interaction_index[i]] < sig.level)
        union_index = union(union_index, idx)

        ## the eason for this "-" is because beta.mat is the individual coefficient (fix+ranodm)
        sparse_coef[idx,i+length(Response_idx)] = mod1$beta.mat[idx,Response_interaction_index[i]] -
          mod1$fixed.results[Response_interaction_index[i],1]
        ### only random effects
      }
    } else{
        sparse_coef = fix_coef = matrix(0, ncol = l+length(Response_idx), nrow = m)
        fix_coef[,1:length(Response_idx)] = mod1$fixed.results[Response_idx,1] ### the coef for Demo variable
        sparse_coef[,1:length(Response_idx)] = mod1$beta.mat[,Response_idx]-mod1$fixed.results[Response_idx,1]
        ### fill the coef for interaction
        for(i in 1:l){
          fix_coef[,i+length(Response_idx)] = mod1$fixed.results[Response_interaction_index[i],1]
          idx = which((mod1$re.ind.pvalue)[,Response_interaction_index[i]] < sig.level)
          union_index = union(union_index, idx)

          ## the eason for this "-" is because beta.mat is the individual coefficient (fix+ranodm)
          sparse_coef[idx,i+length(Response_idx)] = mod1$beta.mat[idx,Response_interaction_index[i]] -
            mod1$fixed.results[Response_interaction_index[i],1]
          ### only random effects
        }
    }

    ### calculate the sparse score
    delta_sparse_fix = fix_coef %*% as.matrix((Data_1$X[1:n,c(Response_idx, Response_interaction_index)+1] -
                                                  Data_0$X[1:n,c(Response_idx, Response_interaction_index)+1]))
    delta_sparse_random = sparse_coef %*%  as.matrix((Data_1$X[1:n,c(Response_idx, Response_interaction_index)+1] -
                                                         Data_0$X[1:n,c(Response_idx, Response_interaction_index)+1]))

    delta_sparse_scale= apply(delta_sparse_fix+delta_sparse_random, 2, function(x) sqrt(sum(x^2)))
    single_sparse_score = t(res) %*% (delta_sparse_fix+delta_sparse_random)
    single_sparse_score = diag(single_sparse_score) / (delta_sparse_scale)

    #=======================================================#
    #====== step4: generate sparse multiple scores =========#
    #=======================================================#

    multi_sparse_score = c("fixed"=colSums(res * (delta_sparse_fix)),
                               (res * delta_sparse_random)[union_index,]) ### m by n


    result = list(single_score = single_score, single_sparse_score = single_sparse_score, multi_score = multi_score, multi_sparse_score = multi_sparse_score)

    return(result)
  }

}
terrysun0302/FastMix documentation built on Nov. 14, 2019, 4:54 a.m.