R/LRAcluster.R

Defines functions LRAcluster

Documented in LRAcluster

#' @name LRAcluster
#' @aliases LRAcluster
#' @title integrated analysis of cancer omics data by low-rank approximation
#' @description The LRAcluster function is the main interface of this package, it gets a list of matrix as input and outputs the coordinates of the samples in the reduced space and the explained potential.
#' @param data a list of data matrix,please keep the columns are the same order of samples
#' @param types a list of data types, can be binary, gaussian, or poisson
#' @param dimension the reduced dimension
#' @param names data names
#' @return A list contains the following component:
#'
#'         \code{coordinate} A matrix of the coordinates of all the samples in the reduced space
#'
#'         \code{potential}  ratio of explained variance
#' @keywords internal
#' @author Dingming Wu, Dongfang Wang
#' @references Wu D, Wang D, Zhang MQ, Gu J (2015). Fast dimension reduction and integrative clustering of multi-omics data using low-rank approximation: application to cancer molecular classification. BMC Genomics, 16(1):1022.
LRAcluster <- function(data, types, dimension = 2, names = as.character(1:length(data)))
{
  #--------#
  # binary #
  #--------#
  epsilon.binary<-2.0
  check.binary.row<-function(arr)
  {
    if (sum(!is.na(arr))==0)
    {
      return (F)
    }
    else
    {
      idx<-!is.na(arr)
      if (sum(arr[idx])==0 || sum(arr[idx])==sum(idx))
      {
        return (F)
      }
      else
      {
        return (T)
      }
    }
  }
  check.binary<-function(mat,name)
  {
    index<-apply(mat,1,check.binary.row)
    n<-sum(!index)
    if (n>0)
    {
      w<-paste("Warning: ",name," have ",as.character(n)," invalid lines!",sep="")
      warning(w)
    }
    mat_c<-mat[index,]
    rownames(mat_c)<-rownames(mat)[index]
    colnames(mat_c)<-colnames(mat)
    return (mat_c)
  }

  base.binary.row<-function(arr)
  {
    idx<-!is.na(arr)
    n<-sum(idx)
    m<-sum(arr[idx])
    return (log(m/(n-m)))
  }

  base.binary<-function(mat)
  {
    mat_b<-matrix(0,nrow(mat),ncol(mat))
    ar_b<-apply(mat,1,base.binary.row)
    mat_b[1:nrow(mat_b),]<-ar_b
    return (mat_b)
  }

  update.binary<-function(mat,mat_b,mat_now,eps)
  {
    mat_p<-mat_b+mat_now
    mat_u<-matrix(0,nrow(mat),ncol(mat))
    idx1<-!is.na(mat) & mat==1
    idx0<-!is.na(mat) & mat==0
    index<-is.na(mat)
    arr<-exp(mat_p)
    mat_u[index]<-mat_now[index]
    mat_u[idx1]<-mat_now[idx1]+eps*epsilon.binary/(1.0+arr[idx1])
    mat_u[idx0]<-mat_now[idx0]-eps*epsilon.binary*arr[idx0]/(1.0+arr[idx0])
    return (mat_u)
  }

  stop.binary<-function(mat,mat_b,mat_now,mat_u)
  {
    index<-!is.na(mat)
    mn<-mat_b+mat_now
    mu<-mat_b+mat_u
    arn<-exp(mn)
    aru<-exp(mu)
    idx1<-!is.na(mat) & mat==1
    idx0<-!is.na(mat) & mat==0
    lgn<-sum(log(arn[idx1]/(1+arn[idx1])))+sum(log(1/(1+arn[idx0])))
    lgu<-sum(log(aru[idx1]/(1+aru[idx1])))+sum(log(1/(1+aru[idx0])))
    return (lgu-lgn)
  }

  LL.binary<-function(mat,mat_b,mat_u)
  {
    index<-!is.na(mat)
    mu<-mat_b+mat_u
    aru<-exp(mu)
    idx1<-!is.na(mat) & mat==1
    idx0<-!is.na(mat) & mat==0
    lgu<-sum(log(aru[idx1]/(1+aru[idx1])))+sum(log(1/(1+aru[idx0])))
    return (lgu)
  }

  LLmax.binary<-function(mat)
  {
    return (0)
  }

  LLmin.binary<-function(mat,mat_b)
  {
    index<-!is.na(mat)
    aru<-exp(mat_b)
    idx1<-!is.na(mat) & mat==1
    idx0<-!is.na(mat) & mat==0
    lgu<-sum(log(aru[idx1]/(1+aru[idx1])))+sum(log(1/(1+aru[idx0])))
    return (lgu)
  }

  binary_type_base <- function( data,dimension=2 ,name="test")
  {
    data<-check.binary(data,name)
    data_b<-base.binary(data)
    data_now<-matrix(0,nrow(data),ncol(data))
    data_u<-update.binary(data,data_b,data_now)
    data_u<-nuclear_approximation(data_u,dimension)
    while (T)
    {
      thr<-stop.binary(data,data_b,data_now,data_u)
      message(thr)
      if (thr<0.2)
      {
        break
      }
      data_now<-data_u
      data_u<-update.binary(data,data_b,data_now)
      data_u<-nuclear_approximation(data_u,dimension)
    }
    return (data_now)
  }

  #----------#
  # gaussian #
  #----------#

  epsilon.gaussian=0.5

  check.gaussian.row<-function(arr)
  {
    if (sum(!is.na(arr))==0)
    {
      return (F)
    }
    else
    {
      return (T)
    }
  }
  check.gaussian<-function(mat,name)
  {
    index<-array(T,nrow(mat))
    for(i in 1:nrow(mat))
    {
      if (sum(is.na(mat[i,])==ncol(mat)))
      {
        war<-paste("Warning: ",name,"'s ",as.character(i)," line is all NA. Delete this line",sep="")
        warning(war)
        index[i]<-F
      }
    }
    mat_c<-mat[index,]
    rownames(mat_c)<-rownames(mat)[index]
    colnames(mat_c)<-colnames(mat)
    return (mat_c)
  }

  base.gaussian.row<-function(arr)
  {
    idx<-!is.na(arr)
    return (mean(arr[idx]))
  }

  base.gaussian<-function(mat)
  {
    mat_b<-matrix(0,nrow(mat),ncol(mat))
    ar_b<-apply(mat,1,base.gaussian.row)
    mat_b[1:nrow(mat_b),]<-ar_b
    return (mat_b)
  }

  update.gaussian<-function(mat,mat_b,mat_now,eps)
  {
    mat_p<-mat_b+mat_now
    mat_u<-matrix(0,nrow(mat),ncol(mat))
    index<-!is.na(mat)
    mat_u[index]<-mat_now[index]+eps*epsilon.gaussian*(mat[index]-mat_p[index])
    index<-is.na(mat)
    mat_u[index]<-mat_now[index]
    return (mat_u)
  }

  stop.gaussian<-function(mat,mat_b,mat_now,mat_u)
  {
    index<-!is.na(mat)
    mn<-mat_b+mat_now
    mu<-mat_b+mat_u
    ren<-mat[index]-mn[index]
    reu<-mat[index]-mu[index]
    lgn<- -0.5*sum(ren*ren)
    lgu<- -0.5*sum(reu*reu)
    return (lgu-lgn)
  }

  LL.gaussian<-function(mat,mat_b,mat_u)
  {
    index<-!is.na(mat)
    mu<-mat_b+mat_u
    reu<-mat[index]-mu[index]
    lgu<- -0.5*sum(reu*reu)
    return (lgu)
  }

  LLmax.gaussian<-function(mat)
  {
    return (0.0)
  }

  LLmin.gaussian<-function(mat,mat_b)
  {
    index<-!is.na(mat)
    reu<-mat[index]-mat_b[index]
    lgu<- -0.5*sum(reu*reu)
    return (lgu)
  }

  gaussian_base<-function(data,dimension=2,name="test")
  {
    data<-check.gaussian(data,name)
    data_b<-base.gaussian(data)
    data_now<-matrix(0,nrow(data),ncol(data))
    data_u<-update.gaussian(data,data_b,data_now)
    data_u<-nuclear_approximation(data_u,dimension)
    while(T)
    {
      thr<-stop.gaussian(data,data_b,data_now,data_u)
      message(thr)
      if (thr<0.2)
      {
        break
      }
      data_now<-data_u
      data_u<-update.gaussian(data,data_b,data_now)
      data_u<-nuclear_approximation(data_u,dimension)
    }
    return (data_now)
  }

  #---------#
  # poisson #
  #---------#

  epsilon.poisson<-0.5

  check.poisson.row<-function(arr)
  {
    if (sum(!is.na(arr))==0)
    {
      return (F)
    }
    else
    {
      idx<-!is.na(arr)
      if (sum(arr[idx]<0)>0)
      {
        return (F)
      }
      else
      {
        return (T)
      }
    }
  }

  check.poisson<-function(mat,name)
  {
    w<-paste(name," is poisson type. Add 1 to all counts",sep="")
    message(w)
    index<-apply(mat,1,check.poisson.row)
    n<-sum(!index)
    if (n>0)
    {
      w<-paste("Warning: ",name," have ",as.character(n)," invalid lines!",sep="")
      warning(w)
    }
    mat_c<-mat[index,]+1
    rownames(mat_c)<-rownames(mat)[index]
    colnames(mat_c)<-colnames(mat)
    return (mat_c)
  }

  base.poisson.row<-function(arr)
  {
    idx<-!is.na(arr)
    m<-sum(log(arr[idx]))
    n<-sum(idx)
    return(m/n)
  }

  base.poisson<-function(mat)
  {
    mat_b<-matrix(0,nrow(mat),ncol(mat))
    ar_b<-apply(mat,1,base.poisson.row)
    mat_b[1:nrow(mat_b),]<-ar_b
    return (mat_b)
  }

  update.poisson<-function(mat,mat_b,mat_now,eps)
  {
    mat_p<-mat_b+mat_now
    mat_u<-matrix(0,nrow(mat),ncol(mat))
    index<-!is.na(mat)
    mat_u[index]<-mat_now[index]+eps*epsilon.poisson*(log(mat[index])-mat_p[index])
    index<-is.na(mat)
    mat_u[index]<-mat_now[index]
    return (mat_u)
  }

  stop.poisson<-function(mat,mat_b,mat_now,mat_u)
  {
    index<-!is.na(mat)
    mn<-mat_b+mat_now
    mu<-mat_b+mat_u
    lgn<-sum(mat[index]*mn[index]-exp(mn[index]))
    lgu<-sum(mat[index]*mu[index]-exp(mu[index]))
    return (lgu-lgn)
  }

  LL.poisson<-function(mat,mat_b,mat_u)
  {
    index<-!is.na(mat)
    mu<-mat_b+mat_u
    lgu<-sum(mat[index]*mu[index]-exp(mu[index]))
    return (lgu)
  }

  LLmax.poisson<-function(mat)
  {
    index<-!is.na(mat)
    lgu<-sum(mat[index]*log(mat[index])-mat[index])
    return (lgu)
  }

  LLmin.poisson<-function(mat,mat_b)
  {
    index<-!is.na(mat)
    lgu<-sum(mat[index]*mat_b[index]-exp(mat_b[index]))
    return (lgu)
  }

  poisson_type_base<-function(data,dimension=2,name="test")
  {
    data<-check.poisson(data,name)
    data_b<-base.poisson(data)
    data_now<-matrix(0,nrow(data),ncol(data))
    data_u<-update.poisson(data,data_b,data_now)
    data_u<-nuclear_approximation(data_u,dimension)
    while(T)
    {
      thr<-stop.poisson(data,data_b,data_now,data_u)
      message(thr)
      if (thr<0.2)
      {
        break
      }
      data_now<-data_u
      data_u<-update.poisson(data,data_b,data_now)
      data_u<-nuclear_approximation(data_u,dimension)
    }
    return (data_now)
  }

  #----#
  # na #
  #----#

  nuclear_approximation<-function(mat,dimension)
  {
    svd<-svd(mat,nu=0,nv=0)
    if (dimension<length(svd$d))
    {
      lambda<-svd$d[dimension+1]
      svd<-svd(mat,nu=dimension,nv=dimension)
      indexh<-svd$d>lambda
      indexm<-svd$d<lambda
      dia<-array(svd$d,length(svd$d))
      dia[indexh]<-dia[indexh]-lambda
      dia[indexm]<-0
      mat_low<-svd$u%*%diag(c(dia[1:dimension],0))[1:dimension,1:dimension]%*%t(svd$v)
    }
    else
    {
      mat_low<-mat
    }
    return (mat_low)
  }

  #------------#
  # LRAcluster #
  #------------#
  check.matrix.element<-function(x)
  {
    if (!is.matrix(x))
    {
      return (T)
    }
    else
    {
      return (F)
    }
  }

  ncol.element<-function(x)
  {
    return (ncol(x))
  }

  nrow.element<-function(x)
  {
    return (nrow(x))
  }

  check<-function(mat,type,name)
  {
    if (type=="binary")
    {
      return (check.binary(mat,name))
    }
    else if (type=="gaussian")
    {
      return (check.gaussian(mat,name))
    }
    else if (type=="poisson")
    {
      return (check.poisson(mat,name))
    }
    else
    {
      e<-paste("unknown type ",type,sep="")
      stop(e)
    }
  }

  eps<-0.0
  if (!is.list(data))
  {
    stop("the input data must be a list!")
  }
  c<-sapply(data,check.matrix.element)
  if (sum(c)>0)
  {
    stop("each element of input list must be a matrix!")
  }
  c<-sapply(data,ncol.element)
  if (length(levels(factor(c)))>1)
  {
    stop("each element of input list must have the same column number!")
  }
  if (length(data)!=length(types))
  {
    stop("data and types must be the same length!")
  }
  nSample<-c[1]
  loglmin<-0
  loglmax<-0
  loglu<-0.0
  nData<-length(data)
  for (i in 1:nData)
  {
    data[[i]]<-check(data[[i]],types[[i]],names[[i]])
  }
  nGeneArr<-sapply(data,nrow.element)
  nGene<-sum(nGeneArr)
  indexData<-list()
  k=1
  for(i in 1:nData)
  {
    indexData[[i]]<- (k):(k+nGeneArr[i]-1)
    k<-k+nGeneArr[i]
  }
  base<-matrix(0,nGene,nSample)
  now<-matrix(0,nGene,nSample)
  update<-matrix(0,nGene,nSample)
  thr<-array(0,nData)
  for (i in 1:nData)
  {
    if (types[[i]]=="binary")
    {
      base[indexData[[i]],]<-base.binary(data[[i]])
      loglmin<-loglmin+LLmin.binary(data[[i]],base[indexData[[i]],])
      loglmax<-loglmax+LLmax.binary(data[[i]])
    }
    else if (types[[i]]=="gaussian")
    {
      base[indexData[[i]],]<-base.gaussian(data[[i]])
      loglmin<-loglmin+LLmin.gaussian(data[[i]],base[indexData[[i]],])
      loglmax<-loglmax+LLmax.gaussian(data[[i]])
    }
    else if (types[[i]]=="poisson")
    {
      base[indexData[[i]],]<-base.poisson(data[[i]])
      loglmin<-loglmin+LLmin.poisson(data[[i]],base[indexData[[i]],])
      loglmax<-loglmax+LLmax.poisson(data[[i]])
    }
  }
  for (i in 1:nData)
  {
    if (types[[i]]=="binary")
    {
      update[indexData[[i]],]<-update.binary(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
    }
    else if (types[[i]]=="gaussian")
    {
      update[indexData[[i]],]<-update.gaussian(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
    }
    else if (types[[i]]=="poisson")
    {
      update[indexData[[i]],]<-update.poisson(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
    }
  }
  update<-nuclear_approximation(update,dimension)
  nIter<-0
  thres<-array(Inf,3)
  epsN<-array(Inf,2)
  while(T)
  {
    for (i in 1:nData)
    {
      if (types[[i]]=="binary")
      {
        thr[i]<-stop.binary(data[[i]],base[indexData[[i]],],now[indexData[[i]],],update[indexData[[i]],])
      }
      else if (types[[i]]=="gaussian")
      {
        thr[i]<-stop.gaussian(data[[i]],base[indexData[[i]],],now[indexData[[i]],],update[indexData[[i]],])
      }
      else if (types[[i]]=="poisson")
      {
        thr[i]<-stop.poisson(data[[i]],base[indexData[[i]],],now[indexData[[i]],],update[indexData[[i]],])
      }
    }
    nIter<-nIter+1
    thres[1]<-thres[2]
    thres[2]<-thres[3]
    thres[3]<-sum(thr)
    epsN[1]<-epsN[2]
    epsN[2]<-eps
    if (nIter>5)
    {
      if (runif(1)<thres[1]*thres[3]/(thres[2]*thres[2]+thres[1]*thres[3]))
      {
        eps<-epsN[1]+0.05*runif(1)-0.025
      }
      else
      {
        eps<-epsN[2]+0.05*runif(1)-0.025
      }
      if (eps< -0.7)
      {
        eps<- 0
        epsN<-c(0,0)
      }
      if (eps > 1.4)
      {
        eps<-0
        epsN<-c(0,0)
      }
    }
    if (sum(thr)<nData*0.2)
    {
      break
    }
    now<-update
    for (i in 1:nData)
    {
      if (types[[i]]=="binary")
      {
        update[indexData[[i]],]<-update.binary(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
      }
      else if (types[[i]]=="gaussian")
      {
        update[indexData[[i]],]<-update.gaussian(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
      }
      else if (types[[i]]=="poisson")
      {
        update[indexData[[i]],]<-update.poisson(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
      }
    }
    update<-nuclear_approximation(update,dimension)
  }
  for (i in 1:nData)
  {
    if (types[[i]]=="binary")
    {
      loglu<-loglu+LL.binary(data[[i]],base[indexData[[i]],],update[indexData[[i]],])
    }
    else if (types[[i]]=="gaussian")
    {
      loglu<-loglu+LL.gaussian(data[[i]],base[indexData[[i]],],update[indexData[[i]],])
    }
    else if (types[[i]]=="poisson")
    {
      loglu<-loglu+LL.poisson(data[[i]],base[indexData[[i]],],update[indexData[[i]],])
    }
  }
  sv<-svd(update,nu=0,nv=dimension)
  coordinate<-diag(c(sv$d[1:dimension],0))[1:dimension,1:dimension]%*%t(sv$v)
  colnames(coordinate)<-colnames(data[[1]])
  rownames(coordinate)<-paste("PC ",as.character(1:dimension),sep="")
  ratio<-(loglu-loglmin)/(loglmax-loglmin)
  return (list("coordinate"=coordinate,"potential"=ratio))
}
xlucpu/MOVICS documentation built on July 24, 2021, 9:23 p.m.