R/palimpsest_utils.R

Defines functions compare_results palimpsest_distCosine signature_colour_generator cit.discretize cit.peaks cit.density factochar cit.pangenomPlot cit.pangenomCoord cit.genomOrder bed2index palimpsest_makeMutypeMatFromVcf palimpsest_distCosine palimpsest_dfPosXSegm

Documented in bed2index cit.density cit.discretize cit.genomOrder cit.pangenomCoord cit.pangenomPlot cit.peaks compare_results factochar palimpsest_dfPosXSegm palimpsest_distCosine palimpsest_makeMutypeMatFromVcf signature_colour_generator

#' palimpsest_dfPosXSegm
#'
#' Function to annotate positions in a data frame (dfPos) using segments in another data frame (dfSegm)
#' @param dfPos Data frame with positions to annotate
#' @param dfPos.chrom.col Chromosome column in dfPos
#' @param dfPos.pos.col Position column in dfPos
#' @param dfSegm Data frame with segments to use for annotating dfPos
#' @param dfSegm.chrom.col Chromosome column in dfSegm
#' @param dfSegm.start.col Start position column in dfSegm
#' @param dfSegm.end.col End position column in dfSegm
#' @param colsToAdd Names of columns in dfSegm that should be used to annotate dfPos
#' @param namesColsToAdd Column names to give in the columns added to dfPos
#' @param multseg required
#'
#' @export
#' @import gtools

palimpsest_dfPosXSegm <- function(dfPos=NULL,
                               dfPos.chrom.col="chrom",
                               dfPos.pos.col="pos",
                               dfSegm=NULL,
                               dfSegm.chrom.col="chrom",
                               dfSegm.start.col="start",
                               dfSegm.end.col="end",
                               colsToAdd=NULL,
                               namesColsToAdd=NULL,
                               multseg=c(NA,"first","all")[1]
)
{
  for(col in namesColsToAdd)	dfPos[,col] <- NA
  dfSegm. <- split(dfSegm,dfSegm[,dfSegm.chrom.col])
  dfPos. <- split(dfPos,dfPos[,dfPos.chrom.col])
  chr_listy <- mixedsort(intersect(names(dfSegm.),names(dfPos.)))
  total <- length(chr_listy)
  pb <- txtProgressBar(min = 0, max = total, style = 3)
  counter_start <- c()
  if(is.na(multseg)){
    for(chr in chr_listy)
    {
      counter_start <- match(chr,chr_listy)
      Sys.sleep(0.1)
      # update progress bar
      setTxtProgressBar(pb, counter_start)

      ind <- unlist(sapply(dfPos.[[chr]][,dfPos.pos.col],function(pos){
        tmp <- which(dfSegm.[[chr]][,dfSegm.start.col] <= pos & dfSegm.[[chr]][,dfSegm.end.col] >= pos)
        if(length(tmp)==1)	tmp else{NA}
      }))
      if (all(is.na(ind)) == TRUE)
        dfPos.[[chr]][, namesColsToAdd] <-
        NA
      if (all(is.na(ind)) != TRUE) {
        dfPos.[[chr]][, namesColsToAdd] <- dfSegm.[[chr]][ind,
                                                          colsToAdd]
      }
    }
    close(pb)

  }else{
    if(multseg=="first"){
      for(chr in chr_listy)
      {
        counter_start <- match(chr,chr_listy)
        Sys.sleep(0.1)
        # update progress bar
        setTxtProgressBar(pb, counter_start)
        ind <- unlist(sapply(dfPos.[[chr]][,dfPos.pos.col],function(pos){
          tmp <- which(dfSegm.[[chr]][,dfSegm.start.col] <= pos & dfSegm.[[chr]][,dfSegm.end.col] >= pos)[1]
        }))
        dfPos.[[chr]][,namesColsToAdd] <- dfSegm.[[chr]][ind,colsToAdd]
      }
      close(pb)

    }
    if(multseg=="all"){
      for(chr in chr_listy)
      {
        counter_start <- match(chr,chr_listy)
        Sys.sleep(0.1)
        # update progress bar
        setTxtProgressBar(pb, counter_start)

        ind <- sapply(dfPos.[[chr]][,dfPos.pos.col],function(pos){
          tmp <- which(dfSegm.[[chr]][,dfSegm.start.col] <= pos & dfSegm.[[chr]][,dfSegm.end.col] >= pos)
          tmp
        })
        for(j in 1:length(colsToAdd)){
          dfPos.[[chr]][,namesColsToAdd[j]] <- unlist(lapply(ind,function(z)	paste(dfSegm.[[chr]][z,colsToAdd[j]],collapse=",")))
          dfPos.[[chr]][which(sapply(ind,length)==0),namesColsToAdd[j]] <- NA
        }
      }
      close(pb)
    }
  }
  dfPos <- unsplit(dfPos.,dfPos[,dfPos.chrom.col])
  dfPos
}

#' palimpsest_distCosine
#'
#' Function to calculate cosine distance
#' @param required m 
#'
#' @export
#' @importFrom lsa cosine

palimpsest_distCosine <- function(m)
{
  nsamp <- nrow(m)
  res <- matrix(NA,nrow=nsamp,ncol=nsamp)
  rownames(res) <- colnames(res) <- rownames(m)
  for(i in 1:nsamp)
  {
    for(j in 1:nsamp)
    {
      res[i,j] <- cosine(m[i,],m[j,])
    }
  }
  as.dist(1-res)
}

#' palimpsest_makeMutypeMatFromVcf
#'
#' Function to create a matrix in mutation type x sample format with either counts or proportions
#' @param vcf vcf data frame containing the mutations
#' @param sample.col Name of column containing sample names
#' @param mutcat.col Name of column containing mutation categories
#' @param mutypes List of categories to be included in the output
#' @param proportion If TRUE, the output matrix will indicate mutation proportions instead of numbers
#'
#' @export

palimpsest_makeMutypeMatFromVcf <- function(vcf,
                                 sample.col="sample",
                                 mutcat.col="mutcat3",
                                 mutypes=c("CA","CG","CT","TA","TC","TG"),
                                 proportion=TRUE)
{
  tmp <- split(vcf,vcf[,sample.col])
  tmp <- lapply(tmp,function(d){
    sapply(mutypes,function(m){sum(d[,mutcat.col]==m,na.rm=T)})
  })
  tmp <- as.matrix(as.data.frame(tmp))
  if(proportion){for(j in 1:ncol(tmp))	tmp[,j] <- tmp[,j]/sum(tmp[,j])}
  tmp
}

#' bed2index
#'
#' convert bed to index object
#' @param x bed file
#' @param sort required
#'
#' @export

bed2index <- function(x, sort = TRUE) {
  index <- paste0(
    x[, 1],
    ':',
    x[, 2],
    '-',
    x[, 3]
  );
  if (sort) {
    index <- bedr.sort.region(index);
  }
  index;
}

#' cit.genomOrder
#'
#' @param d data.frame
#' @param chrom chromosome column name
#' @param pos sequence location column name
#' @param startPos the column in d giving the start base pair position
#' @param endPos the column in d giving the end base pair position
#' @param chromNum the name of added column whith chromosome from 1 to 24
#'
#' @export

cit.genomOrder <- function(d,
                            chrom     = "chrom",
                            pos       = "pos",
                            startPos  = "start",
                            endPos    = "end" ,
                            chromNum  = "chrNum"
)
{
  d <- factochar(d)
  if(!chromNum %in% names(d))  d <- cit.chromString2num(d,chrom=chrom)

  if (is.null(pos)) {
    pos <- "meanPos"
    if (options("verbose")$verbose)
      warning("missing value for parameter pos -> using default value (meanPos).")
  }

  if(!pos %in% colnames(d))
  {
    if(all(c(startPos,endPos) %in% colnames(d))){d$meanPos <- rowMeans(d[,c(startPos,endPos)],na.rm=T)}else{
      stop("Please set valid pos or startPos and endPos values.")
    }
    pos <- "meanPos"
  }
  ord <- order(d[,chromNum], as.numeric(d[,pos]), na.last = TRUE)
  d <- d[ord,]

  d
}


#' cit.pangenomCoord
#'
#' @param x data.frame which columns include genomic position information
#' @param chrom the column in x giving the chromosome position (in 1:22,X,Y)
#' @param pos the column in x giving the base pair position   (optional)
#' @param startPos the column in x giving the start base pair position
#' @param endPos the column in x giving the end base pair position
#' @param cyto cytoband object (optional)
#' @param absPos name of added column giving absolute pangenomic order
#' @param chromNum the column in x giving the chromosome position with X = 23 & Y =24
#'
#' @export

cit.pangenomCoord <- function( x,
                                chrom="chrom",
                                pos="meanPos",
                                startPos="start",
                                endPos="end",
                                cyto=NULL,
                                absPos="absPos",
                                chromNum="chrNum"
)
{
  if(is.null(cyto))  stop("Cytoband object required")

  x <- cit.genomOrder(d=x,
                       chrom     = chrom,
                       pos       = pos,
                       startPos  = startPos,
                       endPos    = endPos ,
                       chromNum  = chromNum)

  if(absPos %in% colnames(x))  return(x)

  if(!endPos %in% colnames(x)){
    endPos <- "endPos"
    if(pos %in% colnames(x)){
      x[,endPos] <- x[,pos]
      if(options("verbose")$verbose) warning("uncorrect value for parameter endPos -> using pos value.")
    }else{
      stop("missing value for pos or endPos.")
    }
  }

  if(is.null(absPos)){
    absPos <- "absPos"
    if(options("verbose")$verbose) warning("missing value for parameter absPos -> using default value.")
  }

  if(!absPos %in% names(x)){
    x$absPos <- as.numeric(x[,pos])
    if(pos %in% colnames(x)){
      x[,absPos] <- as.numeric(x[,pos])
      if(options("verbose")$verbose) warning("uncorrect value for parameter absPos -> using pos value.")
    }else{
      stop("missing value for absPos or pos.")
    }
  }

  if(!is.null(cyto)){
    if(!"ChrNumeric" %in% names(cyto) ) cyto <-  cit.chromString2num(cyto,chrom="Chromosome",chromNum="ChrNumeric")

    allchrnum <- sort(unique(c(cyto$ChrNumeric,x[,chromNum])))

    maxByChr <- sapply(allchrnum,function(z){
      m <- 0
      w <- which(cyto$"ChrNumeric"==z)
      if(length(w)>0) m <- max(cyto[w,"End"],na.rm=TRUE)
      w <- which(x[,chromNum]==z)
      if(length(w)>0){
        if( which.max(c(m,max(x[w,endPos])))==2 )
          warning("In cit.pangenomCoord, some positions in your data are superior to max cytoband in cyto.\ncyto object used does not seem to be the right cytoband object version.\n")
        m <- max(c(m,max(x[w,endPos],na.rm=TRUE)))
      }
      m})

    maxByChr <- cumsum(as.numeric(maxByChr))

  }else{
    maxByChr <- cumsum(as.numeric(sapply(split(x[,endPos],x[,chromNum]),max)))
  }


  for(i in setdiff(unique(x[,chromNum]),1)){
    w <- which(x[,chromNum] == i)
    x[w,absPos] <-  x[w,absPos] + maxByChr[i-1]
  }
  w <- which(is.na(x[,chromNum]))
  if(length(w)>0) x[w,absPos] <- NA
  x
}

#' cit.pangenomPlot
#'
#' pangenomic plot
#' @param d data.frame which columns include genomic position information and the wanted y axis
#' @param ycol the column in d to be used as y axis
#' @param chrom the column in  d giving the chromosome position
#' @param pos the column in d giving the base pair position   (optional)
#' @param startPos the column in d giving the start base pair position
#' @param endPos the column in  d giving the end base pair position
#' @param colorscol (optional) column in d giving the color for each point from ycol or a unique color name or number (default black)
#' @param cyto cytoband object (optional)
#' @param absPos  the column in d giving the absolute base pair position (will be calculated if not present in d)
#' @param rectangleOption boolean : if {TRUE} rectangles are plotted instead of points (default) - NB : for points, several 'looks' can be obtained (ex using parameter type , pch,...)
#' @param plotCentro boolean : should centromeric delimitation be plotted ?
#' @param plotCentro.color color of the centromeric delimitation
#' @param plotChrom boolean : should chromosome delimitation be plotted ?
#' @param plotnew boolean : should the a new graph be plotted (default={TRUE}), or should something be added to the current graph (->{plotnew=FALSE})
#' @param plotyaxis boolean : should an Y axis be plotted ? (considered only if {plotnew = TRUE})
#' @param yaxmark  y axis 'at' parameter (considered only if {plotyaxis = TRUE})
#' @param yaxlab y axis 'labels' parameter (considered only if {plotyaxis = TRUE})
#' @param chromToPlot a vector of the chromosomes to be plotted (default c(1:22,"X","Y"))
#'
#' @export

cit.pangenomPlot <- function(d=NULL,
                              ycol=NULL,
                              chrom="chrom",
                              pos="meanPos",
                              startPos="start",
                              endPos="end",
                              colorscol=NULL,
                              cyto=NULL,
                              absPos="absPos",
                              rectangleOption=FALSE,
                              plotCentro = TRUE,
                              plotCentro.color = "lightgrey",
                              plotChrom =TRUE,
                              plotnew=TRUE,
                              plotyaxis=TRUE,
                              plotxaxis=TRUE,
                              yaxmark=NULL,
                              yaxlab=NULL,
                              chromToPlot=c(1:22,"X","Y"),
                              xlim=NULL,
                              CexAxis=0.8,
                              ...)
{
  if(! ycol %in% names(d))  stop("ycol ", ycol, " not found in d.\n")
  if(is.null(cyto))  stop("Cytoband object required")

  d <- cit.pangenomCoord(d,chrom=chrom,pos=pos,startPos=startPos,endPos=endPos,cyto=cyto)
  y <- d[,ycol]

  if(is.null(colorscol) ){colorY <-"black"}else{colorY <- d[,colorscol]}

  if(!is.null(chromToPlot))	cyto <- cyto[which(cyto$Chromosome %in% chromToPlot),] # added to rm X and Y chromosome from graphics
  cyto <- cit.pangenomCoord (cyto, chrom="ChrNumeric", pos="End", startPos="Start", endPos="End", cyto=cyto, absPos="absPos")
  if(is.null(xlim)) {xlim <- c(0,max(cyto$absPos,na.rm=T))}
  delta <- diff(xlim)*0.02;xlim <- xlim+c(-delta,delta)
  delta <- diff(xlim)*0.04/0.92;xlim <- xlim+c(delta,-delta)

  if(plotnew){
    plot( d[,absPos],y,axes=FALSE,col=colorY,xlim=xlim, ...)

    box()

    if(plotyaxis){
      okk <- FALSE
      if(is.null(yaxmark)){
        yaxmark <- pretty(y,n=10)
        okk <- TRUE
      }
      if(is.null(yaxlab)) yaxlab <- yaxmark
      axis(2, #1=below, 2=left, 3=above and 4=right
           at = yaxmark,
           labels = yaxlab, cex.axis=CexAxis, las=2)
    }

  }else{
    points( d[,absPos],y, col=colorY, ...)
  }

  wat <- NULL
  if(!is.null(cyto)){
    w <- which(cyto$"Centro"==1)
    wat <- sapply(split(cyto[w,"absPos"],cyto[w,"ChrNumeric"]),mean)
    if(length(w) == 0) wat <- sapply(split(cyto[,"absPos"],cyto[,"ChrNumeric"]),mean)

    if(plotCentro & length(w) != 0)abline(v=wat,lty=3,col=plotCentro.color)

    if(plotChrom ){

      if(plotxaxis){
        axis(1, #1=below, 2=left, 3=above and 4=right
             at = wat,
             labels = c(1:22,"X","Y")[c(1:22,"X","Y") %in% unique(cyto$Chromosome)],#c(1:22,"X","Y"),
             cex.axis=CexAxis, las=2)
      }

      wat <-  sapply(split(cyto[,"absPos"],cyto$"ChrNumeric"),max)
      abline(v=wat[-length(wat)],lty=1)
    }
  }
  d
}


#' factochar
#'
#' Function to convert factors factors in a data frame to characters
#' @param d Data frame to be converted
#'
#' @export

factochar <- function(d) {
  for(i in 1:ncol(d)) if(is.factor(d[,i])) d[,i] <- as.character(d[,i])
  d
}

#' cit.chromString2num
#'
#' convert chromosomes X, Y and MT into numerics 23, 24 and 25
#' @param d data.frame which columns include genomic position information
#' @param chrom the column in \code{d} giving the chromosome position
#'
#' @export

cit.chromString2num <- function (d,
                                  chrom = "chrom",
                                  chromNum  = "chrNum"
)
{
  d[, chromNum] <- as.character(sub("chr","",d[, chrom]))

  maxchr <- suppressWarnings( max(as.numeric(d[,chromNum]),na.rm=TRUE) )

  w <- which(d[, chromNum] == "X")
  if (length(w) > 0)d[w, chromNum] <- maxchr+1

  w <- which(d[, chromNum] == "Y")
  if (length(w) > 0)d[w, chromNum] <- maxchr+2

  w <- which(d[, chromNum] %in% c("M","MT"))
  if (length(w) > 0)d[w, chromNum] <- maxchr+3

  w <- which(!(d[, chromNum] %in% as.character(1:25)))
  if (length(w) > 0) d[w, chromNum] <- NA

  d[, chromNum] <- as.numeric(d[, chromNum])
  d
}

#' cit.density
#'
#' wrapping of the function 'density' adding down and top points to the result
#' @param x a numeric vector
#' @param doplot boolean
#' @param pc ?
#' @param ...
#'
#' @export

cit.density <- function(x,doplot=FALSE,pc=.05,...){
  dx <- density(x,na.rm=TRUE,...)
  ymax <- diff(range(dx$y))
  n <- length(dx$y)
  croissance <- as.numeric((dx$y[-1] - dx$y[-n])>0)
  wB <- 1+which(diff(croissance)== 1)
  wH <- 1+which(diff(croissance)== -1)
  pointsBas   <- dx$x[wB]
  pointsHauts <- dx$x[intersect(wH,which(dx$y>pc*ymax))]

  if(length(pointsHauts)>0)pointsBas   <- sapply(split(pointsBas,cit.discretize(pointsBas,pointsHauts)),median)

  if(doplot){
    plot(dx,...)
    abline(v=pointsBas,lty=3,col="blue")
    abline(v=pointsHauts,lty=3,col="red")
  }
  L <- c(dx,list("down"=pointsBas,"top"=pointsHauts))
  attr(L,"class") <- "density"
  L
}

#' cit.peaks
#'
#' wrapping of the function 'cit.density' to control the maximum number of peaks
#' @param x a numeric vector
#' @param percentHighestPeak percentHighestPeak
#' @param maxNbPeaks maxNbPeaks
#' @param minDeltaBetweenPeaks minDeltaBetweenPeaks
#' @param deltaApproach deltaApproach
#' @param doplot boolean
#' @param ...
#' 
#' @export

cit.peaks <- function(x,
                       percentHighestPeak=.2,
                       maxNbPeaks=NULL,
                       minDeltaBetweenPeaks=.03,
                       deltaApproach=1,
                       doplot=FALSE,
                       ...){
  if(is.na( minDeltaBetweenPeaks ))minDeltaBetweenPeaks <- NULL
  v <- cit.density(x,pc =percentHighestPeak)
  m <- v$y[which(v$x %in% v$top)]
  w <- which(m/max(m) > percentHighestPeak)
  xw <- v$top[w]
  which.eq <- function(z){ order(abs(v$x-z))[1]}
  if(deltaApproach==1 & !is.null( minDeltaBetweenPeaks ) & length(xw)>1){
    for(i in 1:(length(xw)-1)){
      if(xw[i+1]-xw[i] <  minDeltaBetweenPeaks) {
        xw[i:(i+1)] <- xw[c(i:(i+1))][which.max(m[i:(i+1)])]
      }
    }

    xw <- unique(xw)

  }
  if(!is.null(maxNbPeaks)){
    if(length(xw)> maxNbPeaks)
      for(i in 1:(length(xw)-maxNbPeaks)){
        oneamong <- xw[c(0,1)+which.min(diff(xw))]
        w1 <-  which.eq(oneamong[1])
        w2 <-  which.eq(oneamong[2])
        out <- oneamong[which.min(v$y[c(w1,w2)])]
        xw <- setdiff(xw,out)

      }
  }
  umin <- NULL
  if(length(xw)>1){
    for(i in 1:(length(xw)-1)){
      possiblemin <- v$down[which(v$down >= xw[i] & v$down <= xw[i+1])]
      if(length(possiblemin)>1){
        w<- sapply(possiblemin,which.eq)
        possiblemin <- possiblemin[which.min(v$y[w])]
      }
      umin <- c(umin,possiblemin)
    }

  }
  L <- list("x abciss big peaks"=xw,
            "x abciss inter-peaks"=umin,
            "nb big peaks"=length(xw))

  peaks <- L[[1]]
  interpeaks <- L[[2]]
  temp <- as.data.frame(t(sapply(peaks,function(pic) {
    wleft <- which(interpeaks < pic)
    if(length(wleft)>0){
      wleft <- max(interpeaks[wleft])
    }else{
      wleft <- min(x,na.rm=TRUE)
    }
    wright <- which(interpeaks > pic)
    if(length(wright)>0){
      wright <- min(interpeaks[wright])
    }else{
      wright <- max(x,na.rm=TRUE)
    }
    c(pic,wleft,wright,length(which(x>=wleft & x<=wright)))
  }) ))
  names(temp) <- c("peak","left born","right born","size")
  L$"peaks x size" <- temp


  if(deltaApproach==2 &!is.null( minDeltaBetweenPeaks ) & L$'nb big peaks'>1){
    Lini <- L
    names(Lini) <- paste("initial",names(L))

    while(any( diff(L$"x abciss big peaks") < minDeltaBetweenPeaks) & L$"nb big peaks" >1){

      w <- which.min(abs(diff(L$"x abciss big peaks")) )


      lb <- min(L$"peaks x size"[w:(w+1),"left born"])
      rb <- max(L$"peaks x size"[w:(w+1),"right born"])
      si <- sum(L$"peaks x size"[w:(w+1),"size"])
      pe <- median(x[which(x>=lb & x<=rb)])

      L$"peaks x size"[w,] <- c(pe,lb,rb,si)
      L$"peaks x size" <- L$"peaks x size"[-(w+1),]

      L$"nb big peaks" <- L$"nb big peaks" - 1
      L$"x abciss big peaks"[w] <- pe
      L$"x abciss big peaks" <- L$"x abciss big peaks"[-(w+1)]
      L$"x abciss inter-peaks" <- L$"x abciss inter-peaks"[-(w+1)]
    }

    L <- c(Lini,L)
  }

  if(doplot){
    plot(v,...)
    abline(v=L$"x abciss big peaks",col="red")
    if(length(L$"x abciss inter-peaks")>0)abline(v=L$"x abciss inter-peaks",col="green",lty=3)
  }

  L
}

#' cit.discretize
#'
#' Discretize a continous variable by specified \code{lim} cut-off(s)
#' @param x a vector
#' @param lim cut-off(s)
#' @param quant if TRUE (default FALSE) the \code{x} is discretize by quantile and \code{lim} is considered as cut-off(s) for quantile, ie 0<lim<1
#' @param addlevels add character levels indicating the cut-offs (i.e. for un cut-off iqq levels=c("<iqq",">=iqq") )
#'
#' @export

cit.discretize <- function(x, lim, quant = FALSE, addlevels = FALSE){

  lim <- sort(lim)

  if( quant & ( any(lim>1) | any(lim<0) ) )
    stop("lim must be [0;1] as quant=T\n")

  res <- rep(NA, length(x))

  if(quant) lim  <- quantile(x, probs=lim, na.rm=TRUE)
  n <- length(lim)
  for(i in n:1) res[which(x<lim[i])] <- i
  res[which(x>=lim[n])] <- n+1

  if (addlevels) {
    res <- as.factor(res)

    if(quant)
      lim <- gsub(" ", "", prettyNum(lim, format="g", digits=1))

    if(length(lim)==1)
      lev <- c(paste("<", lim[1], sep = ""), paste(">=", lim[1], sep = ""))
    else
      lev <- c(paste("<", lim[1], sep = ""), paste("[",lim[-length(lim)],";",lim[-1],"[",sep=""), paste(">=", lim[length(lim)], sep = ""))
    levels(res) <- lev[as.numeric(levels(res))]
  }
  res
}


#' factoall
#'
#' Function to convert factors in a data frame to the appropriate format (character, numeric...)
#' @param d Data frame to be converted
#' @param ncmax Maximum number of characters for numeric fields
#'
#' @export

factoall <- function (d, ncmax = 10)
{
  n <- ncol(d)
  for (i in 1:n) {
    if (is.factor(d[, i])) {
      d[, i] <- as.character(d[, i])
      na <- which(is.na(d[, i]))
      num <- suppressWarnings(as.numeric(d[, i]))
      nanum <- which(is.na(num))
      if (length(nanum) == length(na)) {
        d[, i] <- num
      }
    }
  }
  d
}


#' "Not in" function 
#'
#' The opposite of << %in% >>, used by several palimpsest functions.
#' @keywords Signatures
#' @export
#' @examples
#' days <- c("monday","tuesday","wednesday","thursday","friday","saturday","sunday")
#' weekdays <- days[days %!in% c("saturday", "sunday")]

'%!in%' <- function(x,y)!('%in%'(x,y))



#' signature_colour_generator
#'
#' Generates colours for de novo signatures.
#' @param signature_names Character vector of the names of the mutational signatures for which colours are to be generated.
#' @keywords Signatures
#' @export
#' @import RColorBrewer
#' @examples
#' SBS_colours <- signature_colour_generator(signature_names = rownames(SBS_denovo_sigs))

signature_colour_generator <- function(signature_names = NULL){
  requireNamespace("RColorBrewer",quietly = T)
  qual_col_pals <- brewer.pal.info[brewer.pal.info$category == 'qual',]
  sig_cols <- unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
  sig_cols <- sig_cols[sample.int(length(sig_cols),length(signature_names))];names(sig_cols) <- signature_names
  return(sig_cols)
}


#' load2object
#'
#' loads object to a particular name. 
#' @export 

load2object <- function (filename) 
{
  if (file.exists(filename)) 
    return(eval(parse(text = load(filename))))
  cat(paste("error! the file ", filename, 
            " was not found :("))
  NULL
}




#' palimpsest_distCosine
#'
#' Function to calculate cosine distance
#' @param m input
#'
#' @export
#' @importFrom lsa cosine

palimpsest_distCosine <- function(m)
{
  nsamp <- nrow(m)
  res <- matrix(NA,nrow=nsamp,ncol=nsamp)
  rownames(res) <- colnames(res) <- rownames(m)
  for(i in 1:nsamp)
  {
    for(j in 1:nsamp)
    {
      res[i,j] <- cosine(m[i,],m[j,])
    }
  }
  as.dist(1-res)
}


#' compare_results
#'
#' compare the results of one or two de novo extraction(s) to a set of reference signatures (e.g. Palimpsest SBS de novo result to SBS COSMIC de novo signatures)
#' @export

compare_results <- function(reference_sigs = NA, extraction_1 = NA, extraction_2 = NULL, extraction_1_name ="Palimp", extraction_2_name = NA, lower_threshold = 0.6, upper_threshold = 0.9){

  #ifelse(!is.na(extraction_2),print("good to go"),stop("function currently needs 2 denovo sets to work"))
  if(!missing(extraction_2)){
    rownames(extraction_1) <- c(paste0(extraction_1_name,"_",c(1:nrow(extraction_1))))
    rownames(extraction_2) <- c(paste0(extraction_2_name,"_",c(1:nrow(extraction_2))))
    
    refs <- rownames(reference_sigs)
    denovs1 <- rownames(extraction_1)
    denovs2 <- rownames(extraction_2)
    
    longest <- max(c(length(denovs1),length(denovs2)))
    
    # res <- as.data.frame(matrix(nrow = nrow(reference_sigs)+longest,ncol = 0)) %>% 
      # mutate(References = NA, denovo1 = NA, ref_denovo1_cos = NA, denovo2 = NA, ref_denovo2_cos = NA, denovo1_denovo2_cos = NA, keep_version = NA)

    repnum = nrow(reference_sigs) + longest
    res = data.frame(References = rep(NA,repnum),  denovo1 = rep(NA,repnum), ref_denovo1_cos = rep(NA,repnum), keep_version = rep(NA,repnum))
  
    res$References[1:nrow(reference_sigs)] <- c(rownames(reference_sigs))
                                                
                                                
    # perfrom cosine similarity analysis
    mutmat1 <- t(rbind(extraction_1, reference_sigs))
    m1 <- as.matrix(palimpsest_distCosine(t(mutmat1)))
    m1 <- 1-m1
    mutmat2 <- t(rbind(extraction_2, reference_sigs))
    m2 <- as.matrix(palimpsest_distCosine(t(mutmat2)))
    m2 <- 1-m2
    mutmat3 <- t(rbind(extraction_1, extraction_2))
    m3 <- as.matrix(palimpsest_distCosine(t(mutmat3)))
    m3 <- 1-m3
    
    # make results table
    for(i in 1:nrow(reference_sigs)){
      sig <- res$References[i]
      if(max(m1[sig,colnames(m1) %in% denovs1]) > lower_threshold){
        res$ref_denovo1_cos[i] <- max(m1[sig,colnames(m1) %in% denovs1])
        res$denovo1[i] <- rownames(m1)[m1[sig,]==res$ref_denovo1_cos[i]]
      }
      if( max(m2[sig,colnames(m2) %in% denovs2]) > lower_threshold){
        res$ref_denovo2_cos[i] <- max(m2[sig,colnames(m2) %in% denovs2])
        res$denovo2[i] <- rownames(m2)[m2[sig,]==res$ref_denovo2_cos[i]]
      }
    }
    
    manque1 <- denovs1[denovs1 %!in% res$denovo1]
    manque2 <- denovs2[denovs2 %!in% res$denovo2]
    
    if(length(manque1) > 0){
      for(i in 1:length(manque1)){
          res$denovo1[length(refs)+i] <- manque1[i]
          sig <- manque1[i]
          res$denovo1_denovo2_cos[length(refs)+i] <- max(m3[sig,colnames(m3) %in% denovs2])
          res$denovo2[length(refs)+i] <- rownames(m3)[m3[sig,]==res$denovo1_denovo2_cos[length(refs)+i]]
      }
    }
    for(i in 1:(length(refs)+length(manque1))){
      if(!is.na(res$denovo1[i]) & !is.na(res$denovo2[i])) res$denovo1_denovo2_cos[i]<- m3[res$denovo1[i],res$denovo2[i]]
    }
    
    manque2x <- manque2[manque2 %!in% res$denovo2]
    if(length(manque1) > 0){
      for(i in 1:length(manque2x)){
        if(length(manque2x)==0) break
        res$denovo2[length(refs)+i+length(manque1)] <- manque2x[i]
      }
    }
    
    for(i in (length(refs)+1):nrow(res)){
      sig1 <- res$denovo1[i]; sig2 <- res$denovo2[i]
      if(!is.na(sig1)){
        if(max(m1[sig1,colnames(m1) %in% refs]) > lower_threshold){
          res$ref_denovo1_cos[i] <- max(m1[sig1,colnames(m1) %in% refs])
          res$References[i] <- colnames(m1)[m1[sig1,]==res$ref_denovo1_cos[i]]
          if(!is.na(sig2)) res$ref_denovo2_cos[i] <- m2[sig2,res$References[i]]
        }
      }
      if(!is.na(sig2)){
        if(max(m2[sig2,colnames(m2) %in% refs]) > lower_threshold){
          res$ref_denovo2_cos[i] <- max(m2[sig2,colnames(m2) %in% refs])
          res$References[i] <- colnames(m2)[m2[sig2,]==res$ref_denovo2_cos[i]]
          if(!is.na(sig1)) res$ref_denovo1_cos[i] <- m1[sig1,res$References[i]]
        }
      }
    }
    
    # add suggestion annotations
    res_f = res %>% 
      filter(rowSums(is.na(.)) < ncol(.)) %>% 
      mutate(keep = ifelse((ref_denovo1_cos > lower_threshold & ref_denovo2_cos > lower_threshold)|ref_denovo1_cos > upper_threshold | ref_denovo2_cos > upper_threshold,"Yes",NA)) #mark those to keep
    
    res_f$keep[res_f$denovo1_denovo2_cos > upper_threshold] <- "Yes"
    res_f$keep[is.na(res_f$keep)] <- "No"
    
    res_f = res_f %>%
      mutate(keep_version = ifelse(keep == "Yes" & References %in% refs & ref_denovo1_cos > upper_threshold | ref_denovo2_cos > upper_threshold , "Keep_Reference",keep_version)) #highlight for which ref is defo best
      
    res_f = res_f %>% 
      mutate(keep_version = ifelse(keep == "Yes" & is.na(keep_version),"Check_denovo_results",keep_version)) #highlight for which user should check
    res_f$keep_version[res_f$keep == "No"] <- "Discard"
    duped <- unique(res_f$References[duplicated(res_f$References) & !is.na(res_f$References)])
    res_f$keep_version[res_f$References %in% duped] <- "Check_denovo_results"
    
    colnames(res_f) <- c("Ref_Signature",paste0(extraction_1_name,"_Equivalent"),paste0("Ref_",extraction_1_name,"_cosine_score"),paste0(extraction_2_name,"_Equivalent"),
                         paste0("Ref_",extraction_2_name,"_cosine_score"), paste0(extraction_1_name,"_",extraction_2_name,"_cosine_score"),"Suggested_Action","Keep")
    
    res_f <- res_f[,c(1:7)]
    res_f
  }
  else{
    rownames(extraction_1) <- c(paste0(extraction_1_name,"_",c(1:nrow(extraction_1))))

  refs <- rownames(reference_sigs)
  denovs1 <- rownames(extraction_1)
    
  # res <- as.data.frame(matrix(nrow = nrow(reference_sigs)+length(denovs1),ncol = 0)) %>% 
    # mutate(References = NA, denovo1 = NA, ref_denovo1_cos = NA, keep_version = NA)

  repnum = nrow(reference_sigs) + length(denovs1)
  res = data.frame(References = rep(NA,repnum),  denovo1 = rep(NA,repnum), ref_denovo1_cos = rep(NA,repnum), keep_version = rep(NA,repnum))
  
  res$References[1:nrow(reference_sigs)] <- c(rownames(reference_sigs))
  
  # perfrom cosine similarity analysis
  mutmat1 <- t(rbind(extraction_1, reference_sigs))
  m1 <- as.matrix(palimpsest_distCosine(t(mutmat1)))
  m1 <- 1-m1

  
  # make results table
  for(i in 1:nrow(reference_sigs)){
    sig <- res$References[i]
    if(max(m1[sig,colnames(m1) %in% denovs1]) > lower_threshold){
      res$ref_denovo1_cos[i] <- max(m1[sig,colnames(m1) %in% denovs1])
      res$denovo1[i] <- rownames(m1)[m1[sig,]==res$ref_denovo1_cos[i]]
    }
  }
  
  manque1 <- denovs1[denovs1 %!in% res$denovo1]
  
  if(length(manque1) > 0){
    for(i in 1:length(manque1)){
      res$denovo1[length(refs)+i] <- manque1[i]
    }
  }

  
  
  # add suggestion annotations
  res_f = res %>% 
    filter(rowSums(is.na(.)) < ncol(.)) %>% 
    mutate(keep = ifelse((ref_denovo1_cos > lower_threshold)|ref_denovo1_cos > upper_threshold,"Yes",NA)) #mark those to keep
  
  res_f$keep[is.na(res_f$keep)] <- "No"
  
  res_f = res_f %>%
    mutate(keep_version = ifelse(keep == "Yes" & References %in% refs & ref_denovo1_cos > upper_threshold, "Keep_Reference",keep_version)) #highlight for which ref is defo best
  
  res_f = res_f %>% 
    mutate(keep_version = ifelse(keep == "Yes" & is.na(keep_version),"Check_denovo_results",keep_version)) #highlight for which user should check
  res_f$keep_version[res_f$keep == "No"] <- "Discard"
  duped <- unique(res_f$References[duplicated(res_f$References) & !is.na(res_f$References)])
  res_f$keep_version[res_f$References %in% duped] <- "Check_denovo_results"
  
  colnames(res_f) <- c("Ref_Signature",paste0(extraction_1_name,"_Equivalent"),paste0("Ref_",extraction_1_name,"_cosine_score"),
                       "Suggested_Action","Keep")
  
  res_f <- res_f[,c(1:4)]
  res_f
    
}
}
FunGeST/Palimpsest documentation built on June 2, 2024, 4:21 a.m.