R/utils_drift.R

Defines functions .getDrift .estimateZcutoff .estimateDrift .compSegs .addSegSd

Documented in .addSegSd .compSegs .estimateDrift .estimateZcutoff .getDrift

#' addSegSd
#' @description Adds segment SD to CBS segment objects
#'
#' @param seg.obj an object returned from DNAcopy::segment()
#' @param winsor Winsorization threshold (Default = 0.95)
#' @param verbose Default = FALSE
#' @param ... Extra para
#' @importFrom GenomicRanges makeGRangesFromDataFrame
#' @importFrom GenomeInfoDb seqlevelsStyle
#' @importFrom GenomicRanges findOverlaps
#' @importFrom GenomicRanges mcols
#' @importFrom S4Vectors subjectHits
#' @importFrom S4Vectors queryHits
#' @importFrom stats quantile
#' @importFrom stats sd
#' @export
#' @keywords internal
.addSegSd <- function(seg.obj, winsor=0.95, verbose=FALSE, ...){
  adj.segs <- lapply(split(seg.obj$output, f=seg.obj$output$ID), function(seg){
    if (verbose) print(paste0(unique(seg$ID), "..."))
    seg.dat <- as.data.frame(seg.obj$data)
    seg.dat$chrom <- as.character(seg.dat$chrom)
    
    ## Loop through each segment and find SD of the raw data
    gr.dat <- makeGRangesFromDataFrame(seg.dat, seqnames.field='chrom', 
                                       start.field = c('maploc', 'pos'),
                                       end.field = c('maploc', 'pos'), 
                                       keep.extra.columns = TRUE)
    gr.seg <- makeGRangesFromDataFrame(seg, keep.extra.columns = TRUE)
    seqlevelsStyle(gr.seg) <- 'UCSC'
    seqlevelsStyle(gr.dat) <- 'UCSC'
    
    ov.idx <- findOverlaps(gr.dat, gr.seg)
    s.idx <- grep(paste0("^", unique(gr.seg$ID), "$"), colnames(mcols(gr.dat)))
    sd.per.seg <- sapply(split(ov.idx, subjectHits(ov.idx)), function(ov.i, winsorize.data=FALSE){
      dat <- mcols(gr.dat[queryHits(ov.i),])[, s.idx]
      if(winsorize.data){
        # if (verbose) print("Winsorizing")
        lim <- quantile(dat, probs=c(winsor, 1-winsor), na.rm=TRUE) ##winsorization
        dat[dat < min(lim) ] <- min(lim)
        dat[dat > max(lim) ] <- max(lim)
      }
      
      return(round(sd(dat, na.rm = TRUE),3))
      # t.dat <- tryCatch({
      #   t.test(na.omit(dat))
      # }, error=function(e){
      #   data.frame("statistic"=NA, "p.value"=NA, "stderr"=NA)
      # })
      # setNames(round(c(t.dat$statistic, t.dat$p.value, t.dat$stderr),5),
      #          c("t", "p", "seg.sd"))
    }, ...)
    seg$seg.sd <- sd.per.seg
    # seg <- cbind(seg, abs(t(sd.per.seg)))
    # seg$'seg.sd' <- 0
    
    return(seg)
  })
  
  return(do.call(rbind, adj.segs))
}


#' .compSegs
#' @description compare the difference between BAF profiles between pairwise
#' samples using a t-test of the raw probeset data
#'
#' @param seg.obj A seg obj
#' @param verbose Verbose (Default = FALSE)
#' @importFrom GenomicRanges makeGRangesFromDataFrame
#' @importFrom GenomicRanges mcols
#' @importFrom GenomicRanges mcols<-
#' @importFrom GenomeInfoDb seqlevelsStyle
#' @importFrom GenomeInfoDb seqlevelsStyle<-
#' @importFrom IRanges findOverlapPairs
#' @importFrom GenomicRanges pintersect
#' @importFrom GenomicRanges findOverlaps
#' @importFrom S4Vectors subjectHits
#' @importFrom S4Vectors queryHits
#' @importFrom stats t.test
#'
#' @return A list of seg objects
#' @export
#' @keywords internal
.compSegs <- function(seg.obj, verbose=FALSE){
  spl.segs <- split(seg.obj$output, f=seg.obj$output$ID)
  seg.dat <- as.data.frame(as.matrix(seg.obj$data))
  gr.dat <- makeGRangesFromDataFrame(seg.dat, seqnames.field='chrom', 
                                     start.field = c('maploc', 'pos'),
                                     end.field = c('maploc', 'pos'), 
                                     keep.extra.columns = TRUE)
  dat.m <- as.matrix(mcols(gr.dat))
  storage.mode(dat.m) <- 'numeric'
  mcols(gr.dat) <- dat.m
  seqlevelsStyle(gr.dat) <- 'UCSC'
  
  
  diff.segs <- lapply(spl.segs, function(seg0){
    ## Take one segment
    gr.seg0 <- makeGRangesFromDataFrame(seg0, keep.extra.columns = TRUE)
    seg0.id <- unique(gr.seg0$ID)
    
    diff.seg1 <- lapply(spl.segs, function(seg1){
      ## Take a second segment to compare against
      gr.seg1 <- makeGRangesFromDataFrame(seg1, keep.extra.columns = TRUE)
      seg1.id <- unique(gr.seg1$ID)
      if(seg0.id == seg1.id) return(NULL)
      #if (verbose)(paste0(seg0.id, " - ", seg1.id))
      
      ## Find all overlap/intersects between segments
      ov.segs <- findOverlapPairs(gr.seg0, gr.seg1)
      seg <- pintersect(ov.segs)
      mcols(seg) <- NULL
      
      ## Find the raw SNP probes that populate those intersect segments
      dat.seg.ov <- findOverlaps(gr.dat, seg)
      s0.idx <- grep(seg0.id, colnames(mcols(gr.dat)), fixed = TRUE)
      s1.idx <- grep(seg1.id, colnames(mcols(gr.dat)), fixed = TRUE)
      
      ## Test the difference between the raw probe data
      test.diff <- sapply(split(dat.seg.ov, subjectHits(dat.seg.ov)), function(ov.i){
        dat <- mcols(gr.dat[queryHits(ov.i),])[, c(s0.idx, s1.idx)]
        dat.t <- t.test(dat[,1], dat[,2])
        return(round(c(dat.t$statistic, dat.t$p.value),3))
      })
      
      ## Append new metadata to the intersect/overlap between the segments
      new.meta <- cbind(mcols(ov.segs@first)[,c("ID", "seg.mean"),drop=FALSE],
                        mcols(ov.segs@second)[,c("ID", "seg.mean")],
                        t(test.diff))
      colnames(new.meta) <- c("refID", "seg.a", "ID", "seg.b", "seg.mean", "p")
      new.meta$t <- floor(abs(new.meta$seg.mean))
      mcols(seg) <- new.meta
      names(seg) <- NULL
      return(seg)
    })
    
    return(unlist(as(diff.seg1[-which(sapply(diff.seg1, is.null))], "GRangesList")))
  })

  return(diff.segs)
}


#' .estimateDrift
#' @description Estimates genetic drift given an SD adjusted DNAcopy::segment()
#' object.  Estimates this based on a t-statistic
#' 
#' @param ... Extra param
#' @param seg.obj DNAcopy::segment() object
#'
#' @importFrom GenomicRanges makeGRangesFromDataFrame
#' @importFrom GenomicRanges mcols
#' @importFrom GenomicRanges width
#' @importFrom stats setNames
#' @importFrom methods as
#' @export
#' @keywords internal
.estimateDrift <- function(seg.obj, ...){
  seg.gr <- makeGRangesFromDataFrame(seg.obj$output, keep.extra.columns = TRUE)
  drift.dat <- lapply(split(seg.gr, seg.gr$ID), function(seg, z.cutoff=NULL, ...){
    ##Calculate Z-score of each seg.mean
    if('t' %in% colnames(mcols(seg))){
      seg$seg.z <- seg.z <- seg$t
    } else {
      # seg.sd <- mean(rep(seg$seg.sd, (width(seg) / 1000000)), na.rm=TRUE)
      seg.z <- round((seg$seg.mean / seg$seg.sd), 3) ## z
      seg$seg.z <- seg.z
    }
    frac.cnv <- round(width(seg) / sum(width(seg)),3)
    ## Create filter criteria
    if(is.null(z.cutoff)){
      seg <- .estimateZcutoff(seg, ...)
      diff.sum <- sapply(split(seg, seg$t), function(tseg) round(sum(width(tseg)) / sum(width(seg)),3))
    } else {
      diff.idx <- sapply(setNames(z.cutoff, z.cutoff), function(z){which(seg.z > z | seg.z < -z)})
      diff.sum <- sapply(setNames(diff.idx, z.cutoff), function(idx) {sum(frac.cnv[idx])})
      seg$seg.diff <- NA
      seg$t <- as.integer(cut(abs(seg$seg.z), breaks = z.cutoff, right = FALSE)) - 1
    }
    # seg$t <- floor(abs(seg$seg.z))
    
    return(list("seg"=seg, "sum"=diff.sum))
  }, ...)

  ## Reform the DNAcopy segment object into original populated data structure
  seg.out <- unlist(as(sapply(drift.dat, function(i) i$seg), "GRangesList"))
  names(seg.out) <- NULL
  seg.out <- as.data.frame(seg.out)[,-c(4,5)]
  colnames(seg.out)[c(1:3)] <- c("chrom", "loc.start", "loc.end")
  seg.out <- seg.out[,c(colnames(seg.obj$output), "seg.z", "t", "seg.diff")]
  
  return(list("frac"=sapply(drift.dat, function(i){ i$sum }),
              "seg"=seg.out))
}


#' .estimateZcutoff
#' @description Creates a theoreticla framework for difference
#' between BAFs, and then compares the observed z-differences
#' against the theoretical diff to look for differences
#'
#' @param seg an output dataframe from a CNA object
#' @param data.type Data type, supports only 'baf' at this time
#' @param verbose Verbose, default = FALSE
#'
#' @return Returned seg with $t and $seg.diff
#' @export
#' @keywords internal
.estimateZcutoff <- function(seg, data.type='baf', verbose=FALSE){
  ref.frac <- switch(data.type,
                     "baf"={
                       if (verbose) print("Estimating BAF diff significance from theoretical framework")
                       ref.frac <- sapply(c(1:5), function(tcn){
                         sapply(c(0:4), function(alt){
                           if(alt <= tcn) alt/tcn else 0
                         })
                       })
                       ref.frac <- round(unique(as.numeric(ref.frac)), 3)
                       ref.frac <- ref.frac[ref.frac <= 0.5]
                       (ref.frac)
                      },
                     "lrr"={
                       ref.frac <- seq(0, 1, by=0.1)
                       (ref.frac)
                     },
                     stop("data.type must be either baf or lrr"))

  ## Find all theoretical differences between BAFs in 100% purity state
  delta.frac <- abs(as.numeric(sapply(ref.frac, function(i) i -ref.frac)))
  delta.frac <- as.numeric(unique(as.character(round(sort(delta.frac),3))))
  
  ## Find which z-scores are greater than the theoretical SD-adjusted seg.mean difference
  theor.cutoff <- t(sapply(seg$seg.sd, function(s) round((delta.frac / s), 3)))
  t.mat <- (sweep(theor.cutoff, 1, as.matrix(abs(seg$seg.z))) <= 0)
  seg$t <-  rowSums(t.mat) - 1
  seg$seg.diff <- delta.frac[seg$t+1]
  # head(as.data.frame(seg), 30)
  return(seg)
}

#' .getDrift
#' @description Returns the genomic fraction of drift for significant
#' regions (z>3)
#' 
#' @param i List object returned from bafDrift()
#' @param idx Index to return (1 = input compared to all matching cell lines, 2 = reference cell lines to all others)
#' @export
#' @keywords internal
.getDrift <- function(i, idx=1){
  if(length(i$frac) >= idx){
    ## Select the "z > 3" row from all baf-drift estimates
    ## idx = 1: Should be VCF compare to all matching cell lines
    ## idx = 2: Should be SNP cell line compared to all other matching SNP cell line
    if(is.list(i$frac)){
      delta <- i$frac[[idx]][3,,drop=FALSE]
    } else {
      delta <- i$frac[3,,drop=FALSE]
    }
    rownames(delta) <- gsub("RNA_", "", names(i$frac)[1])
  } else {
    delta <- NULL
  }
  if(!is.null(delta)) colnames(delta) <- gsub("_.*", "", colnames(delta))
  return(as.data.frame(delta))
}
quevedor2/CCLid documentation built on July 15, 2020, 4:05 a.m.