SCclust: Single Cell Copy Number Analysis and Clustering

Documented in cbs.segment_one gc_one

#'GC normalization.
#'
#'Normalize the bin count file to adjust for nonuniformity results from variation in GC content across the genome.
#'@param bin_mat The bin count files of cells. For example, one bin count file for a cell is CJA1023.varbin.20k.txt
#'@param gc The GC content table.
#'@return The normalized bin count table based on GC content.
#'@export

gc_one <- function(bin_mat, gc) {
  bin_mat$gc.content <- gc$gc.content
  a <- bin_mat$bincount + 1
  bin_mat$ratio <- a / mean(a)
  bin_mat$lowratio <- lowess.gc(bin_mat$gc.content, bin_mat$ratio)
  return(bin_mat)
}



#'Segment the normalzied bin count profile for individual cell for GC content.
#'
#'Generate the segmented profile for individual cell with CBS.
#'@param bin_mat The normalized bin count files generated by gc_one.
#'@param alpha Parameter alpha defined in DNAcopy package.
#'@param nperm Parameter nperm defined in DNAcopy package.
#'@param undo.SD Parameter undo.SD defined in DNAcopy package.
#'@param min.width Parameter min.width defined in DNAcopy package.
#'@param method Default: "multiplier" to transform the ratio data to integer CN state. When genome is hg-dm hybrid, set method as "dmploidies" to use dm plodies as reference.
#'@param genome Default: "hg" (Human genome). hg-dm hybrid genome: "hgdm".
#'@param graphic Default: TRUE. Generate the copy number profile plot.
#'@return The segmented profile for individual cell
#'@export

cbs.segment_one <- function(bin_mat, alpha, nperm, undo.SD, min.width, method = "multiplier", genome = "hg", graphic = TRUE){

  set.seed(25)

  bin_mat$chrom.numeric <- substring(bin_mat$chrom, 4)
  bin_mat$chrom.numeric[which(bin_mat$chrom == "chrX")] <- "23"
  bin_mat$chrom.numeric[which(bin_mat$chrom == "chrY")] <- "24"
  if (genome == "hgdm"){
    bin_mat$chrom.numeric[which(bin_mat$chrom == "dm_chrX")] <- "25"
    bin_mat$chrom.numeric[which(bin_mat$chrom == "dm_chr2L")] <- "26"
    bin_mat$chrom.numeric[which(bin_mat$chrom == "dm_chr2R")] <- "27"
    bin_mat$chrom.numeric[which(bin_mat$chrom == "dm_chr3L")] <- "28"
    bin_mat$chrom.numeric[which(bin_mat$chrom == "dm_chr3R")] <- "29"
    bin_mat$chrom.numeric[which(bin_mat$chrom == "dm_chr4")] <- "30"
  }

  bin_mat$chrom.numeric <- as.numeric(bin_mat$chrom.numeric)

  CNA.object <- DNAcopy::CNA(log(bin_mat$lowratio, base=2), bin_mat$chrom.numeric,
                             bin_mat$chrompos, data.type="logratio")
  smoothed.CNA.object <- DNAcopy::smooth.CNA(CNA.object)
  segment.smoothed.CNA.object <- DNAcopy::segment(smoothed.CNA.object, alpha=alpha,
                                                  nperm=nperm, undo.splits="sdundo", undo.SD=undo.SD, min.width=2)
  #A table contains segmented regions and the corresponding segment means (log)
  thisShort <- segment.smoothed.CNA.object[[2]]

  #A long table:each row corresponds to the segment mean for each bin
  m <- matrix(data=0, nrow=nrow(bin_mat), ncol=1)
  prevEnd <- 0
  for (i in 1:nrow(thisShort)) {
    thisStart <- prevEnd + 1
    thisEnd <- prevEnd + thisShort$num.mark[i]
    m[thisStart:thisEnd, 1] <- 2^thisShort$seg.mean[i]
    prevEnd = thisEnd
  }
  cbs.long.nobad <- m[, 1]

  #####  NEW STUFF  also check min.width=2 above

  workShort <- thisShort
  workShort$segnum <- 0
  workShort$seg.start <- 0
  workShort$seg.end <- 0

  prevEnd <- 0
  for (i in 1:nrow(thisShort)) {
    thisStart <- prevEnd + 1
    thisEnd <- prevEnd + thisShort$num.mark[i]
    workShort$seg.start[i] <- thisStart
    workShort$seg.end[i] <- thisEnd
    workShort$segnum[i] <- i
    prevEnd = thisEnd
  }

  ### deal with short segments (appending, combining)
  discardSegments <- TRUE
  while (discardSegments) {
    orderShort <- workShort[order(workShort$num.mark, abs(workShort$seg.mean)), ]
    if (orderShort[1, "num.mark"] < min.width) {
      workShort <- remove.segment(workShort, orderShort[1, "segnum"], bin_mat, undo.SD)
    } else {
      discardSegments <- FALSE
    }
  }

  workShort <- sdundo.all(workShort, bin_mat, undo.SD)
  thisShort <- workShort

  #####  END NEW STUFF

  m <- matrix(data=0, nrow=nrow(bin_mat), ncol=1)
  prevEnd <- 0
  for (i in 1:nrow(thisShort)) {
    thisStart <- prevEnd + 1
    thisEnd <- prevEnd + thisShort$num.mark[i]
    m[thisStart:thisEnd, 1] <- 2^thisShort$seg.mean[i]
    prevEnd = thisEnd
  }

  bin_mat$seg.mean.LOWESS <- m[, 1]

  chr <- bin_mat$chrom.numeric
  chr.shift <- c(chr[-1], chr[length(chr)])
  vlines <- c(1, bin_mat$abspos[which(chr != chr.shift)+1],
              bin_mat$abspos[nrow(bin_mat)])
  vlines.shift <- c(vlines[-1], 4*10^9)
  chr.at <- vlines + (vlines.shift - vlines)/2
  chr.text <- sub(".*chr", "", unique(as.character(bin_mat$chrom)))
  col.chr.text <- c(rep("black", 24), rep("darkred", 6))[1:length(chr.text)]

  old.par <- par(mfrow=c(2, 1))
  par(mar=c(1.1,4.2,1.1,1.1))

  if (graphic) {
    plot(x = bin_mat$abspos, y = bin_mat$lowratio, log = "y", xaxt = "n", ylab = "Ratio", xlab = "", col = "#CCCCCC")
     #axis(1, at = x.at, labels = x.labels)
    lines(x = bin_mat$abspos, y = bin_mat$lowratio, col = "#CCCCCC")
    points(x = bin_mat$abspos, y = bin_mat$seg.mean.LOWESS, col = "#0000AA")
    lines(x = bin_mat$abspos, y = bin_mat$seg.mean.LOWESS, col = "#0000AA")
    hlines = c(0.5, 1.0, 1.5, 2.0)
    mtext(chr.text[1:24], at = chr.at[1:24], cex = 0.7, col = col.chr.text[1:24],las = 1)

    if (genome == "hgdm"){
      text(chr.at[25], max(bin_mat$lowratio),  chr.text[25], col = col.chr.text[25],srt = 90,
           adj = 1 ,cex=1)
      mtext(chr.text[26], at = chr.at[26], cex = 0.7, col = col.chr.text[26],las = 1)
      text(chr.at[27], max(bin_mat$lowratio),  chr.text[27], col = col.chr.text[27],srt = 90,
           adj = 1 ,cex=1)
      mtext(chr.text[28], at = chr.at[28], cex = 0.7, col = col.chr.text[28],las = 1)
      text(chr.at[29], max(bin_mat$lowratio),  chr.text[29], col = col.chr.text[29],srt = 90,
           adj = 1 ,cex=1)
      mtext(chr.text[30], at = chr.at[30], cex = 0.7, col = col.chr.text[30],las = 1)
    }

    abline(h = hlines, col="black", lty = 3)
    abline(v = vlines, col="black", lty = 2)

  }


  if (method == "multiplier"){
    thisGrid <- seq(1.5, 5.5, by=0.05)
    thisOuter <- bin_mat$seg.mean.LOWESS %o% thisGrid
    thisOuterRound <- round(thisOuter)
    thisOuterDiff <- (thisOuter - thisOuterRound) ^ 2
    thisOuterColsums <- colSums(thisOuterDiff, na.rm = FALSE, dims = 1)
    thisMultiplier <- thisGrid[which.min(thisOuterColsums)]

    bin_mat$seg.quantal <- bin_mat$seg.mean.LOWESS * thisMultiplier
    bin_mat$ratio.quantal <- bin_mat$lowratio * thisMultiplier

    if (graphic) {
      plot(x = bin_mat$abspos, y = bin_mat$ratio.quantal, log = "y", xaxt = "n", yaxt = "n", ylab = "Ratio.quantal", xlab= "", col = "#CCCCCC")
      # axis(1, at = x.at, labels = x.labels)
      lines(x = bin_mat$abspos, y = bin_mat$ratio.quantal, col = "#CCCCCC")
      points(x = bin_mat$abspos, y = bin_mat$seg.quantal, col = "#0000AA")
      lines(x = bin_mat$abspos, y = bin_mat$seg.quantal, col = "#0000AA")
      hlines = c(1, 3, 4, 5, 6)
      mtext(chr.text[1:24], at = chr.at[1:24], cex = 0.7, col = col.chr.text[1:24],las = 1)

      if (genome == "hgdm"){
        text(chr.at[25], max(bin_mat$lowratio),  chr.text[25], col = col.chr.text[25],srt = 90,
             adj = 1 ,cex=1)
        mtext(chr.text[26], at = chr.at[26], cex = 0.7, col = col.chr.text[26],las = 1)
        text(chr.at[27], max(bin_mat$lowratio),  chr.text[27], col = col.chr.text[27],srt = 90,
             adj = 1 ,cex=1)
        mtext(chr.text[28], at = chr.at[28], cex = 0.7, col = col.chr.text[28],las = 1)
        text(chr.at[29], max(bin_mat$lowratio),  chr.text[29], col = col.chr.text[29],srt = 90,
             adj = 1 ,cex=1)
        mtext(chr.text[30], at = chr.at[30], cex = 0.7, col = col.chr.text[30],las = 1)
      }

      abline(h = hlines, col="black", lty = 3)
      abline(v = vlines, col="black", lty = 2)
      abline(h = 2, col="red", lty = 1)
      axis(2, at = c(2,4,6), labels = c("2","4","6"))

    }

  }

  if (method == "dmploidies"){
    dmbinID <- which(bin_mat$chrom.numeric%in%(25:30))
    median_ratio <- median(bin_mat$lowratio[dmbinID], na.rm = TRUE)
    theMultiplier <- 2/median_ratio
    bin_mat$seg.quantal <- bin_mat$seg.mean.LOWESS * theMultiplier
    bin_mat$ratio.quantal <- bin_mat$lowratio * theMultiplier

    if (genome == "hg"){
      stop("method dmploidies only works when genome is hgdm!")
    }

    if (graphic) {
      plot(x = bin_mat$abspos, y = bin_mat$ratio.quantal, log = "y", xaxt = "n",yaxt="n", ylab = "Ratio.quantal", xlab="", col = "#CCCCCC")
      # axis(1, at = x.at, labels = x.labels)
      lines(x = bin_mat$abspos, y = bin_mat$ratio.quantal, col = "#CCCCCC")
      points(x = bin_mat$abspos, y = bin_mat$seg.quantal, col = "#0000AA")
      lines(x = bin_mat$abspos, y = bin_mat$seg.quantal, col = "#0000AA")
      hlines = c(1, 3, 4, 5, 6)
      mtext(chr.text[1:24], at = chr.at[1:24], cex = 0.7, col = col.chr.text[1:24],las = 1)
      text(chr.at[25], max(bin_mat$lowratio),  chr.text[25], col = col.chr.text[25],srt = 90,
           adj = 1 ,cex=1)
      mtext(chr.text[26], at = chr.at[26], cex = 0.7, col = col.chr.text[26],las = 1)
      text(chr.at[27], max(bin_mat$lowratio),  chr.text[27], col = col.chr.text[27],srt = 90,
           adj = 1 ,cex=1)
      mtext(chr.text[28], at = chr.at[28], cex = 0.7, col = col.chr.text[28],las = 1)
      text(chr.at[29], max(bin_mat$lowratio),  chr.text[29], col = col.chr.text[29],srt = 90,
           adj = 1 ,cex=1)
      mtext(chr.text[30], at = chr.at[30], cex = 0.7, col = col.chr.text[30],las = 1)

      abline(h = hlines, col="black", lty = 3)
      abline(v = vlines, col="black", lty = 2)
      abline(h = 2, col="red", lty = 1)
      axis(2, at = c(2,4,6), labels = c("2","4","6"))
    }
  }

  par(old.par)

  return(bin_mat)
}