R/facets-procreads.R

Defines functions MakeLoessObject FindBestNormalParameters PreProcSnpPileup counts2logROR scanSnp procXSnps procSnps

Documented in counts2logROR FindBestNormalParameters MakeLoessObject PreProcSnpPileup procSnps procXSnps scanSnp

procSnps <- function(rcmat, ndepth=35, het.thresh=0.25, snp.nbhd=250, gbuild="hg19", unmatched=FALSE, ndepthmax=5000, donorCounts=NULL) {
  #' heterozygous and keep flags of the SNPs
  #' @param rcmat input counts matrix
  #' @param ndepth (numeric) minimum normal sample depth to keep
  #' @param het.thresh (numeric) vaf threshold to call a SNP heterozygous
  #' @param snp.nbhd (logical) window size
  #' @param gbuild (character) genome build version.
  #' @param unmatched (logical)
  #' @param ndepthmax (numeric) loci for which normal coverage exceeds this number (default is 5000) will be discarded as PCR duplicates. Fof high coverage sample increase this and ndepth commensurately.
  #' @param donorCounts count matrix of donor sample(s)
  #' @importFrom utils hasName
  #process SNPs,  keep only chromsomes 1-22 & X for humans and 1-19, X for mice
    if (gbuild %in% c("hg19", "hg38", "hg18")) {
        chromlevels <- c(1:22,"X")
    } else {
        chromlevels <- c(1:19,"X")
    }
    chr.keep <- rcmat$Chromosome %in% chromlevels
    # keep only snps with normal read depth between ndepth and 1000
    depthN.keep <- (rcmat$NOR.DP >= ndepth) & (rcmat$NOR.DP < ndepthmax)
    # reduce the data frame to these snps
    rcmat <- rcmat[chr.keep & depthN.keep,]
    # output data frame
    out <- list()
    out$chrom <- rcmat$Chromosome
    out$maploc <- rcmat$Position
    out$Ref = rcmat$Ref
    out$Alt = rcmat$Alt
    out$rCountT <- rcmat$TUM.DP
    out$rCountN <- rcmat$NOR.DP
    # if count matrix has unmatched normal as well include it
    if (hasName(rcmat, "UMN.DP")) out$umNrCount <- rcmat$UMN.DP
    if (hasName(rcmat, "UMNX.DP")) out$umNrXCount <- rcmat$UMNX.DP
    out$vafT <- 1 - rcmat$TUM.RD/rcmat$TUM.DP
    out$vafN <- 1 - rcmat$NOR.RD/rcmat$NOR.DP
    out$TUM.DP = rcmat$TUM.DP
    # make chromosome ordered and numeric
    out$chrom <- as.numeric(ordered(out$chrom, levels=chromlevels))
    # call a snp heterozygous if min(vafN, 1-mafN) > het.thresh
    if (unmatched) {
        #if (het.thresh == 0.25) het.thresh <- 0.1
        out$het <- 1*(pmin(out$vafT, 1-out$vafT) > het.thresh & out$TUM.DP >= 100)
    } else {
        out$het <- 1*(pmin(out$vafN, 1-out$vafN) > het.thresh)
    }
    # scan maploc for snps that are close to one another (within snp.nbhd bases)
    # heep all the hets (should change if too close) and only one from a nbhd
    out$keep <- scanSnp(out$maploc, out$het, snp.nbhd)
    out = as.data.frame(out)
    if (!is.null(donorCounts)){
      donorcount = length(grep("^RefDonor([0-9]{1,})DP$", colnames(donorCounts)))
      
      dmatrix = subset(donorCounts, select=c(Chromosome, Position,Ref, Alt))
      for(i in 1:donorcount){
        
        tempVAF = paste('RefDonor', i, "VAF", sep="")
        tempR = paste("RefDonor", i, "R", sep="")
        tempDP = paste("RefDonor", i, "DP", sep="")
        tempHET = paste("RefDonor", i, "het", sep="")
        dmatrix[,tempVAF] = 1 - (donorCounts[,tempR]/donorCounts[,tempDP])
        dmatrix[,tempHET] =  1*(pmin(dmatrix[,tempVAF], 1-dmatrix[,tempVAF]) > het.thresh  & tempDP>=ndepth)
      }
      hetcols = grep("^RefDonor[0-9]{1,}het", colnames(dmatrix))
      dhet = dmatrix[dmatrix[,c(hetcols)]==1,]
      dhet = dhet[complete.cases(dhet),]
      write.table(dhet, file="baseline_donor_hetsnps.txt", sep = "\t", row.names=FALSE, quote=FALSE)
      dhet$key = paste(dhet$Chromosome, dhet$Position, dhet$Ref, dhet$Alt, sep = ":")
      out$key = paste(out$chrom, out$maploc, out$Ref, out$Alt, sep=":")
      write.table(out, file="baseline_host_snps.txt", sep = "\t", row.names=FALSE, quote=FALSE)
      out$het = ifelse(out$het==1 & out$key %in% dhet$key, 1, 0)
      write.table(out, file="sample_snps_donor_filtered.txt", sep = "\t", row.names=FALSE, quote=FALSE)
      out = subset(out, select=-c(key, Ref, Alt))
    }
    out
}

procXSnps <- function(pileup, ndepth=35, het.thresh=0.25, snp.nbhd=250, gbuild="hg19", unmatched=FALSE, ndepthmax=5000, phet=0.01) {
  #' Takes a snp-pileup file and determines sex of sample and any unmatched normals based on number of chrX het SNPs, as males should not have het X
  #' @param pileup (character) A snp-pileup generated pileup file that has been analyzed with readSnpMatrix(). Expect columns "Chromosome", "Position", "NOR.DP", "NOR.RD", "TUM.DP", and "TUM.RD". Can be a pileup file that has been merged with common loci of reference normals processed with PreProcSnpPileup()
  #' @param ndepth (numeric) minimum normal sample depth to keep
  #' @param het.thresh (numeric) vaf threshold to call a SNP heterozygous
  #' @param snp.nbhd (logical) window size
  #' @param gbuild (character) genome build version.
  #' @param unmatched (logical)
  #' @param ndepthmax (numeric) loci for which normal coverage exceeds this number (default is 5000) will be discarded as PCR duplicates. Fof high coverage sample increase this and ndepth commensurately.
  #' @param phet (numeric) proportion heterzygous chrX SNPs to classify sample as Female
  #' @return output a table of samples analyzed and imputed sex.
  #' @export

    chromlevels = "X"
    chr.keep <- pileup$Chromosome %in% chromlevels
    # keep only snps with normal read depth between ndepth and 1000
    depthN.keep <- (pileup$NOR.DP >= ndepth) & (pileup$NOR.DP < ndepthmax)
    # reduce the data frame to these snps
    rcmatX <- pileup[chr.keep & depthN.keep,]
    # output data frame
    out <- list()
    out$chrom <- rcmatX$Chromosome
    out$maploc <- rcmatX$Position


    out$TUM.DP <- rcmatX$TUM.DP
    out$NOR.DP <- rcmatX$NOR.DP
    out$vafT <- 1 - rcmatX$TUM.RD/rcmatX$TUM.DP
    out$vafN <- 1 - rcmatX$NOR.RD/rcmatX$NOR.DP
    out = as.data.frame(out)
    
    normCount = length(grep("^File([3-9]|[1-9]{2,})DP$", colnames(rcmatX)))
    RefnormCount = length(grep("^RefFile([0-9]{1,})DP$", colnames(rcmatX)))
    
    if (normCount>0){
      for(i in 3:(3+normCount-1)){
        name = paste0('File', i)
        tempVAF = paste0(name, "VAF")
        tempR = paste0(name, "R")
        tempDP = paste0(name, "DP")
        tempHET = paste0(tempDP, "_nhet")
  
        out[,tempVAF] = 1 - (rcmatX[,tempR]/rcmatX[,tempDP])
        out[,tempHET] =  1*(pmin(out[,tempVAF], 1-out[,tempVAF]) > het.thresh )
        out[,tempDP] = rcmatX[,tempDP] 
      }
    }
    out$NOR.DP_nhet <- 1*(pmin(out$vafN, 1-out$vafN) > het.thresh)
    out$TUM.DP_nhet <- 1*(pmin(out$vafT, 1-out$vafT) > het.thresh)
    
    out$keep <- scanSnp(out$maploc, out$NOR.DP_nhet, snp.nbhd)
 
    if (RefnormCount>0){
      for(i in 1:RefnormCount){
        name = paste0("RefFile", i)
        tempVAF = paste0(name, "VAF")
        tempR = paste0(name, "R")
        tempDP = paste0(name, "DP")
        tempHET = paste0(tempDP, "_nhet")
        
        out[,tempVAF] = 1 - (rcmatX[,tempR]/rcmatX[,tempDP])
        out[,tempHET] =  1*(pmin(out[,tempVAF], 1-out[,tempVAF]) > het.thresh )
        out[,tempDP] = rcmatX[,tempDP] 
      }
    }
    out = out[out[,'keep']==1,]
    
    samples = colnames(out)[grep("DP$", colnames(out))]
    out.hets = NULL
    for(i in samples){
      #out = out[out[,i]>ndepth,]
      index =paste0(i, "_nhet")
      sumhets = as.data.frame(aggregate(out[,index] ~ out[,'chrom'], FUN=sum)[[2]])
      colnames(sumhets) =  "numHet"
      
      countsnp =  as.data.frame(dim(out)[1])
      colnames(countsnp) = "num.mark"
      
      row = as.data.frame(c(sumhets, countsnp))
      out.hets = rbind(out.hets, row)
    }
    row.names(out.hets) = samples 
    out.hets$pHet  = out.hets$numHet / out.hets$num.mark
    out.hets$sampleSex = ifelse(out.hets$pHet>phet,"Female", "Male")
    
    out.hets
}

scanSnp <- function(maploc, het, nbhd) {
  #' scan maploc for snps that are close to one another (within snp.nbhd bases)
  #' keep all the hets (should change if too close) and only one from a nbhd
  #' @param maploc (numeric) chrom genomic mapping position
  #' @param het (numeric) binary heterozygous status of position
  #' @param nbhd (numeric) window size, bp
  #' @return a vector of snp positions to keep
  #' @export
    n <- length(maploc)
    zzz <- .Fortran("scansnp",
                    as.integer(n),
                    as.double(maploc),
                    as.double(het),
                    keep=double(n),
                    as.double(nbhd))
    zzz$keep
}

counts2logROR <- function(mat, gbuild, unmatched=FALSE, MandUnormal=FALSE, f, spanT, spanA, spanX) {
  #' obtain logR and logOR from read counts and GC-correct logR
  #' @param mat input data
  #' @param gbuild e.g. "hg19"
  #' @param unmatched (logical)
  #' @param MandUnormal analyzing both matched and unmatched normal for log ratio normalization
  #' @param f default span value for loess
  #' @param spanT span value tumor
  #' @param spanA span value autosomes
  #' @param spanX span value X
  #' @importFrom pctGCdata getGCpct
    out <- mat[mat$keep==1,]
    #out$chrom = gsub('X', '23', out$chrom) #testing replace X with 23

    # gc percentage
    out$gcpct <- rep(NA_real_, nrow(out))
    # get GC percentages from pctGCdata package
    # loop thru chromosomes
    #nchr <- max(mat$chrom) # IMPACT doesn't have X so only 22

    for (i in c(1:23)) {
   # for (i in 1:nchr) {
        ii <- which(out$chrom==i)
        # allow for chromosomes with no SNPs i.e. not targeted
        if (length(ii) > 0) {
            out$gcpct[ii] <- getGCpct(i, out$maploc[ii], gbuild)
        }

    }
    out = out[which(!is.na(out$gcpct)),]
    x.idx <- grep("X|23",out$chrom)
    ##### log-ratio with gc correction and maf log-odds ratio steps
    chrom <- out$chrom
    maploc <- out$maploc
    if (hasName(mat, "umNrCount")) {
        rCountN <- out$umNrCount
    } else {
        rCountN <- out$rCountN
    }
    
    if (hasName(mat, "umNrXCount")) {
            rCountNX <- out$umNrXCount
            normal_X_sqrt = sqrt(rCountNX)
    }
    rCountT <- out$rCountT
    vafT <- out$vafT
    vafN <- out$vafN
    het <- out$het
    gcpct <- out$gcpct
    gcpct.auto = gcpct[-x.idx]
    gcpct.x = gcpct[x.idx]

    # compute gc bias
    ncount <- tapply(rCountN, gcpct, sum)
    tcount <- tapply(rCountT, gcpct, sum)
    pctgc <- as.numeric(names(ncount))
    tscl <- sum(ncount)/sum(tcount)
    gcb <- lowess(pctgc, log2(tcount*tscl)-log2(ncount), f=f)
    jj <- match(gcpct, gcb$x)
    gcbias <- gcb$y[jj]
    # compute cn log-ratio (gc corrected) and baf log odds-ratio
    #####################################
    #square root transform count vectors.
    tumor_sqrt = sqrt(rCountT)
    tumor_sqrt.auto = tumor_sqrt[-x.idx]
    tumor_sqrt.x    = tumor_sqrt[x.idx]

    normal_sqrt     = sqrt(rCountN)
    normal_sqrt.auto = normal_sqrt[-x.idx]
    normal_sqrt.x    = normal_sqrt[x.idx]

    loess_tumor.all = lowess(gcpct,tumor_sqrt, f=spanT) #need to change this to input value from span.fits
    jj=match(gcpct, loess_tumor.all$x)
    fit<-loess_tumor.all$y[jj]

    normalized_t.all<-(tumor_sqrt-fit+median(tumor_sqrt))/(median(tumor_sqrt[which(tumor_sqrt != 0)]));
    
    tumor_rt = normalized_t.all
    tumor_rt = tumor_rt^2
    
    #loess regression for lr normal autosomes and X seperately.
    loess_normal.auto <-lowess(gcpct.auto, normal_sqrt.auto,f=spanA);
    jj=match(gcpct.auto, loess_normal.auto$x)
    fit<-loess_normal.auto$y[jj]
    normalized_n.auto<-(normal_sqrt.auto-fit+median(normal_sqrt.auto))/(median(normal_sqrt.auto[which(normal_sqrt.auto != 0)]));

    if (hasName(mat, "umNrXCount")) {
    loess_normal.x <-lowess(gcpct,normal_X_sqrt,f=spanX);
    jj=match(gcpct, loess_normal.x$x)
    fit<-loess_normal.x$y[jj]
    #temp<-predict(loess_normal.x);
    normalized_n.x<-(normal_X_sqrt-fit+median(normal_X_sqrt))/(median(normal_X_sqrt[which(normal_X_sqrt != 0)]));
    normal_rt = c(normalized_n.auto, normalized_n.x[x.idx])
    }
    else{
      loess_normal.x <-lowess(gcpct.x,normal_sqrt.x,f=spanX);
      jj=match(gcpct.x, loess_normal.x$x)
      fit<-loess_normal.x$y[jj]
      #temp<-predict(loess_normal.x);
      normalized_n.x<-(normal_sqrt.x-fit+median(normal_sqrt.x))/(median(normal_sqrt.x[which(normal_sqrt.x != 0)]));
      normal_rt = c(normalized_n.auto, normalized_n.x)
    }
    normal_rt = normal_rt^2
    #calculate log2 ratios
    cnlr = log2(0.25+tumor_rt) - log2(0.25+normal_rt)

    #####################################
    #use old method of cnlr calc if matched normal
    if (!MandUnormal) cnlr <- log2(1+rCountT*tscl) - log2(1+rCountN) - gcbias
    # minor allele log-odds ratio and weights
    rCountN <- out$rCountN # reset normal depth in case umNrCount exists
    lorvar <- valor <- rep(NA_real_, length(maploc))
    if (unmatched) {
        # read count matrix for odds ratio etc
        rcmat <- round(cbind(vafT[het==1]*rCountT[het==1], (1-vafT[het==1])*rCountT[het==1]))
        # folded log of Tukey (with 1/6 correction)
        valor[het==1] <- log(rcmat[,1]+1/6) - log(rcmat[,2]+1/6)
        # variance - approximation using delta method
        lorvar[het==1] <- 1/(rcmat[,1]+1/6) + 1/(rcmat[,2]+1/6)
    } else {
        # read count matrix for odds ratio etc
        rcmat <- round(cbind(vafT[het==1]*rCountT[het==1], (1-vafT[het==1])*rCountT[het==1], vafN[het==1]*rCountN[het==1], (1-vafN[het==1])*rCountN[het==1]))
        # log-odds-ratio (Haldane correction)
        valor[het==1] <- log(rcmat[,1]+0.5) - log(rcmat[,2]+0.5) - log(rcmat[,3]+0.5) + log(rcmat[,4]+0.5)
        # variance of log-odds-ratio (Haldane; Gart & Zweifel Biometrika 1967)
        lorvar[het==1] <- (1/(rcmat[,1]+0.5) + 1/(rcmat[,2]+0.5) + 1/(rcmat[,3]+0.5) + 1/(rcmat[,4]+0.5))
    }
    # put them together
    out$lorvar <- out$valor <- out$cnlr <- out$gcbias <- rep(NA_real_, nrow(out))
    out$gcbias <- gcbias
    out$cnlr <- cnlr
    out$valor <- valor
    out$lorvar <- lorvar
    out
}

############################################################################
PreProcSnpPileup <- function(filename, err.thresh=Inf, del.thresh=Inf,
                             is.Reference=FALSE, gbuild="hg19") {
  #' PreProcSnpPileup takes a snp-pileup generated pileup file and process it into a data frame format for downstream processing.
  #' @param filename (character) File path to snp-pileup generated pileup file.
  #' @param err.thresh (numeric) Error threshold to be used to filter snp-pileup data frame.
  #' @param del.thresh (numeric) Deletion threshold to be used to filter snp-pileup data frame.
  #' @param is.Reference (logical) Indicate whether the snp-pilep is Reference data.
  #' @param gbuild (character) genome build version.
  #' @return A data drame of pileup depth values filtered against del.thresh and err.thresh values.
  #' @export

  pileup <- read.csv(filename, stringsAsFactors=FALSE)
  # remove chr if present in Chrom
  if (pileup$Chromosome[1] == "chr1") {
    pileup$Chromosome <- gsub("chr", "", pileup$Chromosome)
  }
  if (gbuild %in% c("hg19", "hg38", "hg18")) chromlevels <- c(1:22,"X")
  if (gbuild %in% c("mm9", "mm10")) chromlevels <- c(1:19,"X")
  pileup <- pileup[which(pileup$Chromosome %in% chromlevels),]

  # if pileup data contains tumor and matched normal data, use those columns
  #  to filter against err.thresh and del.thresh
  if (!is.Reference) {
  err.columns <- colnames(pileup)[grep("^File[0-9]+E$", colnames(pileup))][1:2]
  del.columns <- colnames(pileup)[grep("^File[0-9]+D$", colnames(pileup))][1:2]

  # remove loci where errors and deletions exceed thresholds
  select.loci <- apply(
    cbind(apply(pileup[err.columns], 2, function(x) x<=err.thresh),
          apply(pileup[del.columns], 2, function(x) x<=del.thresh)),
    1, all)

  # select loci in pileup data and skip identifiers
  pileup.select <- pileup[select.loci,]
  }
  else {
    pileup.select <- pileup
  }

  # retain genomic coordinates
  pileup.select.key <- paste(pileup.select$Chromosome, pileup.select$Position, sep=":")

  # calculate total depth for each samples at all loci
  for(i in 1:((ncol(pileup.select)-4)/4)){
    temp <- paste0("File", i, "DP")
    tempR <- paste0("File", i, "R")
    tempA <- paste0("File", i, "A")
   # tempRD <-paste0("File", i, "RD")
    pileup.select[,temp] <- pileup.select[,tempR] + pileup.select[,tempA]
   # pileup.select[,tempRD] <- pileup.select[,tempR]
  }
  pileup.select<- cbind(key=pileup.select.key,
                        pileup.select)
  
  if (is.Reference) {
    colnames(pileup.select) = gsub("^File", "RefFile", colnames(pileup.select))
  }

  return(pileup.select)
}

###########################################################################
FindBestNormalParameters <- function(TumorLoess, TumorPileup,
                                     ReferenceLoess=NULL, ReferencePileup=NULL,
                                     MinOverlap=0.90, useMatchedX=FALSE, refX=FALSE, unmatched=FALSE) {
  #' FindBestNormalParameters takes takes a facets2n generated tumor loess object and snp-pileup generated pileup file, and optional similar files for reference normals, and returns the pileup data for the best normal for T/N CNLR.
  #' @param TumorLoess (matrix) A facets2n generated TumorLoess matrix with header and span values in the first row.
  #' @param TumorPileup (data frame) snp-pileup generated pileup data frame with sample columns that match with the TumorLoess object.
  #' @param ReferenceLoess (matrix) A ReferenceLoess matrix with a header and span values in the first row.
  #' @param ReferencePileup (data frame) A snp-pileup generated pileup data frame with sample columns that match with the ReferenceLoess object.
  #' @param MinOverlap (numeric) A numeric between 0 and 1 that denotes the fraction overlap of loci between TumorLoess and the optional ReferenceLoess
  #' @param useMatchedX (logical) Force select matched normal for normalization in ChrX.
  #' @param unmatched (logical)
  #' @param refX (logical) Use matched or reference normal for chrX normalization. excludes unmatched normals, such as pooled references, present in tumor counts matrix. 
  #' @return A list of data frame with pileup depth values of Tumor, matched Normal, and a best unmatched normal, and the associated span values.
  #' @export

  TumorLoess.span <- TumorLoess[1,]
  TumorLoess <- as.data.frame(TumorLoess[-1,], stringsAsFactors=TRUE)
  colnames(TumorLoess)[1] <- "key"

  if (!is.null(ReferencePileup)) {
   
    ReferenceLoess.span <- ReferenceLoess[1,]
    ReferenceLoess <- as.data.frame(ReferenceLoess[-1,],stringsAsFactors=TRUE)
    colnames(ReferenceLoess)[1] <- "key"

    common.loci <- intersect(ReferenceLoess$key, TumorLoess$key)
    if (length(common.loci) / max(length(TumorLoess$key),
                                  length(ReferenceLoess$key)) < MinOverlap ) {
      warning(sprintf("Overlap of loci between the two Loess dataframes\
           is less than defined MinOverlap fraction of %s\n", MinOverlap))

    }
    combined.loess <- cbind(
      TumorLoess[which(TumorLoess$key %in% common.loci),],
      ReferenceLoess[which(ReferenceLoess$key %in% common.loci),][,-1]
    )
    TumorPileup.common = TumorPileup
    TumorPileup.common = TumorPileup.common[which(TumorPileup.common$key %in% common.loci),]
    TumorPileup.common$NOR.DP <- TumorPileup.common$File1R + TumorPileup.common$File1A
    TumorPileup.common$NOR.RD <- TumorPileup.common$File1R
    TumorPileup.common$TUM.DP <- TumorPileup.common$File2R + TumorPileup.common$File2A
    TumorPileup.common$TUM.RD <- TumorPileup.common$File2R
    
    ReferencePileup.common = ReferencePileup[which(ReferencePileup$key %in% common.loci),]
  
    colkeep = colnames(TumorPileup.common)[grep("File([3-9]|[1-9]{2,})", colnames(TumorPileup.common))]
    combined.pileup <- cbind(
      TumorPileup.common[,c(colkeep,"Ref", "Alt","File1DP", "NOR.DP", "NOR.RD", "TUM.DP", "TUM.RD")],
      ReferencePileup.common
    )

    combined.span <- c(TumorLoess.span, ReferenceLoess.span[-1])
  }
  else {
    combined.pileup <- TumorPileup
    combined.pileup$NOR.DP <- combined.pileup$File1R + combined.pileup$File1A
    combined.pileup$NOR.RD <- combined.pileup$File1R
    combined.pileup$TUM.DP <- combined.pileup$File2R + combined.pileup$File2A
    combined.pileup$TUM.RD <- combined.pileup$File2R
    
    combined.loess <- TumorLoess
    combined.span <- TumorLoess.span
    common.loci <- TumorLoess$key
  }

  # Assumptions: 
    # First column of data is the key,
    # Second column belongs to the Normal sample
    # Third column belongs to the Tumor sample
    #remaining columns are unmatched normals samples
  
  MatchedNormalIdentifier <- colnames(combined.loess)[2]
  TumorIdentifier <- colnames(combined.loess)[3]

  #identify probes on ChrX for separate processing
  x.idx <- as.vector(grep('^X\\:', combined.loess$key))
  x.idx.values <- as.vector(grep('^X\\:', combined.loess$key, value = T))

  #determine sex of sample and unmatched nornmals
  snpsX = procXSnps(combined.pileup, phet=0.01)
  
  sampleSex ="Female"
  if (unmatched){
    sampleSex = snpsX["TUM.DP", "sampleSex"]
    message("imputed patient sex from Tumor, experimental: ", sampleSex)
  }else{
    sampleSex = snpsX["NOR.DP", "sampleSex"]
    message("imputed patient sex from matched normal: ", sampleSex)
  }


  #calculate noise of tumor against normals for autosomes and ChrX seperately
  noiseAuto <- do.call('rbind',list(apply(combined.loess[-x.idx,-c(1,3), drop=F],2,function(column){
    lr = log2(as.numeric(levels(combined.loess[-x.idx,3]))[combined.loess[-x.idx,3]]) -
      log2(as.numeric(column))
    return(sum(lr^2, na.rm=T));
  })));

  #pick the normals that minimize noise
  best_normAuto <- colnames(noiseAuto)[which(noiseAuto == min(noiseAuto) & noiseAuto != 0)][1]
  if(useMatchedX) {
    best_normX <- MatchedNormalIdentifier
  }
  else {
    #limit normals for X normalization to those matching patient sex
    combined.loess.useX = row.names(snpsX[which(snpsX$sampleSex==sampleSex),])
    combined.loess.useX = gsub("NOR.DP", "File1DP", combined.loess.useX)
    if(refX){
      combined.loess.useX = combined.loess.useX[grep("File1DP|^RefFile", combined.loess.useX)]
    }else{
      combined.loess.useX = combined.loess.useX[grep("^File|^RefFile", combined.loess.useX)]
    }

    noiseX <- do.call('rbind',list(apply(subset(combined.loess[x.idx,-c(1,3), drop=F],select = c(combined.loess.useX)),2,function(column){
      lr = log2(as.numeric(levels(combined.loess[x.idx,3]))[combined.loess[x.idx,3]]) -
        log2(as.numeric(column))
      return(sum(lr^2, na.rm=T));
    })));

    best_normX <- colnames(noiseX)[which(noiseX == min(noiseX) & noiseX != 0)][1]
  }

  message(sprintf("Best normal for autosomes: %s\nBest normal for ChrX: %s\n",
                  best_normAuto, best_normX))
  
  rcmat <- cbind(combined.pileup[,c("Chromosome", "Position", "Ref", "Alt","NOR.DP", "NOR.RD", "TUM.DP", "TUM.RD")],
            UMN.DP=c(combined.pileup[-x.idx, best_normAuto], combined.pileup[x.idx,best_normX]), UMNX.DP = combined.pileup[,best_normX])
  
  
  return(list("rcmat"=rcmat,
            "spanT"=as.numeric(combined.span[TumorIdentifier]), # Assume that tumor data is always in the third column as expected
            "spanA"=as.numeric(combined.span[best_normAuto]),
            "spanX"=as.numeric(combined.span[best_normX])
            )
        )
}

######################################################################
MakeLoessObject <- function(pileup, write.loess=FALSE, outfilepath="./loess.txt", is.Reference = FALSE, gbuild="hg19") {

  #' MakeLoessObject takes a pipleup file generated by snp-pileup and generates a loess/lowess object, which is also optinally written into an output file.
  #' @importFrom pctGCdata getGCpct
  #' @importFrom utils write.table
  #' @param pileup (data frame) A data franme of snp-pileup generated depth.
  #' @param write.loess (logical) Write loess object into file, instead of returning it as a matrix?
  #' @param outfilepath (character) Filepath for writing loess object.
  #' @param is.Reference (logical) Indicate whether the snp-pilep is Reference data.
  #' @param gbuild (character) genome build version.
  #' @return A dataframe of loess normalized values for all input samples against filtered loci or None, if the loess normalized value is to be written to an output file.
  #' @export

  # read pileup data and select rows based on used-defined err and del thresholds
  pileup.select <- pileup

  pileup.select.dp <- pileup.select[,grep(paste(c("^key$", "^Chromosome$", "^Position$", "File.*DP$"),collapse="|"), colnames(pileup.select))]
  
  if (is.Reference) {
    message("starting loess normalization for reference samples")
    #pileup.select.dp$medianDP<- apply(pileup.select.dp[,grep("RefFile.*DP", colnames(pileup.select.dp))], 1, median, na.rm=T)
    #pileup.select.dp$q25<- apply(pileup.select.dp[,grep("RefFile.*DP", colnames(pileup.select.dp))], 1, quantile, probs=0.25, na.rm=T)
    #pileup.select.dp <- pileup.select.dp[which(pileup.select.dp$q25>quantile(pileup.select.dp$medianDP, 0.1)),]
    #pileup.select.dp <- subset(pileup.select.dp, select=-c(q25, medianDP))
  }
  else {
    #message("skipping quantile covg filtering")
    #pileup.select.dp$medianDP<- apply(pileup.select.dp[,grep("File([1]|[3-9]|[1-9]{2,})DP$", colnames(pileup.select.dp))][,-c(2), drop=F], 1, median, na.rm=T)
    #pileup.select.dp$q25<- apply(pileup.select.dp[,grep("File([1]|[3-9]|[1-9]{2,})DP$", colnames(pileup.select.dp))][,-c(2), drop=F], 1, quantile, probs=0.25, na.m=T)
    #pileup.select.dp <- pileup.select.dp[which(pileup.select.dp$q25>quantile(pileup.select.dp$medianDP, 0.1)),]
    #pileup.select.dp <- subset(pileup.select.dp, select=-c(q25, medianDP))
  }

  gcout <- subset(pileup.select.dp, select=c(Chromosome, Position))
  gcout$gcpct <- rep(NA_real_, nrow(gcout))

  for (i in c(1:22,'X')) {
    ii <- which(gcout$Chromosome==i)
    if (length(ii) > 0) {
      gcout$gcpct[ii] <- getGCpct(i, gcout$Position[ii], gbuild)
    }
  }
  gcout$key <-  paste(gcout$Chromosome, gcout$Position, sep = ":")

  #remove positions near centromeres, where we don't calculate GCpct
  gcout <-gcout[!is.na(gcout$gcpct),]
  #subset counts dfs to those with gc calc
  pileup.select.dp <- pileup.select.dp[which(pileup.select.dp$key %in% gcout$key),]

  #add gcpct values to the count file
  pileup.select.dp$gcpct<-gcout[match(pileup.select.dp$key, gcout$key),"gcpct"]

  gc.bias <- pileup.select.dp$gcpct
  
  #determine span values for lowess normalization
  span.fits <- do.call('rbind',list(
    apply(subset(pileup.select.dp, select= -c(Chromosome, Position, key, gcpct)),2,function(column){
    column_sqrt<-sqrt(column)
    testspan <- function(spanvalue){  #change this to get span values seperately for auto, X?
      loess.obj<-lowess(gc.bias, column_sqrt,f=spanvalue);
      jj <- match(pileup.select.dp$gcpct, loess.obj$x)
      fit <- loess.obj$y[jj]#Calculation of the loess fit for each spanvalue
      normalized<-(column_sqrt-fit+median(column_sqrt))/(median(column_sqrt[which(column_sqrt != 0)]));#Data normalized for each spanvalue

      loess.obj2<-lowess(gc.bias,normalized,f=spanvalue);
      fit2 <- loess.obj2$y  #The "fit" of each normalized data point - the result gives the flat-ish line
      spanvar <- var(fit2,na.rm=TRUE) #Calculate the variance to find the flattest line after fitting
      return(round(spanvar,5));
    }
    optimize.obj <-	optimize(testspan,interval=c(0.1,0.9),maximum=F);
    return(c('min'=optimize.obj$minimum,'obj'=optimize.obj$objective));
  })));

  span.fits <- t(span.fits)

  pileup.select.dp.lowess <- do.call('cbind',lapply(seq(1,ncol(pileup.select.dp)-4,1),function(i){
    column_sqrt <- sqrt(pileup.select.dp[,grep("File.*DP", colnames(pileup.select.dp))][,i])
    loess.obj <-lowess(gc.bias, column_sqrt,f=span.fits[i,'min']);
    jj <- match(pileup.select.dp$gcpct, loess.obj$x)
    fit <- loess.obj$y[jj]
    normalized <- (column_sqrt-fit+median(column_sqrt))/(median(column_sqrt[which(column_sqrt != 0)]));
    return(normalized);
  }));

  colnames(pileup.select.dp.lowess) <- setdiff(
    colnames(pileup.select.dp), c("Chromosome", "Position", "key", "gcpct"))

  pileup.select.dp.lowess <- cbind(key=paste0(pileup.select.dp$Chromosome, ":",
                                  pileup.select.dp$Position), pileup.select.dp.lowess)

  # add span values as the first row
  pileup.select.dp.lowess <- rbind(c("span", span.fits[,"min"]),as.matrix(pileup.select.dp.lowess))

  if(write.loess) {
    write.table(pileup.select.dp.lowess, file=outfilepath, sep="\t", quote=F, col.names=T, row.names=F)
  }
  else {
    return(pileup.select.dp.lowess)
  }
}
rptashkin/facets2n documentation built on May 11, 2022, 1:34 p.m.