R/champ.DMR.R

Defines functions champ.DMR

Documented in champ.DMR

if(getRversion() >= "3.1.0") utils::globalVariables(c("myNorm","myLoad","mylimma","detectCores","probe.features","illumina450Gr","seqlevels<-","seqlevels"))

champ.DMR <- function(beta=myNorm,
                      pheno=myLoad$pd$Sample_Group,
                      compare.group=NULL,
                      arraytype="450K",
                      method = "Bumphunter",
                      minProbes=7,
                      adjPvalDmr=0.05,
                      cores=3,
                      ## following parameters are specifically for Bumphunter method.
                      maxGap=300,
                      cutoff=NULL,
                      pickCutoff=TRUE,
                      smooth=TRUE,
                      smoothFunction=loessByCluster,
                      useWeights=FALSE,
                      permutations=NULL,
                      B=250,
                      nullMethod="bootstrap",
                      ## following parameters are specifically for probe ProbeLasso method.
                      meanLassoRadius=375,
                      minDmrSep=1000,
                      minDmrSize=50,
                      adjPvalProbe=0.05,
                      Rplot=T,
                      PDFplot=T,
                      resultsDir="./CHAMP_ProbeLasso/",
                      ## following parameters are specifically for DMRcate method.
                      rmSNPCH=T,
                      fdr=0.05,
                      dist=2,
                      mafcut=0.05,
                      lambda=1000,
                      C=2)
{
  message("[===========================]")
  message("[<<<<< ChAMP.DMR START >>>>>]")
  message("-----------------------------")
  
  message("!!! important !!! We just upgrate champ.DMR() function, since now champ.DMP() could works on multiple phenotypes, but ProbeLasso can only works on one DMP result, so if your pheno parameter contains more than 2 phenotypes, and you want to use ProbeLasso function, you MUST specify compare.group=c(\"A\",\"B\"). Bumphunter and DMRcate should not be influenced.")
  
  message("\n[ Section 1:  Check Input Pheno Start ]\n")
  
  if(length(which(is.na(beta)))>0) message(length(which(is.na(beta)))," NA are detected in your beta Data Set, which may cause fail or uncorrect of SVD analysis. You may want to impute NA with champ.impute() function first.")
  
  if(!class(pheno) %in% c("character","factor","numeric")) stop("pheno parameter must be a category vector, could be character, factor or numeric (numeric is not recommended).")
  
  message("  You pheno is ",class(pheno)," type.")
  message("    Your pheno information contains following groups. >>")
  sapply(unique(pheno),function(x) message("    <",x,">:",sum(pheno==x)," samples."))
  
  
  if(method == "ProbeLasso")
  {
    message("  ProbeLasso Method can only be done between two phenotypes. So we need to do more check here...")
    if(length(unique(pheno))>2)
    {
      message("    Your pheno contains more than two phenotypes.")
      message("    You may specify compare.group to do comparision between certain two phenotypes")
      if(is.null(compare.group))
      {
        stop("    You did not specifically compare.group parameter to specify which two phenotypes you want to analysis.")
      } else if(sum(compare.group %in% unique(pheno)) == 2) {
        message("    Your compare.group is in accord with your pheno parameter, which is good.")
        message("    Now champ.DMR() would extract values for only these two phenotypes to analysis.")
        beta <- beta[,which(pheno %in% compare.group)]
        pheno <- pheno[which(pheno %in% compare.group)]
      } else {
        stop("    Seems you specified compare.group, but elements in your compare.group are not all found in your pheno parameter. Please recheck your pheno or compare.group.")
      }
    } else if(length(unique(pheno))==2) {
      message("    Your pheno parameter contains extactly two phenotypes, which is good and compare.group is not needed, champ.DMR() would proceed with your whole data set.")
    } else {
      stop("    Seems something wrong with your pheno data. champ.DMR() can not proceed. Please recheck your pheno information.")
    }
  }
  message("\n[ Section 1:  Check Input Pheno Done ]\n")
  
  message("\n[ Section 2:  Run DMR Algorithm Start ]\n")
  
  # if(arraytype=="EPIC"){
  #   RSobject <- RatioSet(beta, annotation = c(array = "IlluminaHumanMethylationEPIC",annotation = "ilm10b4.hg19"))
  # }else{
  #   RSobject <- RatioSet(beta, annotation = c(array = "IlluminaHumanMethylation450k",annotation = "ilmn12.hg19"))
  # }
  # probe.features <- getAnnotation(RSobject)
  
  if(arraytype %in% c("EPIC", "EPICv2")) {
    data("probe.features.epicv2")
  } else if(arraytype == "EPICv1") {
    data("probe.features.epicv1")
  } else if(arraytype == "450K") { 
    data("probe.features")
  } else (
    stop("arraytype must be `EPICv2`, `EPICv1`, `450K`")
  )
  
  if(cores > detectCores()) cores <- detectCores()
  
  
  if(method=="Bumphunter")
  {
    message("<< Find DMR with Bumphunter Method >>")
    
    message(cores," cores will be used to do parallel Bumphunter computing.")
    registerDoParallel(cores = cores)
    
    
    cpg.idx <- intersect(rownames(beta),rownames(probe.features))
    Anno <- probe.features[cpg.idx,]
    Anno <- Anno[order(Anno$CHR, Anno$MAPINFO),]
    cpg.idx <- rownames(Anno)
    
    cl <- clusterMaker(Anno$CHR, Anno$MAPINFO, maxGap=maxGap)
    names(cl) <- cpg.idx
    bumphunter.idx <- cpg.idx[which(cl %in% names(which(table(cl)>minProbes)))]
    
    message("According to your data set, champ.DMR() detected ",sum(table(cl)>minProbes)," clusters contains MORE THAN ",minProbes," probes within ",maxGap," maxGap. These clusters will be used to find DMR.\n")
    
    X <- cbind(1,(as.numeric(as.factor(pheno))-1))
    Beta <- beta[bumphunter.idx,]
    Beta <- replace(Beta,which(Beta <= 0.001),0.001)
    Beta <- replace(Beta,which(Beta >= 0.999),0.999)
    Y <- log((Beta/(1-Beta)),2)
    
    Bumps <- bumphunter(Y,
                        design=X,
                        chr=Anno[bumphunter.idx,]$CHR,
                        pos=Anno[bumphunter.idx,]$MAPINFO,
                        cluster=cl[bumphunter.idx],
                        cutoff=cutoff,
                        pickCutoff=pickCutoff,
                        smooth=smooth,
                        smoothFunction=smoothFunction,
                        useWeights=useWeights,
                        permutations=permutations,
                        verbose=TRUE,
                        B=B,
                        nullMethod=nullMethod)
    
    message("<< Calculate DMR success. >>")
    DMR <- Bumps$table[which(Bumps$table$p.valueArea <= adjPvalDmr),]
    message("Bumphunter detected ",nrow(DMR)," DMRs with P value <= ",adjPvalDmr,".")
    
    if(nrow(DMR) == 0) stop("No DMR detected.")
    
    rownames(DMR) <- paste("DMR",1:nrow(DMR),sep="_")
    #DMRProbes <- apply(DMR,1,function(x) Anno[which(Anno$chr==x[1] & Anno$pos>= as.numeric(x[2]) & Anno$pos<= as.numeric(x[3])),])
    DMR <- data.frame(DMR[,1:3],width=DMR[,3]-DMR[,2],strand="*",DMR[,4:14])
    colnames(DMR)[1:3] <- c("seqnames","start","end") 
    
    #OutputDMR <- list(BumphunterDMR=DMR,BumphunterDMRProbes=DMRProbes)
    OutputDMR <- list(BumphunterDMR=DMR)
    
  } else if(method == "ProbeLasso")
  {
    if (!file.exists(resultsDir)) dir.create(resultsDir)
    message("champ.DMR Results will be saved in ",resultsDir)
    
    message("<< Find DMR with ProbeLasso Method >>")
    gc()
    
    DMP <- champ.DMP(beta=beta,pheno=pheno,adjPVal=1)
    if(length(DMP) > 1) stop("Your pheno parameter seems contains more than 2 phenotypes. champ.DMR() only take covariates with only 2 phenotypes. Please manually extract your sample and covariates, then retry champ.DMR()")
    DMP <- DMP[[1]]
    
    if(arraytype=="EPIC") data(illuminaEPICGr) else data(illumina450Gr)
    if(length(which(DMP$adj.P.Val < adjPvalProbe))==0) stop("There is no probe show significant difference from champ.DMP() function.")
    
    
    myResultsGr <- illumina450Gr[match(rownames(DMP), names(illumina450Gr))]
    myResultsGr$P.Value <- DMP$P.Value[match(names(myResultsGr), rownames(DMP))];
    myResultsGr$adj.P.Val <- DMP$adj.P.Val[match(names(myResultsGr), rownames(DMP))]
    seqlevels(myResultsGr) <- sort(seqlevels(myResultsGr));
    myResultsGr <- sort(myResultsGr,ignore.strand=T) # sort for later
    ### readjust pValues after masking
    myResultsGr$adj.P.Val <- p.adjust(mcols(myResultsGr)$P.Value, method = "BH")
    ### Probe spacing and quantile derivation
    
    message("<< Get closestProbe for each Probe >>")
    closestProbe <- as.data.frame(distanceToNearest(myResultsGr,ignore.strand=T))$distance
    closestProbeSp <- split(closestProbe, mcols(myResultsGr)$featureCgi); rm(closestProbe)
    
    message("<< Get lassoQuantileThreshold for each featureCgi >>")
    lassoQuantileDeviation <- abs(meanLassoRadius - rowMeans(as.data.frame(lapply(closestProbeSp,function(x) quantile(x,(1:1000)/1000)))))
    lassoQuantileThreshold <- which.min(lassoQuantileDeviation) / 1000;
    lassoSizes <- lapply(closestProbeSp, function(x) quantile(x, lassoQuantileThreshold, na.rm = T))
    
    message("<< Get expend ranges for each probe >>")
    myResultsGrSp <- split(myResultsGr, myResultsGr$featureCgi) # splits myResultsGr by 'featureCgi'; length = 28
    lassoGr <- mapply(function(x, y) promoters(x, upstream = y, downstream = y), x = myResultsGrSp, y = lassoSizes)
    lassoGr <- unlist(GRangesList(lassoGr)); rm(myResultsGrSp)
    myResultsSigGr <- myResultsGr[which(mcols(myResultsGr)$adj.P.Val < adjPvalProbe)]
    lassoProbeCountOverlap <- countOverlaps(lassoGr, myResultsSigGr, ignore.strand = T);rm(myResultsSigGr)
    
    message("<< Get DMR from overlapped probes >>")
    dmrGr <- reduce(lassoGr[which(lassoProbeCountOverlap >= minProbes)], min.gapwidth = minDmrSep, ignore.strand=TRUE);
    rm(lassoProbeCountOverlap, lassoGr) # lassos capturing 'minSigProbesLasso', merged 
    strand(dmrGr) <- '*'
    dmrGr <- dmrGr[which(width(dmrGr) > minDmrSize)] # remove DMRs < minDmrSize
    probeIndex <- as.data.frame(findOverlaps(dmrGr, myResultsGr))
    pValuesGr <- myResultsGr[probeIndex$subjectHits, "P.Value"]
    myBetas <- beta[match(names(pValuesGr), rownames(beta)), ]
    myBetas <- split(as.data.frame(myBetas), probeIndex$queryHits)
    
    message("<< Get adjusted P value for DMR >>")
    correl <- lapply(myBetas, function(x) cor(t(x)))
    weights <- lapply(correl, function(x) 1/apply(x^2,1,sum)); rm(correl)
    dmrQP <- qnorm(mcols(pValuesGr)$P.Value); dmrQP <- split(dmrQP, probeIndex$queryHits)
    dmrQPW <- mapply("*", dmrQP, weights); rm(dmrQP)
    if(class(dmrQPW) == "matrix") dmrStat <- sum(dmrQPW) else dmrStat <- lapply(dmrQPW, sum)
    rm(dmrQPW)
    dmrSd <- lapply(weights, function(x) sqrt(sum(x^2))); rm(weights)
    dmrP <- mapply(function(x,y) pnorm(x,0, sd=y), dmrStat, dmrSd); rm(dmrStat, dmrSd)
    dmrP <- p.adjust(dmrP, method = "BH")
    goodDmr <- which(dmrP < adjPvalDmr)
    dmrGr <- dmrGr[goodDmr] 
    dmrP <- dmrP[goodDmr]
    dmrpRank <- rank(dmrP, ties.method="min"); rm(goodDmr)
    
    
    
    ### get pvalues and betas for GOOD DMRs
    message("<< Get Start-End Ranges for each DMR >>")
    probeIndex <- as.data.frame(findOverlaps(dmrGr, myResultsGr))
    dmrProbesGr <- myResultsGr[probeIndex$subjectHits]
    myBetas <- beta[match(names(dmrProbesGr), rownames(beta)), ]; myBetas <- as.data.frame(myBetas)
    dmrCoreStart <- start(dmrProbesGr)
    dmrCoreEnd <- end(dmrProbesGr)
    myBetas <- split(myBetas, probeIndex$queryHits)
    dmrCoreStart <- split(dmrCoreStart, probeIndex$queryHits); dmrCoreStart <- sapply(dmrCoreStart, min)
    dmrCoreEnd <- split(dmrCoreEnd, probeIndex$queryHits); dmrCoreEnd <- sapply(dmrCoreEnd, max)
    ### calculate methylation scores for each DMR in each Sample_Group
    
    message("<< Calculate Methylation Scores for each DMR >>")
    groupIndex <- pheno
    dmrGroupMeans <- do.call(rbind, lapply(myBetas, function(x) sapply(split(t(x), groupIndex), mean)))
    colnames(dmrGroupMeans) <- paste("betaAv", colnames(dmrGroupMeans), sep = "_")
    probeGroupMeans <- lapply(myBetas, function(x) split(as.data.frame(t(x)), groupIndex)); rm(groupIndex, myBetas)
    probeGroupMeans <- lapply(probeGroupMeans, function(x) lapply(x, colMeans))
    probeGroupMeans <- do.call(rbind, lapply(probeGroupMeans, function(x) t(do.call(rbind, x))))
    colnames(probeGroupMeans) <- paste("betaAv", colnames(probeGroupMeans), sep = "_")
    
    ### probe-level data and DMR metadata
    
    message("<< Generate Probe-level Data >>")
    myDmrProbesGr <- myResultsGr[probeIndex$subjectHits]
    myDmrProbesGr <- as(cbind(as.data.frame(myDmrProbesGr), probeGroupMeans), "GRanges");
    rm(probeGroupMeans)
    myDmrProbesGr$dmrNo <- probeIndex$queryHits
    myDmrProbesGr$dmrP <- dmrP[probeIndex$queryHits]
    myDmrProbesGr$dmrpRank <- dmrpRank[probeIndex$queryHits]
    myDmrProbesGr$dmrChrom <- seqnames(dmrGr[probeIndex$queryHits])
    myDmrProbesGr$dmrStart <- start(dmrGr[probeIndex$queryHits])
    myDmrProbesGr$dmrEnd <- end(dmrGr[probeIndex$queryHits])
    myDmrProbesGr$dmrSize <- width(dmrGr[probeIndex$queryHits])
    myDmrProbesGr$dmrCoreStart <- dmrCoreStart[probeIndex$queryHits]
    myDmrProbesGr$dmrCoreEnd <- dmrCoreEnd[probeIndex$queryHits]    
    myDmrProbesGr$dmrCoreSize <- myDmrProbesGr$dmrCoreEnd - myDmrProbesGr$dmrCoreStart + 1
    
    ### DMR metadata
    message("<< Generate DMR metadata >>")
    myDmrGr <- dmrGr
    myDmrGr$dmrNo <- unique(probeIndex$queryHits)
    myDmrGr$dmrP <- dmrP; rm(dmrP)
    myDmrGr$dmrpRank <- dmrpRank; rm(dmrpRank)
    myDmrGr$dmrChrom <- seqnames(dmrGr) 
    myDmrGr$dmrStart <- start(dmrGr)
    myDmrGr$dmrEnd <- end(dmrGr)
    myDmrGr$dmrSize <- width(dmrGr); rm(dmrGr)
    myDmrGr$dmrCoreStart <- dmrCoreStart
    myDmrGr$dmrCoreEnd <- dmrCoreEnd
    myDmrGr$dmrCoreSize <- myDmrGr$dmrCoreEnd - myDmrGr$dmrCoreStart + 1
    genes <- split(as.data.frame(myResultsGr)[probeIndex$subjectHits, c("ensemblID", "geneSymbol")], probeIndex$queryHits); rm(probeIndex)
    myDmrGr$ensemblID <- sapply(genes, function(x) paste(unique(unlist(strsplit(x$ensemblID, ";"))), collapse = ";"))
    myDmrGr$geneSymbol <- sapply(genes, function(x) paste(unique(unlist(strsplit(x$geneSymbol, ";"))), collapse = ";")); rm(genes)
    myDmrGr <- as(cbind(as.data.frame(myDmrGr), dmrGroupMeans), "GRanges"); rm(dmrGroupMeans) 
    
    # plot of lassos
    interplot <- function(illumina450Gr,lassoSizes)
    {
      cgiNames <- levels(illumina450Gr$cgi)
      featureNames <- levels(illumina450Gr$feature)
      sfo <- c(6, 7, 3, 1, 4, 2, 5)
      scgio <- c(1, 4, 3, 2)
      sfcgio <- rep((sfo - 1) *4, each = 4) + rep(scgio, 7)
      lassoSizes <- round(unlist(lassoSizes))
      #imageName <- paste(getwd(),"myLassos.pdf",sep="/")
      #pdf(imageName,width=9,height=9)
      
      par(mar = c(7, 4, 4, 3)+0.5)
      plot(c(1,28), y=c(range(0.3*sqrt(lassoSizes))[1]*0.8, range(0.3*sqrt(lassoSizes))[2]*1.2), type="n", xaxt="n", xlab="", yaxt="n", ylab="lasso radius [bp]", main=paste("lasso quantile = ", round(lassoQuantileThreshold,2), "\nmean lasso radius = ", meanLassoRadius, "bp", sep = ""), bty="n")
      segments(1:28, rep(0,28), 1:28, 0.3*sqrt(lassoSizes[sfcgio]), lty=3, col="grey")
      points(1:28, 0.3*sqrt(lassoSizes[sfcgio]), pch=16, cex=0.3*sqrt(lassoSizes[sfcgio]), col=rep(rainbow(7,alpha=0.5)[sfo], each=4))
      text(1:28, 0.3*sqrt(lassoSizes[sfcgio]), lassoSizes[sfcgio], pos=3, cex=0.8)
      axis(1, at = 1:28, labels = rep(cgiNames[scgio], 7), las = 2)
      par(xpd = T)
      segments(seq(1, 28, 4), rep(-2.5, 7), seq(4, 28, 4), rep(-2.5, 7))
      mtext(text = featureNames[sfo], side = 1, at = seq(2.5, 28, 4), line = 5.5, las = 1, cex.axis = 1)
      axis(2, at=c(0,max(0.3*sqrt(lassoSizes))), labels=F)
    }
    if(Rplot) interplot(illumina450Gr,lassoSizes)
    if(PDFplot)
    {
      pdf(paste(resultsDir,"myLassos.pdf",sep="/"),width=9,height=9)
      interplot(illumina450Gr,lassoSizes)
      dev.off()
    }
    DMRProbes <- as.data.frame(myDmrProbesGr)
    DMRProbes <- data.frame(probe.features[rownames(DMRProbes),],DMRProbes[,which(colnames(DMRProbes)=="P.Value"):which(colnames(DMRProbes)=="dmrNo")])
    DMRProbes <- split(DMRProbes,DMRProbes$dmrNo)
    DMR <- as.data.frame(myDmrGr)
    
    message("ProbeLasso detected ",nrow(DMR)," DMRs with P value <= ",adjPvalDmr,".")
    if(nrow(DMR) == 0) stop("No DMR detected.")
    
    rownames(DMR) <- paste("DMR",DMR$dmrNo,sep="_")
    names(DMRProbes) <- rownames(DMR)
    
    if(arraytype=="EPIC")
      DMR[,1] <- paste("chr",DMR[,1],sep="")
    
    #OutputDMR <- list(ProbeLassoDMR=DMR,ProbeLassoDMRProbes=DMRProbes)
    OutputDMR <- list(ProbeLassoDMR=DMR)
    
  }else if(method=="DMRcate")
  {
    
    message(cores," cores will be used to do parallel DMRcate computing.")
    
    message("<< Find DMR with DMRcate Method >>")
    myMs <- logit2(beta)
    if(rmSNPCH) myMs <- rmSNPandCH(myMs, dist=dist, mafcut=mafcut)
    design <- model.matrix(~ pheno)
    
    if(arraytype == "450K"){
      myannotation <- cpg.annotate(datatype="array",fdr=fdr, myMs,design=design,coef=ncol(design), analysis.type="differential",annotation=c(array = "IlluminaHumanMethylation450k", annotation = "ilmn12.hg19"),what="M")
    } else{
      myannotation <- cpg.annotate(datatype="array",fdr=fdr, myMs,design=design,coef=ncol(design), analysis.type="differential",annotation=c(array = "IlluminaHumanMethylationEPIC", annotation = "ilm10b4.hg19"),what="M")
    }
    M <- do.call("cbind", lapply(myannotation, as.data.frame))
    colnames(M) <- names(myannotation)
    
    dmrcoutput <- dmrcate(myannotation, min.cpgs = minProbes, lambda=lambda, C=C,mc.cores = cores)
    data(dmrcatedata)
    DMR <- as.data.frame(extractRanges(dmrcoutput, genome = "hg19"))
    
    message("DMRcate detected ",nrow(DMR)," DMRs with mafcut as= ",adjPvalDmr,".")
    if(nrow(DMR) == 0) stop("No DMR detected.")
    
    if(nrow(DMR)!=0)
    {
      #DMRProbes <- apply(DMR,1,function(x) M[which(M[,3]==x[1] & M[,4]>= as.numeric(x[2]) & M[,4]<= as.numeric(x[3])),])
      rownames(DMR) <- paste("DMR",1:nrow(DMR),sep="_")
      #names(DMRProbes) <- rownames(DMR)
      #for(i in names(DMRProbes)) rownames(DMRProbes[[i]]) <- DMRProbes[[i]]$ID
      #X <- lapply(DMRProbes,as.data.frame)
      #DMRProbes <- lapply(X,function(x) cbind(ID=x[,1],probe.features[rownames(x),],x[,c(2,5,6,7)]))
      
      OutputDMR <- list(DMRcateDMR=DMR)
    }else
    {
      OutputDMR <- NULL
    }
    #OutputDMR <- list(DMRcateDMR=DMR,DMRcateDMRProbes=DMRProbes)
    
  } else {
    stop("Please assign correct DMR method: 'Bumphunter' or 'ProbeLasso'")
  }
  message("\n[ Section 2:  Run DMR Algorithm Done ]\n")
  
  message("[<<<<<< ChAMP.DMR END >>>>>>]")
  message("[===========================]")
  message("[You may want to process DMR.GUI() or champ.GSEA() next.]\n")
  return(OutputDMR)
}
JoshuaTian/ChAMP documentation built on Feb. 21, 2023, 4:57 p.m.