R/PlotMetagene.R

Defines functions plotResBar plotRes plotMetaGene2 plotMetaGene scaleMeta scalefun addSym makeCovList

Documented in addSym makeCovList plotMetaGene scalefun scaleMeta

#'Function to import coverage matrix to R
#'
#'This function will import the .mat.gz coverage matrix generated by deeptools
#'computeMatrix command, and split it into a list of coverage matrices - one for
#'each bigwig file input into the command.
#'
#'@param matfile Character indicating the path to the matrix file (ending in '.mat.gz')
#'@param header Numeric indicating the size of the header in matfile (i.e. number of lines to skip while reading file in). Default=3
#'@param bedfile Character indicating the path to the bed region files used in computeMatrix (can be left 'NULL' to use regions in .mat.gz file). MUST be same order as .mat.gz regions
#'@param sampleTable data.frame with two columns: sampleName and sampleID. sampleName contains the name that will appear for each sample in the figure. SampleID contains a pattern denoting each unique sample (can be the same as sampleName). NOTE: samples must be the same order as that of bigwigs input into computeMatrix.
#'@param annobed Boolean indicating if bed file should be annotated to include transcript IDs (default=FALSE)
#'@param species Character indicating species if annobed=T. Must be "human" or "mouse". Can set to "other" but TxDb object and annoDb must be provided.
#'@param TxDb TxDb object. Used if species="other", otherwise, "TxDb.Hsapiens.UCSC.hg38.knownGene" or "TxDb.Mmusculus.UCSC.mm10.knownGene" are used.
#'@param db Annotation database (as character) to use if species="other", otherwise "org.Hs.eg.db" or "org.Mm.eg.db" are used - must have corresponding bioconductor package installed.
#'@return list of length 3: covList containing list of coverage matrices, bed containing regions for each row of the coverage matrices, and cols: a numeric vector of length 4 providing the suggested column parameters for plotMetaGene() and subsetquent functions. Use only when computeMatrix was executed with 'scale-regions' and NOT 'reference-point'
#'@export

makeCovList<-function(matfile, header=3, bedfile=NULL, sampleTable, annobed=F, species="human", TxDb=NULL, db=NULL){
  #matfile = .mat.gz file from deeptools computeMatrix output
  #bedfile = .bed file used to define regions for computeMatrix
  #sampleTable = dataframe with two columns: sampleName and sampleID
  x<-read.delim(matfile, skip=header, header=F)
  x[is.na(x)]<-0
  #First six columns are details of the region, remaining columns are for each sample
  perSamp<-(ncol(x)-6)/nrow(sampleTable)
  cnames<-c("seqnames","start","end","width","score","strand")
  for(i in 1:nrow(sampleTable)){
    cnames<-c(cnames, paste(sampleTable$sampleID[i], c(1:perSamp), sep="."))
  }
  colnames(x)<-cnames
  y<-NULL
  if(isFALSE(annobed)){
    if(!is.null(bedfile)){
      y<-read.delim(bedfile, header=T)
    } else {
      y<-x[,c(1:6)]
    }
  }
  if(isTRUE(annobed)){
    TxDb<-NULL
    db<-NULL
    if(species=="human"){
      require(org.Hs.eg.db, quietly = T)
      TxDb<-TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene
      db<-"org.Hs.eg.db"
    }
    if(species=="mouse"){
      require(org.Mm.eg.db, quietly = T)
      TxDb<-TxDb.Mmusculus.UCSC.mm10.knownGene::TxDb.Mmusculus.UCSC.mm10.knownGene
      db<-"org.Mm.eg.db"
    }
    if(species=="other"){
      message("Species set to other. Be sure to have TxDb and db set to correct organism.")
    }
    message("Annotating bedfile")
    if(is.null(bedfile)){
      bedfile<-GenomicRanges::makeGRangesFromDataFrame(x[,c(1:6)], keep.extra.columns = T)
    }
    y<-ChIPseeker::annotatePeak(bedfile, tssRegion=c(-1000,1000),
                                TxDb=TxDb,
                                annoDb=db)
    n<-c("seqnames","start","end","width","strand","transcriptId")
    y<-as.data.frame(y@anno)[,which(colnames(as.data.frame(y@anno)) %in% n)]
    colnames(y)[which(colnames(y) %in% "transcriptId")]<-"name"
  }
  x_list<-list()
  for(i in 1:nrow(sampleTable)){
    x_list[[i]]<-x[,grep(sampleTable$sampleID[i], colnames(x))]
    names(x_list)[i]<-sampleTable$sampleName[i]
  }
  half<-ncol(x_list[[1]])/2
  cols<-c(0, half-50, half+50, ncol(x_list[[1]]))
  return(list(cov=x_list, bed=y, cols=cols))
}

#' Provide gene symbols for transcripts in metaList
#'
#' @param metaList object created from makeCovList(). Used by default over 'bed' and 'covList' if provided.
#' @param bed data.frame of regions associated with covList. Leave NULL if providing metaList
#' @param covList list of data.frame coverage matrices. Leave NULL if providing metaList
#' @param cols numeric vector of suggested columns for plotMetaGene() and subsequent functions. leave NULL if providing metaList
#' @param species character indicating species from which to derive gene symbols. At the moment only "hsapiens" or "mmusculus" are accepted.
#' @param rmZeroes Boolean indicating if rows with no coverage are to be removed from coverage matrices. Default=TRUE.
#' @param control character indicating name of control entry from sampleTable given to makeCovList().
#' @return Same list object as returned by makeCovList(), but now gene symbols are annotated in $bed object
#' @export

addSym<-function(metaList=NULL, bed=NULL, covList=NULL, cols=NULL, species="hsapiens", rmZeroes=T, control="scr_total"){
  if(!is.null(metaList)){
    bed<-metaList$bed
    covList<-metaList$cov
    cols<-metaList$cols
  }
  x<-gprofiler2::gconvert(sapply(strsplit(bed$name, split=".", fixed=T),'[[',1),
                          organism=species, target="HGNC", mthreshold=1)

  message("Starting dimensions:")
  print("covList:")
  print(lapply(covList, dim))
  print("bed:")
  print(dim(bed))

  to_remove<-which(x$name=="nan", arr.ind=T) #Remove these rows with no symbol
  if(length(to_remove)>0){
    for(i in 1:length(covList)){
      covList[[i]]<-covList[[i]][-to_remove,]
    }
    x<-x[-to_remove,]
    message("Removed ",length(to_remove)," nan values...")
  }
  #find duplicated symbols in promoters
  dups<-unique(x$name[duplicated(x$name)])
  message("Identified ",length(dups)," duplicate gene symbols.")
  #return(dups)
  #Find the transcript IDs that aren't duplicates to keep:
  keep<-x$input[which(!x$name %in% dups)]
  #Now to find the top PolII coverage for duplicated genes from the scr_total
  if(length(dups)>=1){
    message("Duplicate gene symbols found. Filtering based on coverage in control promoters...")
    pb<-txtProgressBar(min=0, max=length(dups), style=3)
    for(i in 1:length(dups)){
      #Now we need to find the max coverage for each duplicate:
      tmp_IDs<-x$input[which(x$name %in% dups[i])] #Tx IDs of current duplicate
      tryCatch({tmp_cov<-rowSums(covList[[which(names(covList) %in% control)]][which(x$name %in% dups[i], arr.ind=T),], na.rm=T)},
               error=function(e) print(which(x$name %in% dups[i], arr.ind=T)))
      #Get the Tx ID of the largest coverage to add to keep
      keep<-c(keep, tmp_IDs[which(tmp_cov==max(tmp_cov), arr.ind=T)])
      setTxtProgressBar(pb, i)
    }
    message("Keeping ", length(keep)," genes after removing duplicates...")
    to_keep<-which(x$input %in% keep, arr.ind=T)
    for(i in 1:length(covList)){
      covList[[i]]<-covList[[i]][to_keep,]
    }
    x<-x[to_keep,]
  }
  #Check the dimensions:
  dim=nrow(x)
  #Reorder everything so that the rows are all in the same order:
  message("Reordering data frames...")
  for(i in 1:length(covList)){
    covList[[i]]<-covList[[i]][order(x$input),]
  }
  x<-x[order(x$input),]
  message("Printing dimensions:")
  print("covList:")
  print(lapply(covList, dim))
  print("bed:")
  print(dim(x))
  if(isTRUE(rmZeroes)){
    message("removing regions with coverage of 0 in promoter control...")
    tmp_cov<-rowSums(covList[[1]])
    zeroes<-which(tmp_cov==0, arr.ind=T)
    if(length(zeroes)>0){
      message(length(zeroes)," regions with zero coverage found. Removing from analysis...")
      for(i in 1:length(covList)){
        covList[[i]]<-covList[[i]][-zeroes,]
      }
      x<-x[-zeroes,]
    }
  }
  return(list(cov=covList, bed=x, cols=cols))
}

#'Scale the upstream/downstream regions of a coverage matrix to a certain number of columns
#'
#'@param df data.frame object of a single coverage matrix
#'@param cols numeric vector of length 4 indicating landmark columns (start=0, end of upstream region, end of main region, ncol(df))
#'@param b numeric indicating the number of columns the upstream region (cols[1]-cols[2]) should be scaled down to. Default=100.
#'@param a numeric indicating the number of columns the downstream region (cols[3]-cols[4]) should be scaled down to. Default=100.
#'@return Data.frame of a coverage matrix with upstream and downstream regions scaled down (i.e. reduced columns) for clearer plotting
#'@export

scalefun<-function(df, cols=c(0,100,300,700), b=100, a=100){
  #print(dim(df))
  #print(cols)
  #print(b)
  #print(a)
  res<-NULL
  if(diff(cols[c(1,2)])>b){
    #message("scaling before region from ",diff(cols[c(1,2)])," to ", b,"...")
    step<-round(diff(cols[c(1,2)])/b)
    #message("Step size of:",step,".")
    #pb<-txtProgressBar(min=0, max=cols[2], style=3)
    tmp<-NULL
    i<-1
    while(i<cols[2]){
      tmp<-cbind(tmp, rowMeans(df[,c(i:(i+step))]))
      #setTxtProgressBar(pb, i)
      #i<-i+(step+1)
      i<-i+step
    }
    #message("Before: ", ncol(tmp))
    #message("Region:", ncol(df[,c(cols[2]:cols[3])]))
    res<-cbind(tmp, df[,c(cols[2]:cols[3])])
  } else {
    #message("Before region already correct size of ",b," (or smaller)...")
    res<-df[,c(1:cols[2])]
  }
  if(diff(cols[c(3,4)])>a){
    #message("scaling after region from ",diff(cols[c(3,4)])," to ", a,"...")
    step<-round(diff(cols[c(3,4)])/a)
    #message("Step size of:",step,".")
    #pb<-txtProgressBar(min=cols[3], max=cols[4], style=3)
    tmp<-NULL
    i<-cols[3]
    while(i<ncol(df)){
      if((i+step)>ncol(df)){
        #print(i)
        #print(step)
        step<-ncol(df)-i
      }
      tmp<-cbind(tmp, rowMeans(df[,c(i:(i+step))]))
      #setTxtProgressBar(pb, i)
      #i<-i+(step+1)
      #print(i)
      i<-i+step
    }
    #message("After: ", ncol(tmp))
    res<-cbind(res, tmp)
  } else {
    message("Before region already correct size of ",b," (or smaller)...")
    res<-cbind(res,df[,c(cols[3]:cols[4])])
  }
  return(res)
}

#'Apply the scalefun() to a metalist
#'
#'@param metaList output of either makeCovList() or addSym()
#'@param cols numeric vector of length 4 indicating landmark columns (start=0, end of upstream region, end of main region, ncol(df))
#'@param b numeric indicating the number of columns the upstream region (cols[1]-cols[2]) should be scaled down to. Default=100.
#'@param a numeric indicating the number of columns the downstream region (cols[3]-cols[4]) should be scaled down to. Default=100.
#'@return metaList of coverage matrices with upstream and downstream regions scaled down (i.e. reduced columns) for clearer plotting
#'@export

scaleMeta<-function(metaList, cols=c(0,100,300,700), b=100, a=100){
  #Function to scale the regions downstream of TTS to same # of bins as upstream of TSS
  #Essentially make cols 301-700 301-400
  tmp<-list()
  for(i in 1:length(metaList$cov)){
    message("Scaling ",names(metaList$cov)[i],"...")
    tmp[[i]]<-scalefun(metaList$cov[[i]], cols, a=a, b=b)
    names(tmp)[i]<-names(metaList$cov)[i]
  }
  return(list(cov=tmp, bed=metaList$bed))
}

#' Make a custom metagene plot
#'
#' @param metaList metaList object output from makeCovList(), addSym(), or scaleMeta()
#' @param samples character vector indicating the names of the samples in metaList$cov to plot. NULL plots all.
#' @param subgenes Character vector of gene symbols to subset coverage matrices by. Must provide gene symbols to metaList after 'addSym()' has been used.
#' @param title Character indicating the title of the plot
#' @param cols Numeric vector of length 3 or 4. Length 3 indicates computeMatrix run with reference-point, length 4 indicates computeMatrix run with scale-regions. Leave NULL to use suggested cols paramters from metaList. Replace to override manually. This will affect any scaling (if indicated) and will affect the placement of the markers on the x-axis.
#' @param marks Character vector of length (length(cols)) indicating how landmarks should be labeled on the x-axis. Defaults to c("-500","TSS","TTS","+2000")
#' @param xlab Character for x-axis label. Default is "Position (bp)"
#' @param ylab Character for y-axis label. Default is "Occupancy".
#' @param pal Character indicating RColourBrewer palette name for colour schemes or custom character vector containing either colour names, hexadecimal or rgb() values.
#' @param lty Numeric indicating the line type for each sample. Default is NULL (lty=1 for each sample - solid line)
#' @param a numeric indicating the number of columns downstream region (cols[3]-cols[4]) should be scaled down to, if scaling indicated
#' @param b numeric indicating the number of columns upstream region (cols[1]-cols[2]) should be scaled down to, if scaling indicated
#' @param scale Boolean indicating if up/downstream regions should be scaled. Default is FALSE. If set to TRUE, must be on computeMatrix output from scale-regions, and cols must be length 4.
#' @param retDat Indicating which type of data should be returned. Default NULL returns no data. Logical TRUE returns data for a metagene plot. "box" returns data for boxplots of coverage.
#' @return Custom metagene figure
#' @export

plotMetaGene<-function(metaList, samples=NULL, subgenes=NULL, title="", cols=NULL, marks=c("-500","TSS","TTS","+2000"),
                       xlab="Position (bp)", ylab="Occupancy", pal="Dark2", lty=NULL, a=100, b=100, scale=F, retDat=F){
  if(!is.null(samples)){
    message("Subseting to samples:")
    print(samples)
    metaList$cov<-metaList$cov[which(names(metaList$cov) %in% samples)]
  }
  if(!is.null(subgenes)){
    message("Subsetting using subgenes...")
    for(i in 1:length(metaList$cov)){
      metaList$cov[[i]]<-metaList$cov[[i]][which(metaList$bed$name %in% subgenes),]
    }
  }
  if(is.null(cols)){
    cols<-metaList$cols
  }
  tmp<-NULL
  if(isTRUE(scale)){
    sm<-scaleMeta(metaList, cols=cols, b=b, a=a)
    tmp<-lapply(sm$cov, colMeans, na.rm=T)
    cols<-c(0, b, b+diff(cols[c(2,3)]), b+diff(cols[c(2,3)])+a)
    #print(cols)
  } else {
    tmp<-lapply(metaList$cov, colMeans, na.rm=T)
  }
  newrange<-function(x){
    range(x[is.finite(x)])
  }
  yrange<-newrange(unlist(tmp)[-c(length(tmp[[1]]),length(tmp[[1]])*length(tmp))])
  #print(yrange)
  plot(tmp[[1]], pch=16, col="white", xaxt="n", ylab=ylab, xlab=xlab, main=title, ylim=yrange)
  axis(1, at=cols, labels=marks)
  if(is.null(lty)){
    lt=rep(1, length(tmp))
  } else {
    lt<-lty
    if(length(lt)<length(tmp)){
      message("lty length is less than number of samples, using first value for each...")
      lt<-rep(lt[1], length(tmp))
    }
  }
  for(i in 1:length(tmp)){
    lines(tmp[[i]][1:length(tmp[[i]])-1], col=BinfTools::colPal(pal)[i], lwd=2, lty=lt[i])
  }
  if(length(cols)==4){
    abline(v=cols[c(2,3)], lty=2)
  }
  legend("topright", legend=names(metaList$cov), col=BinfTools::colPal(pal)[1:length(tmp)], lty=lt, lwd=2)
  if(!is.null(retDat)){
    if(isTRUE(retDat)){
      return(tmp)
    }
    if(retDat=="box"){
      return(metaList$cov)
    }
  }
}


plotMetaGene2<-function(metaList, samples=NULL, subgenes=NULL, title="", cols=NULL, marks=c("-500","TSS","TTS","+2000"),
                       xlab="Position (bp)", ylab="Occupancy", pal="Dark2", a=100, b=100, scale=F, retDat=F, eb="se"){
  if(!is.null(samples)){
    message("Subseting to samples:")
    print(samples)
    metaList$cov<-metaList$cov[which(names(metaList$cov) %in% samples)]
  }
  if(!is.null(subgenes)){
    message("Subsetting using subgenes...")
    for(i in 1:length(metaList$cov)){
      metaList$cov[[i]]<-metaList$cov[[i]][which(metaList$bed$name %in% subgenes),]
    }
  }
  if(is.null(cols)){
    cols<-metaList$cols
  }
  if(isTRUE(scale)){
    metaList<-scaleMeta(metaList, cols=cols, b=b, a=a)
    cols<-c(0, b, b+diff(cols[c(2,3)]), b+diff(cols[c(2,3)])+a)
  }
  colEb<-function(x, eb){
    y<-c()
    if(eb=="se"){
      #print(ncol(x))
      for(i in 1:ncol(x)){
        y<-c(y, sd(as.numeric(x[,i]), na.rm=T)/sqrt(nrow(x)))
      }
    }
    if(eb=="sd"){
      for(i in 1:ncol(x)){
        y<-c(y, sd(as.numeric(x[,i]), na.rm=T))
      }
    }
    return(y)
  }
  sumDat<-function(x, err){
    #print(length(1:ncol(x)))
    #print(length(colMeans(x)))
    #print(length(colEb(x, err)))
    y<-data.frame(index=1:ncol(x), signal=colMeans(x), eb=colEb(x, err))
    y$ymin<-y$signal-y$eb
    y$ymax<-y$signal+y$eb
    return(y)
  }
  tmp<-lapply(metaList$cov, sumDat, err=eb)
  tmp<-do.call("rbind", tmp)
  tmp$Sample<-sapply(strsplit(rownames(tmp),".",T),'[[',1)

  p<-ggplot2::ggplot(data=tmp, ggplot2::aes(x=index, y=signal, ymin=ymin, ymax=ymax, fill=Sample))+
    ggplot2::geom_ribbon(alpha=0.25) + ggplot2::geom_line(ggplot2::aes(colour=Sample), size=2) + ggplot2::theme_classic()+
    ggplot2::scale_fill_manual(values=BinfTools::colPal(pal)) + ggplot2::scale_colour_manual(values=BinfTools::colPal(pal))+
    ggplot2::scale_x_continuous(breaks=cols, labels=marks) +
    ggplot2::labs(title=title, x="Postition (bp)", y="Occupancy")
  if(length(cols)==4){
    p<-p+ggplot2::geom_vline(xintercept=cols[c(2,3)], linetype="dashed", color="black")
  }
  print(p)
    if(isTRUE(retDat)){
      return(tmp)
    }
    if(retDat=="box"){
      return(metaList$cov)
    }
  }

plotRes<-function(resList, title="test", sub_genes=NULL, col=c("#979796","#001aff"), minp=2e-16, transform=NULL, pc=1, ord=NULL, comp=NULL, yax="Occupancy"){
  require(dplyr)
  if(unique(unlist(lapply(resList, class)))=="data.frame"){
    resList<-lapply(resList, rowMeans, na.rm=T)
  }
  res<-do.call("cbind", resList)
  print(head(res))
  #res<-log2(as.data.frame(tmp[,c(4,2)]))

  if(!is.null(sub_genes)){
    res<-res[which(rownames(res) %in% sub_genes),]
  }

  res<-as.data.frame(res)

  if(!is.null(transform)){
    if(transform=="log2"){
      yax<-paste0("log2(",pc,"+",yax,")")
      res<-log2(pc+res)
    }
    if(transform=="log10"){
      yax<-paste0("log10(",pc,"+",yax,")")
      res<-log10(pc+res)
    }
    if(transform=="zscore"){
      yax<-paste("z-score",yax)
      res<-t(scale(t(res)))
    }
  }

  res<- as.data.frame(res)

  res<- res %>% tidyr::gather(key="Sample", value="Occupancy")

  res<- as.data.frame(res)

  if(!is.null(ord)){
    res <- res %>% dplyr::mutate(Sample=forcats::fct_relevel(Sample, ord))
  }
  #print(head(res))

  to_remove<-c()
  for(i in 1:nrow(res)){
    if(any(is.infinite(res[i,2]))){
      to_remove<-c(to_remove,i)
    }
  }
  if(length(to_remove)>=1){
    message("Removing ", length(to_remove)," infinite values from analysis...")
    res<-res[-to_remove,]
  }

  pwc<- res %>% rstatix::pairwise_t_test(Occupancy ~ Sample, comparisons = comp)
  pwc<- res %>% rstatix::pairwise_wilcox_test(Occupancy ~ Sample, comparisons=comp)
  print(pwc)
  pwc<- pwc %>% rstatix::add_xy_position(x="Sample")

  if(!is.null(minp)){
    if(pwc$p.adj[1]<minp){
      pwc$p.adj[1]<-as.character(minp)
    }
  }

  #print(head(res))
  #return(pwc)
  p<- ggplot2::ggplot(res) +
    ggplot2::geom_boxplot(ggplot2::aes(x=Sample, y=Occupancy, fill=Sample), notch=T) +
    ggplot2::theme_bw() + ggplot2::labs(title=title, y=yax, x="Condition") +
    ggpubr::stat_pvalue_manual(pwc, label="p.adj") + ggplot2::scale_fill_manual(values=col)

  v<-ggpubr::ggviolin(res, x="Sample",y="Occupancy", fill="Sample") +
    ggplot2::geom_boxplot(width=0.1, fill="white") +
    ggpubr::stat_pvalue_manual(pwc, label="p.adj") +
    ggplot2::labs(title=title,
                  y=yax, x="Condition") +
    ggplot2::theme_bw() + ggplot2::scale_fill_manual(values=col)

  print(p)
  #print(v)
}

plotResBar<-function(resList, title="test", sub_genes=NULL, col=c("#979796","#001aff"), minp=2e-16, transform=NULL, pc=1, ord=NULL, comp=NULL, yax="Occupancy", eb="sd"){
  require(dplyr)
  if(unique(unlist(lapply(resList, class)))=="data.frame"){
    resList<-lapply(resList, rowMeans, na.rm=T)
  }
  res<-do.call("cbind", resList)
  #print(head(res))
  #res<-log2(as.data.frame(tmp[,c(4,2)]))

  if(!is.null(sub_genes)){
    res<-res[which(rownames(res) %in% sub_genes),]
  }

  res<-as.data.frame(res)

  if(!is.null(transform)){
    if(transform=="log2"){
      yax<-paste0("log2(",pc,"+",yax,")")
      res<-log2(pc+res)
    }
    if(transform=="log10"){
      yax<-paste0("log10(",pc,"+",yax,")")
      res<-log10(pc+res)
    }
    if(transform=="zscore"){
      yax<-paste("z-score",yax)
      res<-t(scale(t(res)))
    }
  }

  res<- as.data.frame(res)

  res<- res %>% tidyr::gather(key="Sample", value="Occupancy")

  res<- as.data.frame(res)
  res$Sample<-factor(res$Sample)

  if(!is.null(ord)){
    res <- res %>% dplyr::mutate(Sample=forcats::fct_relevel(Sample, ord))
  }

  #print(head(res))

  to_remove<-c()
  for(i in 1:nrow(res)){
    if(any(is.infinite(res[i,2]))){
      to_remove<-c(to_remove,i)
    }
  }
  if(length(to_remove)>=1){
    message("Removing ", length(to_remove)," infinite values from analysis...")
    res<-res[-to_remove,]
  }

  data_sum<-function(data, eb, keepCols=c("Sample"), statCol="Occupancy"){
    #data is the data table and eb is what we want the error bars to be (sd or se)
    summary_func<-function(x, col, eb){
      sum<-NULL
      if(eb=="sd"){ #If we want sd, calcaulate sd
        #message("Calculating mean and standard deviation...")
        sum<-c(mean=mean(as.numeric(x[[col]]), na.rm=TRUE),
               eb=sd(as.numeric(x[[col]]), na.rm=T))
      }
      if(eb=="se"){ #If we want se, calculate se
        #message("Calculating mean and standard error...")
        sum<-c(mean=mean(as.numeric(x[[col]]), na.rm=T),
               eb=(sd(as.numeric(x[[col]]), na.rm=T)/sqrt(length(x[[col]]))))
      }
      if(eb==0){ #We don't want any error bars
        #message("eb=0. Error bars will not be showns...")
        sum<-c(mean=mean(as.numeric(x[[col]]), na.rm=T),
               eb=0)
      }
      return(sum)
    }
    if(eb==0){ #We don't want any error bars
      message("eb=0. Error bars will not be shown...")
    }
    data_sum<-plyr::ddply(data, keepCols, .fun=summary_func, statCol, eb)
    #data_sum<-rename(data_sum, c("mean"="reads"))
    #Set NA values for error bars (eb) to 0
    #data_sum$eb[which(is.na(data_sum$eb))]<-0
    return(data_sum)
  }

  x<-data_sum(res, eb)
  x<-x%>%dplyr::mutate(Sample=forcats::fct_relevel(Sample, names(resList)))
  if(!is.null(ord)){
    x<-x%>%dplyr::mutate(Sample=forcats::fct_relevel(Sample, ord))
  }

  pwc<- res %>% rstatix::pairwise_t_test(Occupancy ~ Sample, comparisons = comp)
  pwc<- res %>% rstatix::pairwise_wilcox_test(Occupancy ~ Sample, comparisons=comp, p.adjust.method = "BH")
  pwc<- pwc %>% rstatix::add_xy_position(x="Sample")
  y.stat<-(max(x$mean)+max(x$eb))*1.1
  pwc$y.position<-y.stat
  print(x)
  print(pwc)

  if(!is.null(minp)){
    if(pwc$p.adj[1]<minp){
      pwc$p.adj[1]<-as.character(minp)
    }
  }

  #print(head(res))
  #return(pwc)
  p<- ggplot2::ggplot(x, ggplot2::aes(x=Sample, y=mean, fill=Sample)) +
    ggplot2::geom_bar(stat="identity", colour="black", position=ggplot2::position_dodge()) +
    ggplot2::geom_errorbar(ggplot2::aes(ymin=mean-eb, ymax=mean+eb), width=.2, position=ggplot2::position_dodge(0.9))+
    ggplot2::theme_bw() + ggplot2::labs(title=title, y=yax, x="Condition") +
    ggpubr::stat_pvalue_manual(pwc, label="p.adj.signif", inherit.aes=F, step.increase = 0.1) + ggplot2::scale_fill_manual(values=colPal(col))


  print(p)
}
kevincjnixon/ChIPATAC documentation built on May 25, 2023, 9:33 p.m.