R/coverage.R

Defines functions calculate_coverage_around_gp calculate_coverage_tfbs get_mean_coverage get_coverage_bed get_mean_and_conf_intervals get_norm_local_coverage calculate_genowide_coverage

Documented in calculate_coverage_around_gp calculate_coverage_tfbs calculate_genowide_coverage get_coverage_bed get_mean_and_conf_intervals get_mean_coverage get_norm_local_coverage

#' Calculate Genome-wide coverage for a BAM file
#'
#' This function takes a BAM file and generates a TXT file with its genome-wide coverage
#'
#' @param bin_path Path to binary. Default tools/bedtools2/bin/bedtools
#' @param bam Path to BAM file.
#' @param verbose Enables progress messages. Default FALSE.
#' @param output_dir Directory to output results. If not provided then outputs in current directory
#' @export

calculate_genowide_coverage=function(bin_path="tools/bedtools2/bin/bedtools",bam="",verbose=FALSE,output_dir=""){

  sep="/"
  sample_name=ULPwgs::get_sample_name(bam)

  if(output_dir==""){
    sep=""
  }

  out_file=paste0(output_dir,sep,sample_name,"_GENOME_COVERAGE")

  if (!dir.exists(out_file)){
      dir.create(out_file)
    }

  out_file=paste0(out_file,"/",sample_name,"_genome_coverage.txt")

  if(verbose){
    print(paste(bin_path,"genomecov -ibam",bam,">",out_file))
  }
  system(paste(bin_path,"genomecov -ibam",bam,">",out_file))

}

#' Get normalized local coverage from segmentation data (CNA) for a single base position
#'
#' This function takes normalized data for local coverage as input,
#' as well as a valid genomic position for a single base and returns a log2 corrected value for said position
#'
#' @param pos Genomic position
#' @param chr Chromosome
#' @param norm_log2 DATA.FRAME with normalized local coverage
#' @return A double with the local coverage
#' @export


get_norm_local_coverage=function(pos="",chr="",norm_log2=""){
   norm_log2$log2=ifelse(is.na(norm_log2$log2),0,norm_log2$log2)
   norm=as.numeric(norm_log2[norm_log2$chr==chr & pos>norm_log2$start & pos<norm_log2$end,]$log2)
   if(!length(norm)==0){
     norm=2**norm
     if (norm==0){
       return(0.001)
    }
     else{
       return(norm)
     }
   }else{
     return (1)
   }
}

#' Calculate mean and confidence intervals for positions relative to TFBSs
#'
#' This function takes a LIST of DATA.FRAMES as input(each DATA.FRAME for each TFBS), and returns a DATA.FRAME
#' with the mean and confidence intervals for the positions relative to them
#'
#' @param data LIST of DATA.FRAMES with the data
#' @param CI Confidence Interval
#' @return A DATA.FRAME with the mean coverage values and CI
#' @export

get_mean_and_conf_intervals=function(cov_data="",CI=0.95){
  FUN=function(x,cov_data){
    dat=cov_data[[x]]
    return(dat[,"norm_cor_cov",drop=FALSE])
  }
  norm_cor_cov_list=lapply(seq(1:length(cov_data)),FUN=FUN,cov_data=cov_data)
  norm_cor_cov_df=suppressMessages(norm_cor_cov_list %>% dplyr::bind_cols())
  norm_cor_cov_means=rowMeans(norm_cor_cov_df,na.rm = TRUE)
  numb_analyz_tfbs=norm_cor_cov_df %>% is.na() %>% `!` %>% rowSums()
  if (all(numb_analyz_tfbs>3)){
      norm_cor_cov_mes=apply(norm_cor_cov_df,1,FUN=function(x){qt(CI,sum(!is.na(x))-1)*sd(x[!is.na(x)])/sqrt(sum(!is.na(x)))})
        results=data.frame(POSITION_RELATIVE_TO_TFBS=cov_data[[1]]$pos_relative_to_tfbs,MEAN_DEPTH=norm_cor_cov_means,CI95_LOWER_BOUND=norm_cor_cov_means-norm_cor_cov_mes,CI95_UPPER_BOUND=norm_cor_cov_means+norm_cor_cov_mes,TFBS_ANALYZED=numb_analyz_tfbs)
  }else{
  results=data.frame(POSITION_RELATIVE_TO_TFBS=cov_data[[1]]$pos_relative_to_tfbs,MEAN_DEPTH=norm_cor_cov_means,TFBS_ANALYZED=numb_analyz_tfbs)
  }
  return (results)
}



#' Get coverage for target regions
#'
#' This function takes a  BAM and a BED file with target regions and estimate per base coverage for them
#' using samtools depth
#'
#' @param bin_path Path to samtools. Default tools/samtools/samtools
#' @param bam Path to BAM file
#' @param bed Path to BED file
#' @param mapq Minimum mapping quality
#' @param threads Number of threads to use. Default 3
#' @param output_dir Directory to output results
#' @return A double with the mean genome coverage
#' @export

get_coverage_bed=function(bin_path="tools/samtools/samtools",bam="",bed="",mapq=0,threads=3,output_dir=""){
  region_data=read.table(bed,comment.char="$")
  region_data=region_data[,c(1,2,3,4)]
  colnames(region_data)=c("chr","start","end")
  cov_results=parallel::mclapply(1:nrow(region_data),FUN=function(x){
    cov_data=read.csv(text=system(paste(bin_path,"depth -a -Q",mapq, "-r",paste0(region_data[x,1],":",region_data[x,2],"-",region_data[x,3]),bam),intern=TRUE),header=FALSE,sep="\t",stringsAsFactors=FALSE,colClasses=c("character"));
    cov_data$Original=paste0(region_data[x,1],":",region_data[x,2],"-",region_data[x,3]);
    cov_data$Id=region_data[x,4];
    cov_data$Position=as.integer((as.numeric(region_data[x,2])+as.numeric(region_data[x,3]))/2);
    cov_data$Relative_Position=cov_data$Position-as.numeric(cov_data$V3);
    return(cov_data)
  },mc.cores=threads)
  cov_results=dplyr::bind_rows(cov_results)
  return(cov_results)
}


#' Calculate mean genome coverage
#'
#' This function takes a TXT file with genome-wide ocverage as input and returns the mean genome coverage
#'
#' @param file Path to file with genome-wide coverage
#' @param output_dir Directory to output results
#' @param region Region for which to get mean coverage. Default genome
#' @param sample_name Sample name
#' @param save Save as TXT. Default TRUE
#' @return A double with the mean genome coverage
#' @export


get_mean_coverage=function(file="",output_dir="",region="genome",sample_name="",save=TRUE){


  data=read.table(file)
  data=data %>% dplyr::mutate(tot_bases=(as.numeric(V2)*as.numeric(V3))) %>% dplyr::group_by(V1) %>% dplyr::summarise(cnt=dplyr::n(),avg_cov=sum(tot_bases)/max(V4), .groups = 'drop') %>% as.data.frame()

  if (save){
    sep="/"


    if(output_dir==""){
      sep=""
    }
    out_file=paste0(output_dir,sep,sample_name,"_mean_genome_coverage.txt")
    write.table(data.frame(data$V1,data$avg_cov),file=out_file,sep="\t",col.names=FALSE,row.names=FALSE,quote=FALSE)

  }

  return(data[data$V1==region,]$avg_cov)
}

#' Calculate Mean Coverage Depth around TFBS
#'
#' This function takes a DATA.FRAME with TFBS positions, a BAM file with the sequence to analyze, a mean genome wide coverage,
#' local coverage values for CNA, and outputs the corrected depth coverage values around TFBS and generates
#' a TSS file with them. For better understanding check: https://www.nature.com/articles/s41467-019-12714-4
#'
#'
#' @param bin_path Path to binary. Default tools/samtools/samtools
#' @param ref_data Data from BED file as Data.frame
#' @param bam Path to BAM file.
#' @param sample_name Sample name
#' @param tf_name Transcription Factor name
#' @param mean_cov Mean genome wide coverage.
#' @param norm_log2 Data.frame with normalized local coverage
#' @param tfbs_start Number of bases to analyze forward from TFBS central point. Default 1000
#' @param tfbs_end Number of bases to analyze  backward from TFBS central point. Default 1000
#' @param cov_limit Max base depth. Default 1000
#' @param max_regions Max number of TFBS to analyze. Default 100000
#' @param mapq Min quality of mapping reads. Default 0
#' @param threads Number of threads. Default 1
#' @param output_dir Directory to output results. If not provided then outputs in current directory
#' @return A list of DATA.FRAME with the coverage per base of each TFBS
#' @export
#' @import pbapply

calculate_coverage_tfbs=function(bin_path="tools/samtools/samtools",ref_data="",bam="",sample_name="",tf_name="",mean_cov="",norm_log2="",tfbs_start=1000,tfbs_end=1000,cov_limit=1000,mapq=0,threads=1,output_dir=""){

  ref_data=data.frame(chr=ref_data[,1],start=ref_data[,2],end=ref_data[,3],strand=ref_data[,which(ref_data[1,]=="+" | ref_data[1,]=="-")],pos=as.integer((ref_data[,3]+ref_data[,2])/2))
  tfbs_to_analyze=ref_data

  ## Filter for overlapping TFBS or duplicated TFBS to save time
  tfbs_to_analyze=tfbs_to_analyze %>% dplyr::filter(!grepl("_",chr),(pos-start)<1) %>% dplyr::distinct(chr, pos, .keep_all = TRUE)

  FUN=function(x,bin_path,norm_log2,start,end,mean_cov,bam,cov_limit,mapq){
    tfbs_data=t(x)

    ## IF running sambamba
    ## cov_data=read.csv(text=system(paste(bin_path,"depth base -t 3 -z -L ",paste0(tfbs_data$chr,":",as.numeric(tfbs_data$pos)-start,"-",as.numeric(tfbs_data$pos)+end),bam),intern=TRUE),header=TRUE,sep="\t")
    ## cov_data=cbind(cov_data[,1:3],strand=tfbs_data$strand)
    ## names(cov_data)=c("chr","pos","cov","strand")

    cov_data=read.csv(text=system(paste(bin_path,"depth -a -Q",mapq, "-r",paste0(tfbs_data[1],":",as.numeric(tfbs_data[5])-start,"-",as.numeric(tfbs_data[5])+end),bam),intern=TRUE),header=FALSE,sep="\t")
    colnames(cov_data)=c("chr","pos","cov")
    cov_data$chr=as.character(cov_data$chr)
    norm_cov=get_norm_local_coverage(pos=as.numeric(tfbs_data[5]),chr=tfbs_data[1],norm_log2=norm_log2)

    if(!(nrow(cov_data)==(start+end+1))){
      fix=(as.numeric(tfbs_data[5])-start):(as.numeric(tfbs_data[5])+end)
      fix=data.frame(chr=tfbs_data[1],pos=fix[!fix==cov_data$pos])
      cov_data=dplyr::bind_rows(cov_data,fix) %>% dplyr::arrange(pos)
    }

    cov_data=cbind(cov_data,strand=tfbs_data[4])
    cov_data=cov_data %>% dplyr::mutate(cor_cov=as.numeric(cov)/as.numeric(mean_cov))  %>% dplyr::mutate(norm_cor_cov=cor_cov/as.numeric(norm_cov),pos_relative_to_tfbs=dplyr::if_else(strand=="+",pos-as.numeric(tfbs_data[5]),-(pos-as.numeric(tfbs_data[5])))) %>% dplyr::arrange(pos_relative_to_tfbs)

    return(cov_data)
  }
  cl=parallel::makeCluster(threads)
  coverage_list=pbapply(X=tfbs_to_analyze,1,FUN=FUN,bin_path=bin_path,norm_log2=norm_log2,start=tfbs_start,end=tfbs_end,mean_cov=mean_cov,bam=bam,cov_limit=cov_limit,mapq=mapq,cl=cl)
  ## coverage_list=pbapply::pblapply(seq(1,nrow(tfbs_to_analyze),1),FUN=FUN,tfbs_data=tfbs_to_analyze,bin_path=bin_path,norm_log2=norm_log2,start=tfbs_start,end=tfbs_end,mean_cov=mean_cov,bam=bam,cov_limit=cov_limit,mapq=mapq,cl=cl)

  on.exit(parallel::stopCluster(cl))
  print(paste("TFBS analyzed:",nrow(tfbs_to_analyze)))
  print(paste("TFBS skipped:",nrow(ref_data)-nrow(tfbs_to_analyze)))


  return(coverage_list)
  }

#' Calculate Mean Coverage Depth around Genomic Position
#'
#' This function takes chromosome location, as well as strand information, and generates a coverage report around
#' for it. This coverage can be normalized if local segmentation values for the region are given and further corrected by the
#' the local coverage in the BAM
#'
#'
#' @param bin_path Path to binary. Default tools/samtools/samtools
#' @param chr Chromosome
#' @param bam Path to BAM file.
#' @param position Genomic position within the chromosome
#' @param strand Strand direction. If not available asumes + strand.
#' @param norm_log2 Local segment log2
#' @param start Downstream distance from genomic position to estimate coverage
#' @param end Upstream distance from genomic position to estimate coverage
#' @param mean_cov Mean genome wide coverage.
#' @param cov_limit Max coverage limit for position to be taken into account
#' @param mapq Min quality of mapping reads. Default 0
#' @return A DATA.FRAME with per base coverage por each genomic position

calculate_coverage_around_gp=function(bin_path="tools/samtools/samtools",chr="",position="",strand="",bam="",norm_log2=1,start=1000,end=1000,mean_cov=1,cov_limit=NA,mapq=0){
  cov_data=read.csv(text=system(paste(bin_path,"depth -a -Q",mapq, "-r",paste0(chr,":",as.numeric(position)-start,"-",as.numeric(position)+end),bam),intern=TRUE),header=FALSE,sep="\t",stringsAsFactors=FALSE)
  colnames(cov_data)=c("chr","pos","cov")
  cov_data$strand=strand
  cov_data=cov_data %>% dplyr::mutate(cor_cov=as.numeric(cov)/sum(as.numeric(cov)))  %>% dplyr::mutate(norm_cor_cov=cor_cov/as.numeric(norm_log2),pos_relative_to_tfbs=dplyr::if_else(strand=="+"|strand=="",pos-as.numeric(position),-(pos-as.numeric(position)))) %>% dplyr::arrange(pos_relative_to_tfbs)
  return(cov_data)
}
TearsWillFall/tfProfiling documentation built on Aug. 7, 2021, 9:40 a.m.