R/buildInput.R
In uncoverappLib: Interactive graphical application for clinical assessment of sequence coverage at the base-pair level

Documented in buildInput

#'Build input file
#'
#' @description
#' Function to build input file for unCOVERAPP when the number
#' of genes to analyze is > 50.
#'
#'
#' @return A file.bed containing tab-separated specifications of
#' genomic coordinates (chromosome, start position, end position),
#' the coverage value, and the reference:alternate allele counts for each position.
#'
#' @param geneList a text file, named with .txt extension,
#' containing HGNC official gene name(s) one per row.
#' @param genome (char) reference genome, hg19 or hg38
#' @param type_bam (char) chromosome notation of their BAM file(s). Use "number"
#' or "chr".  In the BAM file, the number option refers to 1, 2, ..., X,.M
#' chromosome notation, while the chr option refers to chr1, chr2, ... chrX,
#' chrM chromosome notation.
#' @param bamList a text file, named with .list extension,
#' containing the absolute paths to BAM file(s) one per row.
#' @param outDir (char) directory where pileup output will be stored
#' @param MAPQ.min (integer) minimum MAPQ value for an alignment
#' to be included in output file.
#' @param base.quality (integer) minimum QUAL value for each
#' nucleotide in an alignment.
#' @export
#' @import rlist
#' @import OrganismDbi
#' @import Rsamtools
#' @import org.Hs.eg.db
#' @import TxDb.Hsapiens.UCSC.hg19.knownGene
#' @import TxDb.Hsapiens.UCSC.hg38.knownGene
#' @importFrom utils write.table
#' @import GenomicRanges
#' @examples
#' gene.list<- system.file("extdata", "mygene.txt", package = "uncoverappLib")
#'
#' bam_example <- system.file("extdata", "example_POLG.bam",
#' package = "uncoverappLib")
#' cat(bam_example, file = "bam.list", sep = "\n")
#' temp_dir=tempdir()
#' buildInput(geneList= gene.list, genome= "hg19", type_bam= "chr",
#' bamList= "bam.list", outDir= temp_dir)
#'


buildInput<- function(geneList,genome,type_bam,bamList,outDir,
                      MAPQ.min=1,base.quality=1) {


    ##### Check input
    if (missing(geneList)) stop("geneList must be supplied.\n")
    if (missing(bamList)) stop("bamList must be supplied.\n")
    if (MAPQ.min <= 0 )
    stop("MAPQ.min must be greater than 0")
    if (base.quality <= 0 )
    stop("base.quality must be greater than 0")

    ##load packages

    ###read gene(s) list and retrieve genomic coordinates in grange format

    message("Reading gene name started at:")
    message(Sys.time())
    gene.List=scan(geneList, character(), quote = "")
    my_gene_name=OrganismDbi::select(org.Hs.eg.db, key= gene.List,
                                     columns="ENTREZID", keytype="ALIAS")

    ###check control: stop when gene names in list are incorrect

    for (i in my_gene_name$ENTREZID){
     if (is.na(i)){
       s=print(subset(my_gene_name$ALIAS,is.na(my_gene_name$ENTREZID)))
       utils::write.table(s, file='./preprocessing_log.txt', quote= FALSE,
                  row.names = FALSE, col.names = FALSE)
      stop('ERROR: unrecognized GENE NAME. Please choose a
         correct HGNC official gene names.')}
    else {return= NULL}
  }
  ID=my_gene_name$ENTREZID
  if (genome == "hg19"){
    all_gene= TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene}else{
      all_gene= TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene}

  b=c()
  for (i in ID){
    if( ! is.element(i, keys(all_gene, "GENEID")))
      b[i]= i
  }
  no_entrID= subset(my_gene_name, my_gene_name$ENTREZID %in% b)

  ###chech controls : remove genes name not recognize from
  #TxDb.Hsapiens.UCSC.hg19.knownGene
  message("Control HGNC gene name started at:")
  message(Sys.time())

  if (nrow(no_entrID)!=0){
    utils::write.table(no_entrID, file='./preprocessing_log1.txt',
                quote= FALSE, row.names = FALSE, col.names = FALSE)
    stop('ERROR: unrecognized GENE NAME. Please remove genes
       stored in preprocessing_log1.txt')}


  pre= data.frame()
  for (i in ID){
    txid <- OrganismDbi::select(all_gene, i, "TXNAME", "GENEID")[["TXNAME"]]
    cds <-OrganismDbi::exonsBy(all_gene, by="tx", use.names=TRUE)
    coordinate= as.data.frame(cds[names(cds) %in% txid])
    pre= rbind(coordinate,pre)
    pre$start= pre$start -10
    pre$end= pre$end +10
  }


  for_bed=data.frame()
  #for (row in 1:nrow(pre)){
  for (row in seq_len(nrow(pre))){
    sel_cc= data.frame(as.character(pre$seqnames[row]),
                       pre$start[row], pre$end[row], stringsAsFactors = FALSE)
    for_bed= rbind(for_bed, sel_cc)
  }
  colnames(for_bed)=c('chr', 'start', 'end')

  for_bed= unique(for_bed)



  if(type_bam == "number"){
    for_bed$chr= gsub("^.{0,3}", "", for_bed$chr, perl =TRUE)}



  for_grange=GenomicRanges::makeGRangesFromDataFrame(for_bed, keep.extra.columns = TRUE)

  ##set parameters for bam

  param <- Rsamtools::ScanBamParam(which= for_grange)

  p_param <- Rsamtools::PileupParam(distinguish_nucleotides=TRUE,
                         distinguish_strands=FALSE,
                         min_mapq=MAPQ.min,
                         min_base_quality=base.quality,
                         min_nucleotide_depth=0,
                         max_depth=15000)
  list= scan(bamList, character(), quote= "")
  df= list()
  for (i in list){
    pileup_df= Rsamtools::pileup(list, scanBamParam=param, pileupParam=p_param)

    df=rlist::list.append(df, pileup_df)
  }
  #remove which_label column
  lst1 <- lapply(df, function(x) transform(x[,-5]))
  #check duplicate column
  lst2<- lapply(lst1, function(x) transform(x[!duplicated(x),]))

  ##function to rearrange each df in list in order to count the total coverage
  #in each position and the count of each read nucletide

  all.col= c("seqnames","pos","nucleotide","count"  )
  for (i in seq_along(lst2)){
    colnames(lst2[[i]]) <- all.col
  }
  new= "pos"
  riarrange.df = function(list_df){
    list_df %>%
      dplyr::mutate(end= pos) %>%
      dplyr::group_by(seqnames,pos,end,nucleotide) %>%
      dplyr::summarise(count=sum(count)) %>%
      dplyr::arrange(nucleotide) %>%
      dplyr::summarise(value= paste(sum(count)),
                       counts=paste(nucleotide, count, sep=':',collapse=';'))
  }


  lst3<- lapply(lst2, riarrange.df)

  #pp=Reduce(function(...) merge(...,by= c("seqnames", "pos", "end"), all=TRUE), lst3)
  pp = Reduce(function(x,y) merge(x,y,by=c("seqnames", "pos", "end"),all=TRUE) ,lst3)


  pp[is.na(pp)] <- 0
  pp=as.data.frame(pp)
  if(type_bam == "number"){
    for (i in pp[1]){
      Chromosome<-paste("chr", i, sep="")
      pp<- cbind(Chromosome, pp)
      pp[,2]<-NULL
    }
  }

  ###write file

  dir_users=sprintf("%s/output",outDir)
  myDir <- dir_users
  if (file.exists(myDir)) unlink(myDir,recursive=TRUE)
  dir.create(myDir)
  message("Write output file in directory ", myDir, format(Sys.time(), "%a_%b_%d_%Y"),'.bed')
  message(Sys.time())

  #utils::write.table(x=pp, file =paste0(myDir,format(Sys.time(), "%a_%b_%d_%Y"),'.bed'),
   #           quote=FALSE, sep="\t", eol = "\n", row.names = FALSE,
    #          col.names = FALSE)
  utils::write.table(x=pp, file =file.path(myDir, paste0(format(Sys.time(), "%a_%b_%d_%Y"),'.bed')),
                     quote=FALSE, sep="\t", eol = "\n", row.names = FALSE,
                     col.names = FALSE)

}

Any scripts or data that you put into this service are public.

uncoverappLib documentation built on Nov. 8, 2020, 5:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

uncoverappLib
Interactive graphical application for clinical assessment of sequence coverage at the base-pair level

R/buildInput.R
In uncoverappLib: Interactive graphical application for clinical assessment of sequence coverage at the base-pair level

Defines functions buildInput

Documented in buildInput

Try the uncoverappLib package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

uncoverappLib Interactive graphical application for clinical assessment of sequence coverage at the base-pair level

R/buildInput.R In uncoverappLib: Interactive graphical application for clinical assessment of sequence coverage at the base-pair level

Defines functions buildInput

Documented in buildInput

Try the uncoverappLib package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

uncoverappLib
Interactive graphical application for clinical assessment of sequence coverage at the base-pair level

R/buildInput.R
In uncoverappLib: Interactive graphical application for clinical assessment of sequence coverage at the base-pair level