R/functions.R

Defines functions process mp_ROI return_sums extract_ROI filter_Tukey stat_by_variants plot_snp_density plot_snp_genome

Documented in extract_ROI filter_Tukey mp_ROI plot_snp_density plot_snp_genome process return_sums stat_by_variants

#' function to plot ROI and annotation over genome
#'
#'@description
#' plots ROI and viable regions over chromosome
#' @param list1 list, .bedfiles with ROIs
#' @param list2 list, gff3.gz file with annotation
#' @param list3 list, chromosome name
#' @param repeat_file file with repeats
#' @keywords plot_snp_genome
#' @return jpeg with plot and table with data
#' @export
plot_snp_genome=function(list1,list2,list3,repeat_file){#TODO include repeat_file
  ROI=read.csv(list1,sep = "\t",header=F)
  colnames(ROI)=c("seqnames","starts","ends","names","scores","strands")
  ROI["rlen"]=ROI$ends-ROI$starts
  ROI=subset(ROI,ROI$rlen<=10000)

  ROI.gr=GenomicRanges::GRanges(
    seqnames = S4Vectors::Rle(ROI$seqnames),
    ranges=IRanges::IRanges(start=ROI$starts,end=ROI$ends,names=ROI$names),
    count=ROI$scores,
    strands=ROI$strands
  )
  gff3=read.delim(list2,header = F,comment.char = "#")
  colnames(gff3)=c("seqid","source","type","start","end","score","strand","phase","attributes")
  exon=subset(gff3,gff3$type=="exon" | gff3$type=="three_prime_UTR" | gff3$type=="five_prime_UTR"| gff3$type=="mRNA"| gff3$type=="rRNA")
  exon.gr=GenomicRanges::GRanges(
    seqnames = S4Vectors::Rle(exon$seqid),
    ranges=IRanges::IRanges(start=exon$start,end=exon$end,names=exon$source),
    strands=exon$strand
  )
  exon.gr=GenomicRanges::gaps(exon.gr)

  repeats=read.delim(repeat_file,header = F,comment.char = "#")  #from ftp://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/
  repeats=subset(repeats,repeats$V2==list3)
  repeats.gr=GenomicRanges::GRanges(
    seqnames = S4Vectors::Rle(rep(gff3$seqid[1],times=length(repeats$V2))),
    ranges=IRanges::IRanges(start=repeats$V3,end=repeats$V4)
  )
  repeats.gr=GenomicRanges::gaps(repeats.gr)

  combined.gr=unlist(c(GenomicRanges::GRangesList(repeats.gr),GenomicRanges::GRangesList(exon.gr)))


  atrack=Gviz::AnnotationTrack(ROI.gr, name="SNPs")
  gtrack=Gviz::GenomeAxisTrack()
  hits=GenomicRanges::findOverlaps(IRanges::ranges(ROI.gr),IRanges::ranges(combined.gr),type = "within")
  hltrack=Gviz::HighlightTrack(trackList =atrack,range = IRanges::ranges(combined.gr)[hits@to],chromosome = list3,col="#FF3333",fill="#FF3333")

  out=sprintf("SNP_pos_%s.bed",list3)
  output_df=data.frame(seqnames=GenomeInfoDb::seqnames(ROI.gr[hits@from]),
                       starts=BiocGenerics::start(ROI.gr[hits@from])-1,
                       ends=BiocGenerics::end(ROI.gr[hits@from]),
                       names=names(ROI.gr[hits@from]),
                       count=c(ROI.gr[hits@from]$count),
                       strands=BiocGenerics::strand(ROI.gr[hits@from]))
  write.table(output_df,file = out,quote = F,sep = "\t",row.names = F)


  out=sprintf("SNP_%s.jpeg",list3)
  jpeg(file=out,width = 800, height = 800,quality = 100)
  Gviz::plotTracks(list(gtrack,hltrack))
  dev.off()
}

################################################################################
#' function to plot variants over chromosome
#'
#'@description
#' plots variants from vcf-file over chromosome
#' @param file list, example.vcf-files or example.vcf.gz-files with variants
#' @keywords plot_snp_density
#' @return pdf with plot
#' @export
plot_snp_density =function(file){
  vcf=vcfR::read.vcfR(file,verbose=FALSE)
  x=as.data.frame(vcf@fix)
  rm(vcf)
  x$POS=type.convert(x$POS)
  x=subset(x,x$REF=="A" |x$REF=="C" |x$REF=="T" |x$REF=="G" ) # filter so reference base is only one nt
  g=ggplot2::ggplot(x,ggplot2::aes(POS))+
    ggplot2::geom_histogram(ggplot2::aes(fill=..count..),binwidth = 1000)+
    ggplot2::scale_y_continuous(breaks = seq(0,2500,by=250))+
    ggplot2::scale_x_continuous(breaks = seq(0,max(x$POS),by=100000000))+
    ggplot2::scale_fill_gradient("Count",low="blue", high="red",na.value = "grey")
  out=sprintf("%s_only_SNP.pdf",strsplit(file,"\\.")[[1]][1])
  ggplot2::ggsave(out,plot=g,device = pdf())
  rm(g)
}

################################################################################
#' function to test for significant bins
#'
#'@description
#' runs ANOVA and Tukey on variant data, variants split into bins by positions
#' @param file list, example.vcf-files or example.vcf.gz-files with variants
#' @param binsize integer, number of neighboring variant positions per bin,default 1000
#' @keywords stat_by_variants
#' @return txt with table with data
#' @export
stat_by_variants=function(file,binsize=1000){
  vcf=vcfR::read.vcfR(file,verbose=FALSE)
  x=as.data.frame(vcf@fix)
  rm(vcf)
  x$POS=type.convert(x$POS)
  x=subset(x,x$REF=="A" |x$REF=="C" |x$REF=="T" |x$REF=="G" ) # filter so reference base is only one nt
  counts=as.data.frame(table(x$POS))
  rm(x)
  l=rep(seq(1,length(counts$Freq),by=binsize),each=binsize)[1:length(counts$Freq)]
  bins=tapply(counts$Freq,l,FUN=c)
  rm(l)
  tmp=reshape2::melt(data.frame(do.call(cbind,bins)))
  rm(bins)
  out1=sprintf("%s_aov.txt",strsplit(file,"\\.")[[1]][1])
  out2=sprintf("%s_Tukey.txt",strsplit(file,"\\.")[[1]][1])
  l=rep(seq(1,length(tmp$value),by=250*binsize),each=250*binsize)[1:length(tmp$value)]
  split_df=split(tmp,l)
  rm(l)
  rm(tmp)
  aov.list=lapply(split_df, function(x) aov(formula = value~variable, data = x))
  rm(split_df)
  aov.summary=lapply(aov.list, summary)
  capture.output(aov.summary,file = out1)
  rm(aov.summary)
  Tukeys=lapply(aov.list[1:length(aov.list)], function(x) TukeyHSD(x))
  export=data.frame(Tukeys[[1]]$variable)
  write.table(export,out2,quote = F,sep = "\t")
  rm(aov.list)
  rm(Tukeys)
  rm(export)
}


################################################################################
#' filter TuckeyHSD output
#'
#'@description
#' filters Tukey output to specified p-value
#' @param file list, .txt-files with Tukey output
#' @param pval integer, p-value to filter Tukey output, default 0.05
#' @keywords filter_Tukey
#' @return txt with table with data
#' @export
filter_Tukey=function(file,pval=0.05){
  tmp=read.delim(file,sep = "\t",header=T)
  export=subset(tmp,tmp[,4]<=pval)
  out=sprintf("%s_sign_Tukey.txt",strsplit(file,"_")[[1]][1])
  write.table(export,out,quote = F,col.names = F,row.names = T,sep = "\t")
}



################################################################################
#' extract ROI
#'
#'@description
#' extract ROI based on Tukey results
#' @param file1 list, .txt-files with filtered Tukey output
#' @param file2 list, example.vcf-files or example.vcf.gz-files with variants
#' @param cutoff integer
#' @param binsize integer, number of neighboring variant positions per bin,default 1000, same as stat_by_variants function
#' @keywords extract_ROI
#' @return txt with table with data
#' @export
extract_ROI=function(file1,file2,cutoff=100,binsize=1000){ #file1=Tukey_output file2=vcf-file
  df=read.csv(file1,sep = "\t",header=F)
  colnames(df)=c("pairs","diff","lwr","upr","p")
  df$pairs=as.character(df$pairs)
  tmp=strsplit(df$pairs,"-")
  split.pairs=data.frame(pair_1=unlist(lapply(tmp,function(x) return(x[1]))),pair_2=unlist(lapply(tmp,function(x) return(x[2]))))
  rm(tmp)
  counts=lapply(split(split.pairs,split.pairs$pair_2), function(x) length(x[,1]))
  ROI=data.frame(names=as.character(names(subset(counts,counts>cutoff))),counts=unlist(subset(counts,counts>cutoff)))#welches cutoff
  ROI["help"]=as.integer(unlist(lapply(ROI$names,function(x) gsub("X","",x))))
  ROI=ROI[order(ROI$help),]
  rm(counts)

  vcf=vcfR::read.vcfR(file2,verbose=FALSE)
  x=as.data.frame(vcf@fix)
  rm(vcf)
  x$POS=type.convert(x$POS)
  x=subset(x,x$REF=="A" |x$REF=="C" |x$REF=="T" |x$REF=="G" )
  counts=as.data.frame(table(x$POS))
  counts$Var1=type.convert(counts$Var1)
  l=rep(seq(1,length(counts$Freq),by=binsize),each=binsize)[1:length(counts$Freq)]
  pos=tapply(counts$Var1,l,FUN=c)
  rm(counts)
  test=unlist(lapply(ROI$help,function(x) pos[as.character(x)]),recursive = F)
  ROI["START"]=unlist(lapply(test, min))
  ROI["END"]=unlist(lapply(test, max))
  ROI=subset(ROI,select=-c(help))
  rownames(ROI)=NULL


  gr=GenomicRanges::GRanges(
    seqnames = S4Vectors::Rle(strsplit(file1,"_")[[1]][1],length(ROI$names)),
    ranges=IRanges::IRanges(start=ROI$START,end=ROI$END,names=ROI$names),
    count=ROI$counts
  )
  export <- data.frame(seqnames=GenomeInfoDb::seqnames(gr),
                       starts=BiocGenerics::start(gr)-1,
                       ends=BiocGenerics::end(gr),
                       names=names(gr),
                       scores=S4Vectors::elementMetadata(gr)$count,
                       strands=BiocGenerics::strand(gr))
  out=sprintf("%s_ROI.bed",strsplit(file1,"_")[[1]][1])
  write.table(export, file=out, quote=F, sep="\t", row.names=F, col.names=F)
}

################################################################################
#' sum
#'
#'@description
#' help function for mp_ROI
#' @param l list, .txt-files with filtered Tukey output
#' @param sums numeric vector
#' @keywords return_sums
#' @return txt with table with data
#' @export
return_sums=function(l,sums){
  regions=strsplit(as.character(l),",")
  sum=0
  for(r in regions){
    sum=sum+unlist(sums[r])
  }
  return(sum)
}

################################################################################
#' sum
#'
#'@description
#' sorts ROI by number of SNPs they contain
#' @param file1 list, filtered list of ROI, output from plot_snp_genome
#' @param file2 list, vcf-files with variants
#' @param binsize integer, number of neighboring variant positions per bin,default 1000, same as stat_by_variants function
#' @keywords mp_ROI
#' @return txt with table with data
#' @export
mp_ROI=function(file1,file2,binsize=1000){
  vcf=vcfR::read.vcfR(file2,verbose=FALSE)
  vcf_df=as.data.frame(vcf@fix)
  rm(vcf)
  vcf_df$POS=type.convert(vcf_df$POS)
  vcf_df=subset(vcf_df,vcf_df$REF=="A" |vcf_df$REF=="C" |vcf_df$REF=="T" |vcf_df$REF=="G" )
  counts=as.data.frame(table(vcf_df$POS))
  rm(vcf_df)
  l=rep(seq(1,length(counts$Freq),by=binsize),each=binsize)[1:length(counts$Freq)]
  bins=tapply(counts$Freq,l,FUN=c)
  rm(l)
  rm(counts)
  tmp=reshape2::melt(data.frame(do.call(cbind,bins)))
  rm(bins)
  split.df=split(tmp,tmp$variable)
  sums=lapply(split.df, function(x) sum(x$value))
  rm(split.df)
  df=read.delim(file1,header = T,quote = "",sep = "\t")
  f=lapply(df$names, function(x) return_sums(x,sums))
  df["snp_nr"]=unlist(lapply(f, sum))
  rm(f)


  df=df[order(df$snp_nr,df$count,decreasing = T),]
  candidates=df[1:15,]
  out=sprintf("%s_mp_ROI.bed",strsplit(file2,"_")[[1]][1])
  write.table(candidates,file = out,quote = F,sep = "\t",row.names = F)
}

################################################################################
#' filters mp_ROI for repeats
#'
#'@description
#' filters mp_ROI for repeat regions marked in input file
#' @param file1 list, filtered list of ROI, output from plot_snp_genome
#' @param masked_reg bed file with repeat regions, can be .bed.gz
#' @keywords process
#' @return txt with table with data
#' @export
process=function(file1,masked_reg){
  masked_regions_df=read.delim(masked_reg,header = F,quote = "",sep = "\t")
  colnames(masked_regions_df)=c("name","start","end","type")
  masked_regions.gr=GenomicRanges::GRanges(
    seqnames = S4Vectors::Rle(rep("masked_region",length(masked_regions_df$name))),
    ranges=IRanges::IRanges(start=masked_regions_df$start,end=masked_regions_df$end,names=masked_regions_df$name)
  )


  ROI_df=read.delim(file1,header = T,quote = "",sep = "\t")
  ROI.gr=GenomicRanges::GRanges(
    seqnames = S4Vectors::Rle(ROI_df$seqnames),
    ranges=IRanges::IRanges(start=ROI_df$starts,end=ROI_df$ends,names=ROI_df$names),
    count=ROI_df$count,
    strands=ROI_df$strands,
    snp_nr=ROI_df$snp_nr
  )


  hits=GenomicRanges::findOverlaps(IRanges::ranges(masked_regions.gr),IRanges::ranges(ROI.gr),type = "within")
  non_overlap=GenomicRanges::setdiff(IRanges::ranges(ROI.gr),IRanges::ranges(masked_regions.gr[hits@from]))

  output_df=data.frame(seqnames=rep(unique(GenomicRanges::seqnames(ROI.gr)),length(non_overlap)),
                       starts=BiocGenerics::start(non_overlap)-1,
                       ends=BiocGenerics::end(non_overlap),
                       width=BiocGenerics::width(non_overlap)
  )
  out=sprintf("processed_ROI_%s.bed",unlist(strsplit(file1,"_"))[1])
  write.table(output_df,file = out,quote = F,sep = "\t",row.names = F)

  non_overlap=subset(non_overlap,non_overlap@width>300)
  print(non_overlap)
  if(length(non_overlap)>0){
    plot.gr=GenomicRanges::GRanges(
      seqnames = rep(unlist(strsplit(file1,"_"))[1],length(non_overlap)),
      ranges=non_overlap,
      strands=rep("*",length(non_overlap))
    )

    atrack=Gviz::AnnotationTrack(plot.gr, name="processed ROIs >300bp")
    gtrack=Gviz::GenomeAxisTrack()
    out=sprintf("processed_ROI_%s.jpeg",unlist(strsplit(file1,"_"))[1])
    jpeg(file=out,width = 800, height = 800,quality = 100)
    Gviz::plotTracks(list(gtrack,atrack))
    dev.off()
  }
}
Luk13Mad/sifrabaR documentation built on July 15, 2020, 12:54 a.m.