#' function to plot ROI and annotation over genome
#'
#'@description
#' plots ROI and viable regions over chromosome
#' @param list1 list, .bedfiles with ROIs
#' @param list2 list, gff3.gz file with annotation
#' @param list3 list, chromosome name
#' @param repeat_file file with repeats
#' @keywords plot_snp_genome
#' @return jpeg with plot and table with data
#' @export
plot_snp_genome=function(list1,list2,list3,repeat_file){#TODO include repeat_file
ROI=read.csv(list1,sep = "\t",header=F)
colnames(ROI)=c("seqnames","starts","ends","names","scores","strands")
ROI["rlen"]=ROI$ends-ROI$starts
ROI=subset(ROI,ROI$rlen<=10000)
ROI.gr=GenomicRanges::GRanges(
seqnames = S4Vectors::Rle(ROI$seqnames),
ranges=IRanges::IRanges(start=ROI$starts,end=ROI$ends,names=ROI$names),
count=ROI$scores,
strands=ROI$strands
)
gff3=read.delim(list2,header = F,comment.char = "#")
colnames(gff3)=c("seqid","source","type","start","end","score","strand","phase","attributes")
exon=subset(gff3,gff3$type=="exon" | gff3$type=="three_prime_UTR" | gff3$type=="five_prime_UTR"| gff3$type=="mRNA"| gff3$type=="rRNA")
exon.gr=GenomicRanges::GRanges(
seqnames = S4Vectors::Rle(exon$seqid),
ranges=IRanges::IRanges(start=exon$start,end=exon$end,names=exon$source),
strands=exon$strand
)
exon.gr=GenomicRanges::gaps(exon.gr)
repeats=read.delim(repeat_file,header = F,comment.char = "#") #from ftp://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/
repeats=subset(repeats,repeats$V2==list3)
repeats.gr=GenomicRanges::GRanges(
seqnames = S4Vectors::Rle(rep(gff3$seqid[1],times=length(repeats$V2))),
ranges=IRanges::IRanges(start=repeats$V3,end=repeats$V4)
)
repeats.gr=GenomicRanges::gaps(repeats.gr)
combined.gr=unlist(c(GenomicRanges::GRangesList(repeats.gr),GenomicRanges::GRangesList(exon.gr)))
atrack=Gviz::AnnotationTrack(ROI.gr, name="SNPs")
gtrack=Gviz::GenomeAxisTrack()
hits=GenomicRanges::findOverlaps(IRanges::ranges(ROI.gr),IRanges::ranges(combined.gr),type = "within")
hltrack=Gviz::HighlightTrack(trackList =atrack,range = IRanges::ranges(combined.gr)[hits@to],chromosome = list3,col="#FF3333",fill="#FF3333")
out=sprintf("SNP_pos_%s.bed",list3)
output_df=data.frame(seqnames=GenomeInfoDb::seqnames(ROI.gr[hits@from]),
starts=BiocGenerics::start(ROI.gr[hits@from])-1,
ends=BiocGenerics::end(ROI.gr[hits@from]),
names=names(ROI.gr[hits@from]),
count=c(ROI.gr[hits@from]$count),
strands=BiocGenerics::strand(ROI.gr[hits@from]))
write.table(output_df,file = out,quote = F,sep = "\t",row.names = F)
out=sprintf("SNP_%s.jpeg",list3)
jpeg(file=out,width = 800, height = 800,quality = 100)
Gviz::plotTracks(list(gtrack,hltrack))
dev.off()
}
################################################################################
#' function to plot variants over chromosome
#'
#'@description
#' plots variants from vcf-file over chromosome
#' @param file list, example.vcf-files or example.vcf.gz-files with variants
#' @keywords plot_snp_density
#' @return pdf with plot
#' @export
plot_snp_density =function(file){
vcf=vcfR::read.vcfR(file,verbose=FALSE)
x=as.data.frame(vcf@fix)
rm(vcf)
x$POS=type.convert(x$POS)
x=subset(x,x$REF=="A" |x$REF=="C" |x$REF=="T" |x$REF=="G" ) # filter so reference base is only one nt
g=ggplot2::ggplot(x,ggplot2::aes(POS))+
ggplot2::geom_histogram(ggplot2::aes(fill=..count..),binwidth = 1000)+
ggplot2::scale_y_continuous(breaks = seq(0,2500,by=250))+
ggplot2::scale_x_continuous(breaks = seq(0,max(x$POS),by=100000000))+
ggplot2::scale_fill_gradient("Count",low="blue", high="red",na.value = "grey")
out=sprintf("%s_only_SNP.pdf",strsplit(file,"\\.")[[1]][1])
ggplot2::ggsave(out,plot=g,device = pdf())
rm(g)
}
################################################################################
#' function to test for significant bins
#'
#'@description
#' runs ANOVA and Tukey on variant data, variants split into bins by positions
#' @param file list, example.vcf-files or example.vcf.gz-files with variants
#' @param binsize integer, number of neighboring variant positions per bin,default 1000
#' @keywords stat_by_variants
#' @return txt with table with data
#' @export
stat_by_variants=function(file,binsize=1000){
vcf=vcfR::read.vcfR(file,verbose=FALSE)
x=as.data.frame(vcf@fix)
rm(vcf)
x$POS=type.convert(x$POS)
x=subset(x,x$REF=="A" |x$REF=="C" |x$REF=="T" |x$REF=="G" ) # filter so reference base is only one nt
counts=as.data.frame(table(x$POS))
rm(x)
l=rep(seq(1,length(counts$Freq),by=binsize),each=binsize)[1:length(counts$Freq)]
bins=tapply(counts$Freq,l,FUN=c)
rm(l)
tmp=reshape2::melt(data.frame(do.call(cbind,bins)))
rm(bins)
out1=sprintf("%s_aov.txt",strsplit(file,"\\.")[[1]][1])
out2=sprintf("%s_Tukey.txt",strsplit(file,"\\.")[[1]][1])
l=rep(seq(1,length(tmp$value),by=250*binsize),each=250*binsize)[1:length(tmp$value)]
split_df=split(tmp,l)
rm(l)
rm(tmp)
aov.list=lapply(split_df, function(x) aov(formula = value~variable, data = x))
rm(split_df)
aov.summary=lapply(aov.list, summary)
capture.output(aov.summary,file = out1)
rm(aov.summary)
Tukeys=lapply(aov.list[1:length(aov.list)], function(x) TukeyHSD(x))
export=data.frame(Tukeys[[1]]$variable)
write.table(export,out2,quote = F,sep = "\t")
rm(aov.list)
rm(Tukeys)
rm(export)
}
################################################################################
#' filter TuckeyHSD output
#'
#'@description
#' filters Tukey output to specified p-value
#' @param file list, .txt-files with Tukey output
#' @param pval integer, p-value to filter Tukey output, default 0.05
#' @keywords filter_Tukey
#' @return txt with table with data
#' @export
filter_Tukey=function(file,pval=0.05){
tmp=read.delim(file,sep = "\t",header=T)
export=subset(tmp,tmp[,4]<=pval)
out=sprintf("%s_sign_Tukey.txt",strsplit(file,"_")[[1]][1])
write.table(export,out,quote = F,col.names = F,row.names = T,sep = "\t")
}
################################################################################
#' extract ROI
#'
#'@description
#' extract ROI based on Tukey results
#' @param file1 list, .txt-files with filtered Tukey output
#' @param file2 list, example.vcf-files or example.vcf.gz-files with variants
#' @param cutoff integer
#' @param binsize integer, number of neighboring variant positions per bin,default 1000, same as stat_by_variants function
#' @keywords extract_ROI
#' @return txt with table with data
#' @export
extract_ROI=function(file1,file2,cutoff=100,binsize=1000){ #file1=Tukey_output file2=vcf-file
df=read.csv(file1,sep = "\t",header=F)
colnames(df)=c("pairs","diff","lwr","upr","p")
df$pairs=as.character(df$pairs)
tmp=strsplit(df$pairs,"-")
split.pairs=data.frame(pair_1=unlist(lapply(tmp,function(x) return(x[1]))),pair_2=unlist(lapply(tmp,function(x) return(x[2]))))
rm(tmp)
counts=lapply(split(split.pairs,split.pairs$pair_2), function(x) length(x[,1]))
ROI=data.frame(names=as.character(names(subset(counts,counts>cutoff))),counts=unlist(subset(counts,counts>cutoff)))#welches cutoff
ROI["help"]=as.integer(unlist(lapply(ROI$names,function(x) gsub("X","",x))))
ROI=ROI[order(ROI$help),]
rm(counts)
vcf=vcfR::read.vcfR(file2,verbose=FALSE)
x=as.data.frame(vcf@fix)
rm(vcf)
x$POS=type.convert(x$POS)
x=subset(x,x$REF=="A" |x$REF=="C" |x$REF=="T" |x$REF=="G" )
counts=as.data.frame(table(x$POS))
counts$Var1=type.convert(counts$Var1)
l=rep(seq(1,length(counts$Freq),by=binsize),each=binsize)[1:length(counts$Freq)]
pos=tapply(counts$Var1,l,FUN=c)
rm(counts)
test=unlist(lapply(ROI$help,function(x) pos[as.character(x)]),recursive = F)
ROI["START"]=unlist(lapply(test, min))
ROI["END"]=unlist(lapply(test, max))
ROI=subset(ROI,select=-c(help))
rownames(ROI)=NULL
gr=GenomicRanges::GRanges(
seqnames = S4Vectors::Rle(strsplit(file1,"_")[[1]][1],length(ROI$names)),
ranges=IRanges::IRanges(start=ROI$START,end=ROI$END,names=ROI$names),
count=ROI$counts
)
export <- data.frame(seqnames=GenomeInfoDb::seqnames(gr),
starts=BiocGenerics::start(gr)-1,
ends=BiocGenerics::end(gr),
names=names(gr),
scores=S4Vectors::elementMetadata(gr)$count,
strands=BiocGenerics::strand(gr))
out=sprintf("%s_ROI.bed",strsplit(file1,"_")[[1]][1])
write.table(export, file=out, quote=F, sep="\t", row.names=F, col.names=F)
}
################################################################################
#' sum
#'
#'@description
#' help function for mp_ROI
#' @param l list, .txt-files with filtered Tukey output
#' @param sums numeric vector
#' @keywords return_sums
#' @return txt with table with data
#' @export
return_sums=function(l,sums){
regions=strsplit(as.character(l),",")
sum=0
for(r in regions){
sum=sum+unlist(sums[r])
}
return(sum)
}
################################################################################
#' sum
#'
#'@description
#' sorts ROI by number of SNPs they contain
#' @param file1 list, filtered list of ROI, output from plot_snp_genome
#' @param file2 list, vcf-files with variants
#' @param binsize integer, number of neighboring variant positions per bin,default 1000, same as stat_by_variants function
#' @keywords mp_ROI
#' @return txt with table with data
#' @export
mp_ROI=function(file1,file2,binsize=1000){
vcf=vcfR::read.vcfR(file2,verbose=FALSE)
vcf_df=as.data.frame(vcf@fix)
rm(vcf)
vcf_df$POS=type.convert(vcf_df$POS)
vcf_df=subset(vcf_df,vcf_df$REF=="A" |vcf_df$REF=="C" |vcf_df$REF=="T" |vcf_df$REF=="G" )
counts=as.data.frame(table(vcf_df$POS))
rm(vcf_df)
l=rep(seq(1,length(counts$Freq),by=binsize),each=binsize)[1:length(counts$Freq)]
bins=tapply(counts$Freq,l,FUN=c)
rm(l)
rm(counts)
tmp=reshape2::melt(data.frame(do.call(cbind,bins)))
rm(bins)
split.df=split(tmp,tmp$variable)
sums=lapply(split.df, function(x) sum(x$value))
rm(split.df)
df=read.delim(file1,header = T,quote = "",sep = "\t")
f=lapply(df$names, function(x) return_sums(x,sums))
df["snp_nr"]=unlist(lapply(f, sum))
rm(f)
df=df[order(df$snp_nr,df$count,decreasing = T),]
candidates=df[1:15,]
out=sprintf("%s_mp_ROI.bed",strsplit(file2,"_")[[1]][1])
write.table(candidates,file = out,quote = F,sep = "\t",row.names = F)
}
################################################################################
#' filters mp_ROI for repeats
#'
#'@description
#' filters mp_ROI for repeat regions marked in input file
#' @param file1 list, filtered list of ROI, output from plot_snp_genome
#' @param masked_reg bed file with repeat regions, can be .bed.gz
#' @keywords process
#' @return txt with table with data
#' @export
process=function(file1,masked_reg){
masked_regions_df=read.delim(masked_reg,header = F,quote = "",sep = "\t")
colnames(masked_regions_df)=c("name","start","end","type")
masked_regions.gr=GenomicRanges::GRanges(
seqnames = S4Vectors::Rle(rep("masked_region",length(masked_regions_df$name))),
ranges=IRanges::IRanges(start=masked_regions_df$start,end=masked_regions_df$end,names=masked_regions_df$name)
)
ROI_df=read.delim(file1,header = T,quote = "",sep = "\t")
ROI.gr=GenomicRanges::GRanges(
seqnames = S4Vectors::Rle(ROI_df$seqnames),
ranges=IRanges::IRanges(start=ROI_df$starts,end=ROI_df$ends,names=ROI_df$names),
count=ROI_df$count,
strands=ROI_df$strands,
snp_nr=ROI_df$snp_nr
)
hits=GenomicRanges::findOverlaps(IRanges::ranges(masked_regions.gr),IRanges::ranges(ROI.gr),type = "within")
non_overlap=GenomicRanges::setdiff(IRanges::ranges(ROI.gr),IRanges::ranges(masked_regions.gr[hits@from]))
output_df=data.frame(seqnames=rep(unique(GenomicRanges::seqnames(ROI.gr)),length(non_overlap)),
starts=BiocGenerics::start(non_overlap)-1,
ends=BiocGenerics::end(non_overlap),
width=BiocGenerics::width(non_overlap)
)
out=sprintf("processed_ROI_%s.bed",unlist(strsplit(file1,"_"))[1])
write.table(output_df,file = out,quote = F,sep = "\t",row.names = F)
non_overlap=subset(non_overlap,non_overlap@width>300)
print(non_overlap)
if(length(non_overlap)>0){
plot.gr=GenomicRanges::GRanges(
seqnames = rep(unlist(strsplit(file1,"_"))[1],length(non_overlap)),
ranges=non_overlap,
strands=rep("*",length(non_overlap))
)
atrack=Gviz::AnnotationTrack(plot.gr, name="processed ROIs >300bp")
gtrack=Gviz::GenomeAxisTrack()
out=sprintf("processed_ROI_%s.jpeg",unlist(strsplit(file1,"_"))[1])
jpeg(file=out,width = 800, height = 800,quality = 100)
Gviz::plotTracks(list(gtrack,atrack))
dev.off()
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.