Nothing
#' Compute sliding window estimates of F-statistics or ratio of F-statistics over the genome
#' @param x A pooldata object containing Pool-Seq information or a countdata object containing allele count information
#' @param num.pop.idx A vector of length 1 to 4 (depending on num.stat) giving the index of the populations. If num.stat="het", num.pop.idx must be of length 1: num.pop.idx=i specifies the ith pop in x. If num.stat="div", "F2" or "Fst", num.pop.idx must be of length 2: num.pop.idx=c(i,j) specifies the pairs of populations with indexes i and j in x. If num.stat="F3" or "F3star", num.pop.idx must be of length 3 (num.pop.idx=c(i,j,k) specifies the F3(pop_i;pop_j,pop_k) populations triplet). Finally, if num.stat="F4" or "Dstat", num.pop.idx must be of length 4: num.pop.idx=c(i,j,k) specifies the F4(pop_i,pop_j;pop_k,pop_l) populations quadruplet i.e. the computed (numerator) statistic computed is (F2(pop_i,pop_k)-F2(pop_i,pop_l)-F2(pop_j,pop_k)+F2(pop_j,pop_l))/2.
#' @param den.pop.idx A vector of length 1 to 4 (see num.pop.idx description) giving the index of the populations specifying the F-statistic. If NULL, the computed statistic is the one specified by num.pop.idx.
#' @param num.stat the name of the (numerator) stat which must be "het" (1-Q1), "div" (1-Q2), "F2", "Fst", "F3", "F3star", "F4" or "Dstat"
#' @param den.stat the name of the (numerator) stat which must be "het" (1-Q1), "div" (1-Q2), "F2", "Fst", "F3", "F3star", "F4" or "Dstat"
#' @param window.def Either "nsnp" or "bp" to define windows by either a number of SNPs or a size in bp, respectively
#' @param sliding.window.size A numeric value giving the number of SNPs or the size (in bp) of the windows depending window.def
#' @param window.overlap.fact A numeric value (between 0 and 1) giving the percentage of overlap between consecutive windows (default=0.5)
#' @param bp.start.first.snp When window.def="bp", if TRUE (default) the windowing start at the first SNP position, if FALSE the windowing start at position 1
#' @param verbose If TRUE extra information is printed on the terminal
#' @details Compute sliding window estimates of F-statistics or ratio of F-statistics over the genome.
#' @return A data frame with 7 columns with for each window in a row their i) chromosome/contig of origin; ii) start and iii) end position; iv) the mid-position of each window; v) the cumulated mid-position of each window (to facilitate further plotting); vi) the number of SNPs included in the computation of window value; and vii) the estimated value of the statistic
#' @seealso To generate pooldata object, see \code{\link{vcf2pooldata}}, \code{\link{popsync2pooldata}},\code{\link{genobaypass2pooldata}} or \code{\link{genoselestim2pooldata}}. To generate coundata object, see \code{\link{genobaypass2countdata}} or \code{\link{genotreemix2countdata}}.
#' @examples
#' make.example.files(writing.dir=tempdir())
#' pooldata=popsync2pooldata(sync.file=paste0(tempdir(),"/ex.sync.gz"),poolsizes=rep(50,15))
#' @export
sliding.windows.fstat<-function(x,num.pop.idx=NULL,den.pop.idx=NULL,num.stat=NULL,den.stat=NULL,window.def=c("nsnp","bp")[1],sliding.window.size=NULL,window.overlap.fact=0.5,bp.start.first.snp=TRUE,verbose=TRUE){
#check input objects (and retrieve nbr of pop samples)
if(!(is.pooldata(x)) & !(is.countdata(x))){
stop("Input data are not formatted as valid pooldata (see the popsync2pooldata, vcf2pooldata, genobaypass2pooldata and selestim2pooldata functions) or countdata (see the genobaypass2countdata and genotreemix2countdata) object\n")}
n.pops=ifelse(is.countdata(x),x@npops,x@npools)
npop.stats=c(1,2,2,2,3,3,4,4)
names(npop.stats)=c("het","div","F2","Fst","F3","F3star","F4","Dstat")
if(is.null(num.pop.idx)){stop("No pop. indexes given for the numerator stat. (num.pop.idx argument)\n")}
if(is.null(num.stat)){stop("No numerator f-stat name given (num.stat argument)\n")}
if(!(num.stat %in% names(npop.stats))){
stop(paste0("num.stat must be one of the following names: ",paste(names(npop.stats),collapse=", ")))
}
if(length(num.pop.idx)!=npop.stats[num.stat]){
stop(paste("num.pop.idx argument for",num.stat,"must be of length",npop.stats[num.stat]))
}
if(sum(!(num.pop.idx%in%1:n.pops))>0){
stop("Check num.pop.idx indexes (must be >=1 and <=n.pops included in the countdata or pooldata input object)\n")
}
if(is.null(den.pop.idx) | is.null(den.stat)){
denom=FALSE
}else{
if(is.null(den.pop.idx)){stop("No pop. indexes given for the denominator stat. (den.pop.idx argument)\n")}
if(is.null(den.stat)){stop("No denominator f-stat name given (den.stat argument)\n")}
if(!(den.stat %in% names(npop.stats))){
stop(paste0("den.stat must be one of the following names: ",paste(names(npop.stats),collapse=", ")))
}
if(length(den.pop.idx)!=npop.stats[num.stat]){
stop(paste("den.pop.idx argument for",num.stat,"must be of length",npop.stats[num.stat]))
}
if(sum(!(den.pop.idx%in%1:n.pops))>0){
stop("Check den.pop.idx indexes (must be >=1 and <=n.pops included in the countdata or pooldata input object)\n")
}
denom=TRUE
}
if (!(window.def %in% c("nsnp","bp"))){
stop("window.def should either be nsnp (windows are defined by a number of SNPs, default) or bp (windows are defined by a size in bp)")
}else{
winbp=ifelse(window.def=="nsnp",FALSE,TRUE)
}
if(is.null(sliding.window.size)){stop("No sliding.window.size provided\n")}
if(window.overlap.fact<=0 | window.overlap.fact>=1){stop("window.overlap.fact must be between 0 and 1\n")}
step=floor(sliding.window.size*(1-window.overlap.fact))
###############
########internal function to manage computation of the stats (all checks are assumed to be OK)
########NOTE: only compute numerator of scaled stats
########
compute_snp_stat_ <- function(statname,pop.idx){
if(statname=="het"){
if(is.countdata(x)){
tmp.w=c(1)
tmp.refcount=matrix(x@refallele.count[,pop.idx],ncol=1)
tmp.totcount=matrix(x@total.count[,pop.idx],ncol=1)
}else{
tmp.w=x@poolsizes[pop.idx]/(x@poolsizes[pop.idx]-1)
tmp.refcount=matrix(x@refallele.readcount[,pop.idx],ncol=1)
tmp.totcount=matrix(x@readcoverage[,pop.idx],ncol=1)
}
snp.val=1-.compute_snpQ1onepop(tmp.refcount,tmp.totcount,tmp.w)
}
if(statname=="div"){
if(is.countdata(x)){
snp.val=1-.compute_snpQ2onepair(x@refallele.count[,pop.idx[1]],x@refallele.count[,pop.idx[2]],
x@total.count[,pop.idx[1]],x@total.count[,pop.idx[2]])
}else{
snp.val=1-.compute_snpQ2onepair(x@refallele.readcount[,pop.idx[1]],x@refallele.readcount[,pop.idx[2]],
x@readcoverage[,pop.idx[1]],x@readcoverage[,pop.idx[2]])
}
}
if(statname%in%c("F2","Fst")){
snp.val=matrix(0,x@nsnp,3) #Q1a Q1b and Q2ab
for(k in 1:2){
if(is.countdata(x)){
tmp.w=c(1)
tmp.refcount=matrix(x@refallele.count[,pop.idx[k]],ncol=1)
tmp.totcount=matrix(x@total.count[,pop.idx[k]],ncol=1)
}else{
tmp.w=x@poolsizes[pop.idx[k]]/(x@poolsizes[pop.idx[k]]-1)
tmp.refcount=matrix(x@refallele.readcount[,pop.idx[k]],ncol=1)
tmp.totcount=matrix(x@readcoverage[,pop.idx[k]],ncol=1)
}
snp.val[,k]=.compute_snpQ1onepop(tmp.refcount,tmp.totcount,tmp.w)
}
if(is.countdata(x)){
tmp.refcount=x@refallele.count[,pop.idx];tmp.totcount=x@total.count[,pop.idx]
}else{
tmp.refcount=x@refallele.readcount[,pop.idx];tmp.totcount=x@readcoverage[,pop.idx]
}
snp.val[,3]=.compute_snpQ2onepair(tmp.refcount[,1],tmp.refcount[,2],tmp.totcount[,1],tmp.totcount[,2])
snp.val=0.5*rowSums(snp.val[,1:2]) - snp.val[,3]
}
if(statname%in%c("F3","F3star")){
snp.val=matrix(0,x@nsnp,4) #Q1a Q2bc Q2ab Q2ac
if(is.countdata(x)){
tmp.w=c(1)
tmp.refcount=matrix(x@refallele.count[,pop.idx[1]],ncol=1)
tmp.totcount=matrix(x@total.count[,pop.idx[1]],ncol=1)
}else{
tmp.w=x@poolsizes[pop.idx[1]]/(x@poolsizes[pop.idx[1]]-1)
tmp.refcount=matrix(x@refallele.readcount[,pop.idx[1]],ncol=1)
tmp.totcount=matrix(x@readcoverage[,pop.idx[1]],ncol=1)
}
snp.val[,1]=.compute_snpQ1onepop(tmp.refcount,tmp.totcount,tmp.w)
if(is.countdata(x)){
snp.val[,2]=.compute_snpQ2onepair(x@refallele.count[,pop.idx[2]],x@refallele.count[,pop.idx[3]],
x@total.count[,pop.idx[2]],x@total.count[,pop.idx[3]])
for(k in 2:3){#ab et ac
snp.val[,k+1]=.compute_snpQ2onepair(x@refallele.count[,pop.idx[1]],x@refallele.count[,pop.idx[k]],
x@total.count[,pop.idx[1]],x@total.count[,pop.idx[k]])}
}else{
snp.val[,2]=.compute_snpQ2onepair(x@refallele.readcount[,pop.idx[2]],x@refallele.readcount[,pop.idx[3]],
x@readcoverage[,pop.idx[2]],x@readcoverage[,pop.idx[3]])
for(k in 2:3){#ab et ac
snp.val[,k+1]=.compute_snpQ2onepair(x@refallele.readcount[,pop.idx[1]],x@refallele.readcount[,pop.idx[k]],
x@readcoverage[,pop.idx[1]],x@readcoverage[,pop.idx[k]])}
}
snp.val=0.5*(rowSums(snp.val[,1:2]) - rowSums(snp.val[,3:4]))
}
if(statname%in%c("F4","Dstat")){
q2.req=cbind(pop.idx[c(1,2,1,2)],pop.idx[c(3,4,4,3)])#Q2ac Q2bd Q2ad Q2bc
snp.val=matrix(0,x@nsnp,nrow(q2.req))
for(i in 1:nrow(q2.req)){
if(is.countdata(x)){
snp.val[,i]=.compute_snpQ2onepair(x@refallele.count[,q2.req[i,1]],x@refallele.count[,q2.req[i,2]],
x@total.count[,q2.req[i,1]],x@total.count[,q2.req[i,2]])
}else{
snp.val[,i]=.compute_snpQ2onepair(x@refallele.readcount[,q2.req[i,1]],x@refallele.readcount[,q2.req[i,2]],
x@readcoverage[,q2.req[i,1]],x@readcoverage[,q2.req[i,2]])
}
}
snp.val=0.5*(rowSums(snp.val[,1:2]) - rowSums(snp.val[,3:4]))
}
return(snp.val)
}
##########
##########
if(verbose){cat("Computing SNP-specific values\n")} #needs to be done first (to account for NA in windows definition)
if(verbose){cat("For the num.pop.idx combination\n")}
snp.num.val=compute_snp_stat_(num.stat,num.pop.idx)
keep=!is.na(snp.num.val)
num.scaling=FALSE
#denominator of the numerator! to properly scale stats when computing multilocus window stat
if(num.stat%in%c("Fst","F3star","Dstat")){
num.scaling=TRUE
if(num.stat=="Fst"){snp.num.val.scale=compute_snp_stat_("div",num.pop.idx)}
if(num.stat=="F3star"){snp.num.val.scale=compute_snp_stat_("het",num.pop.idx[1])}
if(num.stat=="Dstat"){#/(1-Q2ab)*(1-Q2bc)
snp.num.val.scale=compute_snp_stat_("div",num.pop.idx[1:2])*compute_snp_stat_("div",num.pop.idx[3:4])}
keep=keep & !is.na(snp.num.val.scale)
}
if(denom){
if(verbose){cat("For the den.pop.idx combination\n")}
snp.den.val=compute_snp_stat_(den.stat,den.pop.idx)
keep=keep & !is.na(snp.den.val)
den.scaling=FALSE
if(den.stat%in%c("Fst","F3star","Dstat")){
den.scaling=TRUE
if(den.stat=="Fst"){snp.den.val.scale=compute_snp_stat_("div",den.pop.idx)}
if(den.stat=="F3star"){snp.den.val.scale=compute_snp_stat_("het",den.pop.idx[1])}
if(den.stat=="Dstat"){#/(1-Q2ab)*(1-Q2bc)
snp.den.val.scale=compute_snp_stat_("div",den.pop.idx[1:2])*compute_snp_stat_("div",den.pop.idx[3:4])}
keep=keep & !is.na(snp.den.val.scale)
snp.den.val.scale=snp.den.val.scale[keep]
}
snp.den.val=snp.den.val[keep]
}
snp.num.val=snp.num.val[keep]
if(num.scaling){snp.num.val.scale=snp.num.val.scale[keep]}
########
snp.keep.info=x@snp.info[keep,]
if(verbose){cat("Defining Windows\n")}
#Window definition
win.det=matrix(NA,0,9)
colnames(win.det)=c("Chr","Start","End","MidPos","CumMidPos","id_snp_start","id_snp_end","nsnp","Value")
win.det=as.data.frame(win.det)
chr=unique(as.character(snp.keep.info$Chromosome))
nchr=length(chr)
cum.midpos=0
for(c in 1:nchr){
tmp.continue=TRUE
tmp.sel=which(snp.keep.info$Chromosome==chr[c])
tmp.pos=floor(snp.keep.info$Position[tmp.sel]) #in case real (e.g., when simulated data)
tmp.nsnps=length(tmp.sel)
if(winbp){
if(bp.start.first.snp){
pos.init=tmp.pos[1]
}else{
pos.init=(tmp.pos[1]%/%sliding.window.size)*sliding.window.size + 1
if(tmp.pos[1]%%sliding.window.size==0){pos.init=pos.init-sliding.window.size}#the first SNP is the only of the first window (that may be discarded further!)
}
if((tmp.pos[tmp.nsnps]-pos.init)>sliding.window.size){
tmp.start.pos=seq(pos.init,tmp.pos[tmp.nsnps],step)
tmp.end.pos = tmp.start.pos + sliding.window.size
tmp.snp.start.idx=sapply(tmp.start.pos,f<-function(x){min(which(tmp.pos>=x))})
tmp.snp.end.idx=sapply(tmp.end.pos,f<-function(x){max(which(tmp.pos<=x))})
}else{
tmp.continue=FALSE
}
}else{
if(tmp.nsnps>sliding.window.size){
tmp.snp.start.idx=seq(1,tmp.nsnps,step)
tmp.snp.end.idx=tmp.snp.start.idx+sliding.window.size-1
tmp.snp.end.idx=pmin(tmp.snp.end.idx,tmp.nsnps)
tmp.start.pos=tmp.pos[tmp.snp.start.idx]
tmp.end.pos=tmp.pos[tmp.snp.end.idx]
}else{
tmp.continue=FALSE
}
}
if(tmp.continue){
tmp.mid.pos=(tmp.start.pos+tmp.end.pos)/2
tmp.cum.mid.pos=tmp.mid.pos+cum.midpos
cum.midpos=cum.midpos+max(tmp.mid.pos)
tmp.win.nsnp=tmp.snp.end.idx-tmp.snp.start.idx+1
tmp.nwin=length(tmp.start.pos)
tmp.windet=data.frame(Chr=rep(chr[c],tmp.nwin),Start=tmp.start.pos,End=tmp.end.pos,
MidPos=tmp.mid.pos,CumMidPos=tmp.cum.mid.pos,
id_snp_start=tmp.sel[tmp.snp.start.idx],
id_snp_end=tmp.sel[tmp.snp.end.idx],
nsnp=tmp.win.nsnp,Value=rep(NA,tmp.nwin))
if(winbp){
tmp.win.size=tmp.windet$End-tmp.windet$Start
tmp.win.keep=(tmp.win.size>0.99*sliding.window.size) & (tmp.win.size<1.01*sliding.window.size) &
tmp.windet$nsnp>1
}else{
tmp.win.keep=tmp.windet$nsnp==sliding.window.size
}
win.det=rbind(win.det,tmp.windet[tmp.win.keep,])
}
}
nwins=nrow(win.det)
if(nwins<1){
stop("No SNP-window found with the given parameters. Try decreasing sliding.window.size\n")
}
if(verbose){
cat(nwins,"(overlapping) windows identified\n")
dum=(win.det$End-win.det$Start)*1e-3
cat(" Average (min-max) window sizes (in kb):",round(mean(dum),1),"(",round(min(dum)),"-",round(max(dum)),")\n")
cat(" Average (min-max) nb. of SNPs per window:",round(mean(win.det$nsnp),1),"(",min(win.det$nsnp),"-",max(win.det$nsnp),")\n")
}
#######
if(verbose){cat("Computing window statistics\n")}
snp.win.idx=cbind(win.det$id_snp_start,win.det$id_snp_end)-1 #0-indexed (for cpp)
win.det$Value=.block_sum2(snp.num.val,snp.win.idx)
if(num.scaling){win.det$Value=win.det$Value/.block_sum2(snp.num.val.scale,snp.win.idx)}
if(denom){
win.det$Value=win.det$Value/.block_sum2(snp.den.val,snp.win.idx)
if(den.scaling){win.det$Value=win.det$Value * .block_sum2(snp.den.val.scale,snp.win.idx)}
}else{
if(!num.scaling){win.det$Value=win.det$Value/win.det$nsnp}
}
return(win.det[,c(1:5,8:9)])
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.