Nothing
#' Convert Popoolation Sync files into a pooldata object
#' @param sync.file The name (or a path) of the Popoolation sync file (might be in compressed format)
#' @param poolsizes A numeric vector with haploid pool sizes
#' @param poolnames A character vector with the names of pool
#' @param min.rc Minimal allowed read count per base. Bases covered by less than min.rc reads are discarded and considered as sequencing error. For instance, if nucleotides A, C, G and T are covered by respectively 100, 15, 0 and 1 over all the pools, setting min.rc to 0 will lead to discard the position (the polymorphism being considered as tri-allelic), while setting min.rc to 1 (or 2, 3..14) will make the position be considered as a SNP with two alleles A and C (the only read for allele T being disregarded).
#' @param min.cov.per.pool Minimal allowed read count (per pool). If at least one pool is not covered by at least min.cov.perpool reads, the position is discarded
#' @param max.cov.per.pool Maximal allowed read count (per pool). If at least one pool is covered by more than min.cov.perpool reads, the position is discarded
#' @param min.maf Minimal allowed Minor Allele Frequency (computed from the ratio overal read counts for the reference allele over the read coverage)
#' @param noindel If TRUE, positions with at least one indel count are discarded
#' @param nlines.per.readblock Number of Lines read simultaneously. Should be adapted to the available RAM.
#' @param nthreads Number of available threads for parallelization of some part of the parsing (default=1, i.e., no parallelization)
#' @return A pooldata object containing 7 elements:
#' \enumerate{
#' \item "refallele.readcount": a matrix with nsnp rows and npools columns containing read counts for the reference allele (chosen arbitrarily) in each pool
#' \item "readcoverage": a matrix with nsnp rows and npools columns containing read coverage in each pool
#' \item "snp.info": a matrix with nsnp rows and four columns containing respectively the contig (or chromosome) name (1st column) and position (2nd column) of the SNP; the allele taken as reference in the refallele.readcount matrix (3rd column); and the alternative allele (4th column)
#' \item "poolsizes": a vector of length npools containing the haploid pool sizes
#' \item "poolnames": a vector of length npools containing the names of the pools
#' \item "nsnp": a scalar corresponding to the number of SNPs
#' \item "npools": a scalar corresponding to the number of pools
#' }
#' @examples
#' make.example.files(writing.dir=tempdir())
#' pooldata=popsync2pooldata(sync.file=paste0(tempdir(),"/ex.sync.gz"),poolsizes=rep(50,15))
#' @export
popsync2pooldata<-function(sync.file="",poolsizes=NA,poolnames=NA,min.rc=1,min.cov.per.pool=-1,max.cov.per.pool=1e6,min.maf=0.01,noindel=TRUE,nlines.per.readblock=1000000,nthreads=1){
if(nthreads>1){
tmp.ncores=detectCores()
if(nthreads>tmp.ncores){nthreads=tmp.ncores}
options(cores=nthreads)
registerDoParallel() ; getDoParWorkers()
parallel=TRUE
}
if(nthreads<=1){parallel=FALSE}
if(nchar(sync.file)==0){stop("ERROR: Please provide the name of the sync file generated by Popoolation")}
if(sum(is.na(poolsizes))>0){stop("ERROR: Please provide a vector of Pool Sizes (poolsize argument)")}
poolsizes=as.numeric(poolsizes)
ngenos=6
mat.geno=matrix(c(1,1,1,2,2,3,2,3,4,3,4,4),6,2)
all1=c("A","A","A","T","T","C")
all2=c("T","C","G","C","G","G")
file.con=file(sync.file,open="r")
##
tmp.data=scan(file=file.con,nlines = 1,what="character",quiet=TRUE)
npools=length(tmp.data)-3
close(file.con)
if(length(poolsizes)!=npools){stop("ERROR: The number of pools in the sync file is different from the length of the vector of pool sizes")}
if(sum(is.na(poolnames))>0){
poolnames=paste0("P",1:npools)
}else{
poolnames=as.character(poolnames)
if(length(poolnames)!=npools){stop("ERROR: The number of pools in the sync file is different from the length of vector of pool names")}
}
###
file.con=file(sync.file,open="r")
continue.reading=TRUE
nlines.read=0
time1=proc.time()
while(continue.reading){
# tmp.data=matrix(scan(file=sync.file,skip=skip,nlines = nlines.per.readblock,what="character",quiet=TRUE),ncol=npools+3,byrow=T)
tmp.data=matrix(scan(file=file.con,nlines = nlines.per.readblock,what="character",quiet=TRUE),ncol=npools+3,byrow=T)
if(length(tmp.data)<nlines.per.readblock){continue.reading=FALSE}
npos=nrow(tmp.data)
if(npos>1){
tmp.snpdet=tmp.data[,1:3] ; tmp.cnt=tmp.data[,-1:-3]
rm(tmp.data)
if(parallel){
res=foreach(i=1:npools,.combine=cbind) %dopar% {
matrix(as.numeric(unlist(strsplit(tmp.cnt[,i],split = ":"))),ncol=6,byrow=TRUE)
}
tmp.cnt2=array(0,dim = c(npos,npools,6))
dum.deb=6*(0:(npools-1))
for(i in 1:6){tmp.cnt2[,,i]=res[,dum.deb+i]}
rm(res)
}else{
tmp.cnt2=array(0,dim = c(npos,npools,6))
for(i in 1:npools){
tmp.cnt2[,i,]=matrix(as.numeric(unlist(strsplit(tmp.cnt[,i],split = ":"))),ncol=6,byrow=TRUE)
}
}
rm(tmp.cnt)
cnt.all=matrix(0,npos,6)
for(i in 1:6){cnt.all[,i]=rowSums(tmp.cnt2[,,i])}
snp.sel=rowSums(cnt.all[,1:4]>min.rc)==2
if(noindel){snp.sel[cnt.all[,6]>0]=FALSE}
tmp.snpdet=tmp.snpdet[snp.sel,]
tmp.cnt2=tmp.cnt2[snp.sel,,1:4]
cnt.all=cnt.all[snp.sel,1:4]
tmp.snpdet=cbind(tmp.snpdet,tmp.snpdet[,1:2])
tmp.Y=tmp.N=matrix(0,sum(snp.sel),npools)
for(i in 1:ngenos){
tmp.cntall.col=cnt.all[,mat.geno[i,]]
tmp.cnt.col=tmp.cnt2[,,mat.geno[i,]]
dum.sel=rowSums(tmp.cntall.col>min.rc)==2
if(sum(dum.sel)>0){
tmp.Y[dum.sel,]=tmp.cnt.col[dum.sel,,1]
tmp.N[dum.sel,]=tmp.cnt.col[dum.sel,,1] + tmp.cnt.col[dum.sel,,2]
tmp.snpdet[dum.sel,4]=all1[i] ; tmp.snpdet[dum.sel,5]=all2[i]
}
}
##filtres sur couverture et maf
tmp.maf=0.5-abs(0.5-rowSums(tmp.Y)/rowSums(tmp.N))
dum.sel=(rowSums(tmp.N>=min.cov.per.pool)==npools) & (rowSums(tmp.N<=max.cov.per.pool)==npools) & (tmp.maf>min.maf)
tmp.Y=tmp.Y[dum.sel,] ; tmp.N=tmp.N[dum.sel,] ; tmp.snpdet=tmp.snpdet[dum.sel,]
if(nlines.read==0){
data.Y=tmp.Y ; data.N=tmp.N ; snpdet=tmp.snpdet[,-3]
}else{
data.Y=rbind(data.Y,tmp.Y)
data.N=rbind(data.N,tmp.N)
snpdet=rbind(snpdet,tmp.snpdet[,-3])
}
nlines.read=nlines.read+npos
cat(nlines.read/1000000,"millions lines processed in",round((proc.time()-time1)[1]/60,2)," min.; ",nrow(data.Y),"SNPs found\n")
}
}
close(file.con)
res<-new("pooldata")
res@npools=npools
res@nsnp=nrow(data.Y)
res@refallele.readcount=data.Y
rm(data.Y)
res@readcoverage=data.N
rm(data.N)
res@snp.info=data.frame(Chromosome=as.character(snpdet[,1]),
Position=as.numeric(snpdet[,2]),
RefAllele=as.character(snpdet[,3]),
AltAllele=as.character(snpdet[,4]),
stringsAsFactors = FALSE)
rm(snpdet)
res@poolsizes=poolsizes
res@poolnames=poolnames
cat("Data consists of",res@nsnp,"SNPs for",res@npools,"Pools\n")
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.