R/popsync2pooldata.R

Defines functions popsync2pooldata

Documented in popsync2pooldata

#' Convert Popoolation Sync files into a pooldata object
#' @param sync.file The name (or a path) of the Popoolation sync file (might be in compressed format)
#' @param poolsizes A numeric vector with haploid pool sizes
#' @param poolnames A character vector with the names of pool
#' @param min.rc Minimal allowed read count per base. Bases covered by less than min.rc reads are discarded and considered as sequencing error. For instance, if nucleotides A, C, G and T are covered by respectively 100, 15, 0 and 1 over all the pools, setting min.rc to 0 will lead to discard the position (the polymorphism being considered as tri-allelic), while setting min.rc to 1 (or 2, 3..14) will make the position be considered as a SNP with two alleles A and C (the only read for allele T being disregarded).
#' @param min.cov.per.pool Minimal allowed read count (per pool). If at least one pool is not covered by at least min.cov.perpool reads, the position is discarded
#' @param max.cov.per.pool Maximal allowed read count (per pool). If at least one pool is covered by more than min.cov.perpool reads, the position is discarded
#' @param min.maf Minimal allowed Minor Allele Frequency (computed from the ratio overal read counts for the reference allele over the read coverage)
#' @param noindel If TRUE, positions with at least one indel count are discarded
#' @param nlines.per.readblock Number of Lines read simultaneously. Should be adapted to the available RAM.
#' @param nthreads Number of available threads for parallelization of some part of the parsing (default=1, i.e., no parallelization)
#' @return A pooldata object containing 7 elements:
#' \enumerate{
#' \item "refallele.readcount": a matrix with nsnp rows and npools columns containing read counts for the reference allele (chosen arbitrarily) in each pool
#' \item "readcoverage": a matrix with nsnp rows and npools columns containing read coverage in each pool
#' \item "snp.info": a matrix with nsnp rows and four columns containing respectively the contig (or chromosome) name (1st column) and position (2nd column) of the SNP; the allele taken as reference in the refallele.readcount matrix (3rd column); and the alternative allele (4th column)
#' \item "poolsizes": a vector of length npools containing the haploid pool sizes
#' \item "poolnames": a vector of length npools containing the names of the pools
#' \item "nsnp": a scalar corresponding to the number of SNPs
#' \item "npools": a scalar corresponding to the number of pools
#' }
#' @examples
#'  make.example.files(writing.dir=tempdir())
#'  pooldata=popsync2pooldata(sync.file=paste0(tempdir(),"/ex.sync.gz"),poolsizes=rep(50,15))
#' @export
popsync2pooldata<-function(sync.file="",poolsizes=NA,poolnames=NA,min.rc=1,min.cov.per.pool=-1,max.cov.per.pool=1e6,min.maf=0.01,noindel=TRUE,nlines.per.readblock=1000000,nthreads=1){
  if(nthreads>1){
    tmp.ncores=detectCores()
    if(nthreads>tmp.ncores){nthreads=tmp.ncores}
    options(cores=nthreads)
    registerDoParallel()  ;  getDoParWorkers()  
    parallel=TRUE
  }
  if(nthreads<=1){parallel=FALSE}
  if(nchar(sync.file)==0){stop("ERROR: Please provide the name of the sync file generated by Popoolation")}
  if(sum(is.na(poolsizes))>0){stop("ERROR: Please provide a vector of Pool Sizes (poolsize argument)")}
  poolsizes=as.numeric(poolsizes)
  ngenos=6
  mat.geno=matrix(c(1,1,1,2,2,3,2,3,4,3,4,4),6,2)
  all1=c("A","A","A","T","T","C")
  all2=c("T","C","G","C","G","G")
  
  file.con=file(sync.file,open="r") 
  ##
  tmp.data=scan(file=file.con,nlines = 1,what="character",quiet=TRUE)
  npools=length(tmp.data)-3
  close(file.con)
  if(length(poolsizes)!=npools){stop("ERROR: The number of pools in the sync file is different from the length of the vector of pool sizes")}
  if(sum(is.na(poolnames))>0){
    poolnames=paste0("P",1:npools)
  }else{
    poolnames=as.character(poolnames)
    if(length(poolnames)!=npools){stop("ERROR: The number of pools in the sync file is different from the length of vector of pool names")}
  }
  ###
  file.con=file(sync.file,open="r")
  continue.reading=TRUE
  nlines.read=0
  time1=proc.time()
  while(continue.reading){
    # tmp.data=matrix(scan(file=sync.file,skip=skip,nlines = nlines.per.readblock,what="character",quiet=TRUE),ncol=npools+3,byrow=T)  
    tmp.data=matrix(scan(file=file.con,nlines = nlines.per.readblock,what="character",quiet=TRUE),ncol=npools+3,byrow=T)  
    if(length(tmp.data)<nlines.per.readblock){continue.reading=FALSE}
    npos=nrow(tmp.data)
    if(npos>1){
      tmp.snpdet=tmp.data[,1:3] ; tmp.cnt=tmp.data[,-1:-3]
      rm(tmp.data) 
      if(parallel){
        res=foreach(i=1:npools,.combine=cbind) %dopar% {
          matrix(as.numeric(unlist(strsplit(tmp.cnt[,i],split = ":"))),ncol=6,byrow=TRUE)
        }
        tmp.cnt2=array(0,dim = c(npos,npools,6))
        dum.deb=6*(0:(npools-1))
        for(i in 1:6){tmp.cnt2[,,i]=res[,dum.deb+i]}
        rm(res)
      }else{
        tmp.cnt2=array(0,dim = c(npos,npools,6))
        for(i in 1:npools){
          tmp.cnt2[,i,]=matrix(as.numeric(unlist(strsplit(tmp.cnt[,i],split = ":"))),ncol=6,byrow=TRUE)
        }
      }
      rm(tmp.cnt)    
      cnt.all=matrix(0,npos,6)
      for(i in 1:6){cnt.all[,i]=rowSums(tmp.cnt2[,,i])}
      snp.sel=rowSums(cnt.all[,1:4]>min.rc)==2
      if(noindel){snp.sel[cnt.all[,6]>0]=FALSE}
      tmp.snpdet=tmp.snpdet[snp.sel,]
      tmp.cnt2=tmp.cnt2[snp.sel,,1:4]
      cnt.all=cnt.all[snp.sel,1:4]
      
      tmp.snpdet=cbind(tmp.snpdet,tmp.snpdet[,1:2])
      tmp.Y=tmp.N=matrix(0,sum(snp.sel),npools)
      
      for(i in 1:ngenos){
        tmp.cntall.col=cnt.all[,mat.geno[i,]]
        tmp.cnt.col=tmp.cnt2[,,mat.geno[i,]]
        dum.sel=rowSums(tmp.cntall.col>min.rc)==2
        if(sum(dum.sel)>0){
          tmp.Y[dum.sel,]=tmp.cnt.col[dum.sel,,1]
          tmp.N[dum.sel,]=tmp.cnt.col[dum.sel,,1] + tmp.cnt.col[dum.sel,,2]
          tmp.snpdet[dum.sel,4]=all1[i] ; tmp.snpdet[dum.sel,5]=all2[i]
        }
      }
      ##filtres sur couverture et maf
      tmp.maf=0.5-abs(0.5-rowSums(tmp.Y)/rowSums(tmp.N))
      dum.sel=(rowSums(tmp.N>=min.cov.per.pool)==npools) & (rowSums(tmp.N<=max.cov.per.pool)==npools) & (tmp.maf>min.maf)
      tmp.Y=tmp.Y[dum.sel,] ; tmp.N=tmp.N[dum.sel,] ; tmp.snpdet=tmp.snpdet[dum.sel,]
      
      if(nlines.read==0){
        data.Y=tmp.Y ; data.N=tmp.N ; snpdet=tmp.snpdet[,-3]
      }else{
        data.Y=rbind(data.Y,tmp.Y)
        data.N=rbind(data.N,tmp.N)
        snpdet=rbind(snpdet,tmp.snpdet[,-3])
      }
      nlines.read=nlines.read+npos
      cat(nlines.read/1000000,"millions lines processed in",round((proc.time()-time1)[1]/60,2)," min.; ",nrow(data.Y),"SNPs found\n")
    }
  }
  close(file.con)
  
  res<-new("pooldata")
  res@npools=npools
  res@nsnp=nrow(data.Y)
  res@refallele.readcount=data.Y
  rm(data.Y)
  res@readcoverage=data.N
  rm(data.N)
  res@snp.info=data.frame(Chromosome=as.character(snpdet[,1]),
                          Position=as.numeric(snpdet[,2]),
                          RefAllele=as.character(snpdet[,3]),
                          AltAllele=as.character(snpdet[,4]),
                          stringsAsFactors = FALSE)
  rm(snpdet)
  res@poolsizes=poolsizes
  res@poolnames=poolnames
  
  cat("Data consists of",res@nsnp,"SNPs for",res@npools,"Pools\n")
  return(res)
}

Try the poolfstat package in your browser

Any scripts or data that you put into this service are public.

poolfstat documentation built on Sept. 8, 2023, 5:49 p.m.