#################################
#' Read data generated by Hudson ms program
#'
#' Read data generated by
#' \href{http://home.uchicago.edu/rhudson1/source/mksamples.html}{Hudson ms}
#' program, either as Haplotypes or as SNPs.
#'
#' With argument what="SNP", each site is read as a SNP, with the ancestral allele encoded as 0 and the alternate
#' allele encoded as 1. If the ms output file contains several replicates,
#' the different replicates will be collated together. Hence, the number of loci is the sum of all
#' sites from all replicates.
#'
#' With argument what="Haplotype", each different sequence from a replicate
#' is read as a haplotype,
#' by converting it first to a factor, and then to an integer. There will be as many loci
#' as there are replicates, and the number of alleles per locus will be the number of different
#' haplotypes in the corresponding replicate.
#'
#' @usage read.ms(fname,what=c("SNP","Haplotype"))
#' @param fname file name containing ms output
#' @param what whether to read ms output as SNPs or haplotypes
#' @return alldat a data frame with nloc+1 columns, the first being the population
#' to which the individual belongs and the next being the genotypes, one column per locus;
#' and one row per (haploid) individual.
#'
#'
#' @author Jerome Goudet \email{jerome.goudet@@unil.ch}
#' @references \href{https://academic.oup.com/bioinformatics/article-abstract/18/2/337/225783}{Hudson, R. R. (2002) Generating samples under a Wright-Fisher neutral model of genetic variation}. Bioinformatics 18 : 337-338.
#'
#' @examples
#' \dontrun{
#' datH<-read.ms(system.file("extdata","2pops_asspop.txt",package="hierfstat"),what="Haplotype")
#' dim(datH)
#' head(datH[,1:10]
#' datS<-read.ms(system.file("extdata","2pops_asspop.txt",package="hierfstat"),what="SNP")
#' dim(datS)
#' head(datS[,1:10])
#' }
#' @export
#####################################################################################
read.ms<-function(fname,what=c("SNP","Haplotype")){
cl<-match.call()
if(!is.na(pmatch(what,"SNP")))
what<-"SNP"
WHATS<-c("SNP","Haplotype")
what<-pmatch(what,WHATS)
if (is.na(what)) stop("Invalid data type")
if (what==-1) stop("Ambiguous data type")
dat<-readLines(fname)
tmp<-strsplit(dat[1],split=" ")[[1]]
pos.i<-which(tmp=="-I")
if (length(pos.i)>0) {
np<-as.integer(tmp[pos.i+1])
ni<-as.integer(tmp[(pos.i+2):(pos.i+1+np)])
}
else ni<-as.integer(tmp[2])
nrep<-as.integer(tmp[3])
dat1<-grep("^[0,1]",dat[-c(1:4)],value=TRUE)
nt<-sum(ni)
dats<-split(dat1,rep(1:nrep,each=nt))
if (what==2){
datia<-lapply(dats,function(x)as.integer(as.factor(x)))
datm<-matrix(unlist(datia),nrow=nt,byrow=FALSE)
alldat<-data.frame(Pop=rep(1:np,ni),datm)
names(alldat)<-c("Pop",paste("loc",1:dim(datm)[2],sep=""))
return(alldat)
}
if (what==1){
nsnps<-as.integer(unlist(lapply(strsplit(grep("segsites:",dat,value=TRUE),split=":"),function(x) x[[2]])))
dats1<-lapply(dats,strsplit,"")
dats2<-lapply(dats1,function(x) matrix(as.integer(unlist(x)),nrow=nt,byrow=TRUE))
alldat<-matrix(numeric(nt*sum(nsnps)),nrow=nt)
cnsnps<-cumsum(nsnps)
alldat[,1:cnsnps[1]]<-dats2[[1]]
if (nrep>1)
for (i in 2:nrep)
alldat[,(cnsnps[i-1]+1):cnsnps[i]]<-dats2[[i]]
popid<-rep(1:length(ni),ni)
alldat.names<-c("Pop",paste("loc",1:dim(alldat)[2],sep=""))
alldat<-data.frame(Pop=popid,alldat)
names(alldat)<-alldat.names
return(alldat)
}
}
####
#' Import \code{ms} output
#'
#' Import the output of the \code{ms} program into suitable format for further manipulations
#'
#' @usage ms2dos(fname,chrom=NULL)
#'
#' @param fname a text file containing the output of the \code{ms program}
#' @param chrom the chromosomes (replicates) to be imported. default to all
#'
#' @return alldat a matrix with as many row as (haploid) individuals and as many columns as SNPs
#' @pop a vector containing individuals population identifier
#' @return bim a data frame with two components
#' chr contains the chromosome (replicate) id;
#' pos contains the SNPs positions on the chromosome
#'
#'
#' @export
####
ms2dos<-function(fname,chrom=NULL){
#take an ms file as input
#return a matrix of haploid gametes
#for loop rather than lapply for speeding up if nrep too large
char2int<-function(datas){
tmp<-strsplit(datas,"")
matrix(as.integer(unlist(tmp)),nrow=length(datas),byrow=TRUE)
}
dat<-readLines(fname)
tmp<-strsplit(dat[1],split=" ")[[1]]
pos.i<-which(tmp=="-I")
if (length(pos.i)>0) {
np<-as.integer(tmp[pos.i+1])
ni<-as.integer(tmp[(pos.i+2):(pos.i+1+np)])
}
else {
np<-1
ni<-as.integer(tmp[2])
}
pos.i<-which(tmp=="-r")
if (length(pos.i)>0){
nbp<-as.integer(tmp[pos.i+2])
}
else nbp<-NULL
nrep<-as.integer(tmp[3])
dat1<-grep("^[0,1]",dat[-c(1:4)],value=TRUE)
nt<-sum(ni)
dats<-split(dat1,rep(1:nrep,each=nt))
if (!is.null(chrom)) dats<-dats[chrom] else chrom<-1:nrep
nsnps<-as.integer(unlist(lapply(strsplit(grep("segsites:",dat,value=TRUE),split=":"),function(x) x[[2]])))[chrom]
if (!is.null(nbp)){
pos<-round(unlist(lapply(lapply(strsplit(grep("positions",dat,value=T),split=":"),
function(x) strsplit(x[[2]],split=" ")),function(y) as.numeric(y[[1]][-1]))[chrom])*nbp,0)
} else {
pos<-unlist(lapply(nsnps,function(x) 1:x))
}
if((1.0*nsnps[1]*nt) <1.0e+8){
dats2<-lapply(dats,char2int)
} else{
nblocks<-ceiling(1.0*max(nsnps)*nt /1.0e+8)
indgroups<-split(1:nt,cut(1:nt,nblocks))
dats2<-list(length=length(chrom))
for (ic in 1:length(chrom)){
dats1<-NULL
dum<-lapply(indgroups,function(ii) char2int(dats[[ic]][ii]))
for (ib in 1:nblocks) dats1<-rbind(dats1,dum[[ib]])
dats2[[ic]]<-dats1
gc()
}
}
alldat<-NULL
for (i in 1:length(dats2))
alldat<-cbind(alldat,dats2[[i]])
gc()
return(list(alldat=alldat,pop=rep(1:np,ni),bim=data.frame(chr=rep(chrom,nsnps),pos=pos)))
}
####
#' Import the output of the \code{ms} program in a \code{BED} object
#'
#' Import the output of the \code{ms} program into a \code{BED} object, as defined in the
#' \href{https://cran.r-project.org/package=gaston}{gaston} package
#'
#' @usage ms2bed(fname,chrom=NULL)
#'
#' @param fname the name of the text file containing \code{ms} output
#' @param chrom the number of chromosomes (replicates) to import (default to all)
#'
#'
#' @return a bed object
#'
#' @details \code{ms2bed} relies on \code{\link{ms2dos}}.
#' The population identifier is in bed@@ped$famid
#'
#' @export
####
ms2bed<-function(fname,chrom=NULL){
dos<-ms2dos(fname,chrom)
x<-0:(dim(dos$alldat)[1]%/%2-1)
dos2<-dos$alldat[x*2+1,]+dos$alldat[x*2+2,]
bed<-gaston::as.bed.matrix(dos2)
bed@ped$famid<-dos$pop[x*2+1]
bed@snps$pos<-dos$bim$pos
bed@snps$chr<-dos$bim$chr
gc()
return(bed)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.