###############################################################
# Various functions to run pop genetic analyses and practicals
###############################################################
#library(popgen);
#########################################################
# To read in genotype data (0,1,2,? as data types
# could also implement PHASE type. 1 is heterozygote)
#########################################################
read.genotype.data<-function(file, data.format="unphased", names=FALSE, header=TRUE, positions=TRUE) {
if (header == TRUE) {data <- read.table(file, fill=TRUE, na.strings="?", skip=1, header=FALSE);}
else {data <- read.table(file, fill=TRUE, na.strings="?", header=FALSE);}
if (positions == TRUE) {n.seq <- nrow(data)-1;}
else {n.seq <- nrow(data);}
if (names == TRUE) {l.seq <- ncol(data)-1;}
else {l.seq <- ncol(data);}
if (positions == TRUE) {
pos <- as.numeric(as.vector(data[1,]));
pos <- pos[1:l.seq];
data <- data[2:nrow(data),];
}
else {pos <- c(1:l.seq);}
if (names == TRUE) {
nm<- as.vector(data[,1]);
data <- data[,2:ncol(data)];
}
else {nm <- c(1:n.seq);}
seq <- as.matrix(data);
;
return(list(type="genotype", n.seq=n.seq, l.seq=l.seq, pos=pos, names=nm, seq=seq));
}
#Reads in haplotype data in PHASE-like format
read.haplotype.data<-function(file, data.format="phased", names=FALSE, header=TRUE, positions=TRUE) {
if (header == TRUE) {data <- read.table(file, fill=TRUE, na.strings="?", skip=1, header=FALSE);}
else {data <- read.table(file, fill=TRUE, na.strings="?", header=FALSE);}
if (positions == TRUE) {n.seq <- nrow(data)-1;}
else {n.seq <- nrow(data);}
if (names == TRUE) {l.seq <- ncol(data)-1;}
else {l.seq <- ncol(data);}
if (positions == TRUE) {
pos <- as.numeric(as.vector(data[1,]));
pos <- pos[1:l.seq];
data <- data[2:nrow(data),];
}
else {pos <- c(1:l.seq);}
if (names == TRUE) {
nm<- as.vector(data[,1]);
data <- data[,2:ncol(data)];
}
else {nm <- c(1:n.seq);}
seq <- as.matrix(data);
return(list(type="haplotype", n.seq=n.seq, l.seq=l.seq, pos=pos, names=nm, seq=seq));
}
#Reads the output from Selsim
read.selsim.data<-function(seq.file="seq.txt", tree.file="SStree.txt", plot=TRUE) {
x=read.table(seq.file,skip=4,fill=T, as.is=T)
pos<-as.matrix(x[1,]);
x<-as.matrix(x[2:(nrow(x)-2),]);
data <- list(type="haplotype", n.seq=nrow(x), l.seq=ncol(x), pos=pos, names=c(1:nrow(x)), seq=x);
data <- subset.data(data, maf=0)
if (plot==TRUE) {
tree<-read.table(tree.file, as.is=T, header=T);
x11();
o<-plot.tree(tree)
haplotype.plot(data, order=o);
}
return(data);
}
#Generic wrapper for reading data files
read.data <- function(file, format="haplotype", names=FALSE, positions=TRUE, header=TRUE, aligned=TRUE, gap.as.state=TRUE, biallelic.only=TRUE) {
if (format=="selsim") {data<-(read.selsim.data(file));}
else if (format == "haplotype") {data<-(read.haplotype.data(file, names=names, positions=positions, header=header));}
else if (format == "genotype") {data<-(read.genotype.data(file, names=names, positions=positions, header=header));}
else if (format == "fasta") {data <- read.fasta(file, aligned=aligned, gap.as.state=gap.as.state, biallelic.only=biallelic.only);}
cat(paste("\n\nRead ",data$n.seq, " sequences of length ",data$l.seq, "\n\n",sep=""));
return(data);
}
#snp.frequency
snp.frequency <- function(data, maf=TRUE) {
ct<-apply(data$seq, 2, sum, na.rm=T)
norm<-apply(!is.na(data$seq), 2, sum)
if (data$type == "genotype") norm <- norm*2;
freq<-ct/norm;
if (maf==TRUE) {
f.inv<-1-freq;
freq<-apply(cbind(freq, f.inv), 1, min);
}
return(freq)
}
#Selects data subset on basis of position and MAF
subset.data <- function(data, maf=-1, pos.lim=c(), select.snp=c(), select.ind=c(), convert.to.minor=FALSE, missing.data.thresh=0.2) {
#Just keep those individuals desired
if (length(select.ind)>0) {
data$seq<-data$seq[select.ind,];
data$n.seq<-length(select.ind);
data$names<-data$names[select.ind];
}
#Remove SNPs with >thresh missing data
na<-apply(is.na(data$seq), 2, mean);
data$seq<-data$seq[,na<missing.data.thresh];
data$pos<-data$pos[na<missing.data.thresh];
if (length(data$snp.id>0)) data$snp.id<-data$snp.id[na<missing.data.thresh];
data$l.seq<-sum(na<missing.data.thresh);
#Remove inds with >thresh missing data
na<-apply(is.na(data$seq), 1, mean);
data$seq<-data$seq[na<missing.data.thresh,];
data$names<-data$names[na<missing.data.thresh];
data$n.seq<-sum(na<missing.data.thresh);
select <- c(1:data$l.seq)
#On allele frequency
if (maf>=0) {
f <- snp.frequency(data);
select <- select[f>maf];
}
#On limits
if (length(pos.lim) == 2) {
select <- select[data$pos[select]>pos.lim[1] & data$pos[select]<pos.lim[2]];
}
#On defined snps - identified by relative position - problematic if using filters from above
if (length(select.snp)>0) {
select<-select[select %in% select.snp];
}
if (length(select) == 0) {
cat("\nNo data to return\n");
return(NULL);
}
if (convert.to.minor==TRUE) {
f<-apply(data$seq[,select], 2, mean);
data$seq[,select[f>0.5]]<-1-data$seq[,select[f>0.5]]
}
cat(paste("\n\nNumber of SNPs returned = ", length(select), "\n\n", sep=""));
if (length(data$snp.id>0)) {
return(list(type=data$type, n.seq=data$n.seq, l.seq=length(select), pos=data$pos[select], snp.id=data$snp.id[select],
names=data$names, seq=as.matrix(data$seq[,select])));
}
else {
return(list(type=data$type, n.seq=data$n.seq, l.seq=length(select), pos=data$pos[select],
names=data$names, seq=as.matrix(data$seq[,select])));
}
}
#Summarises data
summarise.data <- function(data, single.plot=TRUE, ld.plot=TRUE, maf=0.0, n.int=10, new.plot=TRUE) {
if (data$l.seq<2) {
cat("\n\n*** Error: too little data to analyse (S<2) ***\n\n");
}
data<-subset.data(data, maf=0.0)
if (data$l.seq<2) {
cat("\n\n*** Error: too little data to analyse (S<2) ***\n\n");
}
cat("\n\n#####################################################\n");
cat(" Coalescent summary of data\n");
cat("#####################################################\n\n");
if (data$type=="haplotype") cat(paste("Number of sequences = ",data$n.seq,"\n",sep=""));
if (data$type=="genotype") cat(paste("Number of individuals = ",data$n.seq,"\n",sep=""));
cat(paste("Number of polymorphic sites = ",data$l.seq,"\n",sep=""));
if (new.plot) x11();
if (single.plot == TRUE) par(mfrow=c(2,2));
freq <- snp.frequency(data);
#Count singletons
fq<-apply(data$seq, 2, sum, na.rm=T);
n.na<-apply(is.na(data$seq), 2, sum);
if (data$type=="haplotype") n.sing <- sum(fq==1)+sum(fq==(data$n.seq-n.na-1));
if (data$type=="genotype") n.sing <- sum(fq==1)+sum(fq==(2*data$n.seq-2*n.na-1));
if (n.sing<1) n.sing<-0;
d <- dist(data$seq, method="manhattan");
if (data$type=="genotype") d<-d/2;
pwd = mean(d);
hl=hclust(d, method="average");
ns <- data$n.seq;
if (data$type=="genotype") ns<-2*ns;
s <- data$l.seq;
con=vector(length=10);
con[1]=sum(1/c(1:(ns-1)));
con[2]=sum((1/c(1:(ns-1)))^2);
con[3]=(ns+1)/(3*ns-3);
con[4]=2*(ns*ns+ns+3)/(9*ns*ns-9*ns);
con[5]=con[3]-1/con[1];
con[6]=con[4]-(ns+2)/(ns*con[1])+con[2]/(con[1]*con[1]);
con[7]=con[5]/con[1];
con[8]=con[6]/(con[1]*con[1]+con[2]);
con[9]=2*(ns*con[1]-2*(ns-1))/((ns-1)*(ns-2));
con[10]=con[9]+(ns-2)/((ns-1)*(ns-1))+((2/(ns-1))*(1.5-(2*(con[1]+1/ns)-3)/(ns-2)-1/ns));
con[11]=(ns*ns*con[2]/((ns-1)*(ns-1))+con[1]*con[1]*con[10]-2*ns*con[1]*(con[1]+1)/((ns-1)*(ns-1)))/(con[1]*con[1]+con[2]);
con[12]=ns/(ns-1)*(con[1]-ns/(ns-1))-con[11];
cat(paste("\nWatterson's estimate of theta = ", round(data$l.seq/con[1],2), "\n", sep=""));
cat(paste("Average paiwise differences = ", round(pwd,2), "\n", sep=""));
cat(paste("Number of singletons = ", n.sing, "\n", sep=""));
cat(paste("Tajima D statistic = ", round((pwd-s/con[1])/sqrt(con[7]*s+con[8]*s*(s-1)),2), "\n", sep=""));
cat(paste("Fu and Li D statistic = ", round((ns/(ns-1)*s-con[1]*n.sing)/sqrt(con[12]*s+con[11]*s*s),2), "\n", sep=""));
breaks = c(-1.5,-0.5,0.5,1.5,2.5)
image(x=data$pos, z=t(data$seq[hl$order,]), col=c("white", "blue", "yellow","red"), breaks=breaks, xlab="Position", ylab="Sample - same order as cluster", main="Haplotype plot", yaxt="n");
if (single.plot == FALSE) {x11();}
breaks<-c(0:n.int)/(2*n.int);
breaks[length(breaks)]<-breaks[length(breaks)]*1.1;
exp<-rep(0, n.int);
for (i in 1:(data$n.seq-1)) {
wt<-1/i;
f.i<-findInterval(min(i, data$n.seq-i)/data$n.seq, breaks);
exp[f.i]<-exp[f.i]+wt;
}
obs<-hist(freq, breaks=breaks, plot=F)$counts;
exp<-exp*length(freq)/sum(exp);
mpt<-(breaks[1:n.int]+breaks[2:(n.int+1)])/2;
barplot(rbind(obs, exp), xlab="Minor Allele Frequency", ylab="Count", beside=T,
col=c("blue", "cyan"), main="Minor Allele Frequency", names=mpt, space=c(0,0.25),
cex.names=0.5);
if (single.plot == FALSE) {x11();}
hist(d, main="Pairwise differences", xlab="Pairwise differences", ylab="Count", col="blue");
if (single.plot == FALSE) x11();
plot(x=hl, labels=data$names, main="UPGMA tree of pairwise differences", ylab="Distance", hang=-1);
if (ld.plot == TRUE) {
x11();
par(mfrow=c(1,1));
plotld(data, cutoff=0.05);
}
}
#Makes LD matrix from data
makeld<-function(data){
cor=array(0, dim=c(data$l.seq, data$l.seq))
for(i in 1:(nrow(cor)-1)){
for(j in (i+1):ncol(cor)){
which.use<-which(!is.na(data$seq[,i]) & !is.na(data$seq[,j]));
if (length(which.use>0)) {
if (data$type=="haplotype") {
d<-(length(which.use)-1)/length(which.use)*cov(data$seq[which.use,i],data$seq[which.use,j], use="pairwise");
fi<-mean(data$seq[which.use,i]);
fj<-mean(data$seq[which.use,j]);
}
if (data$type=="genotype") {
d = estimateD.genotype.EM(data$seq[which.use,c(i,j)]);
fi<-d$fi;
fj<-d$fj;
d<-d$D;
}
r<-d/sqrt(fi*fj*(1-fi)*(1-fj));
cor[i,j]=r^2;
if(!is.na(d) & d>=0) cor[j,i]=d/min(fi*(1-fj),(1-fi)*fj);
if(!is.na(d) & d<0) cor[j,i]=-d/min(fi*fj,(1-fi)*(1-fj));
}
if (length(which.use)==0) {
is.na(cor[j,i])<-TRUE;
is.na(cor[i,j])<-TRUE;
}
}
}
if (data$type=="haplotype") {
min.rec = vector(length=ncol(cor));
min.rec[1]=0;
if (!is.na(cor[2,1]) & cor[2,1]<0.999) {min.rec[2]=1;}
else {min.rec[2]=0;}
for (k in 3:ncol(cor)) {
min.rec[k]=0;
for (i in 2:(k-1)) {
if (!is.na(cor[k,i]) & cor[k,i]<0.999) {r.add=1;}
else {r.add=0;}
if (min.rec[i]+r.add>min.rec[k]) {min.rec[k]=min.rec[i]+r.add;}
}
}
}
if (data$type=="genotype") min.rec<-min.rec.gt(data);
cat(paste("\n\nMinimum number of recombination events (HK85) = ",min.rec[ncol(cor)],"\n\n", sep=""));
return(cor);
}
plotld=function(data,cutoff=0,left=-1,right=-1,cor=c(), col=c(), breaks=c()){
data<-subset.data(data, maf=cutoff);
cat("\n\nCalculating LD matrix");
cor=makeld(data)
if (length(col)==0) {
blues=rgb(0,0,(20:0)/20);
reds=rgb((0:20)/20,0,0);
col=c(blues, reds);
}
if (length(breaks)==0) {
image(data$pos,data$pos,cor,zlim=c(0,1.02),xlab="Position",ylab="Position",
main="LD matrix: r2 - |D'|", col=col)
}
else {
image(data$pos,data$pos,cor,zlim=c(0,1.5),xlab="Position",ylab="Position",
main="LD matrix: r2 - |D'|", col=col, breaks=breaks)
}
}
#########################################################
# Haplotype plot of data
#########################################################
haplotype.plot<-function(data, maf=-1, plot.lims=c(), plot.new=TRUE, col=c(), order=c(), spacing="distance",
type="block", return=FALSE, add.legend=TRUE, cex=1, sort.hap.position=0, f.half=0.2, method="normal") {
if (data$l.seq<2) {
cat("\n\n*** Error: too little data to analyse (S<2) ***\n\n");
}
if (maf>=0 | length(plot.lims)>0) data<-subset.data(data, maf=maf, pos=plot.lims);
if (plot.new==TRUE) {x11();}
if (length(order)==0) {
if (sort.hap.position==0) {d <- dist(data$seq, method="manhattan");}
else {d<-make.weighted.dist(data, position=sort.hap.position, f.half<-f.half, method=method);}
hl<-hclust(d, method="average");
order<-hl$order
}
if (is.na(pmatch(spacing, "distance"))) {pos<-c(1:data$l.seq);}
else {pos<-data$pos;}
#Block like picture
if (is.na(pmatch(type, "block"))==FALSE) {
breaks = c(-1.5,-0.5,0.5,1.5,2.5)
if (length(col)==0) col=c("white", "blue", "yellow","red");
image(x=pos, z=t(data$seq[order,]), col=col, breaks=breaks,
xlab="Position", ylab="Chromosome", main="Haplotype plot", yaxt="n");
}
#Stick and ball picture
else {
plot(0,0,type="n", xlim=c(min(pos), max(pos)), ylim=c(0,(data$n.seq*1.05)), yaxt="n", xlab="Position", bty="n", ylab="");
if (length(col)==0) {col=rep("blue", data$l.seq);}
for (i in 1:data$n.seq) segments(x0=pos[1], y0=i, x1=pos[length(pos)], y1=i, lwd=1, col=grey(0.85));
for (i in 1:data$n.seq) {
snps<-pos[data$seq[order[i],]==1]
points(x=snps, y=rep(i, length(snps)), pch=19, col=col[data$seq[order[i],]==1], cex=cex);
}
if (is.na(pmatch(spacing, "distance")) & add.legend==TRUE) {
segments(x0=pos[1], y0=data$n.seq*1.05, x1=pos[length(pos)], y1=data$n.seq*1.05, lwd=1, col=grey(0.75));
low<-data$pos[1];
high<-data$pos[data$l.seq];
snp.pos<-1+(data$l.seq-1)*(data$pos-low)/(high-low);
segments(snp.pos, data$n.seq*1.04, snp.pos, data$n.seq*1.05, lwd=1, col=col);
segments(pos, data$n.seq+1, snp.pos, data$n.seq*1.03, lwd=1, col="black");
}
}
if (return==TRUE) return(order);
}
#############################################################
# Make distance matrix weighting exponentially from location
#############################################################
make.weighted.dist<-function(data, position=0, f.half=1, method="exponential") {
if (method=="exponential") {
lambda<-log(2)/(f.half*(data$pos[length(data$pos)]-data$pos[1]));
wts<-exp(-lambda*abs(data$pos-position));
}
else {
sig<-f.half*(data$pos[length(data$pos)]-data$pos[1])/sqrt(2*log(2));
wts<-exp(-(data$pos-position)^2/(2*sig^2));
}
d<-matrix(nrow=data$n.seq, ncol=data$n.seq);
for (i in 1:(data$n.seq-1)) for (j in (i+1):data$n.seq) {
d[j,i]<-sum(((data$seq[i,]-data$seq[j,])^2)*wts);
}
return(as.dist(d));
}
#########################################################
#Summarises output from interval program in LDhat2.1
#########################################################
summarise.interval=function(rates.file="rates.txt", burn.in=30, locs.file=FALSE) {
x = read.table(rates.file, skip=1, fill=T);
x = as.matrix(x);
low = as.integer(nrow(x)*burn.in/100);
cat("\n\nSummarise output from MCMC estimation of recombination rates in INTERVAL (LDhat 2.1)\n\n");
cat(paste("Number of SNPs = ", ncol(x), "\n", sep=""));
cat(paste("Number of samples = ", nrow(x), "\n", sep=""));
cat(paste("Burn-in period = ", low, " samples\n", sep=""));
x11();
par(mfrow=c(1,2));
# plot(x[,1], type="s", col=rgb(0,0,0.5), xlab="Sample", ylab="Total map length",
# main="Mixing of total map length");
# image(x=c(1:nrow(x)), y=c(1:(ncol(x)-1)), z=log(x[,2:ncol(x)]), xlab="Sample",
# ylab="log(rate) at SNP", main="Mixing of rates");
means<-apply(x[low:nrow(x),], 2, mean, na.rm=T);
q.95<-apply(x[low:nrow(x),], 2, quantile, probs=c(0.025, 0.5, 0.975), na.rm=T);
cat(paste("\nMean posterior total map length (4Ner) = ", signif(mean(means), 4), "\n", sep=""));
x11();
if (locs.file==FALSE) {pos<-c(1:ncol(x)); xlab<-"Position (SNP)";}
else {pos<-as.vector(as.matrix(read.table(locs.file, as.is=T, skip=1))); xlab<-"Position"}
plot(pos[1:(length(pos)-1)], y=means[2:length(means)], type="s", col=rgb(0,0,0.5),
xlab=xlab, ylab="Posterior mean rate", main="Posterior mean rates");
lines(pos[1:(length(pos)-1)], y=q.95[1,2:length(means)], type="s", col=grey(0.75), lty="dotted");
lines(pos[1:(length(pos)-1)], y=q.95[3,2:length(means)], type="s", col=grey(0.75), lty="dotted");
op<-cbind(means, t(q.95));
colnames(op)<-c("Mean", "q2.5", "Median", "q97.5");
return(op);
}
#########################################################
#summarises output from rhomap program in LDhat2.1
#########################################################
summarise.rhomap<-function(rates.file="rates.txt", burn.in=30, locs.file=FALSE) {
return(summarise.interval(rates.file=rates.file, burn.in=burn.in, locs.file=locs.file));
}
#########################################################
#summarises output from pairwise program in LDhat2.1
#########################################################
summarise.pairwise<-function(surf=TRUE, window=FALSE, rm=FALSE, test=FALSE, ci=FALSE, locs.file=FALSE) {
cat("\n\nSummarising output from PAIRWISE program in LDhat 2.1\n\n");
if (locs.file==TRUE) pos<-as.vector(as.matrix(read.table(locs.file, as.is=T, skip=1)));
if (surf) {
surface = read.table("outfile.txt", skip=10, fill=T);
x11();
plot(surface[,1:2], type="l", col=rgb(0,0,0.5), xlab="4Ner",
ylab="Composite likelihood", main="Likelihood surface for 4Ner over region");
xmax=surface[1,1];
ymax = surface[1,2];
for (i in 2:nrow(surface)) {
if (surface[i,2]>ymax) {ymax=surface[i,2]; xmax=surface[i,1];}
}
cat(paste("Maximum composite likelihood estimate of 4Ner = ", xmax, "\n", sep=""));
}
if (window) {
win = read.table("window_out.txt", skip=5);
x11();
par(mfrow=c(3,1));
plot(x=(win[,1]+win[,2])/2, win[,3], type="n", xlab="Window", ylab="Number SNPs", main="SNP density");
segments(x0=win[,1], y0=win[,3], x1=win[,2], y1=win[,3], col="blue");
plot(x=(win[,1]+win[,2])/2, win[,4], type="n", xlab="Window", ylab="4Ner per kb/bp", main="Local recombination rate");
segments(x0=win[,1], y0=win[,4], x1=win[,2], y1=win[,4], col="red");
av = vector(length=nrow(win));
for(i in 1:nrow(win)) av[i]=xmax/(win[nrow(win),2]-win[1,1]);
lines(x=(win[,1]+win[,2])/2, y=av, col="green");
plot(x=(win[,1]+win[,2])/2, y=win[,5], type="n", xlab="Window midpoint", ylab="CLR", main="Composite likelihood ratio");
segments(x0=win[,1], y0=win[,5], x1=win[,2], y1=win[,5], col="black");
}
if (rm) {
rmin = read.table("rmin.txt", skip=5, fill=T);
rmin= as.matrix(rmin[1:nrow(rmin),2:ncol(rmin)]);
f<-as.matrix(read.table("freqs.txt", as.is=T, skip=5)[,2:6]);
n.chr<-sum(f[1,]);
anc<-apply(f[,2:5], 1, sum);
mx<-apply(f[,2:5], 1, max);
seg<-mx<anc;
rmin<-rmin[seg[1:(length(seg)-1)],];
rmin<-rmin[,seg[2:length(seg)]];
maxrm=0;
maxscrm=0;
for (i in 1:nrow(rmin)) {
if (i<ncol(rmin)) {for (j in (i):ncol(rmin)) {if (rmin[i,j]>maxrm) maxrm=rmin[i,j];}}
if (i>1) {for (j in 1:(i-1)) {if(rmin[i,j]>maxscrm) maxscrm=rmin[i,j];}}
}
xu=matrix(0,nrow=nrow(rmin)+1, ncol=nrow(rmin)+1);
xl=matrix(0,nrow=nrow(rmin)+1, ncol=nrow(rmin)+1);
for (i in 1:nrow(rmin)) {
if (i<ncol(rmin)) {
for (j in (i):ncol(rmin)) {xu[i,j+1]=rmin[i,j]; xl[i,j+1]=0;}
}
if (i>1) {
for (j in 1:(i-1)) {xl[i,j]=rmin[i,j]; xu[i,j+1]=0;}
}
xu[i,i]=0;
xl[i,i]=0;
}
blues = c(rgb(0,0,(15:6)/15));
reds = c(rgb((6:15)/15,0,0));
x11();
par(mfrow=c(2,1));
image(xu, xaxt="n", yaxt="n", xlab="Position", ylab="Position", main="Total recombination",
col=c("white", blues, reds, "yellow"));
image(log(xl+0.001), main="Recombination scaled by distance", xaxt="n", yaxt="n",
xlab="Position", ylab="Position", col=c("white", blues, reds, "yellow"),
breaks=as.vector(quantile(xl, probs=seq(0,1,length.out=23))));
rm(xu);
rm(xl);
rm(rmin);
}
if (test) {
xx<-max(surface[,2]);
rd<-as.matrix(read.table("rdist.txt", skip=4)[,2:5]);
x11();
hist(rd[,2], col="blue", breaks=seq(xlim[1], xlim[2], length.out=20), ,
main="Likelihood permutation test", xlab="Composite Likelihood");
points(xx,1,pch=25, col="red", bg="red");
}
if (ci) {
cat("\n\nNothing implemented yet to display sampling distribution\n\n");
}
}
#########################################################
#Function to plot the results of Jonathan,s ps program
#########################################################
plot.ps.results=function(ps.output) {
x11();
pal = c(rgb(0,0,(15:0)/15), rgb((0:15)/15,0,0));
image(x=c(1:nrow(ps.output$Q)), y=c(1:ncol(ps.output$Q)), z=ps.output$Q, xlab="Individual", ylab="Posterior probability from K",col=pal, main="Posterior probs for individual assignment");
x11();
pal = c("red", "blue", "green", "cyan", "black", "purple", "orange");
cmax = max(ps.output$c);
plot(ps.output$c[,1], type="n", xlab="Sample", ylab="c-parameter", ylim=c(0,cmax), main="Samples of the population c parameters");
for (i in 1:ncol(ps.output$c)) {
lines(ps.output$c[,i], col=pal[i]);
}
}
#########################################################
#Writes LDhat format file from data - need to cope with missing data still
#currently outputs as NA rather than ?/N/n/-
#########################################################
write.ldhat.format = function(x, seq.file="seqs.txt", loc.file="locs.txt", in.kb=TRUE, maxseq=-1, na.char="?") {
if (x$type == "haplotype") {hd=1;}
else {hd=2;}
if ("names" %in% names(x)) {
names<-x$names;
}
else {
names<-paste("seq", 1:x$n.seq, sep="");
}
if (in.kb) {
write(paste(x$l.seq, as.character(max(x$pos)/1000), "L", sep=" "), loc.file);
}
else {
write(paste(x$l.seq, max(x$pos), "L", sep=" "), loc.file);
}
if (in.kb) {write(as.character(x$pos/1000), file=loc.file, append=TRUE, ncolumns=1);}
else {
write(x$pos, loc.file, append=TRUE, ncolumns=1);
}
if (maxseq>0) {
n.seq <- min(maxseq, x$n.seq);
}
else {n.seq <- x$n.seq;}
cat(paste(n.seq, x$l.seq, hd, "\n", sep=" "), file=seq.file);
for (i in 1:n.seq) {
s<-as.character(x$seq[i,]);
s[is.na(s)]<-na.char;
#s[s=="2"]<--1;
#s[s=="1"]<-2;
#s[s=="-1"]<-1;
#cat (paste(">", names[i], "\n", s, "\n", sep="", coll=""), file=seq.file, append=TRUE);
cat (paste(">", names[i], "\n", sep=""), file=seq.file, append=TRUE);
cat (paste(s, sep="", collapse=""), file=seq.file, append=TRUE);
cat ("\n", file=seq.file, append=TRUE);
}
}
#########################################################
#Writes PHASE format file from data - need to cope with missing data still
#currently outputs as NA rather than ?/N/n/-
#########################################################
write.phase.format = function(x, file="data.phase") {
cat(x$pos, sep="\t", file=file);
for (i in 1:x$n.seq) {
cat("\n", file=file, append=T);
cat(x$seq[i,], file=file, append=T);
}
}
#########################################################
#Convert from standard data format to ps format
#########################################################
convert2ps <- function(data) {
y = array(dim=c(data$n.seq, 2, data$l.seq));
if (data$type == "haplotype") {
for (i in 1:data$n.seq) {
for (j in 1:data$l.seq) {
y[i,1,j]=data$seq[i,j];
y[i,2,j]=-1;
}
}
}
else if (data$type == "genotype") {
for (i in 1:data$n.seq) {
for (j in 1:data$l.seq) {
if (data$seq[i,j]==0) {y[i,1,j]=0; y[i,2,j]=0;}
else if (data$seq[i,j]==1) {y[i,1,j]=0; y[i,2,j]=1;}
else if (data$seq[i,j]==2) {y[i,1,j]=1; y[i,2,j]=1;}
else {y[i,1,j]=-1; y[i,2,j]=-1;}
}
}
}
return(y);
}
#########################################################
#To plot haplotype structure around a SNP
#########################################################
plot.snp.hap<-function(data, snp=1, pos=c(), maf=0.0, gap=3, cex=2, scale=10, min.r2=0.2, show=0) {
snppos<-data$pos[snp];
d<-subset.data(data, pos=pos, maf=maf);
snp<-c(1:length(d$pos))[d$pos==snppos];
d.l<-d$seq[,1:(snp-1)];
d.r<-d$seq[,(snp+1):d$l.seq];
f1pos<-sum(d$seq[,snp]);
haps.l<-unique(d.l);
haps.r<-unique(d.r);
nhaps.l<-nrow(haps.l);
nhaps.r<-nrow(haps.r);
dist.l<-dist(haps.l, method="manhattan");
dist.r<-dist(haps.r, method="manhattan");
h.l<-hclust(dist.l, method="average");
h.r<-hclust(dist.r, method="average");
hap.count.l<-matrix(nrow=nhaps.l, ncol=6);
hap.id.l<-matrix(nrow=nhaps.l, ncol=1);
hap.count.r<-matrix(nrow=nhaps.r, ncol=6);
hap.id.r<-matrix(nrow=nhaps.r, ncol=1);
colnames(hap.count.l)<-c("Haplotype", "Count0", "Count1", "Freq", "D", "r2");
colnames(hap.count.r)<-c("Haplotype", "Count0", "Count1", "Freq", "D", "r2");
for (hap in 1:nhaps.l) {
hap.id.l[hap,1]<-paste(haps.l[hap,], sep="", collapse="");
hap.count.l[hap,1]<-hap;
m<-t(d.l)-haps.l[hap,];
mn<-apply(m, 2, min);
mx<-apply(m, 2, max);
id<-c(1:ncol(m))[mn==0 & mx==0];
allele<-d$seq[id,snp];
f1hap<-sum(allele);
fhap<-length(id);
hap.count.l[hap,3]<-f1hap;
hap.count.l[hap,2]<-fhap-f1hap;
hap.count.l[hap,4]<-fhap/d$n.seq;
dnum<-f1hap*d$n.seq-fhap*f1pos;
dden<-f1pos*(d$n.seq-f1pos)*fhap*(d$n.seq-fhap);
hap.count.l[hap,5]<-dnum;
hap.count.l[hap,6]<-dnum*dnum/dden;
}
for (hap in 1:nhaps.r) {
hap.id.r[hap,1]<-paste(haps.r[hap,], sep="", collapse="");
hap.count.r[hap,1]<-hap;
m<-t(d.r)-haps.r[hap,];
mn<-apply(m, 2, min);
mx<-apply(m, 2, max);
id<-c(1:ncol(m))[mn==0 & mx==0];
allele<-d$seq[id,snp];
f1hap<-sum(allele);
fhap<-length(id);
hap.count.r[hap,3]<-f1hap;
hap.count.r[hap,2]<-fhap-f1hap;
hap.count.r[hap,4]<-fhap/d$n.seq;
dnum<-f1hap*d$n.seq-fhap*f1pos;
dden<-f1pos*(d$n.seq-f1pos)*fhap*(d$n.seq-fhap);
hap.count.r[hap,5]<-dnum;
hap.count.r[hap,6]<-dnum*dnum/dden;
}
plot(0,0,type="n", xlab="", ylab="", xaxt="n", yaxt="n",
xlim=c(0,d$l.seq+1+2*gap), ylim=c(0,max(nhaps.l, nhaps.r)), bty="n");
pal<-c("white", "black");
mpt<-max(nhaps.l, nhaps.r)/2;
for (hap in 1:nhaps.l) if (hap.count.l[hap,6]>min.r2) {
col<-"red";
if (show==0 & hap.count.l[h.l$order[hap],5]>0) {col<-"blue"};
if (show==1 & hap.count.l[h.l$order[hap],5]<0) {col<-"blue"};
segments(x0=snp+gap, y0=mpt, x1=snp-1, y1=hap, lwd=scale*hap.count.l[h.l$order[hap],6], col=col);
}
for (hap in 1:nhaps.r) if (hap.count.r[hap,6]>min.r2) {
col<-"red";
if (show==0 & hap.count.r[h.r$order[hap],5]>0) {col<-"blue"};
if (show==1 & hap.count.r[h.r$order[hap],5]<0) {col<-"blue"};
segments(x0=snp+gap, y0=mpt, x1=snp+2*gap, y1=hap, lwd=scale*hap.count.r[h.r$order[hap],6], col=col);
}
for (hap in 1:nhaps.l) {
lines(x=c(1, ncol(haps.l)), y=c(hap, hap));
points(x=c(1:ncol(haps.l)), y=rep(hap, ncol(haps.l)), pch=21, bg=pal[haps.l[h.l$order[hap],]+1], cex=cex);
}
for (hap in 1:nhaps.r) {
lines(x=c(snp+2*gap+1, ncol(haps.r)+snp+2*gap), y=c(hap, hap));
points(x=c(1:ncol(haps.r))+snp+2*gap, y=rep(hap, ncol(haps.r)), pch=21, bg=pal[haps.r[h.r$order[hap],]+1], cex=cex);
}
points(snp+gap, mpt, pch=21, bg=pal[show+1], cex=cex*2);
return(list(hapID.L=hap.id.l, hapCT.L=hap.count.l, hapID.R=hap.id.r, hapCT.R=hap.count.r));
}
#########################################################
# Make a list of all descenddant of a node
#########################################################
find.descendants<-function(node, nodes, descendant.list) {
if (nodes[node,3]==0) {descendant.list<-c(descendant.list, node);}
else {
descendant.list<-find.descendants(nodes[node,3], nodes, descendant.list);
descendant.list<-find.descendants(nodes[node,4], nodes, descendant.list);
}
return(descendant.list);
}
#########################################################
# Find time underneath a node
#########################################################
node.time<-function(nodes, node, time) {
if (nodes[node,3]>0) {
time<-time+nodes[node,2]-nodes[nodes[node,3],2];
time<-node.time(nodes, nodes[node,3], time);
time<-time+nodes[node,2]-nodes[nodes[node,4],2];
time<-node.time(nodes, nodes[node,4], time);
}
return(time);
}
#########################################################
# Choose a node to mutate on the basis of a cumulative time
#########################################################
choose.node<-function(nodes, ctime, mrca) {
parent<-match(c(1:(mrca-1)), nodes[,3:4])%%mrca;
parent[parent==0]=mrca;
dtime<-nodes[parent,2]-nodes[1:(mrca-1),2];
for (node in 1:(mrca-1)) {
ctime<-ctime-dtime[node];
if (ctime<=0) return(node);
}
cat("\n\n*** Error: gone beyond end of nodes ***\n\n");
}
#########################################################
# Define order
#########################################################
order.seqs<-function(data, node=nrow(data), ord=c()) {
if (data[node,3]==0) {
ord[length(ord)+1]<-node;
}
else {
ord<-order.seqs(data=data, node=data[node, 3], ord);
ord<-order.seqs(data=data, node=data[node, 4], ord);
}
return(ord);
}
#########################################################
# To plot a tree vertically
# Data is
# Col1 = name
# Col2 = Time
# COl3 = D1
# Col4 = No. mutations
#########################################################
plot.tree<-function(data, maxdepth=-1, add.mutations=FALSE, y.low=0, names.plot=TRUE, lwd=1, cex.mtn=0.5, plot.order=c(),cex.txt=0.5, srt.txt=90, col=c()) {
nseq<-(nrow(data)+1)/2;
mrca = nrow(data);
if (length(plot.order)==0) plot.order<-order.seqs(data);
if (maxdepth<0) maxdepth=max(data[,2]);
plot(0,0, type="n", xaxt="n", xlab="", ylab="Time", bty="n", ylim=c(y.low,maxdepth), xlim=c(0,nseq+1));
branch.pos<-order(plot.order);
for (node in (nseq+1):mrca) {
branch.pos[node]<-(branch.pos[data[node,3]]+branch.pos[data[node,4]])/2;
#DO horizontal line
segments(x0=branch.pos[data[node,3]], y0=data[node,2], x1=branch.pos[data[node,4]], y1=data[node, 2], lwd=lwd);
#DO LH vertical
segments(branch.pos[data[node,3]], data[data[node,3],2], branch.pos[data[node,3]], data[node, 2], lwd=lwd);
if (add.mutations==T && data[data[node,3],5]>0) {
time.span<-c(data[data[node,3],2], data[node,2]);
y.vals<-time.span[1]+runif(data[data[node,3],5])*(time.span[2]-time.span[1]);
points(rep(branch.pos[data[node,3]], data[data[node,3],5]), y.vals, pch=19, col=rgb(0,0,0.5), cex=cex.mtn);
}
#DO RH vertical
segments(branch.pos[data[node,4]], data[data[node,4],2], branch.pos[data[node,4]], data[node, 2], lwd=lwd);
if (add.mutations==T && data[data[node,4],5]>0) {
time.span<-c(data[data[node,4],2], data[node,2]);
y.vals<-time.span[1]+runif(data[data[node,4],5])*(time.span[2]-time.span[1]);
points(rep(branch.pos[data[node,4]], data[data[node,4],5]), y.vals, pch=19, col=rgb(0,0,0.5), cex=cex.mtn);
}
}
if (names.plot==TRUE & length(col)==0) {
text(x=c(1:nseq), y=rep(0, nseq), labels=data[plot.order,1], srt=srt.txt, cex=cex.txt, pos=1)
}
if (length(col)==nseq) {
points(x=c(1:nseq), y=rep(0, nseq), pch=19, col=col[plot.order]);
}
return(plot.order)
}
#########################################################
# To plot a tree horizontally
# Data is
# Col1 = name
# Col2 = Time
# COl3 = D1
# Col4 = No. mutations
#########################################################
plot.tree.horizontal<-function(data, maxdepth=-1, add.mutations=FALSE, y.low=0, names.plot=TRUE, lwd=1, cex.mtn=0.5) {
nseq<-(nrow(data)+1)/2;
mrca = nrow(data);
plot.order<-order.seqs(data);
if (maxdepth<0) maxdepth=max(data[,2]);
plot(0,0, type="n", yaxt="n", xlab="Time", ylab="", bty="n", xlim=c(y.low,maxdepth), ylim=c(0,nseq+1));
branch.pos<-order(plot.order);
for (node in (nseq+1):mrca) {
branch.pos[node]<-(branch.pos[data[node,3]]+branch.pos[data[node,4]])/2;
#DO horizontal line
segments(y0=branch.pos[data[node,3]], x0=data[node,2], y1=branch.pos[data[node,4]], x1=data[node, 2], lwd=lwd);
#DO LH vertical
segments(data[data[node,3],2], branch.pos[data[node,3]], data[node, 2], branch.pos[data[node,3]], lwd=lwd);
if (add.mutations==T && data[data[node,3],5]>0) {
time.span<-c(data[data[node,3],2], data[node,2]);
x.vals<-time.span[1]+runif(data[data[node,3],5])*(time.span[2]-time.span[1]);
points(y=rep(branch.pos[data[node,3]], data[data[node,3],5]), x=x.vals, pch=19, col=rgb(0,0,0.5), cex=cex.mtn);
}
#DO RH vertical
segments(data[data[node,4],2], branch.pos[data[node,4]], data[node, 2], branch.pos[data[node,4]], lwd=lwd);
if (add.mutations==T && data[data[node,4],5]>0) {
time.span<-c(data[data[node,4],2], data[node,2]);
x.vals<-time.span[1]+runif(data[data[node,4],5])*(time.span[2]-time.span[1]);
points(y=rep(branch.pos[data[node,4]], data[data[node,4],5]), x=x.vals, pch=19, col=rgb(0,0,0.5), cex=cex.mtn);
}
}
if (names.plot==TRUE) {
text(y=c(1:nseq), x=rep(0, nseq), labels=data[plot.order,1], srt=0, pos=2, cex=0.5)
}
return(plot.order)
}
#########################################################
# To simulate data under the coalescent
#########################################################
simulate.coalescent<-function(sample=10, theta=10, sites=10, rho=0, rmap=c(), nrun=1, plot.tree=TRUE, return=TRUE, n.tree=5, all.seg=TRUE, inf.sites=TRUE) {
cat("\n\n#####################################################\n");
cat(" Coalescent simulation of data\n");
cat(paste(" samples = ",sample,"\n",sep=""));
cat("#####################################################\n\n");
if (sample>1000) {
cat("\n\nToo many samples: please use n<=100\n\n");
}
if (rho>100) {
cat("\n\nToo much recombination: please use rho<=100\n\n");
}
if (length(rmap)>0) if (rmap[length(rmap)]>100) {
cat("\n\nToo much recombination: please use rho<=100\n\n");
}
#No recombination
if (length(rmap)==0 && rho<1e-6) {
cat(paste("\n\nNo recombination: infinite-sites theta = ",theta,"\n",sep=""));
nodes<-matrix(nrow=2*sample-1, ncol=5);
colnames(nodes)<-c("Name", "Time", "D1", "D2", "Mutations");
nodes[,1]<-c(1:nrow(nodes));
nodes[,2:5]<-0;
k<-sample;
klist<-c(1:sample);
time<-0;
current.node<-k+1;
while(k>1) {
rate<-k*(k-1)/2;
dt<--log(runif(1))/rate;
time<-time+dt;
l1<-ceiling(runif(1)*k);
tmp<-klist[l1];
klist[l1]<-klist[k];
klist[k]<-tmp;
l2<-ceiling(runif(1)*(k-1));
nodes[current.node,2]<-time;
nodes[current.node,3]<-klist[k];
nodes[current.node,4]<-klist[l2];
nodes[nodes[current.node,3],5]<-rpois(1, theta*(time-nodes[nodes[current.node,3],2])/2);
nodes[nodes[current.node,4],5]<-rpois(1, theta*(time-nodes[nodes[current.node,4],2])/2);
klist[l2]<-current.node;
current.node<-current.node+1;
k<-(k-1);
klist<-klist[1:k];
}
n.mtns<-sum(nodes[,5]);
mut.list<-runif(n.mtns);
seqs<-matrix(0, nrow=sample, ncol=n.mtns);
cum.mut<-1;
for (node in 1:(nrow(nodes)-1)) if (nodes[node,5]>0) {
who.list<-find.descendants(node, nodes, c());
for (mut in 1:nodes[node,5]) {
seqs[who.list,cum.mut]=1;
cum.mut<-cum.mut+1;
}
}
seqs<-seqs[,order(mut.list)];
mut.list<-mut.list[order(mut.list)];
if (plot.tree == TRUE) plot.tree(nodes, add.mutations=T);
if (return==TRUE) return(list(tree=nodes,
data=list(type="haplotype", n.seq=sample, l.seq=n.mtns, pos=mut.list, names=c(1:sample), seq=seqs)
));
}
#With recombination, use finfast type
else {
cat(paste("\n\nWith recombination: sites = ",sites,"\n",sep=""));
if (length(rmap)>0) {cat(paste("Using Rmap: length = ",rmap[length(rmap)],"\n",sep=""));}
else {cat(paste("Assuming constant rec rate: rho = ",rho,"\n",sep=""));}
nodes<-array(dim=c(sites, 2*sample-1, 5));
dimnames(nodes)<-list(c(1:sites),c(1:(2*sample-1)),
c("Name", "Time", "D1", "D2", "Mutations"));
for (site in 1:sites) {
nodes[site,,1]<-c(1:(2*sample-1));
nodes[site,,2:5]<-0.0;
}
k<-sample;
klist<-c(1:sample);
anc<-matrix(ncol=sample, nrow=sites);
for (i in 1:sites) anc[i,]<-c(1:sample);
time<-0;
if (length(rmap)==0) rmap<-c(0:(sites-1))*rho/(sites-1);
current.nodes<-rep(sample+1,sites); #Ref to current node at each site
rho.lin<-cbind(rep(1,sample), rep(sites, sample), rep(rmap[sites]-rmap[1],sample)); #Rho for each lineage
colnames(rho.lin)=c("Lower", "Upper", "Rho");
n.rec<-0;
mrca<-2*sample-1;
new.lin<-sample+1;
while(k>1) {
rho.tot<-sum(rho.lin[,3]);
rate<-k*(k-1)/2+rho.tot/2;
dt<--log(runif(1))/rate;
time<-time+dt;
#Recombination
if (runif(1)<rho.tot/(rho.tot+k*(k-1))) {
n.rec<-n.rec+1;
cum.rec<-cumsum(rho.lin[,3]);
tmp<-runif(1)*rho.tot;
l1<-c(1:k)[cum.rec>tmp][1];
cum.rec<-rmap[anc[,l1]>0]
tmp<-runif(1)*rho.lin[l1,3];
pos<-rho.lin[l1,1]+c(1:sites)[cum.rec-rmap[rho.lin[l1,1]]>tmp][1]-1;
# cat(paste("\nRecombination on Lin = ", l1, " start = ", rho.lin[l1,1],
# " end = ", rho.lin[l1,2], " pos = ", pos, sep=""));
klist<-c(klist,new.lin);
new.lin<-new.lin+1;
anc<-cbind(anc, anc[,l1]);
anc[(1:sites)>=pos,l1]=0
anc[(1:sites)<pos,k+1]=0
rho.lin<-rbind(rho.lin, rho.lin[l1,]);
rho.lin[k+1,2]=rho.lin[l1,2];
rho.lin[k+1,1]=min(c(1:sites)[anc[,k+1]>0]);
rho.lin[k+1,3]=rmap[rho.lin[k+1,2]]-rmap[rho.lin[k+1,1]]
rho.lin[l1,2]=max(c(1:sites)[anc[,l1]>0]);
rho.lin[l1,3]=rmap[rho.lin[l1,2]]-rmap[rho.lin[l1,1]]
k<-k+1;
}
#Coalescent
else {
l1<-ceiling(runif(1)*k);
#First move chosen lineage to end of klist, anc and rho.lin
tmp<-klist[l1];
klist[l1]<-klist[k];
klist[k]<-tmp;
tmp<-anc[,l1];
anc[,l1]<-anc[,k];
anc[,k]<-tmp;
tmp<-rho.lin[l1,];
rho.lin[l1,]<-rho.lin[k,];
rho.lin[k,]<-tmp;
l2<-ceiling(runif(1)*(k-1));
new.nodes<-apply(anc[,c(k,l2)], 1, max);
is.co<-c(1:sites)[apply(anc[,c(k,l2)], 1, min)>0];
co.nodes<-current.nodes[is.co];
for (co in 1:length(is.co)) {
nodes[is.co[co],co.nodes[co],2]<-time;
nodes[is.co[co],co.nodes[co],3]<-anc[is.co[co],k];
nodes[is.co[co],co.nodes[co],4]<-anc[is.co[co],l2];
}
new.nodes[is.co]<-co.nodes;
anc[,l2]<-new.nodes;
current.nodes[is.co]<-co.nodes+1;
# cat(paste("\nCoalescent between lineages ", l1, " and ", l2,sep=""));
k<-k-1;
klist<-klist[1:k];
anc<-anc[,1:k];
rho.lin<-rho.lin[1:k,];
#Remove lineages at MRCA
if(k>1) {
anc[anc[,l2]==mrca,l2]<-0;
if (sum(anc[,l2])>0) {
rho.lin[l2,1]<-min(c(1:sites)[anc[,l2]>0]);
rho.lin[l2,2]<-max(c(1:sites)[anc[,l2]>0]);
if (rho.lin[l2,1]<rho.lin[l2,2]) {
rho.lin[l2,3]<-rmap[rho.lin[l2,2]]-rmap[rho.lin[l2,1]];
}
else {rho.lin[l2,3]<-0;}
}
else {
klist[k]<-klist[l2];
anc[,l2]<-anc[,k];
rho.lin[l2,]<-rho.lin[k,];
k<-k-1;
klist<-klist[1:k];
anc<-anc[,1:k];
rho.lin<-rho.lin[1:k,];
}
}
}
}
cat(paste("\nTotal of ",n.rec," recombination events\n\n",sep=""));
if (plot.tree == TRUE) {
par(mfrow=c(1,n.tree));
pos<-ceiling(c(1:n.tree)*sites/(n.tree+1));
mx<-max(nodes[pos,,2]);
for (tree in 1:n.tree) {
plot.tree(nodes[pos[tree],,], maxdepth=mx, add.m=TRUE);
title(main=paste("Position ", pos[tree], sep=""));
}
cat(paste("\nPrinting trees at position", pos, "\n", sep=" "));
par(mfrow=c(1,1));
}
seqs<-matrix(0, nrow=sample, ncol=sites);
is.seg<-c(1:sites);
times.sites<-rep(0,sites);
for (site in 1:sites) times.sites[site]<-node.time(nodes[site,,], mrca, 0);
if (all.seg == FALSE) {
for (site in 1:sites) {
if (runif(1)<exp(-theta*times.sites[site]/(2*sites))) is.seg[site]<-0;
}
cat(paste("Total of ", sum(is.seg>0), " sites segregating\n", sep=""));
}
else {
cat(paste("All sites (", sites, ") segregating\n", sep=""));
}
for (site in 1:sites) if (is.seg[site]>0) {
node<-choose.node(nodes[site,,], runif(1)*times.sites[site], mrca);
who.list<-find.descendants(node, nodes[site,,], c());
seqs[who.list,site]=1;
}
if (return==TRUE) return(list(tree=nodes,
data=list(type="haplotype", n.seq=sample, l.seq=sum(is.seg>0),
pos=c(1:sites)[is.seg>0], names=c(1:sample), seq=seqs[,is.seg>0])));
}
}
#########################################################################
#To simulate new haplotypes from existing ones using LS method (approx)
#########################################################################
simulate.seqs.LS<-function(data, rho=0, rmap=c(), n.seq=1, theta=0.001) {
h<-data$seq;
if (length(rmap)==0) rmap=data$pos/(max(data$pos)-min(data$pos))*rho;
rmap=1-exp(-diff(rmap)/data$n.seq);
p.ident<-exp(-theta/data$n.seq);
seqs.new<-matrix(nrow=n.seq, ncol=data$l.seq);
h.new<-vector(length=data$l.seq);
for (i in 1:n.seq) {
states<-as.integer(runif(data$l.seq)*data$n.seq)+1;
switch<-runif(data$l.seq-1)<=rmap;
mutate<-runif(data$l.seq)>p.ident;
h.new[1]<-states[1];
for (j in 1:length(switch)) {
if (switch[j]==TRUE) {h.new[j+1]<-states[j+1];}
else {h.new[j+1]<-h.new[j];}
}
for (j in 1:data$l.seq) {
if (mutate[j]==FALSE) {seqs.new[i,j]<-h[h.new[j],j];}
else {seqs.new[i,j]<-1-h[h.new[j],j];}
}
}
return(list(type="haplotype", n.seq=n.seq, l.seq=data$l.seq,
pos=data$pos, names=c(1:n.seq), seq=seqs.new));
}
#########################################################
# Some example coalescent simulations
#########################################################
example.sims<-function(value=1, seed=235654) {
set.seed(seed);
if (value==1) {
cat("\n\nData simulated with no recombination\n\n");
s<-simulate.coalescent(sample=50, sites=50);
summarise.data(s$data);
}
else if (value==2) {
cat("\n\nData simulated with intermediate recombination\n\n");
s<-simulate.coalescent(rho=10, sample=50, sites=50);
summarise.data(s$data);
}
else if (value==3) {
cat("\n\nData simulated with high recombination\n\n");
s<-simulate.coalescent(rho=100, sample=50, sites=50);
summarise.data(s$data);
}
else if (value==4) {
cat("\n\nData simulated with a hotspot model\n\n");
s<-simulate.coalescent(rmap=c(rep(0,25), rep(50,25)), sample=50, sites=50);
summarise.data(s$data);
}
return(s);
}
###########################################################
# Count the number of haplotypes and haplotype homozygosity
###########################################################
count.haps<-function(data) {
s<-data$seq;
d<-dist(s, method="manhattan");
hc<-hclust(d, method="average")$order;
d<-as.matrix(d);
d<-d[hc,hc];
n.hap<-1;
n.hap.i<-1;
pairs.ident<-0;
last.hap<-1;
for (j in 2:nrow(d)) {
if (d[last.hap,j]==0) {n.hap.i<-n.hap.i+1;}
else {
n.hap<-n.hap+1;
pairs.ident<-pairs.ident+n.hap.i*(n.hap.i-1)/2;
n.hap.i<-1;
last.hap<-j;
}
}
cat(paste("\n\nNumber of sequences = ", data$n.seq, sep=""));
cat(paste("\nNumber of haplotypes = ", n.hap, sep=""));
cat(paste("\n\nHaplotype homozygosity = ", pairs.ident*2/(data$n.seq*(data$n.seq-1)),
"\n\n", sep=""));
}
#############################################################
# To simulate 2-population data from beta-binomial model
#############################################################
simulate.bb<-function(n.seq=c(10,10), l.seq=10, fst=0.01, f.min=0.01) {
#Location of SNPs
pos<-c(1:l.seq);
#Allele frequencies in ancestor: drawn from 1/x distribution with min = f.min
anc.freq<-exp(runif(l.seq)*log(f.min));
alpha<-anc.freq/(fst*(1+anc.freq));
beta<-alpha*(1-anc.freq)/anc.freq;
#Allele frequencies in each population
pop1.freq<-rbeta(l.seq, alpha, beta);
pop2.freq<-rbeta(l.seq, alpha, beta);
#Haplotypes for each population
pop1.haps<-matrix(nrow=n.seq[1], ncol=l.seq);
for (i in 1:n.seq[1]) pop1.haps[i,]<-rbinom(l.seq, 1, pop1.freq);
pop2.haps<-matrix(nrow=n.seq[2], ncol=l.seq);
for (i in 1:n.seq[2]) pop2.haps[i,]<-rbinom(l.seq, 1, pop2.freq);
return(list(type="haplotype", n.seq=sum(n.seq), l.seq=l.seq, pos=pos,
names=c(1:sum(n.seq)), seq=rbind(pop1.haps, pop2.haps)));
}
############################################################################
# To simulate data under the coalescent with n popns and single split time
############################################################################
simulate.coalescent.split<-function(sample=10, theta=10, sites=10, rho=0, rmap=c(), nrun=1,
plot.tree=TRUE, return=TRUE, n.tree=5, all.seg=TRUE, inf.sites=TRUE,
pop.assign=c(), split.time=0.0) {
cat("\n\n#####################################################\n");
cat(" Coalescent simulation of data\n");
cat(paste(" samples = ",sample,"\n",sep=""));
cat("#####################################################\n\n");
if (sample>100) {
cat("\n\nToo many samples: please use n<=100\n\n");
}
if (rho>100) {
cat("\n\nToo much recombination: please use rho<=100\n\n");
}
if (length(rmap)>0) if (rmap[length(rmap)]>100) {
cat("\n\nToo much recombination: please use rho<=100\n\n");
}
if (length(pop.assign)==0) {
pop.assign<-rep(1, sample);
}
#No recombination
if (length(rmap)==0 && rho<1e-6) {
cat(paste("\n\nNo recombination: infinite-sites theta = ",theta,"\n",sep=""));
nodes<-matrix(nrow=2*sample-1, ncol=5);
colnames(nodes)<-c("Name", "Time", "D1", "D2", "Mutations");
nodes[,1]<-c(1:nrow(nodes));
nodes[1:sample,1]<-pop.assign;
nodes[,2:5]<-0;
k<-sample;
klist<-c(1:sample);
time<-0;
current.node<-k+1;
while(k>1) {
rate<-k*(k-1)/2;
dt<--log(runif(1))/rate;
time<-time+dt;
if (time>split.time) pop.assign<-rep(1, k);
#Choose a first lineage and move to end of klist
l1<-ceiling(runif(1)*k);
tmp<-klist[l1];
klist[l1]<-klist[k];
klist[k]<-tmp;
tmp<-pop.assign[l1];
pop.assign[l1]<-pop.assign[k];
pop.assign[k]<-tmp;
#Choose a second lineage
l2<-ceiling(runif(1)*(k-1));
if (pop.assign[k]==pop.assign[l2]) {
nodes[current.node,2]<-time;
nodes[current.node,3]<-klist[k];
nodes[current.node,4]<-klist[l2];
nodes[nodes[current.node,3],5]<-rpois(1, theta*dt/2);
nodes[nodes[current.node,4],5]<-rpois(1, theta*dt/2);
klist[l2]<-current.node;
current.node<-current.node+1;
k<-(k-1);
klist<-klist[1:k];
pop.assign<-pop.assign[1:k];
}
}
n.mtns<-sum(nodes[,5]);
mut.list<-runif(n.mtns);
seqs<-matrix(0, nrow=sample, ncol=n.mtns);
cum.mut<-1;
for (node in 1:(nrow(nodes)-1)) if (nodes[node,5]>0) {
who.list<-find.descendants(node, nodes, c());
for (mut in 1:nodes[node,5]) {
seqs[who.list,cum.mut]=1;
cum.mut<-cum.mut+1;
}
}
seqs<-seqs[,order(mut.list)];
mut.list<-mut.list[order(mut.list)];
if (plot.tree == TRUE) plot.tree(nodes, add.mutations=T);
if (return==TRUE) return(list(tree=nodes,
data=list(type="haplotype", n.seq=sample, l.seq=n.mtns, pos=mut.list, names=c(1:sample), seq=seqs)
));
}
else {
cat("\n\nNot yet imlpemented recombination and pop division\n\n");
}
}
######################################################################
# To read in data dumped from HapMap web-site
######################################################################
read.hapmap.data<-function(file="dumped_region.txt", maf=FALSE, convert.to.binary=TRUE) {
data<-read.table(file, as.is=T, fill=T, sep="-");
l<-nrow(data);
snp<-c(1:l)[data[,1]=="snps:"];
hap<-c(1:l)[data[,1]=="phased_haplotypes:"];
snps<-data[(snp+1):(hap-1),2];
n.snp<-length(snps);
pos<-c();
snp.id<-c();
snps<-strsplit(snps, ":");
for (i in 1:n.snp) {
pos[i]<-as.numeric(snps[[i]][2]);
snp.id[i]<-strsplit(snps[[i]][1], " ")[[1]][2];
}
haps<-data[(hap+1):nrow(data),2];
n.haps<-length(haps);
haps<-strsplit(haps, ":");
seqs.char<-c();
names<-c();
for (i in 1:n.haps) {
names[i]<-haps[[i]][1];
seqs.char[i]<-haps[[i]][2];
}
#Convert characters to values A=1, C=2, G=3, T=4
seqs<-matrix(nrow=n.haps, ncol=n.snp);
for (i in 1:n.haps) {
hap<-strsplit(seqs.char[i], "")[[1]];
hap<-hap[2:length(hap)];
base<-c(1:n.snp)[hap=="A"];
seqs[i,base]<-1;
base<-c(1:n.snp)[hap=="C"];
seqs[i,base]<-2;
base<-c(1:n.snp)[hap=="G"];
seqs[i,base]<-3;
base<-c(1:n.snp)[hap=="T"];
seqs[i,base]<-4;
}
#Convert to 0/1
if (convert.to.binary==TRUE) {
mn<-apply(seqs, 2, min);
for (i in 1:n.snp) {
seqs[,i]<-seqs[,i]==mn[i]
}
}
#Convert 0 to major allele - best not to really
if (maf==TRUE) {
sm<-apply(seqs, 2, sum);
for (i in 1:n.snp) {
if (sm[i]>n.haps/2) seqs[,i]<-1-seqs[,i];
}
}
data<-list(type="haplotype", n.seq=n.haps, l.seq=n.snp, pos=pos, snp.id=snp.id, names=names, seq=seqs);
cat(paste("\n\nRead ", n.haps, " sequences of length ", n.snp, "\n\n", sep=""));
return(data);
}
######################################################################
# To read in data from multiple populations dumped from HapMap web-site
######################################################################
read.combined.hapmap.data<-function(file.list=c("dumped_region.ceu.txt", "dumped_region.yri.txt","dumped_region.jpt+chb.txt"), convert.to.binary=TRUE) {
d<-list();
for (i in 1:length(file.list)) d[[i]]<-read.hapmap.data(file.list[i], convert=F);
n.snp<-d[[1]]$l.seq;
for (i in 2:length(file.list)) if (d[[i]]$l.seq != n.snp) {
cat("\n\nDifferent numbers of snps in data sets\n\n");
return();
}
#Combine data
seqs<-d[[1]]$seq;
for (i in 2:length(file.list)) seqs<-rbind(seqs, d[[i]]$seq);
#Convert to 0/1
if (convert.to.binary==TRUE) {
mn<-apply(seqs, 2, min);
for (i in 1:n.snp) {
seqs[,i]<-seqs[,i]==mn[i]
}
}
#Combine names;
names<-d[[1]]$names;
tot.haps<-d[[1]]$n.seq;
for (i in 2:length(file.list)) {
names<-c(names, d[[i]]$names);
tot.haps<-tot.haps+d[[i]]$n.seq;
}
return(list(type="haplotype", n.seq=tot.haps, l.seq=n.snp, pos=d[[1]]$pos, snp.id=d[[1]]$snp.id, names=names, seq=seqs));
}
######################################################################
# To calculate various selection statistics for a data set
######################################################################
selection.statistics<-function(data, derived=TRUE, rho=-1, fixed=FALSE, ehh.position=-1, f.cut=0.05, f.half=0.1) {
ns <-data$n.seq;
s <- data$l.seq;
d <- dist(data$seq, method="manhattan");
pwd = 2*sum(d)/(data$n.seq*(data$n.seq-1));
f<-apply(data$seq, 2, sum);
n.sing<-sum(f==1);
if (derived == FALSE) n.sing<-n.sing + sum(f==(data$n.seq-1));
con=vector(length=10);
con[1]=sum(1/c(1:(ns-1)));
con[2]=sum((1/c(1:(ns-1)))^2);
con[3]=(ns+1)/(3*ns-3);
con[4]=2*(ns*ns+ns+3)/(9*ns*ns-9*ns);
con[5]=con[3]-1/con[1];
con[6]=con[4]-(ns+2)/(ns*con[1])+con[2]/(con[1]*con[1]);
con[7]=con[5]/con[1];
con[8]=con[6]/(con[1]*con[1]+con[2]);
con[9]=2*(ns*con[1]-2*(ns-1))/((ns-1)*(ns-2));
con[10]=con[9]+(ns-2)/((ns-1)*(ns-1))+((2/(ns-1))*(1.5-(2*(con[1]+1/ns)-3)/(ns-2)-1/ns));
con[11]=(ns*ns*con[2]/((ns-1)*(ns-1))+con[1]*con[1]*con[10]-2*ns*con[1]*(con[1]+1)/((ns-1)*(ns-1)))/(con[1]*con[1]+con[2]); con[12]=ns/(ns-1)*(con[1]-ns/(ns-1))-con[11];
if (derived==TRUE) {
f<-snp.frequency(data, maf=FALSE)*data$n.seq;
h<-hist(f[f>0 & f<data$n.seq], breaks=c(0:(data$n.seq-1)), plot=F)$counts;
theta.der<-2*sum(h*c(1:(data$n.seq-1))^2)/(data$n.seq*(data$n.seq-1));
}
if (derived == FALSE) {
return(list(n.seq=ns, s=s, pi=round(pwd,2), theta.Watterson=round(data$l.seq/con[1],2),
theta.singleton = n.sing,
Tajima.D= round((pwd-s/con[1])/sqrt(con[7]*s+con[8]*s*(s-1)),2),
FuLi.D = round((ns/(ns-1)*s-con[1]*n.sing)/sqrt(con[12]*s+con[11]*s*s),2)));
}
if (ehh.position>0 & fixed==FALSE) {
pos<-which(data$pos==ehh.position);
iehh<-calculate.iehh(data=data, snp.pos=pos, rho=rho, f.cut=f.cut, f.half=f.half);
return(list(n.seq=ns, s=s, pi=round(pwd,2), theta.Watterson=round(data$l.seq/con[1],2),
theta.derived = round(theta.der,2), theta.singleton = n.sing,
Tajima.D= round((pwd-s/con[1])/sqrt(con[7]*s+con[8]*s*(s-1)),2),
FuLi.D = round((ns/(ns-1)*s-con[1]*n.sing)/sqrt(con[12]*s+con[11]*s*s),2),
FayWu.H = round(pwd,2)-round(theta.der, 2),
ehh.position=ehh.position, ehh.freq = sum(data$seq[,pos])/data$n.seq, iehh=iehh));
}
if (ehh.position<1 | fixed==TRUE) {
return(list(n.seq=ns, s=s, pi=round(pwd,2), theta.Watterson=round(data$l.seq/con[1],2),
theta.derived = round(theta.der,2), theta.singleton = n.sing,
Tajima.D= round((pwd-s/con[1])/sqrt(con[7]*s+con[8]*s*(s-1)),2),
FuLi.D = round((ns/(ns-1)*s-con[1]*n.sing)/sqrt(con[12]*s+con[11]*s*s),2),
FayWu.H = round(pwd,2)-round(theta.der, 2)));
}
}
#Calulate Integrated EHH and plot
calculate.iehh<-function(data, snp.pos=1, rho=-1, der.allele=1, anc.allele=0, f.cut=0.05, plot.order=c(), f.half=0.1) {
if (length(rho)==1) {
rmap=(data$pos-data$pos[1])/(data$pos[data$l.seq]-data$pos[1]);
if (rho>0) rmap<-rmap*rho;
}
list.der<-which(data$seq[,snp.pos]==der.allele);
seq.der<-data$seq[data$seq[,snp.pos]==der.allele,];
f.der<-apply(seq.der, 2, mean);
seq.anc<-data$seq[data$seq[,snp.pos]==anc.allele,];
f.anc<-apply(seq.anc, 2, mean);
#Set all derived mutations polymoprhic on der background only to 0
#seq.der[,f.der>0 & f.der<1 & f.anc==0]<-0;
#Indetify core set left and right from selected SNP
core<-rep(0, data$l.seq);
core[snp.pos]<-1;
#Set up output sequences
seq2<-data$seq;
seq2[list.der, snp.pos]<-seq2[list.der, snp.pos]+1.5;
iehh<-0;
#Left
which.core<-c(1:nrow(seq.der));
pos<-snp.pos-1;
while (length(which.core)>1 & pos>0) {
f<-mean(seq.der[which.core,pos]);
if (f<0.5) core[pos]<-0;
if (f>=0.5) core[pos]<-1;
which.core<-which.core[seq.der[which.core,pos]==core[pos]];
if (length(which.core)/data$n.seq < f.cut) which.core<-c();
seq2[list.der[which.core], pos]<-seq2[list.der[which.core], pos]+0.5;
iehh<-iehh+length(which.core)*(rmap[pos+1]-rmap[pos]);
pos<-pos-1;
}
#Right
which.core<-c(1:nrow(seq.der));
pos<-snp.pos+1;
while (length(which.core)>1 & pos<=data$l.seq) {
f<-mean(seq.der[which.core,pos]);
if (f<0.5) core[pos]<-0;
if (f>=0.5) core[pos]<-1;
which.core<-which.core[seq.der[which.core,pos]==core[pos]];
if (length(which.core)/data$n.seq < f.cut) which.core<-c();
seq2[list.der[which.core], pos]<-seq2[list.der[which.core], pos]+0.5;
iehh<-iehh+length(which.core)*(rmap[pos]-rmap[pos-1]);
pos<-pos+1;
}
pal<-c(hsv(2/3,0.3,1), hsv(2/3,1,1), hsv(1/6,0.3,1), hsv(1/6,1,1), "red");
breaks = c(-1.5,0.2,0.75,1.25,2, 3);
if (length(plot.order)==0) {
d<-make.weighted.dist(data, method="normal", f.half=f.half, position=data$pos[snp.pos]);
plot.order<-hclust(d, method="average")$order
}
image(x=data$pos, z=t(seq2[plot.order,]), col=pal, breaks=breaks,
xlab="Position", ylab="Chromosome", main="EHH Haplotype plot", yaxt="n");
return(iehh);
}
#Summarise edited output from recmin file
summarise.recmin = function(file) {
x = read.table(file);
x = as.matrix(x);
pos<-x[1,];
x<-x[2:nrow(x),];
di<-cbind(c(1:nrow(x)), c(1:nrow(x)));
x[di]<-0;
pal = c(rgb(0,0,(0:15)/15), heat.colors(12));
cat(paste("Summarising data for ", file, "\n\n", sep=""));
cat(paste("Minimum number of recombination events = ", x[1,ncol(x)],"\n",sep=""));
x11();
par(mfrow=c(2,1));
breaks<-c(-10,0.5,1000000);
image(x=pos, y=pos, z=x, col=c("white", "black"), breaks=breaks,
main="Incompatibility matrix", xlab="Position", ylab="Position");
image(x=pos, y=pos, z=x, main="Rh matrix", xlab="Position", ylab="Position", col=pal);
for (i in 1:nrow(x)) {
for (j in 1:ncol(x)) {
if (j>i) {x[i,j] = x[i,j]/(pos[j]-pos[i]);}
else {x[i,j]=0;}
}
}
x11();
par(mfrow=c(1,1));
image(x=pos, y=pos, z=x, main="Scaled Rh matrix", xlab="Position", ylab="Position", col=pal, zlim=c(0,0.003));
}
###############################################################################
# To read the OXSTAT human genetic map from the web
# Can return the map for the whole chromosome (default),
# a segment (using lims) or selected positions (using pos)
###############################################################################
read.genetic.map<-function(lims=c(), pos=c(), chromosome=1, ncbi.build=35) {
if (ncbi.build==35) fname<-paste("http://www.stats.ox.ac.uk/~mcvean/OXSTAT/GeneticMap_b",
ncbi.build, "/genetic_map_chr", chromosome, ".txt.gz", sep="");
if (ncbi.build==36) fname<-paste("http://www.stats.ox.ac.uk/~mcvean/OXSTAT/GeneticMap_b",
ncbi.build, "/genetic_map_chr", chromosome, "_b36.txt.gz", sep="");
map<-scan(gzcon(url(fname)), sep=" ", skip=1);
if (ncbi.build==35) {
map<-matrix(map, nrow=length(map)/6, ncol=6, byrow=T);
colnames(map)<-c("position_b35", "CEU_rate(cM/Mb)", "YRI_rate(cM/Mb)",
"JPT+CHB_rate(cM/Mb)", "COMBINED_rate(cM/Mb)", "Genetic_Map(cM)");
map<-map[,c(1,5,6)];
}
if (ncbi.build==36) {
map<-matrix(map, nrow=length(map)/3, ncol=3, byrow=T);
colnames(map)<-c("position_b36", "COMBINED_rate(cM/Mb)", "Genetic_Map(cM)");
}
if (length(lims)>0) {
map<-map[map[,1]>=lims[1] & map[,1]<=lims[2],];
if(nrow(map)==0) {
cat("\n\n*** Warning: not data to return - check limits ***\n\n");
return();
}
return(map);
}
if (length(pos)>0) {
op<-approx(map[,1], map[,3], xout=pos);
rts<-diff(op$y)*1000000/diff(op$x);
rts<-c(rts, rts[length(rts)]);
map<-cbind(pos, rts, op$y);
if (ncbi.build==35) colnames(map)<-c("position_b35", "COMBINED_rate(cM/Mb)", "Genetic_Map(cM)");
if (ncbi.build==36) colnames(map)<-c("position_b36", "COMBINED_rate(cM/Mb)", "Genetic_Map(cM)");
return(map);
}
return(map);
}
###########################################################
# Routine to read in data from ms/msHOT
###########################################################
read.ms<-function(file) {
data<-list();
input<-scan(file, what="character");
n.samp<-as.numeric(input[2]);
n.rep<-as.numeric(input[3]);
rep<-0;
for (i in 1:length(input)) {
if (input[i]=="//") {
rep<-rep+1;
l.seq<-as.numeric(input[i+2]);
new.data<-list(type="haplotype", n.seq=n.samp, l.seq=l.seq,
pos=as.numeric(input[(i+4):(i+3+l.seq)]), names=c(1:n.samp), seq=matrix(0, nrow=n.samp, ncol=as.numeric(input[i+2])));
i<-i+3+l.seq;
for (j in 1:n.samp) {
new.data$seq[j,]<-as.numeric(strsplit(input[i+j], split="")[[1]]=="1")
}
data[[rep]]<-new.data;
}
}
return(data);
}
###########################################################
# Routine to convert data from ms/msHOT to LDhat format
###########################################################
ms2ldhat<-function(file, total.length=1) {
input<-read.ms(file);
for (i in 1:length(input)) {
input[[i]]$pos<-round(input[[i]]$pos*total.length, 3);
locs.file<-paste("locs_", i, ".txt", sep="");
sites.file<-paste("sites_", i, ".txt", sep="");
write.ldhat.format(input[[i]], seq.file=sites.file, loc.file=locs.file);
}
}
###########################################################
#Convert hclust output into my standard tree format
###########################################################
hc2tree<-function(hc) {
n<-length(hc$height)+1;
tree<-matrix(0, nrow=2*n-1, ncol=5);
colnames(tree)<-c("Node", "Height", "D1", "D2", "Mutations");
tree[,1]<-c(1:nrow(tree));
for (i in 1:nrow(hc$merge)) {
which.node<-n+i;
tree[which.node,2]<-hc$height[i];
if (hc$merge[i,1]<0) {tree[which.node,3]<--hc$merge[i,1];}
else {tree[which.node,3]<-hc$merge[i,1]+n;}
if (hc$merge[i,2]<0) {tree[which.node,4]<--hc$merge[i,2];}
else {tree[which.node,4]<-hc$merge[i,2]+n;}
}
return(tree);
}
###########################################################
#Script to read in data to R in FASTA format
###########################################################
read.fasta<-function(file, aligned=TRUE, gap.as.state=TRUE, biallelic.only=TRUE, recode.biallelic=TRUE) {
allowed.bases<-c("T", "C", "A", "G", "t", "c", "a", "g", "U", "u", "?", "N", "n", "W", "w", "R", "r", "Y", "y", "M", "m", "K", "k", "S", "s", "B", "b", "D", "d", "H", "h", "V", "v", "-");
bases.coding <-c(1, 2, 3, 4, 1, 2, 3, 4, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5);
input<-scan(file, what="char", comment="#", sep="\n");
seq.starts<-grep(">", input);
seqs<-list();
for (i in 1:length(seq.starts)) {
seqs[[i]]<-list();
seqs[[i]]$name <-strsplit(input[seq.starts[i]], ">")[[1]][2];
s<-c();
if (i != length(seq.starts)) up<-seq.starts[i+1]-1;
if (i==length(seq.starts)) up<-length(input);
for (j in (seq.starts[i]+1):up) {
s<-c(s, input[j]);
}
s<-unlist(strsplit(s, ""));
s<-s[s %in% allowed.bases];
s<-bases.coding[match(s, allowed.bases)];
seqs[[i]]$seq<-s;
seqs[[i]]$length <- length(seqs[[i]]$seq);
}
#If not aligned, just return;
if (aligned==FALSE) return (seqs);
#If aligned, check all same length
for (i in 2:length(seqs)) {
if (seqs[[i]]$length != seqs[[1]]$length) {
cat("\n\n*** Error in file: sequences must be same length ***\n\n");
return();
}
}
#Now create numeric representation of data - first convert to
#0=?/N/ambig, 1=T/t, 2=C/c, 3=A/a, 4=G/g, 5=gap/-
seqs.numeric<-matrix(0, nrow=length(seqs), ncol=seqs[[1]]$length);
for (i in 1:length(seqs)) seqs.numeric[i,]<-seqs[[i]]$seq;
#Now set things to missing data
for (i in 1:ncol(seqs.numeric)) {
is.na(seqs.numeric[seqs.numeric[,i]==0,i])<-TRUE;
if (gap.as.state==FALSE) is.na(seqs.numeric[which(seqs.numeric[,i]==5),i])<-TRUE;
}
#Now count number of alleles
allele.ct<-rep(0, ncol(seqs.numeric));
for (i in 1:ncol(seqs.numeric)) {
a<-unique(seqs.numeric[,i]);
a<-a[is.na(a)==F]
allele.ct[i]<-length(a);
}
if (biallelic.only==TRUE) which.keep<-which(allele.ct==2);
if (biallelic.only==FALSE) which.keep<-which(allele.ct>1);
pos<-which.keep;
seqs.numeric<-seqs.numeric[,which.keep, drop=F];
allele.ct<-allele.ct[which.keep];
#select poly sites and re-code biallelic ones as 0/1 with 0=major allele;
for (i in 1:length(pos)) if (allele.ct[i]==2 & recode.biallelic) {
a.base<-max(seqs.numeric[,i], na.rm=T);
a.alt<-min(seqs.numeric[,i], na.rm=T);
f<-mean(seqs.numeric[,i]==a.base, na.rm=T);
if (f>0.5) {
seqs.numeric[,i]<-as.numeric(seqs.numeric[,i]==a.alt);
}
if (f<=0.5) {
seqs.numeric[,i]<-as.numeric(seqs.numeric[,i]==a.base);
}
}
names<-c();
for (i in 1:length(seqs)) names[i]<-seqs[[i]]$name;
#Finally make a big data type
data<-list(type="haplotype", n.seq=nrow(seqs.numeric), l.seq=ncol(seqs.numeric), names=names, pos = pos, seqs=seqs.numeric);
return(data);
}
################################################################
#Script to estimate LD statistic D from genotype data using EM
################################################################
#Work out if evidence for recombination for 2 SNPs with GT data
is.rec.gt<-function(gts) {
gts<-gts[apply(is.na(gts), 1, sum)<1,];
val<-gts[,1]*3+gts[,2]+1;
counts<-hist(val, plot=F, breaks=c(0:9))$counts
is.present<-rep(0,4);
is.present[1]<-counts[1]+counts[2];
is.present[2]<-counts[2]+counts[3]+counts[6];
is.present[3]<-counts[4]+counts[7]+counts[8];
is.present[4]<-counts[8]+counts[9];
return(as.numeric(min(is.present)>0));
}
min.rec.gt<-function(data) {
min.r<-0;
min.i<-rep(0, data$l.seq);
min.i[2]<-is.rec.gt(data$seq[,1:2]);
for (i in 3:data$l.seq) {
curr.max<-min.i[i-1];
min.i[i]<-curr.max;
j<-i-1;
while(j>0 & min.i[j]==curr.max) {
new.rec<-is.rec.gt(data$seq[,c(j,i)]);
if (new.rec) {
min.i[i]<-curr.max+1;
j<-1;
}
j<-j-1;
if (j==0) break();
}
}
return(min.i);
}
#GTs in order 00, 01, 02, 10, 11, 12, 20, 21, 22
#HTs in order 00, 01, 10, 11
llk.gt.2locus<-function(gt.counts, ht.freq){
llk<-1;
gt.freq<-rep(0,9);
gt.freq[1]<-ht.freq[1]^2;
gt.freq[2]<-2*ht.freq[1]*ht.freq[2];
gt.freq[3]<-ht.freq[2]^2;
gt.freq[4]<-2*ht.freq[1]*ht.freq[3];
gt.freq[5]<-2*ht.freq[1]*ht.freq[4]+2*ht.freq[2]*ht.freq[3];
gt.freq[6]<-2*ht.freq[2]*ht.freq[4];
gt.freq[7]<-ht.freq[3]^2;
gt.freq[8]<-2*ht.freq[3]*ht.freq[4];
gt.freq[9]<-ht.freq[4]^2;
which.present<-which(gt.counts>0);
if (min(gt.freq[which.present])<=0) is.na(llk)<-TRUE;
if (!is.na(llk)) llk<-sum(gt.counts[which.present]*log(gt.freq[which.present]));
return(llk);
}
estimateD.genotype.EM<-function(gts, tol=1e-4, max.iterations=100, return.freq=FALSE, verbose=FALSE){
#Input genotypes are columns of 0/1/2/NA
#Counts contains 00/01/02/10/11/12/20/21/22 genotypes
#Freqs contains 00/01/10/11 haplotype frequencies
gts<-gts[apply(is.na(gts), 1, sum)<1,];
val<-gts[,1]*3+gts[,2]+1;
counts<-hist(val, plot=F, breaks=c(0:9))$counts
freq<-vector(length=4);
freq[1]<-2*counts[1]+counts[2]+counts[4]+counts[5]*0.5;
freq[2]<-2*counts[3]+counts[2]+counts[6]+counts[5]*0.5;
freq[3]<-2*counts[7]+counts[4]+counts[8]+counts[5]*0.5;
freq[4]<-2*counts[9]+counts[6]+counts[8]+counts[5]*0.5;
freq.dh<-counts[5]/sum(counts);
freq<-freq/sum(freq);
D0<-freq[1]*freq[4]-freq[2]*freq[3];
if(verbose) {
cat(paste("\nInitial haplotype freqs = ", paste(freq, sep="\t", coll=""), sep=""));
cat(paste("\nInitial log likelihood = ", llk.gt.2locus(counts, freq), sep=""));
}
#Only perform EM if double hets exist
if (freq.dh>1e-6) {
del<-1;
its<-0;
while(del>tol & its<max.iterations) {
rr<-freq[1]*freq[4]/(freq[1]*freq[4]+freq[2]*freq[3]);
freq[1]<-2*counts[1]+counts[2]+counts[4]+counts[5]*rr;
freq[2]<-2*counts[3]+counts[2]+counts[6]+counts[5]*(1-rr);
freq[3]<-2*counts[7]+counts[4]+counts[8]+counts[5]*(1-rr);
freq[4]<-2*counts[9]+counts[6]+counts[8]+counts[5]*rr;
freq<-freq/sum(freq);
D1<-freq[1]*freq[4]-freq[2]*freq[3];
del<-abs(D1-D0);
D0<-D1;
its<-its+1;
if(verbose) {
cat(paste("\nIt ", its, " haplotype freqs = ", paste(freq, sep="\t", coll=""), sep=""));
cat(paste("\nIt ", its, " log likelihood = ", llk.gt.2locus(counts, freq), sep=""));
}
}
}
if (return.freq==TRUE) return(freq);
return(list(D=D0, fi=(freq[3]+freq[4])/sum(freq), fj=(freq[2]+freq[4])/sum(freq)));
}
######################################################################
# To read in data directly from HapMap web-site
######################################################################
get.hapmap.data<-function(pop=c("CEU", "YRI"), chr=17, start=38349000, stop=38579000, maf=FALSE, convert.to.binary=TRUE, rel="phased_hapmap3", md.thresh=0.2) {
url<-"http://www.hapmap.org/cgi-perl/";
tmp<-pop[1];
if (length(pop)>1) for (i in 2:length(pop)) {
tmp<-paste(tmp, "+", pop[i], sep="");
}
file<-paste(url, rel, "?chr=chr", chr, "&pop=", tmp, "&start=", as.integer(start), "&stop=", as.integer(stop), "&ds=r2&out=txt", sep="");
data<-scan(file, what="character");
pos<-as.integer(data[grep("rs", data)+1]);
snp.id<-data[grep("rs", data)];
snp.id<-unlist(strsplit(snp.id, ":"));
n.snp<-length(pos);
names<-data[grep("NA", data)];
names<-unlist(strsplit(names, ":"));
seqs.char<-data[grep("NA", data)+1];
n.haps<-length(seqs.char);
#Convert characters to values A=1, C=2, G=3, T=4
seqs<-matrix(nrow=n.haps, ncol=n.snp);
for (i in 1:n.haps) {
hap<-strsplit(seqs.char[i], "")[[1]];
seqs[i,hap=="A"]<-1;
seqs[i,hap=="C"]<-2;
seqs[i,hap=="G"]<-3;
seqs[i,hap=="T"]<-4;
is.na(seqs[i,hap=="N"])<-TRUE;
}
#Remove inds with high rates NA
md<-apply(is.na(seqs), 1, mean);
which.keep<-which(md<=md.thresh);
seqs<-seqs[which.keep,];
n.haps<-length(which.keep);
names<-names[which.keep];
#Convert to 0/1/NA
if (convert.to.binary==TRUE) {
mn<-apply(seqs, 2, min, na.rm=T);
for (i in 1:n.snp) {
seqs[,i]<-seqs[,i]==mn[i]
}
}
#Convert 0 to major allele - best not to really
if (maf==TRUE) {
sm<-apply(seqs, 2, sum, na.rm=T);
for (i in 1:n.snp) {
if (sm[i]>n.haps/2) seqs[,i]<-1-seqs[,i];
}
}
data<-list(type="haplotype", n.seq=n.haps, l.seq=n.snp, pos=pos, snp.id=snp.id, names=names, seq=seqs);
cat(paste("\n\nRead ", n.haps, " sequences of length ", n.snp, "\n\n", sep=""));
return(data);
}
######################################################################
#Routine to compute HWE p-value for vector (L=3) of GT counts
######################################################################
hwe<-function(cts) {
n<-sum(cts);
if (n<2) {
cat("\n\n*** Error: sample size < 2 ***\n\n");
return();
}
f1<-(cts[2]+2*cts[3])/(2*n);
g1<-cts/sum(cts);
which.nz<-which(cts>0);
l1<-sum(cts[which.nz]*log(g1[which.nz]));
g0<-c((1-f1)^2, 2*f1*(1-f1), f1^2);
l0<-sum(cts[which.nz]*log(g0[which.nz]));
dl<-2*(l1-l0);
p<-1-pchisq(dl, 1);
cat("\nP-value for HWE = ", p, "\n\n");
}
#End of scripts
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.