R/test.submodel.valid.phyml.R

Defines functions .pairfreq .patternfreq

'test.submodel.valid.phyml' <-
function(inputfile, path_phyml, path_seqgen, model, gamma=TRUE, pinv=FALSE, nbootstrap=100)
{
    simfile<-paste(inputfile,".sim",sep="")
    if(is.na(match(model,dat.modelcode[,1]))) return("check the model's name. Use HKY instead of HKY85")
    modelcode = dat.modelcode[match(model,dat.modelcode[,1]),3:4]
    
    
    command <- paste(path_phyml, "-b0 -i", inputfile," -m",modelcode[1])
    if(modelcode[2]=="unequal") command = paste(command,"-f e")
    if(modelcode[2]=="equal") command = paste(command,"-f0.25,0.25,0.25,0.25")
    if(gamma==FALSE)    command <- paste(command, " -c1")
    if(pinv==TRUE)    command <- paste(command, "-ve")
    
    a=try(system(command))
    if(a==1) return("The model is incorrect. Check the model's name")
    
    fit = parse.phyml(paste(inputfile,"_phyml_stats.txt",sep=""))
    if(length(fit$gamma)==0) fit$gamma = 0
    if(length(fit$inv)==0) fit$inv = 0
    
    run.seqgen(path_seqgen,
    seed = floor(runif(1)*434646+46548778), basefreq = fit$basefreq, rate = fit$rates, seqlength = "100000",
    gamma = fit$alpha, inv=fit$inv, treefile=paste(inputfile,"_phyml_tree.txt",sep=""), saveformat = "phylip", outputfile=simfile)
    
    
    simdata<-read.dna.seq(simfile,format="phylip")
    name<-sort(simdata$name)
    nspecies<-length(name)
    simseq<-simdata$seq
    simseq<-tolower(simseq)
    simseq<-simseq[match(name,simdata$name),]
    simlength<-dim(simseq)[2]
    
    data<-read.dna.seq(inputfile,format="phylip")
    seq<-tolower(data$seq)
    seq<-seq[match(name,data$name),]
    seqlength<-dim(seq)[2]
    
    ######################################################
    # chisquare-test for base frequencies
    ######################################################
    base.test=matrix(0,nrow=nspecies,ncol=11)
    colnames(base.test) = c("species","obs_a","obs_c","obs_g","obs_t","exp_a","exp_c","exp_g","exp_t","teststat","p.value")
    base.test[,1] = name
    for(i in 1:nspecies)
    {
        obs<-c(sum(seq[i,]=="a"),sum(seq[i,]=="c"),sum(seq[i,]=="g"),sum(seq[i,]=="t"))
        base.test[i,2:5] <- obs
        base.test[i,6:9] <- fit$basefreq
        result<-chisq.test(obs,p=fit$basefreq,rescale.p=TRUE)
        base.test[i,10] <- round(result$statistic,5)
        base.test[i,11] <- round(result$p.value,5)
    }
    
    print("The Marginal Test for base frequencies is complete!")
    ######################################################
    # chisquare-test for doublet frequencies
    ######################################################
    pair_pvalue<-matrix(0,nspecies,nspecies)
    colnames(pair_pvalue) = name
    rownames(pair_pvalue) = name
    for(m in 1:(nspecies-1))
    for(n in (m+1):nspecies)
    {
        simseq1<-simseq[m,]
        simseq2<-simseq[n,]
        simpair<-.pairfreq(simseq1,simseq2)
        exppair<-simpair/sum(simpair)
        obsseq1<-seq[m,]
        obsseq2<-seq[n,]
        obspair<-.pairfreq(obsseq1,obsseq2)
        if(length(which(exppair==0))>0) obspair = obspair[-which(exppair==0)]
        if(length(which(exppair==0))>0) exppair = exppair[-which(exppair==0)]
        if(sum(obspair)==0){pair_pvalue[m,n]=NA;next;}
        pair_pvalue[m,n]<-chisq.test(obspair,p=exppair,rescale.p=TRUE, simulate.p.value=T)$p.value
    }
    
    print("The Marginal Test for doublet frequencies is complete!")
    ######################################################
    # chisquare-test for site patterns
    ######################################################
    newseq <- gsub("N", "?", seq)
    newseq <- gsub("-", "?", newseq)
    newseq <- gsub("O", "?", newseq)
    site_pattern <- apply(newseq, 2, function(x) paste(x, collapse = "", sep = ""))
    obs_freq <- table(site_pattern)
    pattern <- names(obs_freq)
    newobs_freq<-obs_freq
    
    new = .patternfreq(pattern, newseq)
    obs_pattern_freq <- new$freq
    obs_pattern <- new$pattern
    exp_pattern_freq = .patternfreq(obs_pattern, simseq)$freq
    teststat<-sum(abs(exp_pattern_freq-obs_pattern_freq))
    
    nbootstrap<-nbootstrap
    bootstat<-1:nbootstrap
    for(j in 1:nbootstrap)
    {
        bootsample<-simseq[,sample(1:simlength,seqlength)]
        freq = .patternfreq(obs_pattern, bootsample)$freq
        bootstat[j]<-sum(abs(exp_pattern_freq-freq))
    }
    sitepattern.pvalue = sum(bootstat>teststat)/nbootstrap
    
    z <- list(base.test=base.test, doublet.pvalue=pair_pvalue, sitepattern.pvalue=sitepattern.pvalue)
    z
}


.patternfreq<-function(pattern, seq)
{
    nspecies = dim(seq)[1]
    newobs_freq = 1:length(pattern)
    names(newobs_freq) = pattern
    for(i in 1:length(pattern))
    {
        str<-unlist(strsplit(pattern[i],split=""))
        index<-which(str=="?")
	if(length(index)==nspecies){
	    newobs_freq[i] <- NA
	    next
	}
        {if(length(index)>0){
            newp<-paste(str[-index],collapse="",sep="")
            news<-seq[-index,]
        }else{
            newp<-paste(str,collapse="",sep="")
            news<-seq
        }}
        if(length(index)<nspecies-1) news<-apply(news, 2, function(x) paste(x, collapse = "", sep = ""))
        nmissing<-length(grep("\\?",news))
        if(nmissing<length(news)*0.8) newobs_freq[i]<-length(grep(newp,news))/(length(news)-nmissing)
        else newobs_freq[i] <- NA
    }
    newobs<-newobs_freq[!is.na(newobs_freq)]
    newpattern<-names(newobs)
    
    z <- list(pattern=newpattern, freq=newobs)
    z
}

.pairfreq<-function(seq1,seq2)
{
    x<-paste(seq1,seq2,sep="")
    result<-c(sum(x=="aa"),sum(x=="ac"),sum(x=="ag"),sum(x=="at"),sum(x=="ca"),sum(x=="cc"),sum(x=="cg"),sum(x=="ct"),sum(x=="ga"),sum(x=="gc"),sum(x=="gg"),sum(x=="gt"),sum(x=="ta"),sum(x=="tc"),sum(x=="tg"),sum(x=="tt"))
    return(result)
}
lliu1871/phybase documentation built on April 21, 2024, 3:16 a.m.