vignettes/PyRate/pyrate_utilities.r

no.extension <- function(filename) { 
  if (substr(filename, nchar(filename), nchar(filename))==".") { 
    return(substr(filename, 1, nchar(filename)-1)) 
  } else { 
    no.extension(substr(filename, 1, nchar(filename)-1)) 
  } 
}


extract.ages <- function(file = NULL, replicates = 1, cutoff = NULL, random = TRUE){

if (is.null(file)) 
    stop("you must enter a filename or a character string\n")

rnd <- random
q <- cutoff
dat1 <- read.table(file, header=T, stringsAsFactors=F, row.names=NULL, sep="\t", strip.white=T)
fname <- no.extension(basename(file))
outfile <- paste(dirname(file), "/", fname, "_PyRate.py", sep="")

dat1[,1] <- gsub("[[:blank:]]{1,}","_", dat1[,1])

if (replicates > 1){
	rnd <- TRUE
}

if (any(is.na(dat1[,1:4]))){
	print(c(which(is.na(dat1[,1])),which(is.na(dat1[,2])),which(is.na(dat1[,3])),which(is.na(dat1[,4]))  )   )
	stop("the input file contains missing data in species names, status or ages)\n")
}

if (!is.null(q)){
		dat <- dat1[!(dat1[,4] - dat1[,3] >= q),]
		cat("\n\nExcluded ", 100-round(100*dim(dat)[1]/dim(dat1)[1]), "% occurrences")
		hist(dat1[,4] - dat1[,3])
	} else { 
		dat <- dat1 
}

if (length(dat) == 5){
	colnames(dat) <- c("Species", "Status", "min_age", "max_age", "trait")	
	} else {
	colnames(dat) <- c("Species", "Status", "min_age", "max_age")
}

dat$new_age <- "NA"
splist <- unique(dat[,c(1,2)])[order(unique(dat[,c(1,2)][,1])),]


if (any(is.element(splist$Species[splist$Status == "extant"], splist$Species[splist$Status == "extinct"]))){
	print(intersect(splist$Species[splist$Status == "extant"], splist$Species[splist$Status == "extinct"]))
	stop("at least one species is listed as both extinct and extant\n")
}

cat("#!/usr/bin/env python", "from numpy import * ", "",  file=outfile, sep="\n")

for (j in 1:replicates){
	times <- list()
	cat ("\nreplicate", j)
	
	dat[dat$min_age == 0,3] <- 0.001
	
	#if (any(dat[,4] < dat[,3])){
	#	cat("\nWarning: the min age is older than the max age for at least one record\n")
	#	cat ("\nlines:",1+as.numeric(which(dat[,4] < dat[,3])),sep=" ")
	#}
	
	if (isTRUE(rnd)){
			dat$new_age <- round(runif(length(dat[,1]), min=apply(dat[,3:4],FUN=min,1), max=apply(dat[,3:4],FUN=max,1)), digits=6)
		} else {
			for (i in 1:length(dat[,1])){
				dat$new_age[i] <- mean(c(dat[i,3], dat[i,4]))
			}				
		}

	dat2 <- subset(dat, select=c("Species","new_age"))
	taxa <- sort(unique(dat2$Species))

	for (n in 1:length(taxa)){
		times[[n]] <- dat2$new_age[dat2$Species == taxa[n]]
		if (toupper(splist$Status[splist$Species == taxa[n]]) == toupper("extant")){
			times[[n]] <- append(times[[n]], "0", after=length(times[[n]]))
		}
	}

	dat3 <- matrix(data=NA, nrow=length(times), ncol=max(sapply(times, length)))
	rownames(dat3) <- taxa

	for (p in 1:length(times)){
		dat3[p,1:length(times[[p]])] <- times[[p]]
	}

	cat(noquote(sprintf("\ndata_%s=[", j)), file=outfile, append=TRUE)

	for (n in 1:(length(taxa)-1)){
		rec <- paste(dat3[n,!is.na(dat3[n,])], collapse=",")
		cat(noquote(sprintf("array([%s]),", rec)), file=outfile, append=TRUE, sep="\n")
	}

	n <- n+1
	rec <- paste(dat3[n,!is.na(dat3[n,])], collapse=",")
	cat(noquote(sprintf("array([%s])", rec)), file=outfile, append=TRUE, sep="\n")

	cat("]", "", file=outfile, append=TRUE, sep="\n")
}


data_sets <- ""
names <- ""

if (replicates > 1){
	for (j in 1:(replicates-1)) {
		data_sets <- paste(data_sets, noquote(sprintf("data_%s,", j)))
		names <- paste(names, noquote(sprintf(" '%s_%s',", fname,j)))
		}

	data_sets <- paste(data_sets, noquote(sprintf("data_%s", j+1)))
	names <- paste(names, noquote(sprintf(" '%s_%s',", fname,j+1)))
} else {
	data_sets <- "data_1"
	names <- noquote(sprintf(" '%s_1'", fname))	
}

cat(noquote(sprintf("d=[%s]", data_sets)), noquote(sprintf("names=[%s]", names)), "def get_data(i): return d[i]", "def get_out_name(i): return  names[i]", file=outfile, append=TRUE, sep="\n")


tax_names <- paste(taxa, collapse="','")
cat(noquote(sprintf("taxa_names=['%s']", tax_names)), "def get_taxa_names(): return taxa_names", file=outfile, append=TRUE, sep="\n")


if ("trait" %in% colnames(dat)){
	datBM <- dat[,1]
	splist$Trait <- NA
	for (n in 1:length(splist[,1])){
		splist$Trait[n] <- mean(dat$trait[datBM == splist[n,1]], na.rm=T)
	}
	s1 <- "\ntrait1=array(["
	BM <- gsub("NaN|NA", "nan", toString(splist$Trait))
	s2 <- "])\ntraits=[trait1]\ndef get_continuous(i): return traits[i]"
	STR <- paste(s1,BM,s2)
	cat(STR, file=outfile, append=TRUE, sep="\n")
}

splistout <- paste(dirname(file), "/", fname, "_TaxonList.txt", sep="")
lookup <- as.data.frame(taxa)
lookup$status  <- "extinct"

write.table(splist, file=splistout, sep="\t", row.names=F, quote=F)
cat("\n\nPyRate input file was saved in: ", sprintf("%s", outfile), "\n\n")

}


fit.prior <- function(file = NULL, lineage = "root_age"){

require(fitdistrplus)

if (is.null(file)){
    stop("You must enter a valid filename.\n")
	}

dat <- read.table(file, header=T, stringsAsFactors=F, row.names=NULL, sep="\t")
fname <- no.extension(basename(file))
outfile <- paste(dirname(file), "/", lineage, "_Prior.txt", sep="")

lineage2 <- paste(lineage,"_TS", sep="")
if (!is.element(lineage2, colnames(dat))){
	stop("Lineage not found, please check your input.\n")
	}

time <- dat[,which(names(dat) == lineage2)]
time2 <- time-(min(time)-0.01)
gamm <- fitdist(time2, distr="gamma", method = "mle")$estimate 

cat("Lineage: ", lineage, "; Shape: ", gamm[1], "; Scale: ", 1/gamm[2], "; Offset: ", min(time), sep="", file=outfile, append=FALSE)
}




extract.ages.pbdb <- function(file = NULL,sep=",", extant_species = c(), replicates = 1, cutoff = NULL, random = TRUE){
	print("This function is currently being tested - caution with the results!")
	tbl = read.table(file=file,h=T,sep=sep,stringsAsFactors =F)
	new_tbl = NULL # ADD EXTANT SPECIES
	
	for (i in 1:dim(tbl)[1]){
		if (tbl$accepted_name[i] %in% extant_species){
			status="extant"
		}else{status="extinct"}
		species_name = gsub(" ", "_", tbl$accepted_name[i])
		new_tbl = rbind(new_tbl,c(species_name,status,tbl$min_ma[i],tbl$max_ma[i]))
	}
	colnames(new_tbl) = c("Species","Status","min_age","max_age")
	
	output_file = file.path(dirname(file),strsplit(basename(file), "\\.")[[1]][1])
	output_file = paste(output_file,".txt",sep="")
	write.table(file=output_file,new_tbl,quote=F,row.names = F,sep="\t")
	extract.ages(file=output_file,replicates = replicates, cutoff = cutoff, random = random)
}


extract.ages.tbl <- function(file = NULL,sep="\t", extant_species = c(), replicates = 1, cutoff = NULL, random = TRUE){
	tbl = read.table(file=file,h=T,sep=sep,stringsAsFactors =F)
		
	new_tbl = NULL # ADD EXTANT SPECIES
	
	for (i in 1:dim(tbl)[1]){
		if (tbl[i,1] %in% extant_species){
			status="extant"
		}else{status="extinct"}
		species_name = gsub(" ", "_", tbl[i,1])
		new_tbl = rbind(new_tbl,c(species_name,status,tbl[i,2:dim(tbl)[2]]))
	}
	if (dim(new_tbl)[2]==4){
		colnames(new_tbl) = c("Species","Status","min_age","max_age")
	}else{
		colnames(new_tbl) = c("Species","Status","min_age","max_age","trait")
	}
	
	
	output_file = file.path(dirname(file),strsplit(basename(file), "\\.")[[1]][1])
	output_file = paste(output_file,".txt",sep="")
	write.table(file=output_file,new_tbl,quote=F,row.names = F,sep="\t")
	extract.ages(file=output_file,replicates = replicates, cutoff = cutoff, random = random)
}
benmarwick/pyrater documentation built on May 7, 2019, 10:38 a.m.