#Functions are stored here
###Checks what species intersect?###
Iterate_intersect <- function(spdf_in, raster_size = numeric()){
require('sf')
output_matrix <- matrix(nrow = 1, ncol = 1, dimnames = list("row","Column"))
#Create A raster that divides the data into easier chunks based on raster size
#This is a square raster
x <- raster::raster(ncol = raster_size,
nrow = raster_size,
ext = raster::extent(spdf_in),
vals = 1)
#we then convert that into a sf object
x_sf <- as(x, "SpatialPolygonsDataFrame")%>%
st_as_sf()
#convert SPDF to sf
spdf_raw <- as(spdf_in,"sf")
sf::st_crs(spdf_raw) <- "+init=epsg:4326"
sf::st_crs(x_sf) <- "+init=epsg:4326"
#then we get which polygons are present in each spdf. this is a binary function
#that will output a list, where each item in the list is a vector containing the rows that
# from your sf object that are present in each grid cell
inter <- sf::st_intersects(x_sf[], spdf_raw[])
#Next we iterate through the list creating subsets of our spdf object for each grid cell
#then perform the matrix calculations. Note that this process sitll contains the geometry that
#goes outside the cells, but only contains geometries that intersect into each cell
length(inter)
#now so we dont do the same comparisons twice we will get a taxon list
taxon <- spdf_raw$binomial
taxon$ID <- rownames(taxon)
for(k in 1:length(inter)){
print(paste("working on section ", k))
spdf <- spdf_raw[inter[[k]],]
#intermatrix is intermediary output object
inter_matrix <- matrix(nrow = nrow(spdf), ncol = nrow(spdf),
dimnames = list(spdf$binomial,spdf$binomial))
for(i in 1:(nrow(spdf)-1)){
print(paste(i,"/",nrow(spdf)))
for(j in (i+1):nrow(spdf)){
print(paste(j,"/",nrow(spdf)))
print(paste("now doing ",k,": ", i," x ",j))
if(sf::st_is_valid(spdf[i,1])){
if(sf::st_is_valid(spdf[j,1])){
area_over <- sum(sf::st_area(suppressWarnings(sf::st_intersection(spdf[i,1], spdf[j,1]))))
area_smaller <- min(sum(st_area(spdf[i,1])),sum(st_area(spdf[j,1])))
#area_larger <- max(st_area(spdf[i,1]),st_area(spdf[j,1]))}
if(length(area_over)==0){
area_over <- 0
}else{
}
print(area_over)
inter_matrix[i,j] <- area_over/area_smaller
rownames(inter_matrix)[i] <- spdf[i,]$binomial
colnames(inter_matrix)[j] <- spdf[j,]$binomial
}else{
print(paste('failed on', spdf[j,]$binomial))
inter_matrix[i,j] <- paste('failed on', spdf[j,]$binomial)
rownames(inter_matrix)[i] <- spdf[i,]$binomial
colnames(inter_matrix)[j] <- spdf[j,]$binomial
}
}else{
print(paste('failed on', spdf[i,]$binomial))
inter_matrix[i,j] <- paste('failed on', spdf[i,]$binomial)
rownames(inter_matrix)[i] <- spdf[i,]$binomial
colnames(inter_matrix)[j] <- spdf[j,]$binomial
}
}
}
output_matrix <- merge(output_matrix,inter_matrix, all = TRUE)
rownames(output_matrix)<- colnames(output_matrix)
}
return(output_matrix)
}
### Find optimal number of divisions for Allopatry ###
optimal_div <- function(spdf_in, min,max) {
require('sf')
output_mat <- matrix(ncol = 2, nrow = 1)
#convert SPDF to sf
spdf_raw <- as(spdf_in,"sf")
taxon <- spdf_in$binomial
for (i in min:max){
#Create A raster that divides the data into easier chunks based on raster size
#This is a square raster
check_mat <- matrix(ncol = length(taxon), nrow = length(taxon),
dimnames = list(taxon,taxon))
x <- raster::raster(ncol = i,
nrow = i,
ext = raster::extent(spdf_in),
vals = 1)
#we then convert that into a sf object
x_sf <- as(x, "SpatialPolygonsDataFrame")%>%
st_as_sf()
sf::st_crs(spdf_raw) <- "+init=epsg:4326"
sf::st_crs(x_sf) <- "+init=epsg:4326"
#then we get which polygons are present in each spdf. this is a binary function
#that will output a list, where each item in the list is a vector containing the rows that
# from your sf object that are present in each grid cell
inter <- sf::st_intersects(x_sf[], spdf_raw[])
totalcalc <- 0
for (j in 1:length(inter)){
n_calc = 0
if (length(inter[[j]]>0)){
for (k in 1:length(inter[[j]])){
for (l in k:length(inter[[j]])){
if (is.na(check_mat[inter[[j]][k],inter[[j]][l]])){
check_mat[inter[[j]][k],inter[[j]][l]] <- 1
n_calc = n_calc + 1
}
}
}
totalcalc <- totalcalc + n_calc
}else{
totalcalc = totalcalc
}
}
output_mat <- rbind(output_mat, c(i,totalcalc))
}
output_mat <- data.frame("Num_Div" = output_mat[-1,1],
"Num_calc" = output_mat[-1,2])
return(list(output_mat,check_mat))
}
### Creates list of where species intersect with what chunks###
chunk_pres_ab <- function(spdf_in, raster_size = numeric(),species_spdf){
require('sf')
output_matrix <- matrix(nrow = 1, ncol = 1, dimnames = list("row","Column"))
#Create A raster that divides the data into easier chunks based on raster size
#This is a square raster
x <- raster::raster(ncol = raster_size,
nrow = raster_size,
ext = raster::extent(spdf_in),
vals = 1)
#we then convert that into a sf object
x_sf <- as(x, "SpatialPolygonsDataFrame")%>%
st_as_sf()
#convert SPDF to sf
spdf_raw <- as(spdf_in,"sf")
sf::st_crs(spdf_raw) <- "+init=epsg:4326"
sf::st_crs(x_sf) <- "+init=epsg:4326"
#then we get which polygons are present in each spdf. this is a binary function
#that will output a list, where each item in the list is a vector containing the rows that
# from your sf object that are present in each grid cell
inter <- sf::st_intersects(x_sf[], spdf_raw[])
#Next we iterate through the list creating subsets of our spdf object for each grid cell
#then perform the matrix calculations. Note that this process sitll contains the geometry that
#goes outside the cells, but only contains geometries that intersect into each cell
length(inter)
sp_inter <- sf::st_intersects(x_sf[],spdf_species[])
return(list(inter,sp_inter))
}
###Calculate the instability index from Mesquite
instability_index <- function(phy_list = as.list()){
#Need to get a dataframe that is taxa by distance
#the Index for each taxon in in the set of trees
#that is summed across all distances between trees
#The data matrix can be rows of each species and the columnes
#can be the same species replicated for all trees
require(ape)
I_mat <- matrix()
for(i in 1:length(phy_list)){
j = i+1
if (j>length(phy_list)){
break()
}
#create Dix and Diy and reorder them for consistency
#Need to figure out if rowname sort is required but lol fuck it for now
Dix<- ape::cophenetic.phylo(phy_list[[i]])%>%
as.data.frame(.)%>%
select(order(colnames(.)))
Diy <- ape:: cophenetic.phylo(phy_list[[j]])%>%
as.data.frame(.)%>%
select(order(colnames(.)))
numer <- abs(Dix-Diy)
denom <- (Dix-Diy)^2
comp_mat <- (numer/denom) %>%
rename_all(.,function(x) paste0(colnames(.),"_phylo_",j))
I_mat <- cbind(I_mat,comp_mat)
}
index <- sapply(I_mat[,-1], function (x) as.numeric(gsub("NaN",0,x)))
index <- rowSums(index)
names(index) <- rownames(I_mat)
return(index)
}
### Calculate the Consensus Fork Index
calc_CFI <- function(phy_list,
ranges = c(.5,.75,.95)){
require(ape)
out_list <- list()
list_names <- vector()
for(i in 1:length(ranges)){
con_tree <- consensus(phy_list,p = ranges[i])
boot <- prop.clades(con_tree,phy_list)
CFI <- Nnode(con_tree)/(length(con_tree$tip.label)-2)
out_list <- append(out_list,CFI)
list_names <- append(list_names,paste0("CFI_",gsub("0.","",
as.character(ranges[i]))))
}
names(out_list) <- list_names
return(out_list)
}
#Detect rogue taxa and highlight. Uses the custom function instability_index() and calc_CFI()
detect_rogue_taxa <- function(phy_list = as.list(), #Feed in a list of bootstrap trees
method = "percent", #need a series of options
percent_threshold = .10,
drop_threshold = .1){
require(ape)
require(tidyverse)
CFI_full <- calc_CFI(phy_list)
insta_index <- instability_index(phy_list)
switch(method,
"percent" = {
#percent removes tips based on percent_threshold
#and their instability index scores
#i.e. a percent_threshold of .1 would remove the 10% of worst scoring tips
insta_index <-data.frame(index = sort(insta_index),
tips = names(insta_index))
num_end <- ceiling((1-percent_threshold) * nrow(insta_index))
tips_kept <- insta_index[1:num_end,]
tips_removed <- insta_index[num_end+1:nrow(insta_index),]
tree_sub = list()
for (i in 1:length(phy_list)){
tree_sub[[i]] <- drop.tip(phy_list[[i]],tips_removed$tips)}
return(list(insta_index,tips_kept, tips_removed,tree_sub,num_end))
},
"no_drop" = {
#no drop drops no taxa and just returns the indeces
return(insta_index)},
"iterative" = {
#iterative drops the worst performing taxa
#then recalculates CFI
#then selectively drops taxa based on drop_threshold
#need to add a way to not go through every single taxa
insta_index <-data.frame(index = sort(insta_index),
tips = names(insta_index))
tips_remove <- data.frame(taxa = as.character(),
CFI_50 = as.numeric(),
CFI_75 = as.numeric(),
CFI_95 = as.numeric())
for(i in 1:nrow(insta_index))
{
#sub_tree <- drop.tip(phy_list[[i]],insta_index[i,2])
tree_sub <- list()
for (j in 1:length(phy_list)){
tree_sub[[j]] <- drop.tip(phy_list[[j]],insta_index[i,2])
}
CFI_sub <- calc_CFI(tree_sub)
CFI_diff_50 <- CFI_sub[[1]] - CFI_full[[1]]
CFI_diff_75 <- CFI_sub[[2]] - CFI_full[[2]]
CFI_diff_95 <- CFI_sub[[3]] - CFI_full[[3]]
if(CFI_diff_95 >= drop_threshold |
CFI_diff_75 >= drop_threshold |
CFI_diff_50 >= drop_threshold){
tips_remove <- rbind(tips_remove,
data.frame(insta_index[i,2],
CFI_diff_50,
CFI_diff_75,
CFI_diff_95))
}
}
return(list(tips_remove,CFI_full))
}
)
return(list(insta_index,tips_kept, tips_removed,tree_sub,num_end))
}
iterate_genbank <- function(search_terms, by = 1000,
seq_length_max = 30000,
extra_term = ""){
require(rentrez)
require(tidyverse)
#Custom Sequence Function that ensures last sequences
# If there are 369 sequences and you go by 100s, then the last 69 would not be included
seq0 <- function(from = 1, to = 1, by = 1, incLast = TRUE){
out = do.call(what = seq, args = list(from, to, by))
if (incLast & to%%by != 0){
out = c(out, tail(out, 1) + by)
}
return(out)
}
#iterate through each search term in list
for(i in 1:length(search_terms)){
term <- paste0("('",search_terms[i],"'","[Organism] OR ", search_terms[i],
"[All Fields])"," AND (1 [SLEN] : ",seq_length_max,"[SLEN])",extra_term)
search_res <- entrez_search(db="nuccore", term=term,use_history = TRUE)
#printing to make it look nicer. Adding progress bar as well
print(paste0("Now Quering: ",search_terms[i], ". ", search_res$count, " records found"))
pb <- txtProgressBar(min = 0, max = search_res$count, style = 3, width = 30, char = "\U1F438")
#Ifelse to fetch if few sequences
if(search_res$count<by){
fetch <- entrez_fetch(db = "nuccore", #id = search_res$ids[1:length(search_res$ids)],
rettype = "fasta", web_history = search_res$web_history)
write(fetch, paste0(search_terms[i],"_FASTA.fasta"), sep = "\n")
setTxtProgressBar(pb,search_res$count)
}else{
#else is for larger amount of sequences
start = by
fetch <- entrez_fetch(db = "nuccore", #id = search_res$ids[1:length(search_res$ids)],
rettype = "fasta", retmax = by,
web_history = search_res$web_history)
write(fetch, paste0(search_terms[i],"_FASTA.fasta"), sep = "\n")
setTxtProgressBar(pb,by)
#starting to pull sequences every by
for(j in seq0(from = start,to = search_res$count, by = by)){
setTxtProgressBar(pb,j)
if(j==by){next}else{
fetch <- entrez_fetch(db = "nuccore", #id = search_res$ids[start:j],
rettype = "fasta", web_history = search_res$web_history,
retmax = by, retstart = start)
write(fetch, paste0(search_terms[i],"_FASTA.fasta"),sep = "\n", append = TRUE)
start = j
}
}
}
#finish off the ProgressBar
setTxtProgressBar(pb,search_res$count)
close(pb)
}
}
sp_list_amnh <- function(data){
library(tidyverse)
library(stringr)
library(magrittr)
data_out <-
data %>%
mutate("ID" = 1:nrow(data)) %>%
mutate('Raw' = trimws(data[,1]))%>%
#select('trimmed')%>%
mutate('level' = substr(.$Raw,1,str_locate(.$Raw,":")-1))%>%
mutate('value' = (trimws(str_split(.$Raw,":",n = 2, TRUE)[,2])))%>%
mutate('Order' = ifelse(level == 'Order',value,'-'),
'Superfamily' = ifelse(level == 'Superfamily',value,'-'),
'Family' = ifelse(level == 'Family',value,'-'),
'Subfamily' = ifelse(level == 'Subfamily',value,'-'),
'Genus' = ifelse(level == 'Genus',value,'-'),
'Species' = ifelse(level == 'Species',value,'-'))%>%
mutate('binomial' = ifelse(level == 'Species',
(substr(Species,1,
(str_locate(Species,"^([^ ]+[ ]+[^ ]+)[ ]")[,2]))),Species))%>%
mutate(binomial = str_trim(binomial))%>%
select(ID,Raw, level, Order, Superfamily,Family,Subfamily,Genus,Species,binomial)
return(list(data_out,unique(data_out$binomial)))
}
# Presence absence matrix with 2 object input. Takes SF objects
sf_pres_ab <- function(x,y,x.unit,y.unit){
require(sf)
require(tidyverse)
#add in checks for if function is sf or not
#add in checks that the units exist
inter <-st_intersects(x,y)
#outputs a list where each item in list is X
#the row number of y is given within each item x
mat <- matrix(nrow = nrow(x),ncol = nrow(y))
rownames(mat) <- pull(x,x.unit) #name the rows based on user input
colnames(mat) <- pull(y,y.unit) #name the cols based on user input
for(i in 1:length(inter)){
for(j in 1:length(inter[[i]])){
mat[i,inter[[i]][j]] = 1
}
}
mat[is.na(mat)] <- 0
return(mat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.