#' Filter rare response variables from the data
#'
#' @details This function allows you to remove response units (OTUs or SNPs or
#' species) from your response data as a preprocessing step. Suitable when the
#' response is a binary outcome.
#'
#' @param X is a data.frame with rows as sites or individuals or populations and
#' columns as loci or species OTUs.
#' @param lower is the lower threshold value in which response varialkes are
#' removed from the data.frame.
#' @param higher is the upper threshold value in which response varialkes are
#' removed from the data.frame.
#'
#' @returns A filtered tibble.
#'
#' @examplesIf FALSE
#' X <- filterRareCommon(Responsedata, lower = 0.4, higher = 0.7)
#'
#' @export
filterRareCommon <- function(X,
lower = lower,
higher = higher){
n = ncol(X)
r = nrow(X)
Xt <- as.data.frame(t(X))
Xt$Xsum <- rowSums(Xt[1:n,] )
FilterNoCommon <- Xt %>%
dplyr::filter(.data$Xsum < (r * higher))
FilterNoRare <- FilterNoCommon %>%
dplyr::filter(.data$Xsum > (r * lower))
#remove 'new' sort column
FilterNoRare$Xsum <- NULL
X <- tibble::as_tibble(t(FilterNoRare))
X
}
#' Conversion to single column per locus from plink file via LEA functionality
#'
#' @details Function to import SNP data from a plink format into a format
#' suitable for MrIML predicts (presence/absence of an alelle for each locus).
#' Currently if there is missing data (NAs) it either imputes them as the mode
#' or leaves them. A histogram is also produced of the missing data.
#'
#' @param pedfile A file location.
#' @param mapfile A file location.
#'
#' @returns A tibble.
#'
#' @examplesIf FALSE
#' snps <- readSnpsPed("FILE_NAME.plink.ped", "FILE_NAME.plink.map.map")
#' X <- filterRareCommon(snps, lower = 0.4, higher = 0.7)
#'
#' @export
readSnpsPed <- function(pedfile, mapfile){
if (missing(pedfile))
stop("missing plink '.ped' file")
if (missing(mapfile))
stop("missing plink '.map' file")
# Isolate SNP matrix from .ped file and and append sample IDs as row names
snpobj <- utils::read.table(
pedfile,
row.names = 2,
na.strings = c("0", "-9"),
stringsAsFactors = FALSE
)
snpobj <- snpobj[-(1:5)]
# Extract locus IDs from .map file, create col names and append to SNP matrix
lnames <- utils::read.table(mapfile)[,2]
locnames1 <- paste0(lnames, ".1")
locnames2 <- paste0(lnames, ".2")
locnames3 <- c(rbind(locnames1, locnames2))
colnames(snpobj) <- locnames3
if (any(is.na(snpobj))){
graphics::hist(
colSums(is.na(as.matrix(snpobj))),
main = "Missing genotypes per SNP",
xlab = "No. missing genotpyes"
)
Sys.sleep(1)
impute <- readline(
paste("This dataset contains NAs. Please select an imputation method",
"(type a number): 0. none, 1. mode ")
)
if (impute == "1"){
# Function for finding the major allele for each column
Mode <- function(x, na.rm = FALSE) {
if (na.rm) x = x[!is.na(x)]
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
for(i in 1:ncol(snpobj)){
# Replace NAs with the major allele at a given locus
snpobj[i] <- tidyr::replace_na(
snpobj[i],
Mode(snpobj[i], na.rm = TRUE)
)
# Re-code the major allele as "0"
snpobj[i] <- replace(
snpobj[i],
snpobj[i] == Mode(snpobj[i], na.rm = TRUE),
"0"
)
# Re-code the minor allele as "1"
snpobj[i] <- replace(
snpobj[i],
snpobj[i] %in% c("A", "G", "T", "C"),
"1"
)
}
}
} else {
for(i in 1:ncol(snpobj)){
# Re-code the major allele as "0"
snpobj[i] <- replace(
snpobj[i],
snpobj[i] == Mode(snpobj[i], na.rm = TRUE),
"0"
)
# Re-code the minor allele as "1"
snpobj[i] <- replace(
snpobj[i],
snpobj[i] %in% c("A", "G", "T", "C"),
"1"
)
}
}
data.frame(snpobj, rownames.force = TRUE) %>%
tibble::as_tibble()
}
#' Calculates resistance components from a list of pairwise resistance surfaces
#'
#' @details Outputs a data frame of significant resistance components for each
#' matrix in the target folder. These data can be combined with non-pairwise
#' matrix data.
#'
#' @param foldername A \code{character} this is the location where the
#' resistance surfaces are stored.
#' @param p_val A \code{numeric} this sets the significance threshold for axes
#' in explaining variance in the original resistance matrix based on redundancy
#' analysis. In effect this filters out axes that don't explain variance.
#' @param cl A parallel argument to be passed to [vegan::capscale()] if parallel
#' compute is wanted.
#'
#' @returns A data frame.
#'
#' @examplesIf FALSE
#' Y <- resist_components(filename = 'FILE_PATH', p_val = 0.01)
#'
#' @export
resist_components <- function (foldername = foldername,
p_val = p_val,
cl = NULL){
# Require the 'vegan' package to be installed
if (!requireNamespace("vegan", quietly = TRUE)) {
message(
paste0("The 'vegan' package is required for this function. Would you ",
"like to install it? (yes/no)")
)
response <- readline()
if (tolower(response) == "yes") {
utils::install.packages("vegan")
} else {
stop(
paste0("The 'vegan' package is needed for this function. Please ",
"install it to proceed.")
)
}
}
# Require the 'ape' package to be installed
if (!requireNamespace("ape", quietly = TRUE)) {
message(
paste0("The 'ape' package is required for this function. Would you ",
"like to install it? (yes/no)")
)
response <- readline()
if (tolower(response) == "yes") {
utils::install.packages("ape")
} else {
stop(
paste0("The 'ape' package is needed for this function. Please ",
"install it to proceed.")
)
}
}
files <- list.files(paste(foldername))
n_matrix <- length(files)
#for (i in 1:length(files)){
final_d <- lapply(
seq(1, n_matrix),
function(i) {
# ! Need to be csv in a folder within your working directory
data_resist <- utils::read.csv(paste0("./", foldername,'/', files[i]))
#remove row information. It is important that the matrix is symmetric
data_resist[1] <- NULL
# Colnames should be names of the population/sites/etc
siteData<- as.data.frame(names(data_resist))
names(siteData) <- c('Site') # For identifiability
#names need to match in each matrix
data_resist <- (data_resist / max(data_resist))
res <- ape::pcoa(data_resist)
# variance explained by pcoa 1
l1 <- round(res$values$Relative_eig[1], 2)
# variance explained by pcoa 2 etc
l2 <- round(res$values$Relative_eig[2], 2)
n_axes <- ncol(res$vectors)
pcdat <- as.data.frame(res$vectors)
##########Plot##################
PcoA2D_AT <- res$vectors[,1:2]
# So here simply the Euclidean distance is used, see distances above
PcoA2D_AT <- as.data.frame(PcoA2D_AT)
names(PcoA2D_AT)[1:2] <- c('PCoA1', 'PCoA2')
#plot it
Tr_PcoA <- ggplot2::ggplot( # Would be good to make this 3D at some point
PcoA2D_AT,
ggplot2::aes(x = .data$PCoA1, y = .data$PCoA2, label = siteData$Site)
) +
ggplot2::geom_point(size =2) +
ggplot2::geom_text(
col = 'black',
size = 4,
check_overlap = TRUE
) +
ggplot2::labs(
title = paste('PCoA', gsub('.csv','', files[i])),
y = paste0(
"PCoA-2 (",
paste0((l2 * 100), "%"),
" variance explained)"
),
x = paste0(
"PCoA-1 (",
paste0((l1 * 100), "%"),
" variance explained)"
)
) +
ggplot2::theme_bw()
print(Tr_PcoA)
#------------------------------------------
#Select only significant axes
#------------------------------------------
sigPCs <- NULL
for(j in 1:n_axes){
vec <- pcdat[j]
axisName <- names(vec)
names(vec) <- c('pc_axis')
## Basic dbRDA Analysis
vare.cap <- vegan::capscale(
data_resist ~ vec$pc_axis,
parallel = cl
)
sig <- stats::anova(vare.cap) #999 permutations
pval <- sig$`Pr(>F)`
sigPCs[[j]] <- c(axisName, pval[1])
}
sigPC <- as.data.frame(do.call(rbind, sigPCs))
names(sigPC) <- c('Axis', 'P-value')
sigPC$`P-value` <- as.numeric(as.character(sigPC$`P-value`))
#select significant pcs
threholdScore <- sigPC %>%
dplyr::filter(.data$`P-value`<= p_val)
#------------------------------------------
#select informative axes
ReducedAxis <- pcdat %>%
subset(select = threholdScore$Axis)
# add site data
ReducedAxisDF <- cbind(siteData, ReducedAxis)
rownames(ReducedAxisDF) <- ReducedAxisDF$Site
ReducedAxisDF$Site <-NULL
#add resistance surface names
names(ReducedAxisDF) <- paste(
gsub('.csv','', files[i]),
names(ReducedAxisDF),
sep='_'
)
ReducedAxisDF
})
#save as a data frame
as.data.frame(do.call(cbind, final_d))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.