#' AsgeneDB: A curated orthology arsenic metabolism gene database and computational tool for metagenome annotation
#' @description A manually curated arsenic functional gene database (AsgeneDB) and R package (Asgene package) are developed for rapid and accurate metagenomic analysis.
#' @param analysis Choose the target you want to analyze,abundance or taxonomy; type = "character"; default = "abundance"
#' @param workdir Specify directory for sequence file location;type = "character"; default = "./"
#' @param method Specify the database searching tool you plan to use, currently diamond, usearch and blast are supported; type = "character"; default = "diamond"
#' @param toolpath Specify directory for searching tool location; type = "character"; default = "./"
#' @param search_parameters Define metagenomic comparison parameters; type = "character"; default = "-e 1e-4 -p 28 --query-cover 80 --id 50"
#' @param seqtype Specify your sequence type, nucl or prot; type = "character"; default = "nucl"
#' @param filetype Specify the extensions of your sequence files, e.g., fastq, fastq.gz, fasta,fasta.gz, fq, fq.gz, fa, fa.gz; type = "character"; default = "fasta"
#' @param PE Specifies whether your metagenomic data were PE files, noting that PE files are named "_R1 with extension" or "_R2 with extension" (e.g., XINWEI_R1.fasta and XINWEI_R2.fasta); type = "logical"; default = TRUE
#' @param output Specify the directory for the final result file; type = "character"; default = "./"
#' @param test.data Example datasets are provided as input and output to help users better understand this package; type = "logical"; default = FALSE
#'
#' @return merge.data1
#' @export
#'
#' @examples
#' library(Asgene)
#' Asgene(analysis = "abundance", workdir = "./", method = "diamond", toolpath = "./", search_parameters = "-e 1e-4 -p 28 --query-cover 80 --id 50", seqtype = "nucl", filetype = "fasta", PE = TRUE, out = "./", test.data = FALSE)
#' Asgene(analysis = "abundance", workdir = "./", method = "diamond", toolpath = "./", search_parameters = "-e 1e-4 -p 28 --query-cover 80 --id 50",seqtype = "prot", output = "./", test.data = TRUE)
Asgene <- function(analysis = "abundance", workdir = "./", method = "diamond", toolpath = "./", search_parameters = "-e 1e-4 -p 28 --query-cover 80 --id 50", seqtype = "nucl", filetype = "fasta", PE = TRUE, output = "./", test.data = FALSE) {
if (test.data == FALSE){
setwd("./")
# Install dependent packages
if (!requireNamespace("dplyr", quietly = TRUE)) {
install.packages("dplyr")
}
if (!requireNamespace("seqinr", quietly = TRUE)) {
install.packages("seqinr")
}
library(dplyr)
library(seqinr)
# Call comparison tool
if (method == "diamond") {
system("mkdir sample_file")
write.fasta(sequences = AsgeneDB, names =names(AsgeneDB), file.out = 'AsgeneDB')
system(paste(toolpath, "diamond makedb --in AsgeneDB --db ./AsgeneDB", sep = ""))
file <- list.files(path = workdir, pattern = filetype)
for (i in file) {
file_1 <- paste(workdir, i, sep = "")
out <- gsub(filetype, "diamond", i)
out <- paste("./sample_file/", out, sep = "")
if (seqtype == "nucl") {
system(paste(toolpath, "diamond blastx ", search_parameters, " -d ./AsgeneDB.dmnd -q ", file_1, " -o ", out, sep = ""))
}
if (seqtype == "prot") {
system(paste(toolpath, "diamond blastp ", search_parameters, " -d ./AsgeneDB.dmnd -q ", file_1, " -o ", out, sep = ""))
}
print(paste("Comparison of ",i," successful!!",sep = ""))
}
}
else if (method == "usearch") {
if (grepl("gz", filetype)) {
stop("Only fastq and fasta files are supported by usearch!")
} else {
system("mkdir sample_file")
file <- list.files(path = workdir, pattern = filetype)
for (i in file) {
file_1 <- paste(workdir, i, sep = "")
out <- gsub(filetype, "usearch", i)
out <- paste("./sample_file/", out, sep = "")
system(paste(toolpath, "usearch -usearch_global ", file_1, " -db AsgeneDB ", search_parameters, " -blast6out ", out, sep = ""))
print(paste("Comparison of ",i," successful!!",sep = ""))
}
}
} else if (method == "blast") {
if (grepl("gz|fastq|fq", filetype, perl = TRUE)) {
stop("Only fasta files are supported by blast program!")
} else {
system("mkdir sample_file")
file <- list.files(path = workdir, pattern = filetype)
system(toolpath, "makeblastdb -dbtype prot -input_type fasta -in AsgeneDB -out ./AsgeneDB")
for (i in file) {
file_1 <- paste(workdir, i, sep = "")
out <- gsub(filetype, "blast", i)
out <- paste("./sample_file/", out, sep = "")
if (seqtype == "nucl") {
system(paste(toolpath, "blastx -db ./AsgeneDB ", search_parameters, " -query ", file_1, " -out ", out, sep = ""))
}
if (seqtype == "prot") {
system(paste(toolpath, "blastp -db ./AsgeneDB ", search_parameters, " -query ", file_1, " -out ", out, sep = ""))
}
print(paste("Comparison of ",i," successful!!",sep = ""))
}
}
} else {
stop("Specify the database searching tool you plan to use!")
}
# Merge metagenomic PE files (take union)
list <- read.table("./sampleinfo.txt", sep = "\t", header = F)
if (PE == TRUE) {
system("mkdir sample_merge")
for (i in list[, 1]) {
info1 <- file.info(file = paste("./sample_file/", i, "_R1.", method, sep = ""))
info2 <- file.info(file = paste("./sample_file/", i, "_R2.", method, sep = ""))
if (info1$size != 0 && info2$size != 0) {
file_R1 <- read.table(file = paste("./sample_file/", i, "_R1.", method, sep = ""), sep = "\t")
file_R1 <- data.frame(file_R1)
file_R2 <- read.table(file = paste("./sample_file/", i, "_R2.", method, sep = ""), sep = "\t")
file_R2 <- data.frame(file_R2)
file_total <- rbind(file_R1, file_R2)
file_total_2 <- aggregate(file_total[, 3] ~ file_total[, 1], data = file_total, FUN = "max")
file_total_2 <- as.data.frame(file_total_2)
names(file_total_2)[1] <- "V1"
names(file_total_2)[2] <- "V3"
file_total_2 <- merge(file_total_2, file_total, by = c("V1", "V3"))
file_total_2 <- file_total_2 %>% filter(!duplicated(V1))
write.table(file_total_2, file = paste("sample_merge/", i, ".", method, sep = ""), sep = " ", quote = FALSE, row.names = FALSE, col.names = F)
print(i)
} else {
next
}
}
}
# count the abundance values of each sample and standardize
if (analysis == "abundance") {
if (PE == TRUE) {
file.name <- list.files(path = "./sample_merge/", pattern = method)
} else {
file.name <- list.files(path = "./sample_file/", pattern = method)
}
for (i in file.name) {
if (PE == TRUE) {
file <- read.table(file = paste("./sample_merge/", i, sep = ""), sep = " ", header = F)
} else {
info <- file.info(paste("./sample_file/", i, sep = ""))
if (info$size != 0) {
file <- read.table(file = paste("./sample_file/", i, sep = ""), sep = "\t", header = F)
} else {
next
}
}
i <- gsub(paste(".", method, sep = ""), "", i)
file_1 <- mutate(file, v = i)
# add total reads
list <- data.frame(list)
names(list)[1] <- "v"
t <- list[grep(i, list$v), ]
file_2 <- merge(t, file_1, by = "v")
if (PE == TRUE) {
file_2 <- file_2[, c(1, 2, 3, 5)]
} else {
file_2 <- file_2[, c(1, 2, 3, 4)]
}
asgene_map <- asgene.map
asgene_map <- as.data.frame(asgene_map)
file_2 <- as.data.frame(file_2)
names(file_2)[4] <- "pi"
names(asgene_map)[1] <- "pi"
names(asgene_map)[2] <- "gene"
result_g <- merge(file_2, asgene_map, by = "pi")
result_g <- result_g[, -6]
names(result_g)[3] <- "totalreads"
protein_length_1 <- protein_length
protein_length_1 <- as.data.frame(protein_length_1)
names(protein_length_1)[1] <- "pi"
result <- merge(result_g, protein_length_1, by = "pi")
result <- as.data.frame(result)
names(result)[6] <- "length"
result_x <- result
result_x <- result_x[order(result_x$gene), ]
result_y <- dplyr::count(result_x, result_x$pi)
result_y <- as.data.frame(result_y)
names(result_y)[1] <- "pi"
result_v <- merge(result_x, result_y, by = "pi")
result_v1 <- result_v[!duplicated(result_v$pi), ]
# Formula=SUM protein{[mapped reads/(total reads*protein length*3)]*10^9}
result_v1$totalreads <- as.numeric(result_v1$totalreads)
result_df <- data.frame(result_v1)
result_df <- mutate(result_df, c = result_v1$n * 10^9 / (result_v1$length * 3))
result_df <- mutate(result_df, d = c / result_v1$totalreads)
# group_by_sum
result_df1 <- aggregate(result_df$d, by = list(result_df$gene), sum)
result_df1 <- as.data.frame(result_df1)
names(result_df1) <- c("gene", as.character(i))
system("mkdir sample_gene_abundance")
write.table(result_df1, file = paste("./sample_gene_abundance/", i, ".csv", sep = ""), sep = ",", quote = FALSE, row.names = FALSE)
print(paste("sample","(",i,") ","of arsenic metabolic genes abundance analysis successful!!",sep = ""))
}
print("All samples of arsenic metabolic genes abundance analysis successful!!")
abundance_csv <- list.files(path = "./sample_gene_abundance/", pattern = ".csv")
n <- length(abundance_csv)
merge.data <- read.csv(file = paste("./sample_gene_abundance/", abundance_csv[1], sep = ""), header = T, sep = ",")
merge.data1 <- data.frame(merge.data)
for (i in 2:n) {
new.data <- read.csv(file = paste("./sample_gene_abundance/", abundance_csv[i], sep = ""), sep = ",", header = T)
merge.data1 <- merge(merge.data1, new.data, by = "gene", all = T)
}
write.csv(merge.data1, file = paste(output, "sample_abundance.csv", sep = ""), row.names = F)
print("All samples of arsenic metabolic genes abundance statistics successful!!")
}
# count functional gene drive species of all samples
if (analysis == "taxonomy") {
if (PE == TRUE) {
list <- list.files(path = "./sample_merge/", pattern = method)
} else {
list <- list.files(path = "./sample_file/", pattern = method)
}
system("mkdir sample_gene_tax")
for (i in list) {
if (PE == TRUE) {
a <- read.table(file = paste("./sample_merge/", i, sep = ""), sep = " ", header = F)
} else {
a <- read.table(file = paste("./sample_file/", i, sep = ""), sep = "\t", header = F)
}
a <- as.data.frame(a)
if (PE == TRUE) {
names(a)[1] <- "reads_id"
names(a)[3] <- "protein_id"
a <- a[, c(1, 3)] %>% filter(!duplicated(a[, 1]))
}
else {
names(a)[1] <- "reads_id"
names(a)[2] <- "protein_id"
a <- a[, c(1, 2)] %>% filter(!duplicated(a[, 1]))
}
i <- gsub(".diamond", "", i)
a1 <- mutate(a, "sample" = i)
sample_gene_tax_pathway <- merge(a1, id_gene_tax_pathway, by = "protein_id")
write.table(sample_gene_tax_pathway, file = paste("./sample_gene_tax/", i, ".csv", sep = ""), sep = ",", quote = FALSE, row.names = FALSE)
}
a <- list.files(path = "./sample_gene_tax/", pattern = ".csv")
n <- length(a)
merge.data <- read.csv(file = paste("./sample_gene_tax/", a[1], sep = ""), header = T, sep = ",")
merge.data1 <- data.frame(merge.data)
for (i in 2:n) {
new.data <- read.csv(file = paste("./sample_gene_tax/", a[i], sep = ""), sep = ",", header = T)
merge.data1 <- merge(merge.data1, new.data, all = T)
}
write.csv(merge.data1, file = paste(output, "sample_gene_tax_pathway.csv", sep = ""), row.names = F)
print("All samples of arsenic metabolic species statistics successful!!")
}
}
if(test.data==TRUE){
# Install dependent packages
if (!requireNamespace("dplyr", quietly = TRUE)) {
install.packages("dplyr")
}
if (!requireNamespace("seqinr", quietly = TRUE)) {
install.packages("seqinr")
}
library(dplyr)
library(seqinr)
setwd("./")
system("mkdir test_data")
write.fasta(sequences = testdata1, names =names(testdata1), file.out = './test_data/testdata1')
write.fasta(sequences = testdata2, names =names(testdata2), file.out = './test_data/testdata2')
write.fasta(sequences = testdata3, names =names(testdata3), file.out = './test_data/testdata3')
# Call comparison tool
if (method == "diamond") {
system("mkdir sample_file")
write.fasta(sequences = AsgeneDB, names =names(AsgeneDB), file.out = 'AsgeneDB')
system(paste(toolpath, "diamond makedb --in AsgeneDB --db ./AsgeneDB", sep = ""))
for (i in c("testdata1","testdata2","testdata3")) {
file_1 <- paste(workdir,"test_data/", i, sep = "")
out <- paste(i,".diamond",sep = "")
out <- paste("./sample_file/", out, sep = "")
if (seqtype == "nucl") {
system(paste(toolpath, "diamond blastx ", search_parameters, " -d ./AsgeneDB.dmnd -q ", file_1, " -o ", out, sep = ""))
}
if (seqtype == "prot") {
system(paste(toolpath, "diamond blastp ", search_parameters, " -d ./AsgeneDB.dmnd -q ", file_1, " -o ", out, sep = ""))
}
print(paste("Comparison of ",i," successful!!",sep = ""))
}
}
else if (method == "usearch") {
if (grepl("gz", filetype)) {
stop("Only fastq and fasta files are supported by usearch!")
} else {
system("mkdir sample_file")
for (i in c("testdata1","testdata2","testdata3")) {
file_1 <- paste(workdir,"test_data/", i, sep = "")
out <- paste(i,".usearch",sep = "")
out <- paste("./sample_file/", out, sep = "")
system(paste(toolpath, "usearch -usearch_global ", file_1, " -db AsgeneDB ", search_parameters, " -blast6out ", out, sep = ""))
print(paste("Comparison of ",i," successful!!",sep = ""))
}
}
} else if (method == "blast") {
if (grepl("gz|fastq|fq", filetype, perl = TRUE)) {
stop("Only fasta files are supported by blast program!")
} else {
system("mkdir sample_file")
system(toolpath, "makeblastdb -dbtype prot -input_type fasta -in AsgeneDB -out ./AsgeneDB")
for (i in c("testdata1","testdata2","testdata3")) {
file_1 <- paste(workdir,"test_data/", i, sep = "")
out <- paste(i,".blast",sep = "")
out <- paste("./sample_file/", out, sep = "")
if (seqtype == "nucl") {
system(paste(toolpath, "blastx -db ./AsgeneDB ", search_parameters, " -query ", file_1, " -out ", out, sep = ""))
}
if (seqtype == "prot") {
system(paste(toolpath, "blastp -db ./AsgeneDB ", search_parameters, " -query ", file_1, " -out ", out, sep = ""))
}
print(paste("Comparison of ",i," successful!!",sep = ""))
}
}
} else {
stop("Specify the database searching tool you plan to use!")
}
# Merge metagenomic PE files (take union)
list <- testdata_info
# count the abundance values of each sample and standardize
if (analysis == "abundance") {
file.name <- list.files(path = "./sample_file/", pattern = method)
for (i in file.name) {
info <- file.info(paste("./sample_file/", i, sep = ""))
if (info$size != 0) {
file <- read.table(file = paste("./sample_file/", i, sep = ""), sep = "\t", header = F)
} else {
next
}
i <- gsub(paste(".", method, sep = ""), "", i)
file_1 <- mutate(file, v = i)
# add total reads
list <- data.frame(list)
names(list)[1] <- "v"
t <- list[grep(i, list$v), ]
file_2 <- merge(t, file_1, by = "v")
file_2 <- file_2[, c(1, 2, 3, 4)]
asgene_map <- asgene.map
asgene_map <- as.data.frame(asgene_map)
file_2 <- as.data.frame(file_2)
names(file_2)[4] <- "pi"
names(asgene_map)[1] <- "pi"
names(asgene_map)[2] <- "gene"
result_g <- merge(file_2, asgene_map, by = "pi")
result_g <- result_g[, -6]
names(result_g)[3] <- "totalreads"
protein_length_1 <- protein_length
protein_length_1 <- as.data.frame(protein_length_1)
names(protein_length_1)[1] <- "pi"
result <- merge(result_g, protein_length_1, by = "pi")
result <- as.data.frame(result)
names(result)[6] <- "length"
result_x <- result
result_x <- result_x[order(result_x$gene), ]
result_y <- dplyr::count(result_x, result_x$pi)
result_y <- as.data.frame(result_y)
names(result_y)[1] <- "pi"
result_v <- merge(result_x, result_y, by = "pi")
result_v1 <- result_v[!duplicated(result_v$pi), ]
# Formula=SUM protein{[mapped reads/(total reads*protein length*3)]*10^9}
result_v1$totalreads <- as.numeric(result_v1$totalreads)
result_df <- data.frame(result_v1)
result_df <- mutate(result_df, c = result_v1$n * 10^9 / (result_v1$length * 3))
result_df <- mutate(result_df, d = c / result_v1$totalreads)
# group_by_sum
result_df1 <- aggregate(result_df$d, by = list(result_df$gene), sum)
result_df1 <- as.data.frame(result_df1)
names(result_df1) <- c("gene", as.character(i))
system("mkdir sample_gene_abundance")
write.table(result_df1, file = paste("./sample_gene_abundance/", i, ".csv", sep = ""), sep = ",", quote = FALSE, row.names = FALSE)
print(paste("sample","(",i,") ","of arsenic metabolic genes abundance analysis successful!!",sep = ""))
}
print("All samples of arsenic metabolic genes abundance analysis successful!!")
abundance_csv <- list.files(path = "./sample_gene_abundance/", pattern = ".csv")
n <- length(abundance_csv)
merge.data <- read.csv(file = paste("./sample_gene_abundance/", abundance_csv[1], sep = ""), header = T, sep = ",")
merge.data1 <- data.frame(merge.data)
for (i in 2:n) {
new.data <- read.csv(file = paste("./sample_gene_abundance/", abundance_csv[i], sep = ""), sep = ",", header = T)
merge.data1 <- merge(merge.data1, new.data, by = "gene", all = T)
}
write.csv(merge.data1, file = paste(output, "sample_abundance.csv", sep = ""), row.names = F)
print("All samples of arsenic metabolic genes abundance statistics successful!!")
}
# count functional gene drive species of all samples
if (analysis == "taxonomy") {
list <- list.files(path = "./sample_file/", pattern = method)
system("mkdir sample_gene_tax")
for (i in list) {
a <- read.table(file = paste("./sample_file/", i, sep = ""), sep = "\t", header = F)
a <- as.data.frame(a)
names(a)[1] <- "reads_id"
names(a)[2] <- "protein_id"
a <- a[, c(1, 2)] %>% filter(!duplicated(a[, 1]))
i <- gsub(".diamond", "", i)
a1 <- mutate(a, "sample" = i)
sample_gene_tax_pathway <- merge(a1, id_gene_tax_pathway, by = "protein_id")
write.table(sample_gene_tax_pathway, file = paste("./sample_gene_tax/", i, ".csv", sep = ""), sep = ",", quote = FALSE, row.names = FALSE)
print(paste("sample","(",i,") ","of arsenic metabolic species analysis successful!!",sep = ""))
}
a <- list.files(path = "./sample_gene_tax/", pattern = ".csv")
n <- length(a)
merge.data <- read.csv(file = paste("./sample_gene_tax/", a[1], sep = ""), header = T, sep = ",")
merge.data1 <- data.frame(merge.data)
for (i in 2:n) {
new.data <- read.csv(file = paste("./sample_gene_tax/", a[i], sep = ""), sep = ",", header = T)
merge.data1 <- merge(merge.data1, new.data, all = T)
}
write.csv(merge.data1, file = paste(output, "sample_gene_tax_pathway.csv", sep = ""), row.names = F)
print("All samples of arsenic metabolic species statistics successful!!")
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.