#' @title Quality check graphics for DNA Barcodes
#'
#' @name get_GenBank_seqs_with_efecth
#'
#' @description Obtain a full sequences, CDS sequence and protein files from
#' a list of NCBI accession number
#'
#' @param seq_genbank_ids_user A list of NCBI accession numbers
#' @param is_nucleotide_user Logical. True indicates if the accession numbers
#' are nucleotides data. False indicates if data are protein data
#' @param get_cds_user Logical. Indicates if CDS information is obtain from
#' NCBI
#' @param organize_name_user A list of characters or names to include in the
#' sequence header.
#' @param force_name_to_genes_user Character to rename the gene name in a
#' sequence. default is NULL.
#' @param file_out_user character to add in the output file.
#' @param out_directory_user Output directory.
#'
#' @return
#' three fasta files: full sequence, CDS sequence and protein information.
#' a data frame with sequences information in text format
#' @export
#'
#'@importFrom utils "write.table"
#'
#' @examples
#'get_genes <- list( c("KU494314",
#' "MK369820",
#' "KC520685",
#' "DQ502716",
#' "KM024310"))
#'
#'file_out <- c("COX1")
#'
#'for(i in 1:length(file_out)) {
#' get_GenBank_seqs_with_efecth (seq_genbank_ids_user = get_genes[[i]],
#' is_nucleotide_user = TRUE,
#' get_cds_user = TRUE,
#' organize_name_user = c("species","gene","accession_number"),
#' force_name_to_genes_user = NULL,
#' file_out_user = file_out[i],
#' out_directory_user = NULL)
#' }
#' rm(i)
#'
#'
#'
get_GenBank_seqs_with_efecth <- function(seq_genbank_ids_user,
is_nucleotide_user = TRUE,
get_cds_user = TRUE,
organize_name_user = c("species","gene",
"accession_number"),
force_name_to_genes_user = NULL,
file_out_user = NULL,
out_directory_user = NULL) {
# user input
seq_genbank_ids <- seq_genbank_ids_user
is_nucleotide <- is_nucleotide_user
get_cds <- get_cds_user
organize_name <- organize_name_user
force_name_to_genes <- force_name_to_genes_user
file_out <- file_out_user
if(is.null(file_out)) { file_out <- paste0("sequences_N_")
} else {
file_out <- paste0(file_out, "_sequences_N_")
}
## outdir arguments
if(is.null(out_directory_user)) {
out_directory <- getwd()
} else {
setwd(out_directory_user)
out_directory <- getwd()
}
# build html request
line_1 <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db="
if(is_nucleotide) {
line_2 <- "nucleotide&id="
line_4 <- ifelse(get_cds,"&rettype=fasta_cds_na&retmode=text",
"&rettype=fasta&retmode=text")
} else {
line_2 <- "protein&id="
line_4 <- ifelse(get_cds,"&rettype=fasta_cds_aa&retmode=text",
"&rettype=fasta&retmode=text")
}
if(is_nucleotide) {
line_2 <- "nucleotide&id="
line_4B <- "&rettype=gb&retmode=text"
} else {
line_2 <- "protein&id="
line_4B <- "&rettype=gb&retmode=text"
}
if(is_nucleotide) {
line_2 <- "nucleotide&id="
line_4C <- "&rettype=fasta&retmode=text"
} else {
line_2 <- "protein&id="
line_4C <- "&rettype=fasta&retmode=text"
}
# get genbank
all_entries <- list()
cat("\n *** processing sequences: \n")
counter <- 0
for(i in 1:length(seq_genbank_ids)) {
# i <- 8
one_html_seq <- paste0(line_1,line_2,seq_genbank_ids[i],line_4)
Sys.sleep(0.5)
one_seq_vector <- try(scan(file = one_html_seq, what = "",
sep = "\n", quiet = TRUE), silent = TRUE)
if(class(one_seq_vector) == "try-error") {
cat(paste0("accession_name: ",
seq_genbank_ids[i]," -- CANNOT BE RETRIEVED\n"))
next
}
counter <- counter + 1
Sys.sleep(0.5)
one_html_seq_full <- paste0(line_1,line_2,seq_genbank_ids[i],line_4C)
Sys.sleep(0.5)
one_seq_vector_full <- scan(file = one_html_seq_full, what = "",
sep = "\n", quiet = TRUE)
Sys.sleep(0.5)
one_html_seq_gb <- paste0(line_1,line_2,seq_genbank_ids[i],line_4B)
Sys.sleep(0.5)
one_seq_vector_gb <- scan(file = one_html_seq_gb, what = "",
sep = "\n", quiet = TRUE)
Sys.sleep(0.5)
# build header
header_vector <- one_seq_vector[1]
seq_vector <- paste0(one_seq_vector[2:length(one_seq_vector)], collapse="")
header_full <- one_seq_vector_full[1]
seq_full <- paste0(one_seq_vector_full[2:length(one_seq_vector_full)],
collapse="")
# get length
seq_bp <- nchar(seq_vector)
# get species
species_name <- gsub(" +ORGANISM +", "", grep("ORGANISM", one_seq_vector_gb,
value = TRUE))
species_name <- gsub(" ", "_", species_name)
species_name <- ifelse(is.na(species_name), "unk_species", species_name)
# get gene
gene_name <- grep("/gene=", one_seq_vector_gb, value = TRUE)[1]
gene_name <- gsub(" ", "", gene_name)
gene_name <- gsub('/gene=', "", gene_name)
gene_name <- gsub('"', "", gene_name)
gene_name <- gsub("\\", "", gene_name, fixed=TRUE)
gene_name <- ifelse(is.na(gene_name), "unkgene", gene_name)
if(gene_name == "unkgene") {
gene_name_2 <- gsub(".*gene=(.+)\\].*", "\\1", header_vector)
gene_name_2 <- sub("\\].*", "", gene_name_2)
gene_name <- ifelse(is.na(gene_name_2), "unkgene", gene_name_2)
}
# accession_number
accession_name <- gsub("ACCESSION", "", grep("ACCESSION", one_seq_vector_gb,
value = TRUE))
accession_name <- gsub(" ", "", accession_name)
accession_name <- ifelse(is.na(accession_name), "unkaccession", accession_name)
# accession_protein_number
protein_id <- grep("/protein_id=", one_seq_vector_gb, value = TRUE)[1]
protein_id <- gsub(" ", "", protein_id)
protein_id <- gsub('/protein_id=', "", protein_id)
protein_id <- gsub('"', "", protein_id)
protein_id <- gsub("\\", "", protein_id, fixed=TRUE)
protein_id <- ifelse(is.na(protein_id), "unkprotein", protein_id)
if(protein_id == "unkgene") {
protein_id_2 <- gsub(".*protein_id=(.+)\\].*", "\\1",
header_vector)
protein_id_2 <- sub("\\].*", "", protein_id_2)
protein_id <- ifelse(is.na(protein_id_2), "unkgene",
protein_id_2)
}
# get protein sequence if present
start_trans <- grep("translation=", one_seq_vector_gb)
end_trans <- grep("ORIGIN ", one_seq_vector_gb, fixed = TRUE)[1]
if(length(start_trans) == 0 ) {
complete_prot <- NA
} else if (length(end_trans) == 0) {
complete_prot <- NA
} else if(end_trans > start_trans) {
line_start <- one_seq_vector_gb[start_trans]
start_line <- gsub(".*translation=(.+)\\.*", "\\1", line_start)
start_line <- gsub("[^[:alnum:]]","",start_line)
start_trans_2 <- start_trans+1
end_trans_2 <- end_trans-1
body_trans <- one_seq_vector_gb[start_trans_2:end_trans_2]
body_trans <- gsub("[^[:alnum:]]","",body_trans)
complete_prot <- paste0(start_line,body_trans, collapse ="")
} else {
complete_prot <- NA
}
# build data.frame
one_entry <- data.frame(species_name = species_name,
gene_name = gene_name,
accession_name = accession_name,
seq_bp = seq_bp,
protein_id = protein_id,
nuc_sequence = seq_vector,
full_sequence = seq_full,
prot_sequence = complete_prot,
stringsAsFactors = FALSE)
cat(paste0(species_name, " -- ", gene_name, " -- ", accession_name, " -- ",
seq_bp, " bp \n"))
all_entries[[counter]] <- one_entry
}
rm(i)
all_entries_df <- do.call(rbind, all_entries)
################################### create fasta output
if(!is.null(force_name_to_genes)) {
all_entries_df$gene_name <- force_name_to_genes
}
## build names
name_order_vector <- list()
for(i in 1:length(organize_name)) {
# i <- 1
if(organize_name[i] == "species") {
name_order_vector[[i]] <- all_entries_df$species_name}
if(organize_name[i] == "gene") {
name_order_vector[[i]] <- all_entries_df$gene_name}
if(organize_name[i] == "accession_number") {
name_order_vector[[i]] <- all_entries_df$accession_name}
}
rm(i)
## merge names
name_end <- do.call(cbind,name_order_vector)
name_end <- apply(name_end,1,paste,collapse="_")
name_end_cds <- paste0(">",name_end)
############# nbpar -- cds
# remove rows with no nucleotides
all_entries_cds <- subset(all_entries_df, select=c(accession_name,nuc_sequence))
# if na cds
no_na_cds <- !all_entries_cds[,2] == "NANA"
# final names and cds
name_cds_end <- name_end_cds[no_na_cds]
all_entries_cds_end <- all_entries_cds[no_na_cds,]
if(length(name_cds_end) == 0) {
sequence_fasta_cds <- NULL
} else {
# build fasta cds
nbpar <- sapply(all_entries_cds_end$nuc_sequence, length)
sequence_fasta_cds <- vector(length = length(all_entries_cds_end$nuc_sequence) + length(name_cds_end))
id <- 1
for (i in seq(along = nbpar[-1])) id <- c(id, tail(id, 1) + nbpar[i] + 1)
# cds
sequence_fasta_cds[id] <- name_cds_end
sequence_fasta_cds[-id] <- all_entries_cds_end$nuc_sequence
# write file full fasta files
file_out_fasta_cds_name <- paste0(file_out,
length(all_entries_cds_end$nuc_sequence),
"_queries_nucleotide_cds.fasta")
sequence_fasta_cds_out <- cat(sequence_fasta_cds,
file = file_out_fasta_cds_name, sep = "\n")
cat("\n ******* nucleotide sequences CDS in fasta format have been written to this file - ", file_out_fasta_cds_name, "\n")
}
############# nbpar -- full
name_end_full <-paste0(name_end_cds,"_full")
nbpar <- sapply(all_entries_df$full_sequence, length)
sequence_fasta_full <- vector(length = length(all_entries_df$full_sequence) + length(name_end_full))
id <- 1
for (i in seq(along = nbpar[-1])) id <- c(id, tail(id, 1) + nbpar[i] + 1)
# full nucleotide
sequence_fasta_full[id] <- name_end_full
sequence_fasta_full[-id] <- all_entries_df$full_sequence
# write file full fasta files
file_out_fasta_full_name <- paste0(file_out,
length(all_entries_df$full_sequence),
"_queries_nucleotide_full.fasta")
sequence_fasta_full_out <- cat(sequence_fasta_full,
file = file_out_fasta_full_name, sep = "\n")
cat("\n ******* nucleotide sequences FULL in fasta format have been written to this file - ", file_out_fasta_full_name, "\n")
############# nbpar -- protein
name_protein <- paste0(name_end_cds,"_protein")
# remove rows with no proteins
all_entries_prot <- subset(all_entries_df, select=c(accession_name,
prot_sequence))
# if na proteins
no_na_proteins <- !is.na(all_entries_prot[,2])
# final names and protein
name_protein_end <- name_protein[no_na_proteins]
all_entries_prot_end <- all_entries_prot[no_na_proteins,]
if(length(name_protein_end) == 0) {
sequence_fasta_protein <- NULL
} else {
# build fasta protein
nbpar <- sapply(all_entries_prot_end$prot_sequence, length)
sequence_fasta_protein <- vector(length = length(all_entries_prot_end$prot_sequence) + length(name_protein_end))
id <- 1
for (i in seq(along = nbpar[-1])) id <- c(id, tail(id, 1) + nbpar[i] + 1)
# protein
sequence_fasta_protein[id] <- name_protein_end
sequence_fasta_protein[-id] <- all_entries_prot_end$prot_sequence
# write file full fasta files
file_out_fasta_prot_name <- paste0(file_out,
length(all_entries_prot_end$prot_sequence),
"_queries_protein.fasta")
sequence_fasta_prot_out <- cat(sequence_fasta_protein,
file = file_out_fasta_prot_name, sep = "\n")
cat("\n ******* PROTEIN sequences in fasta format have been written to this file - ", file_out_fasta_prot_name, "\n")
}
############ out dataframe
file_out_df_fasta_name <- paste0(file_out, nrow(all_entries_df),
"_GenBank_queries_df.txt")
utils::write.table(all_entries_df, file = file_out_df_fasta_name, sep = "\t",
row.names = FALSE)
############ processing section: DONE
setwd(out_directory)
return(all_entries_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.