Nothing
#' Download genome (fasta), annotation (GTF) and contaminants
#'
#' This function automatically downloads (if files not already exists)
#' genomes and contaminants specified for genome alignment.
#' Will create a R transcript database (TxDb object) from the annotation. \cr
#' It will also index the genome for you\cr
#' If you misspelled something or crashed, delete wrong files and
#' run again.\cr
#' Do remake = TRUE, to do it all over again.
#'
#' If you want custom genome or gtf from you hard drive, assign it
#' after you run this function, like this:\cr
#' annotation <- getGenomeAndAnnotation(GTF = FALSE, genome = FALSE)\cr
#' annotation["genome"] = "path/to/genome.fasta"\cr
#' annotation["gtf"] = "path/to/gtf.gtf"
#' @param organism scientific name of organism, Homo sapiens,
#' Danio rerio, Mus musculus, etc. See \code{biomartr:::get.ensembl.info()}
#' for full list of supported organisms.
#' @param output.dir directory to save downloaded data
#' @param db database to use for genome and GTF,
#' default adviced: "ensembl" (will contain haplotypes, large file!).
#' Alternatives: "refseq" (primary assembly) and "genbank" (mix)
#' @param GTF logical, default: TRUE, download gtf of organism specified
#' in "organism" argument. If FALSE, check if the downloaded
#' file already exist. If you want to use a custom gtf from you hard drive,
#' set GTF = FALSE,
#' and assign: \cr annotation <- getGenomeAndAnnotation(gtf = FALSE)\cr
#' annotation["gtf"] = "path/to/gtf.gtf".\cr
#' Only db = "ensembl" allowed for GTF.
#' @param genome logical, default: TRUE, download genome of organism
#' specified in "organism" argument. If FALSE, check if the downloaded
#' file already exist. If you want to use a custom gtf from you hard drive,
#' set GTF = FALSE,
#' and assign: \cr annotation <- getGenomeAndAnnotation(genome = FALSE)\cr
#' annotation["genome"] = "path/to/genome.fasta".\cr
#' Will download the primary assembly for ensembl
#' @param merge_contaminants logical, default TRUE. Will merge
#' the contaminants specified into one fasta file, this considerably
#' saves space and is much quicker to align with STAR than each contamint
#' on it's own. If no contaminants are specified, this is ignored.
#' @param phix logical, default FALSE, download phix sequence to filter
#' out with. Phix is used as a contaminant genome.
#' Only use if illumina sequencing. Phix is used in Illumina sequencers for
#' sequencing quality control. Genome is: refseq, Escherichia virus phiX174
#' @param ncRNA logical or character, default FALSE (not used, no download),
#' ncRNA is used as a contaminant genome.
#' If TRUE, will try to find ncRNA sequences from the gtf file, usually represented as
#' lncRNA (long noncoding RNA's). Will let you know if no ncRNA sequences were found in
#' gtf.\cr If not found try character input:\cr
#' Alternatives: "auto" or manual assign like "human".
#' If "auto" will try to find ncRNA file on NONCODE from organism,
#' Homo sapiens -> human etc. "auto" will not work for all,
#' then you must specify the name used by
#' NONCODE, go to the link below and find it.
#' If not "auto" / "" it must be a character vector
#' of species common name (not scientific name) Homo sapiens is human,
#' Rattus norwegicus is rat etc, download ncRNA sequence to filter out with.
#' From NONCODE online server, if you cant find
#' common name see: http://www.noncode.org/download.php/
#' @param tRNA logical or character, default FALSE (not used, no download),
#' tRNA is used as a contaminant genome.
#' If TRUE, will try to find tRNA sequences from the gtf file, usually represented as
#' Mt_tRNA (mature tRNA's). Will let you know if no tRNA sequences were found in
#' gtf. If not found try character input:\cr
#' if not "" it must be a character vector to valid path of mature
#' tRNAs fasta file to remove as contaminants on your disc. Find and download
#' your wanted mtRNA at: http://gtrnadb.ucsc.edu/, or run trna-scan on
#' you genome.
#' @param rRNA logical or character, default FALSE (not used, no download),
#' rRNA is used as a contaminant genome.
#' If TRUE, will try to find rRNA sequences from the gtf file, usually represented as
#' rRNA (ribosomal RNA's). Will let you know if no rRNA sequences were found in
#' gtf. If not found you can try character input:\cr If "silva" will download silva SSU & LSU
#' sequences for all species (250MB file) and use that. If you want a smaller file go to
#' https://www.arb-silva.de/ \cr
#' If not "" or "silva" it must be a character vector to valid path of mature
#' rRNA fasta file to remove as contaminants on your disc.
#' @param gunzip logical, default TRUE, uncompress downloaded files
#' that are zipped when downloaded, should be TRUE!
#' @param remake logical, default: FALSE, if TRUE remake everything specified
#' @param assembly_type a character string specifying from which assembly type
#' the genome shall be retrieved from (ensembl only, else this argument is ignored):
#' Default is
#' \code{assembly_type = "primary_assembly")}.
#' This will give you all no copies of any chromosomes.
#' As an example, the primary_assembly fasta genome in human is only a few GB uncompressed.\cr
#' \code{assembly_type = "toplevel")}.
#' This will give you all multi-chromosomes (copies of the same chromosome with small variations).
#' As an example the toplevel fasta genome in human is over 70 GB uncompressed.
#' To get primary assembly with 1 chromosome variant per chromosome:
#' @importFrom biomartr getGTF getGenome getENSEMBLInfo
#' @importFrom Rsamtools indexFa
#' @importFrom R.utils gunzip
#' @importFrom utils download.file
#' @importFrom AnnotationDbi saveDb
#' @importFrom Biostrings DNAStringSet
#' @importFrom Biostrings writeXStringSet
#' @importFrom Biostrings readDNAStringSet
#' @return a named character vector of path to genomes and gtf downloaded,
#' and additional contaminants if used. If merge_contaminants is TRUE, will not
#' give individual fasta files to contaminants, but only the merged one.
#' @family STAR
#' @export
#' @examples
#' output.dir <- "/Bio_data/references/zebrafish"
#' #getGenomeAndAnnotation("Danio rerio", output.dir)
#'
#' ## Get Phix contamints to deplete during alignment
#' #getGenomeAndAnnotation("Danio rerio", output.dir, phix = TRUE)
#'
getGenomeAndAnnotation <- function(organism, output.dir, db = "ensembl",
GTF = TRUE, genome = TRUE,
merge_contaminants = TRUE, phix = FALSE,
ncRNA = FALSE, tRNA = FALSE, rRNA = FALSE,
gunzip = TRUE, remake = FALSE,
assembly_type = "primary_assembly") {
# Pre checks
finished.file <- paste0(output.dir, "/outputs.rds")
if (file.exists(finished.file) & !remake) {
message("Loading premade Genome files,
do remake = TRUE if you want to run again")
return(readRDS(finished.file))
}
if (!(assembly_type %in% c("toplevel", "primary_assembly")))
stop("Please select one the available assembly types: \ntoplevel, primary_assembly")
conts <- as.character(c(phix, ncRNA, tRNA, rRNA))
any_contaminants <- any(!(conts %in% c("", FALSE)))
# Start process
dir.create(output.dir, recursive = TRUE)
organism <- gsub(" ", "_", organism)
## Go through all contaminants:
if (!(tRNA %in% c("", FALSE, TRUE))) {
if (!file.exists(tRNA)) stop(paste("local tRNA file is given and does not exist:",
tRNA))
}
if (!(rRNA %in% c("", FALSE, TRUE, "silva"))) {
if (!file.exists(rRNA)) stop(paste("local rRNA file is given and does not exist:",
tRNA))
} else if (rRNA == "silva") rRNA <- get_silva_rRNA()
phix <- get_phix_genome(phix, output.dir, gunzip)
ncRNA <- get_noncoding_rna(ncRNA, output.dir, organism, gunzip)
# Get species fasta genome and gtf
genome <- get_genome_fasta(genome, output.dir, organism,
assembly_type, db, gunzip)
gtf <- get_genome_gtf(GTF, output.dir, organism, assembly_type, gunzip,
genome)
if (any_contaminants) {
# Find which contaminants to find from gtf:
conts <- as.character(c(phix, ncRNA, tRNA, rRNA))
names(conts) <- c("phix", "ncRNA", "tRNA", "rRNA")
non_gtf_contaminants <- conts[!(conts %in% c(TRUE, FALSE, ""))]
gtf_contaminants <- conts[conts == TRUE]
if (length(gtf_contaminants) > 0) {
if (is.logical(gtf) | is.logical(genome))
stop("gtf or genome not specified, so impossible to find gtf contaminants!")
# Make fasta file of those contaminants
total_seqs <- DNAStringSet()
gtf.imp <- importGtfFromTxdb(gtf)
txdb <- loadTxdb(paste0(gtf, ".db"))
tx <- loadRegion(txdb)
# Loop through the gtf contaminants
for (t in names(gtf_contaminants)) {
valids <- gtf.imp[grep(x = gtf.imp$transcript_biotype, pattern = t)]
if (length(gtf.imp) == 0) {
warning(paste("Found no transcripts in gtf of type:", t))
next
}
valid_tx <- tx[unique(valids$transcript_id)]
seqs <- txSeqsFromFa(valid_tx, genome, TRUE, TRUE)
path.cont <- paste0(output.dir, "/", t, ".fasta")
writeXStringSet(seqs, path.cont)
total_seqs <- c(total_seqs, seqs)
gtf_contaminants[t] <- path.cont
}
}
}
if (merge_contaminants & any_contaminants) {
message("Merging contaminant genomes:")
for (cont in non_gtf_contaminants)
total_seqs <- c(total_seqs, readDNAStringSet(cont))
all_contaminants <- paste(c(names(non_gtf_contaminants),
names(gtf_contaminants)), collapse = "_")
all_cont <- paste0(output.dir, "/", "merged_contaminants_", all_contaminants, ".fasta")
writeXStringSet(total_seqs, filepath = all_cont)
output <- c(gtf, genome, all_cont)
names(output) <- c("gtf", "genome", "contaminants")
} else {
output <- c(gtf, genome, phix, ncRNA, tRNA, rRNA)
names(output) <- c("gtf", "genome", "phix", "ncRNA", "tRNA", "rRNA")
}
output <- output[!(output %in% c("", "TRUE","FALSE"))]
message("All data downloaded and ready at:")
message(output.dir)
saveRDS(object = output, finished.file)
return(output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.