# bedtools functions ------------------------------------------------------
#' Extract regions from a fasta file
#'
#' Wrapper for bedtools getfasta.
#'
#' @param bed_file Path to bed file with locations of regions to extract.
#' bed file is a tab-separated file with columns for chromosome (e.g., chr1),
#' start position (e.g., 1), and end position (e.g., 10), in that order.
#' No column headers are used.
#' @param fasta_file Path to file in fasta format to extract regions from.
#' @param out_fasta_file Path to write extracted regions (in fasta format).
#' @param ... Other arguments. Not used by this function, but meant to be used
#' by \code{\link[drake]{drake_plan}} for tracking during workflows.
#' @return List; output of processx::run(). Externally, a fasta file will be
#' written to the path specified by `out_fasta_file`.
#' @author Joel H Nitta, \email{joelnitta@@gmail.com}
#' @examples
#' \dontrun{
#' # First write gene, intron, and exon bed regions out as tsv files
#'
#' temp_dir <- tempdir()
#'
#' find_bed_regions(
#' gff3_file = system.file("extdata", "Arabidopsis_thaliana_TAIR10_40_small.gff3",
#' package = "baitfindR", mustWork = TRUE),
#' source_select = "araport11",
#' out_type = "write_all",
#' out_dir = temp_dir,
#' prefix = "arabidopsis"
#' )
#'
#' # Extract genes.
#' extract_regions_from_fasta(
#' bed_file = fs::path(temp_dir, "arabidopsis_introns"),
#' fasta_file = system.file("extdata", "Arabidopsis_thaliana_TAIR10_40_small.fasta",
#' package = "baitfindR", mustWork = TRUE),
#' out_fasta_file = fs::path(temp_dir, "arabidopsis_gene_seqs.fasta")
#' )
#' }
#' @export
extract_regions_from_fasta <- function (bed_file, fasta_file, out_fasta_file, ...) {
# Check input
assertthat::assert_that(assertthat::is.string(bed_file))
bed_file <- fs::path_abs(bed_file)
assertthat::assert_that(assertthat::is.readable(bed_file))
assertthat::assert_that(assertthat::is.string(fasta_file))
fasta_file <- fs::path_abs(fasta_file)
assertthat::assert_that(assertthat::is.readable(fasta_file))
assertthat::assert_that(assertthat::is.string(out_fasta_file))
out_fasta_file <- fs::path_abs(out_fasta_file)
assertthat::assert_that(assertthat::is.dir(fs::path_dir(out_fasta_file)))
bed <- readr::read_tsv(bed_file, col_names = c("chr", "start", "end"), col_types = "cdd")
checkr::check_data(
bed,
values = list(
chr = "a",
start = 1,
end = 1
),
order = TRUE)
if (!(check_bed_genome_names(fasta_file, bed))) {
stop ("Names don't match between bed file and fasta file headers")
}
# Run bedtools getfasta
processx::run(
command = "bedtools",
args = c(
"getfasta",
"-fi", fasta_file,
"-bed", bed_file,
"-fo", out_fasta_file
),
echo = TRUE
)
}
#' Mask regions in a fasta file.
#'
#' Wrapper for bedtools maskfasta.
#'
#' All regions of the `fasta_file` specified by the `bed_file` will be
#' replaced ("hard-masked") with 'N's.
#'
#' The bed file is a tab-separated file with columns for chromosome (e.g., chr1),
#' start position (e.g., 1), and end position (e.g., 10), in that order.
#' No column headers are used.
#'
#' @param bed_file Path to bed file with locations of regions to mask.
#' @param fasta_file Path to unmasked fasta file.
#' @param out_fasta_file Path to write masked fasta file.
#' @param ... Other arguments. Not used by this function, but meant to be used
#' by \code{\link[drake]{drake_plan}} for tracking during workflows.
#' @return List; output of processx::run(). Externally, a fasta file will be
#' written to the path specified by `out_fasta_file`.
#' @author Joel H Nitta, \email{joelnitta@@gmail.com}
#' @examples
#' \dontrun{
#' # First write genes, introns, and exons out as tsv files
#'
#' temp_dir <- tempdir()
#' find_bed_regions(
#' gff3_file = system.file("extdata", "Arabidopsis_thaliana_TAIR10_40_small.gff3", package = "baitfindR", mustWork = TRUE),
#' source_select = "araport11",
#' out_type = "write_all",
#' out_dir = temp_dir,
#' prefix = "arabidopsis"
#' )
#'
#' # Now mask the genome, using the bed file and genome fasta file.
#' mask_genome(
#' bed_file = "temp_dir/test_introns",
#' fasta_file = "data_raw/Arabidopsis_thaliana.TAIR10.dna.toplevel.renamed.fasta",
#' out_fasta_file = "temp_dir/test_masked"
#' )
#' }
#' @export
mask_regions_in_fasta <- function (bed_file, fasta_file, out_fasta_file, ...) {
# Check input
assertthat::assert_that(assertthat::is.string(bed_file))
bed_file <- fs::path_abs(bed_file)
assertthat::assert_that(assertthat::is.readable(bed_file))
assertthat::assert_that(assertthat::is.string(fasta_file))
fasta_file <- fs::path_abs(fasta_file)
assertthat::assert_that(assertthat::is.readable(fasta_file))
assertthat::assert_that(assertthat::is.string(out_fasta_file))
out_fasta_file <- fs::path_abs(out_fasta_file)
assertthat::assert_that(assertthat::is.dir(fs::path_dir(out_fasta_file)))
bed <- readr::read_tsv(bed_file, col_names = c("chr", "start", "end"), col_types = "cdd")
checkr::check_data(
bed,
values = list(
chr = "a",
start = 1,
end = 1
),
order = TRUE)
if (!(check_bed_genome_names(fasta_file, bed))) {
stop ("Names don't match between bed file and fasta file headers")
}
# Run bedtools maskfasta
processx::run(
command = "bedtools",
args = c(
"maskfasta",
"-fi", fasta_file,
"-bed", bed_file,
"-fo", out_fasta_file
),
echo = TRUE
)
}
#' Clean up data from a gff file and
#' convert to bed format
#'
#' Helper function for `find_bed_regions`. Merges overlapping regions and sorts
#' regions.
#'
#' @param region Dataframe; list of gene regions in "bed" format. Must include
#' the following columns in order: `chr` ('chromosome', character), `start`
#' (start position, numeric), and `end` (end position, numeric).
#' @param check.chr Logical; should coordinates be checked for chromosomal
#' format with "chr" prefix?
#' @param verbose Logical; should `bedr` functions output all messages?
#'
#' @return Dataframe in "bed" format.
#'
clean_gff <- function (region, check.chr = FALSE, verbose = FALSE) {
region %>%
# Check if region is valid
dplyr::filter(bedr::is.valid.region(., check.chr = check.chr, verbose = verbose)) %>%
# Collapse overlapping regions
bedr::bedr.merge.region(check.chr = check.chr, verbose = verbose) %>%
# Sort regions
bedr::bedr.sort.region(check.chr = check.chr, verbose = verbose) %>%
# Convert to bed format
bedr::convert2bed(check.chr = check.chr, verbose = verbose)
}
#' Find genes, exons, and introns in a gff3 file
#'
#' If tsv files are written out by selecting "write_all" for `out_type`,
#' they will overwrite any existing files with the same name in `out_dir`.
#'
#' @param gff3_file Path to input file in `gff3` format.
#' @param source_select Character vector; only use regions from these
#' sources. Must match values in `source` column of gff3 file. Optional.
#' @param gene_label String; value used to indicate genes in gff3 file.
#' Must match at least one value in `type` column of gff3 file. Default "gene".
#' @param exon_label String; value used to indicate exons in gff3 file.
#' Must match at least one value in `type` column of gff3 file. Default "exon".
#' @param verbose Logical; should `bedr` functions output all messages?
#' @param out_type Type of output to return:
#' "genes": dataframe in "bed" format of genes.
#' "introns": dataframe in "bed" format of introns.
#' "exons": dataframe in "bed" format of exons.
#' "write_all": write tab-separated files for each of `genes`, `introns`, and
#' `exons` to `out_dir`. The hash digest of the combined genes, introns, and
#' exons will be returned.
#' @param prefix String; prefix to attach to tsv files if `out_type` is
#' "write_all".
#' @param out_dir Directory to write tsv files if `out_type` is "write_all".
#' @param ... Other arguments. Not used by this function, but meant to
#' be used by \code{\link[drake]{drake_plan}} for tracking during workflows.
#' @return Dataframe or character.
#' @author Joel H Nitta, \email{joelnitta@@gmail.com}
#' @examples
#' # Find genes
#'
#' arabidopsis_gff_file <- system.file("extdata", "Arabidopsis_thaliana_TAIR10_40_small.gff3", package = "baitfindR", mustWork = TRUE)
#'
#' genes <- find_bed_regions(
#' gff3_file = arabidopsis_gff_file,
#' source_select = "araport11",
#' out_type = "genes"
#' )
#' head(genes)
#'
#' # Find introns
#' introns <- find_bed_regions(
#' gff3_file = arabidopsis_gff_file,
#' source_select = "araport11",
#' out_type = "introns"
#' )
#' head(introns)
#'
#' # Find exons
#' exons <- find_bed_regions(
#' gff3_file = arabidopsis_gff_file,
#' source_select = "araport11",
#' out_type = "exons"
#' )
#' head(exons)
#'
#' \dontrun{
#' # Write genes, introns, and exons out as tsv files
#' temp_dir <- tempdir()
#' find_bed_regions(
#' gff3_file = arabidopsis_gff_file,
#' source_select = "araport11",
#' out_type = "write_all",
#' out_dir = temp_dir,
#' prefix = "arabidopsis"
#' )
#' }
#' @export
find_bed_regions <- function (gff3_file,
source_select = NULL,
gene_label = "gene", exon_label = "exon",
verbose = FALSE,
prefix = NULL, out_dir = NULL,
out_type = c("genes", "introns", "exons", "write_all"),
...) {
# Check input
assertthat::assert_that(assertthat::is.readable(gff3_file))
assertthat::assert_that(assertthat::is.string(gene_label))
assertthat::assert_that(assertthat::is.string(exon_label))
assertthat::assert_that(assertthat::is.string(out_type))
assertthat::assert_that(is.logical(verbose))
assertthat::assert_that(out_type %in% c("genes", "introns", "exons", "write_all"),
msg = "'out_type' must be one of 'genes', 'introns', 'exons', or 'write_all'")
# Read in gff3 file as dataframe
gff3 <- ape::read.gff(gff3_file) %>%
dplyr::mutate(chr = as.character(seqid))
# Keep only annotations from selected source
if (!is.null(source_select)) {
assertthat::assert_that(is.character(source_select))
assertthat::assert_that(all(source_select %in% gff3$source))
gff3 <- dplyr::filter(gff3, source %in% source_select)
}
# Extract and clean up genes
genes <- gff3 %>% dplyr::filter(type == gene_label) %>%
dplyr::select(chr, start, end) %>%
clean_gff(verbose = verbose)
# Extract and clean up exons
exons <- gff3 %>% dplyr::filter(type == exon_label) %>%
dplyr::select(chr, start, end) %>%
clean_gff(verbose = verbose)
# Introns are genes - exons
introns <- bedr::bedr.subtract.region(
genes,
exons,
remove.whole.feature = FALSE,
check.chr = FALSE,
verbose = verbose)
# Write out all regions and return hash of genes + exons + introns
if (out_type == "write_all") {
out_dir <- fs::path_abs(out_dir)
assertthat::assert_that(assertthat::is.writeable(out_dir))
assertthat::assert_that(assertthat::is.string(prefix))
all_regions <- list(genes = genes,
exons = exons,
introns = introns)
all_regions %>%
purrr::set_names(fs::path(out_dir, paste0(prefix, "_", names(.)))) %>%
purrr::iwalk(readr::write_tsv, col_names = FALSE)
return(digest::digest(all_regions))
}
# Or, return a particular result type.
return(switch(
out_type,
genes = genes,
exons = exons,
introns = introns
))
}
# Check that genome fasta headers and
# "chromosome" names in bed file match.
# (This must be true for
# bedtools maskfasta to work).
check_bed_genome_names <- function (fasta_file, bed) {
# find all sequence headers in fasta file
seq_names <-
readr::read_lines(fasta_file) %>%
purrr::keep(~ grepl(">", .x)) %>%
purrr::map(~ gsub(">", "", .x))
# make sure "chr" (chromosome) names of bed file are all in
# fasta sequence headers
chr <- unique(bed$chr)
all(chr %in% seq_names)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.