#' GRanges to BED
#'
#' Saves an GRanges-object as a BED-file and returns the file location.
#'
#' @param GR GRanges-object.
#' @param bed_fname Path to where file will be saved. Must end in ".bed". Defaults to tempfile().
#' @param trackline logical: Whether a dummy trackline should be included in the .bed file
#' @return Path to file.
#' @author Malte Thodberg
#' @details Saves a GRanges object as a BED-file. Names are set to increasing integers and scores are set to feature widths (these will be overwritten).
#' @seealso \code{\link{tempdir}} \code{\link{tempfile}}
#' @export
GR_to_BED <- function(GR, bed_fname=NULL, trackline=FALSE){
# # Get bed
# bed <- data.frame(seqnames(GR),
# start(GR)-1,
# end(GR),
# 1:length(GR),
# width(GR),
# strand(GR))
# Add some info
GR$name <- 1:length(GR)
GR$score <- width(GR)
# Random file name
if(is.null(bed_fname)){
bed_fname <- tempfile(pattern="temp_bed_file", fileext=".bed")
}
if(trackline == FALSE){
rtracklayer::export(object=GR, con=bed_fname)
}else{
rtracklayer::export(object=GR, con=bed_fname, trackLine=methods::new("BasicTrackLine"))
}
# # Write to file
# write.table(x=bed,
# file=bed_fname,
# quote=FALSE,
# sep="\t",
# row.names=FALSE,
# col.names=FALSE)
# Return name of file
bed_fname
}
#' Parse known motifs from Homer
#'
#' Parse known motifs form a Homer analysis from output_dir. Should not normally be called by the user.
#'
#' @param output_dir Path to output dir for Homer analysis. Defaults to tempdir()
#' @return A dataframe of known motif results.
#' @author Malte Thodberg
#' @details Parses from the "knownResults.txt" file of a Homer analysis.
#' @seealso \code{\link{call_homer}} \code{\link{tempdir}}
#' @import magrittr
#' @export
parse_known <- function(output_dir=tempdir()){
# Read simple table
known_motifs <- utils::read.table(file=file.path(output_dir, "knownResults.txt"), sep="\t", header=T, comment.char="")
# Reformat
colnames(known_motifs) <- c("Name", "Consensus", "Pval", "Log-pval", "Qval", "Tcount", "T", "Bcount", "B")
known_motifs %>%
dplyr::mutate(T=gsub(pattern="%", replacement="", x=T) %>% as.numeric %>% divide_by(100),
B=gsub(pattern="%", replacement="", x=B) %>% as.numeric %>% divide_by(100)) %>%
dplyr::select(-Tcount, -Bcount)
# Return
known_motifs
}
#' Parse de-novo motifs from Homer
#'
#' Parse de-novo motifs from a Homer analysis from output_dir. Should not normally be called by the user.
#' @param output_dir Path to output dir for Homer analysis. Defaults to tempdir()
#' @return A list containing a dataframe of de-novo motif results and a list of corresponding PWMs.
#' @author Malte Thodberg
#' @details Parses from the "homerResults/" folder and "homerMotifs.all.motifs" file of a Homer analysis and merges the results.
#' @seealso \code{\link{call_homer}} \code{\link{tempdir}}
#' @import magrittr
#' @export
parse_homer <- function(output_dir=tempdir()){
### Folder motifs
# Files
motif_fnames <- list.files(file.path(output_dir, "homerResults"), full.names=TRUE)
motif_fnames <- motif_fnames[grepl(pattern=".motif", x=motif_fnames, fixed=TRUE)]
# Seperate PWMS
homer_PWMs <- lapply(motif_fnames, utils::read.table, skip=1, col.names=c("A", "C", "G", "T"))
# Reformat info
folder_motifs <- lapply(motif_fnames, utils::read.table, nrows=1, sep="\t") %>%
Reduce(rbind, .) %>%
as.data.frame()
colnames(folder_motifs) <- c("Consensus", "Name", "Log-odds", "Log-pval", "Placeholder", "Occurence")
# Split names
folder_motifs <- tidyr::separate(folder_motifs, Name, into=c("Name", "Guess"), sep=",", extra="drop")
# Rename PWMs
names(homer_PWMs) <- folder_motifs$Name
### File motifs
# Read from file starting with >
heap <- readLines(file.path(output_dir, "homerMotifs.all.motifs"))
heap <- heap[grep(pattern=">", x=heap)]
# Reformat info
file_motifs <- heap %>%
stringr::str_split(pattern="\t") %>%
as.data.frame %>%
t %>%
as.data.frame
colnames(file_motifs) <- c("Consensus", "Name", "Log-odds", "Log-pval", "Placeholder", "Occurence", "Statistics")
rownames(file_motifs) <- NULL
file_motifs <- file_motifs %>%
dplyr::mutate(`Log-odds`=as.numeric(`Log-odds`),
`Log-pval`=as.numeric(`Log-pval`),
Placeholder=as.integer(Placeholder))
### Merge info
# Merge frames
homer_motifs <- dplyr::left_join(folder_motifs, file_motifs, by=c("Consensus", "Name", "Log-odds", "Log-pval", "Placeholder", "Occurence"))
# Split columns
homer_motifs <- tidyr::separate(homer_motifs, Occurence, into=c("T", "B", "P"), sep=",", extra="drop")
homer_motifs <- tidyr::separate(homer_motifs, Statistics, into=c("Tpos", "Tstd", "Bpos", "Bstd", "StrandBias", "Multiplicity"), sep=",", extra="drop")
# Reformat
homer_motifs <- homer_motifs %>%
dplyr::mutate(Consensus=gsub(pattern=">", replacement="", x=Consensus),
Guess=gsub(pattern="BestGuess:", replacement="", x=Guess),
T=str_split(string=T, pattern=":|\\(|\\)|%") %>% sapply(function(x) as.numeric(x[3]) / 100),
B=str_split(string=B, pattern=":|\\(|\\)|%") %>% sapply(function(x) as.numeric(x[3]) / 100),
P=gsub(pattern="P:", replacement="", x=P) %>% as.numeric,
Tpos=gsub(pattern="Tpos:", replacement="", x=Tpos) %>% as.numeric,
Tstd=gsub(pattern="Tstd:", replacement="", x=Tstd) %>% as.numeric,
Bpos=gsub(pattern="Bpos:", replacement="", x=Bpos) %>% as.numeric,
Bstd=gsub(pattern="Bstd:", replacement="", x=Bstd) %>% as.numeric,
StrandBias=gsub(pattern="StrandBias:", replacement="", x=StrandBias) %>% as.numeric,
Multiplicity=gsub(pattern="Multiplicity:", replacement="", x=Multiplicity) %>% as.numeric) %>%
dplyr::mutate(Orientation=ifelse(test=grepl(pattern="RV", x=motif_fnames), yes="reverse", no="forward"),
Length=sapply(homer_PWMs, nrow))
### Trim
final_motifs <- !is.na(homer_motifs$Tpos)
homer_motifs <- subset(homer_motifs, final_motifs, select=-Placeholder)
homer_PWMs <- homer_PWMs[final_motifs]
### Return
list(homer_motifs=homer_motifs, homer_PWMs=homer_PWMs)
}
#' Parse instances of de-novo motifs from Homer
#'
#' Parse de-novo motifs instances from a Homer analysis from output_dir. Should not normally be called by the user.
#' @param output_dir Path to output dir for Homer analysis. Defaults to tempdir()
#' @return A dataframe of the position of motif occurences.
#' @author Malte Thodberg
#' @details Parses from the "homerResults/motifInstances.tab" file of a Homer analysis and merges the results.
#' @seealso \code{\link{call_homer}} \code{\link{find_instances}} \code{\link{tempdir}}
#' @import magrittr
#' @export
parse_instances <- function(output_dir=tempdir()){
## Read and clean
# Read data
i <- utils::read.table(file.path(output_dir, "motifInstances.tab"),
sep="\t",
header=TRUE,
comment.char="",
quote="")
# Clean columns
i <- subset(i, select=c(1:5, 20:ncol(i)))
info_cols <- colnames(i)[1:7]
motif_cols <- colnames(i)[8:ncol(i)]
info_cols[1] <- "PeakID"
motif_cols <- stringr::str_split(motif_cols, pattern=fixed(".")) %>% sapply(., function(x) x[2])
colnames(i) <- c(info_cols, motif_cols)
# Clean rows
i <- dplyr::arrange(i, PeakID)
## Make into matrix
# Matrix of cumbersome strings
string_mat <- i[,8:ncol(i)]
# Only get position from strings
pos_mat <- apply(string_mat, 2, function(x) data.frame(raw=x) %>%
tidyr::separate(col=raw, into=c("pos", "consensus", "strand", "score"), extra="drop", sep=",|\\(|\\)") %>%
.$pos %>%
as.integer)
# Return
pos_mat
}
#' Motif Enrichment with Homer
#'
#' Call the findMotifsGenome.pl script from Homer directly from R.
#'
#' @param pos_file <#> (Genomic Ranges object)
#' @param genome <#> (Installed Homer genome) or (path to FASTA)
#' @param output_dir Path to output dir for Homer analysis. Defaults to tempdir()
#' @param mask (mask repeats/lower case sequence, can also add 'r' to genome, i.e. mm9r)
#' @param bg <background position file> (genomic positions to be used as background, default=automatic) removes background positions overlapping with target positions
#' @param chopify (chop up large background regions to the avg size of target regions)
#' @param len <#>[,<#>,<#>...] (motif length, default=8,10,12) [NOTE: values greater 12 may cause the programto run out of memory - in these cases decrease the number of sequences analyzed (-N), or try analyzing shorter sequence regions (i.e. -size 100)]
#' @param size <#> (fragment size to use for motif finding, default=200) or (i.e. -size -100,50 will get sequences from -100 to +50 relative from center) or given (uses the exact regions you give it)
#' @param S <#> (Number of motifs to optimize, default: 25)
#' @param mis <#> (global optimization: searches for strings with # mismatches, default: 2)
#' @param norevopp (don't search reverse strand for motifs)
#' @param rna (output RNA motif logos and compare to RNA motif database, automatically sets -norevopp)
#' @param mset <vertebrates|insects|worms|plants|yeast|all> (check against motif collects, default: auto)
#' @param bits (scale sequence logos by information content, default: doesn't scale)
#' @param mcheck <motif file> (known motifs to check against de novo motifs)
#' @param mknown <motif file> (known motifs to check for enrichment)
#' @param gc (use GC-percentage for sequence content normalization, now the default)
#' @param cpg (use CpG-percentage instead of GC-percentage for sequence content normalization)
#' @param noweight (no CG correction)
#' @param h (use hypergeometric for p-values, binomial is default)
#' @param N <#> (Number of sequences to use for motif finding, default=max(50k, 2x input)
#' @param local <#> (use local background, # of equal size regions around peaks to use i.e. 2)
#' @param redundant <#> (Remove redundant sequences matching greater than # percent, i.e. -redundant 0.5)
#' @param maxN <#> (maximum percentage of N's in sequence to consider for motif finding, default: 0.7)
#' @param maskMotif <motif file1> [motif file 2]... (motifs to mask before motif finding)
#' @param rand (randomize target and background sequences labels)
#' @param ref <peak file> (use file for target and background - first argument is list of peak ids for targets)
#' @param oligo (perform analysis of individual oligo enrichment)
#' @param dumpFasta (Dump fasta files for target and background sequences for use with other programs)
#' @param preparse (force new background files to be created)
#' @param preparsedDir <directory> (location to search for preparsed file and/or place new files)
#' @param keepFiles (keep temporary files)
#' @param fdr <#> (Calculate empirical FDR for de novo discovery #=number of randomizations)
#' @param nlen <#> (length of lower-order oligos to normalize in background, default: -nlen 3)
#' @param nmax <#> (Max normalization iterations, default: 160)
#' @param neutral (weight sequences to neutral frequencies, i.e. 25-percentage, 6.25-percentage, etc.)
#' @param olen <#> (lower-order oligo normalization for oligo table, use if -nlen isn't working well)
#' @param p <#> (Number of processors to use, default: 1)
#' @param e <#> (Maximum expected motif instance per bp in random sequence, default: 0.01)
#' @param cache <#> (size in MB for statistics cache, default: 500)
#' @param quickMask (skip full masking after finding motifs, similar to original homer)
#' @param minlp <#> (stop looking for motifs when seed logp score gets above #, default: -10)
#' @return List with output: command line used, knowm motifs, Homer motifs (de-novo) and Homer PWMs.
#' @author Malte Thodberg
#' @details
#' Simple R-wrapper for Homer's findMotifsGenome.pl.
#' Instead of flags, it uses R-arguments which are pasted to a Homer command. Flags that modify output format are not implemented:
#' -nomotif, -find, -enhancers, -enhancersOnly, -basic, -nocheck, -noknown, -nofacts, -opt, -peaks, -homer2.
#'
#' Saves all temporary files to output_dir. Note these files are only deleted upon closing the R-session, which can in some cases lead to files from previous runs being reloaded.
#' @seealso \code{\link{GR_to_BED}} \code{\link{tempdir}}
#' @export
call_homer <- function(pos_file, genome, output_dir=tempdir(),# Mandatory
mask=NULL, bg=NULL, chopify=NULL, len=NULL, size=NULL, S=NULL, mis=NULL, norevopp=NULL, rna=NULL, # Basic options
mset=NULL, bits=NULL, mcheck=NULL, mknown=NULL, #Known motif options
gc=NULL, cpg=NULL, noweight=NULL, # Sequence normalization options
h=NULL, N=NULL, local=NULL, redundant=NULL, maxN=NULL, maskMotif=NULL, rand=NULL, ref=NULL, oligo=NULL, dumpFasta=NULL, preparse=NULL, preparsedDir=NULL, keepFiles=NULL, fdr=NULL, #Advanced options
nlen=NULL, nmax=NULL, neutral=NULL, olen=NULL, p=NULL, e=NULL, cache=NULL, quickMask=NULL, minlp=NULL){ # Homer options
# GR to temporary file
pos_file <- GR_to_BED(GR=pos_file)
if(!is.null(bg)){
bg <- GR_to_BED(GR=bg)
}
# Build basic commond line
cline <- sprintf("findMotifsGenome.pl %s %s %s -nofacts",
pos_file, genome, output_dir)
if(!is.null(mask)){cline <- paste(cline, "-mask")}
if(!is.null(bg)){cline <- paste(cline, "-bg", bg)}
if(!is.null(chopify)){cline <- paste(cline, "-chopify")}
if(!is.null(len)){cline <- paste(cline, "-len", len)}
if(!is.null(size)){cline <- paste(cline, "-size", size)}
if(!is.null(S)){cline <- paste(cline, "-S", S)}
if(!is.null(mis)){cline <- paste(cline, "-mis")}
if(!is.null(norevopp)){cline <- paste(cline, "-norevopp")}
if(!is.null(rna)){cline <- paste(cline, "-rna")}
if(!is.null(mset)){cline <- paste(cline, "-mset", mset)}
if(!is.null(bits)){cline <- paste(cline, "-bits")}
if(!is.null(mcheck)){cline <- paste(cline, "-mcheck", mcheck)}
if(!is.null(mknown)){cline <- paste(cline, "-mknown", mknown)}
if(!is.null(gc)){cline <- paste(cline, "-gc")}
if(!is.null(cpg)){cline <- paste(cline, "-cpg")}
if(!is.null(noweight)){cline <- paste(cline, "-noweight")}
if(!is.null(h)){cline <- paste(cline, "-h")}
if(!is.null(N)){cline <- paste(cline, "-N", N)}
if(!is.null(local)){cline <- paste(cline, "-local", local)}
if(!is.null(redundant)){cline <- paste(cline, "-redundant", redundant)}
if(!is.null(maxN)){cline <- paste(cline, "-maxN", maxN)}
if(!is.null(maskMotif)){cline <- paste(cline, "-maskMotif", maskMotif)}
if(!is.null(rand)){cline <- paste(cline, "-rand")}
if(!is.null(ref)){cline <- paste(cline, "-ref", ref)}
if(!is.null(oligo)){cline <- paste(cline, "-oligo")}
if(!is.null(dumpFasta)){cline <- paste(cline, "-dumpFasta")}
if(!is.null(preparse)){cline <- paste(cline, "-preparse")}
if(!is.null(preparsedDir)){cline <- paste(cline, "-preparsedDir", preparsedDir)}
if(!is.null(keepFiles)){cline <- paste(cline, "-keepFiles")}
if(!is.null(fdr)){cline <- paste(cline, "-fdr", fdr)}
if(!is.null(nlen)){cline <- paste(cline, "-nlen", nlen)}
if(!is.null(nmax)){cline <- paste(cline, "-nmax", nmax)}
if(!is.null(neutral)){cline <- paste(cline, "-neutral")}
if(!is.null(olen)){cline <- paste(cline, "-olen", olen)}
if(!is.null(p)){cline <- paste(cline, "-p", p)}
if(!is.null(e)){cline <- paste(cline, "-e", e)}
if(!is.null(cache)){cline <- paste(cline, "-cache", cache)}
if(!is.null(quickMask)){cline <- paste(cline, "-quickMask")}
if(!is.null(minlp)){cline <- paste(cline, "-minlp", minlp)}
# # Add extra settings with values
# if(!is.null(bg)){cline <- paste(cline, "-bg", bg)}
# if(!is.null(len)){cline <- paste(cline, "-len", len)}
# if(!is.null(size)){cline <- paste(cline, "-size", size)}
# if(!is.null(S)){cline <- paste(cline, "-S", S)}
# if(!is.null(mis)){cline <- paste(cline, "-mis", mis)}
#
# if(!is.null(mset)){cline <- paste(cline, "-mset", mset)}
# if(!is.null(mcheck)){cline <- paste(cline, "-mcheck", mcheck)}
# if(!is.null(mknown)){cline <- paste(cline, "-mknown", mknown)}
#
# if(!is.null(N)){cline <- paste(cline, "-N", N)}
# if(!is.null(local)){cline <- paste(cline, "-local", local)}
# if(!is.null(redundant)){cline <- paste(cline, "-redundant", redundant)}
# if(!is.null(maxN)){cline <- paste(cline, "-maxN", maxN)}
# if(!is.null(maskMotif)){cline <- paste(cline, "-maskMotif", maskMotif)}
# if(!is.null(ref)){cline <- paste(cline, "-ref", ref)}
# if(!is.null(preparsedDir)){cline <- paste(cline, "-preparsedDir", preparsedDir)}
# if(!is.null(fdr)){cline <- paste(cline, "-fdr", fdr)}
#
# if(!is.null(nlen)){cline <- paste(cline, "-nlen", nlen)}
# if(!is.null(nlen)){cline <- paste(cline, "-nlen", nlen)}
# if(!is.null(p)){cline <- paste(cline, "-p", p)}
#
# # Add TRUE/FALSE settings.
# if(!is.null(mask)){cline <- paste(cline, "-mask")}
#
# if(!is.null(chopify)){cline <- paste(cline, "-chopify")}
# if(!is.null(norevopp)){cline <- paste(cline, "-norevopp")}
# if(!is.null(rna)){cline <- paste(cline, "-rna")}
#
# if(!is.null(bits)){cline <- paste(cline, "-bits")}
#
# if(!is.null(gc)){cline <- paste(cline, "-gc")}
# if(!is.null(cpg)){cline <- paste(cline, "-cpg")}
#
# if(!is.null(h)){cline <- paste(cline, "-h")}
# if(!is.null(rand)){cline <- paste(cline, "-rand")}
# if(!is.null(oligo)){cline <- paste(cline, "-oligo")}
# if(!is.null(dumpFasta)){cline <- paste(cline, "-dumpFasta")}
# if(!is.null(preparse)){cline <- paste(cline, "-preparse")}
# if(!is.null(keepFiles)){cline <- paste(cline, "-keepFiles")}
# Execute!
system(cline)
# Read results back into R
res <- list(command=cline)
res$known_motifs <- parse_known(output_dir=output_dir)
tmp <- parse_homer(output_dir=output_dir)
res$homer_motifs <- tmp$homer_motifs
res$homer_PWMs <- tmp$homer_PWMs
# Return cline
res
}
#' Find instances of de-novo Homer motifs
#'
#' Find instances of de-novo Homer motifs in a GRanges using Homer's annotatePeaks.pl script.
#'
#' @param pos_file <#> (Genomic Ranges object)
#' @param genome <#> (Installed Homer genome) or (path to FASTA)
#' @param output_dir Path to output dir for Homer analysis. Defaults to tempdir()
#' @return A tidy dataframe of instances of motifs.
#' @author Malte Thodberg
#' @details Must point to a Homer output directory to find a 'homerMotifs.all.motifs' file. Saves a the raw output file from annotatePeaks.pl as 'motifInstances.tab" in the same folder.
#' @seealso \code{\link{call_homer}} \code{\link{tempdir}}
#' @export
find_instances <- function(pos_file, genome, output_dir=tempdir()){
# GR to temporary file
pos_file <- GR_to_BED(GR=pos_file, trackline=TRUE)
# Look for motifs
pos_cline <- sprintf("annotatePeaks.pl %s %s -m %s -size given > %s",
pos_file,
genome,
file.path(output_dir, "homerMotifs.all.motifs"),
file.path(output_dir, "motifInstances.tab"))
# Execute!
system(pos_cline)
# Read back into R
parse_instances(output_dir=output_dir)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.