SequenceSearches.R
In universalmotif: Import, Modify, and Export Motifs with R

## ----setup, echo=FALSE--------------------------------------------------------
knitr::opts_chunk$set(collapse=TRUE, comment = "#>")
suppressPackageStartupMessages(library(universalmotif))
suppressPackageStartupMessages(library(Biostrings))
data(ArabidopsisPromoters)
data(ArabidopsisMotif)

## -----------------------------------------------------------------------------
library(universalmotif)
library(Biostrings)

## Create some DNA sequences for use with an external program (default
## is DNA):

sequences.dna <- create_sequences(seqnum = 500,
                                  freqs = c(A=0.3, C=0.2, G=0.2, T=0.3))
## writeXStringSet(sequences.dna, "dna.fasta")
sequences.dna

## Amino acid:

create_sequences(alphabet = "AA")

## Any set of characters can be used

create_sequences(alphabet = paste0(letters, collapse = ""))

## -----------------------------------------------------------------------------
library(universalmotif)

## Background of DNA sequences:
dna <- create_sequences()
get_bkg(dna, k = 1:2)

## Background of non DNA/RNA sequences:
qwerty <- create_sequences("QWERTY")
get_bkg(qwerty, k = 1:2)

## -----------------------------------------------------------------------------
library(universalmotif)
library(Biostrings)
data(ArabidopsisPromoters)

## Potentially starting off with some external sequences:
# ArabidopsisPromoters <- readDNAStringSet("ArabidopsisPromoters.fasta")

euler <- shuffle_sequences(ArabidopsisPromoters, k = 2, method = "euler")
markov <- shuffle_sequences(ArabidopsisPromoters, k = 2, method = "markov")
linear <- shuffle_sequences(ArabidopsisPromoters, k = 2, method = "linear")
k1 <- shuffle_sequences(ArabidopsisPromoters, k = 1)

## -----------------------------------------------------------------------------
o.letter <- get_bkg(ArabidopsisPromoters, 1)
e.letter <- get_bkg(euler, 1)
m.letter <- get_bkg(markov, 1)
l.letter <- get_bkg(linear, 1)

data.frame(original=o.letter$count, euler=e.letter$count,
  markov=m.letter$count, linear=l.letter$count, row.names = DNA_BASES)

o.counts <- get_bkg(ArabidopsisPromoters, 2)
e.counts <- get_bkg(euler, 2)
m.counts <- get_bkg(markov, 2)
l.counts <- get_bkg(linear, 2)

data.frame(original=o.counts$count, euler=e.counts$count,
  markov=m.counts$count, linear=l.counts$count,
  row.names = get_klets(DNA_BASES, 2))

## -----------------------------------------------------------------------------
library(universalmotif)

string <- "DASDSDDSASDSSA"

count_klets(string, 2)

shuffle_string(string, 2)

## -----------------------------------------------------------------------------
library(universalmotif)

get_klets(c("A", "S", "D"), 2)

## -----------------------------------------------------------------------------
library(universalmotif)
m1 <- create_motif("TATATATATA", nsites = 50, type = "PWM", pseudocount = 1)
m2 <- matrix(c(0.10,0.27,0.23,0.19,0.29,0.28,0.51,0.12,0.34,0.26,
               0.36,0.29,0.51,0.38,0.23,0.16,0.17,0.21,0.23,0.36,
               0.45,0.05,0.02,0.13,0.27,0.38,0.26,0.38,0.12,0.31,
               0.09,0.40,0.24,0.30,0.21,0.19,0.05,0.30,0.31,0.08),
             byrow = TRUE, nrow = 4)
m2 <- create_motif(m2, alphabet = "DNA", type = "PWM")
m1["motif"]
m2["motif"]

## -----------------------------------------------------------------------------
motif_pvalue(m2, pvalue = 0.001)

## -----------------------------------------------------------------------------
library(universalmotif)
library(Biostrings)
data(ArabidopsisPromoters)

## A 2-letter example:

motif.k2 <- create_motif("CWWWWCC", nsites = 6)
sequences.k2 <- DNAStringSet(rep(c("CAAAACC", "CTTTTCC"), 3))
motif.k2 <- add_multifreq(motif.k2, sequences.k2)

## -----------------------------------------------------------------------------
head(scan_sequences(motif.k2, ArabidopsisPromoters, RC = TRUE,
                    threshold = 0.9, threshold.type = "logodds"))

## -----------------------------------------------------------------------------
head(scan_sequences(motif.k2, ArabidopsisPromoters, use.freq = 2, RC = TRUE,
                    threshold = 0.9, threshold.type = "logodds"))

## -----------------------------------------------------------------------------
library(universalmotif)
library(Biostrings)

sequences <- DNAStringSet(rep(c("CAAAACC", "CTTTTCC"), 3))
motif <- create_motif(sequences, add.multifreq = 2:3)

## -----------------------------------------------------------------------------
library(universalmotif)
data(ArabidopsisMotif)
data(ArabidopsisPromoters)

enrich_motifs(ArabidopsisMotif, ArabidopsisPromoters, shuffle.k = 3,
              threshold = 0.001, RC = TRUE)

## -----------------------------------------------------------------------------
library(universalmotif)
data(ArabidopsisPromoters)

m1 <- create_motif("TTTATAT", name = "PartA")
m2 <- create_motif("GGTTCGA", name = "PartB")

## -----------------------------------------------------------------------------
m <- cbind(m1, m2)
m <- add_gap(m, gaploc = ncol(m1), mingap = 4, maxgap = 6)
m

## -----------------------------------------------------------------------------
scan_sequences(m, ArabidopsisPromoters, threshold = 0.4, threshold.type = "logodds")

## -----------------------------------------------------------------------------
library(universalmotif)
data(ArabidopsisMotif)
data(ArabidopsisPromoters)

hits <- scan_sequences(ArabidopsisMotif, ArabidopsisPromoters, RC = FALSE)

res <- motif_peaks(hits$start,
                   seq.length = unique(width(ArabidopsisPromoters)),
                   seq.count = length(ArabidopsisPromoters))

## Significant peaks:
res$Peaks

## -----------------------------------------------------------------------------
res$Plot

## ----eval=FALSE---------------------------------------------------------------
#  library(universalmotif)
#  
#  ## 1. Once per session: via `options()`
#  
#  options(meme.bin = "/path/to/meme/bin/meme")
#  
#  run_meme(...)
#  
#  ## 2. Once per run: via `run_meme()`
#  
#  run_meme(..., bin = "/path/to/meme/bin/meme")

## ----eval=FALSE---------------------------------------------------------------
#  library(universalmotif)
#  data(ArabidopsisPromoters)
#  
#  ## 1. Read sequences from disk (in fasta format):
#  
#  library(Biostrings)
#  
#  # The following `read*()` functions are available in Biostrings:
#  # DNA: readDNAStringSet
#  # DNA with quality scores: readQualityScaledDNAStringSet
#  # RNA: readRNAStringSet
#  # Amino acid: readAAStringSet
#  # Any: readBStringSet
#  
#  sequences <- readDNAStringSet("/path/to/sequences.fasta")
#  
#  run_meme(sequences, ...)
#  
#  ## 2. Extract from a `BSgenome` object:
#  
#  library(GenomicFeatures)
#  library(TxDb.Athaliana.BioMart.plantsmart28)
#  library(BSgenome.Athaliana.TAIR.TAIR9)
#  
#  # Let us retrieve the same promoter sequences from ArabidopsisPromoters:
#  gene.names <- names(ArabidopsisPromoters)
#  
#  # First get the transcript coordinates from the relevant `TxDb` object:
#  transcripts <- transcriptsBy(TxDb.Athaliana.BioMart.plantsmart28,
#                               by = "gene")[gene.names]
#  
#  # There are multiple transcripts per gene, we only care for the first one
#  # in each:
#  
#  transcripts <- lapply(transcripts, function(x) x[1])
#  transcripts <- unlist(GRangesList(transcripts))
#  
#  # Then the actual sequences:
#  
#  # Unfortunately this is a case where the chromosome names do not match
#  # between the two databases
#  
#  seqlevels(TxDb.Athaliana.BioMart.plantsmart28)
#  #> [1] "1"  "2"  "3"  "4"  "5"  "Mt" "Pt"
#  seqlevels(BSgenome.Athaliana.TAIR.TAIR9)
#  #> [1] "Chr1" "Chr2" "Chr3" "Chr4" "Chr5" "ChrM" "ChrC"
#  
#  # So we must first rename the chromosomes in `transcripts`:
#  seqlevels(transcripts) <- seqlevels(BSgenome.Athaliana.TAIR.TAIR9)
#  
#  # Finally we can extract the sequences
#  promoters <- getPromoterSeq(transcripts,
#                              BSgenome.Athaliana.TAIR.TAIR9,
#                              upstream = 1000, downstream = 0)
#  
#  run_meme(promoters, ...)

## ----eval=FALSE---------------------------------------------------------------
#  run_meme(sequences, output = "/path/to/desired/output/folder")

## ---- eval=FALSE--------------------------------------------------------------
#  motifs <- run_meme(sequences)
#  motifs.k23 <- mapply(add_multifreq, motifs$motifs, motifs$sites)

## ----sessionInfo, echo=FALSE--------------------------------------------------
sessionInfo()