inst/doc/Overview.R

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  library('FastqCleaner')
#  launch_fqc()

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  download.file("https://goo.gl/hb4Kr9", "example_fastq.gz")

## ----eval=TRUE, echo=TRUE-----------------------------------------------------
### Examples

## ----include=FALSE------------------------------------------------------------
require("Biostrings")
require("ShortRead")
require("FastqCleaner")
set.seed(10)

## ----eval = FALSE-------------------------------------------------------------
#  require("Biostrings")
#  require("ShortRead")
#  require("FastqCleaner")

## ----eval=TRUE, echo=TRUE-----------------------------------------------------

# create sequences
set.seed(10) 
# nota that the use of set.seed before the call to the 
# random generators allows reproducibility of the
# examples

input <- random_seq(6, 43)
input

# create qualities of width 50
set.seed(10)
input_q <- random_qual(c(30,40), slength = 6, swidth = 50, 
encod = "Sanger")

# create names
input_names <- seq_names(length(input))


### FULL ADAPTER IN 3'
adapter <- "ATCGACT"

# Create sequences with adapter
my_seqs <- paste0(input, adapter)
my_seqs <- DNAStringSet(my_seqs)
my_seqs

# create ShortReadQ object
my_read <- ShortReadQ(sread = my_seqs, quality = input_q, id = input_names)

# trim adapter
filtered <- adapter_filter(my_read, Lpattern = adapter)
sread(filtered)

### PARTIAL ADAPTER IN 5'
adapter <- "ATCGACT"
subadapter <- subseq(adapter, 1, 4)

# Create sequences with adapter
my_seqs <- paste0(input, subadapter)
my_seqs <- DNAStringSet(my_seqs)
my_seqs

# create ShortReadQ object
my_read <- ShortReadQ(sread = my_seqs, quality = subseq(input_q, 1, 47), 
id = input_names)

# trim adapter
filtered <- adapter_filter(my_read, Rpattern = adapter)
sread(filtered)


## -----------------------------------------------------------------------------

# create  sequences of different width
set.seed(10)
input <- lapply(c(0, 6, 10, 16, 20, 26, 30, 36, 40), 
            function(x) random_seq(1, x))


# create repetitive "CG" sequences with length adequante 
# for a total length input +  CG = 40

CG <- lapply(c(20, 17, 15, 12, 10, 7, 5, 2, 0), 
            function(x) paste(rep("CG", x), collapse = ""))

# concatenate input and CG
input  <- mapply("paste", input, CG, sep = "")
input <- DNAStringSet(input)
input

# plot relative entropy (E, Shannon 1948)
H_plot <- function(x, H_max = 3.908135) {
    freq <- dinucleotideFrequency(x)
    freq  <- freq /rowSums(freq)
    H <- -rowSums(freq  * log2(freq), na.rm = TRUE)
    plot(H/H_max, type="l", xlab = "Sequence", ylab= "E")
    points(H/H_max, col = "#1a81c2", pch = 16, cex = 2)
}

H_plot(input)

## -----------------------------------------------------------------------------
# create qualities of widths 40
set.seed(10)
input_q <- random_qual(c(30,40), slength = 9, swidth = 40, 
    encod = "Sanger")

# create names
input_names <- seq_names(9)


# create ShortReadQ object
my_read <- ShortReadQ(sread = input, quality = input_q, id = input_names)

# apply the filter, 
filtered <- complex_filter(my_read)
sread(filtered)

H_plot(sread(filtered))

## -----------------------------------------------------------------------------

# create sequences, qualities and names of width 20
set.seed(10)
input <- random_seq(6, 20)
input

set.seed(10)
input_q <- random_qual(c(30,40), slength = 6, swidth = 20, 
    encod = "Sanger")

input_names <- seq_names(6)

# create ShortReadQ object
my_read <- ShortReadQ(sread = input, quality = input_q, id = input_names)

# apply the filter 
filtered3 <- fixed_filter(my_read, trim5 = 5)
sread(filtered3)

filtered5 <- fixed_filter(my_read, trim3 = 5)
sread(filtered5)

filtered3and5 <- fixed_filter(my_read, trim3 = 10, trim5 = 5)
sread(filtered3and5)


## -----------------------------------------------------------------------------

# create  ShortReadQ object width widths between 1 and 60
set.seed(10)
input <- random_length(10, widths = 1:60)
sread(input)

# apply the filter, removing sequences with length < 5 or length> 30
filtered <- length_filter(input, rm.min = 5, rm.max = 30)
sread(filtered)

## -----------------------------------------------------------------------------
# create 10 sequences of width 20
set.seed(10)
input <- random_seq(10, 20)
input

# inject N's
set.seed(10)
input <- inject_letter_random(input, how_many_seqs = 1:5,
    how_many = 1:10)
input

#'  
hist(letterFrequency(input, "N"), breaks = 0:10, 
    main  = "Ns Frequency", xlab = "# Ns",
    col = "#1a81c2")

## -----------------------------------------------------------------------------

# Create qualities, names and ShortReadQ object
set.seed(10)
input_q <- random_qual(10, 20)
input_names <- seq_names(10)
my_read <- ShortReadQ(sread = input, quality = input_q, id = input_names)

# Apply the filter 
filtered <- n_filter(my_read, rm.N = 3)
sread(filtered)
hist(letterFrequency(sread(filtered), "N"), 
    main = "Ns distribution", xlab = "",
    col = "#1a81c2")

## -----------------------------------------------------------------------------

# create 30 sequences of width 20, 15 with low quality and 15 with high quality
set.seed(10)
input <- random_seq(30, 20)

set.seed(10)
my_qual_H <- random_qual(c(30,40), slength = 15, swidth = 20,
    encod = "Sanger")

set.seed(10)
my_qual_L <- random_qual(c(5,30), slength = 15, swidth = 20, 
    encod = "Sanger")
input_q<- c(my_qual_H, my_qual_L)

input_names <- seq_names(30)
my_read <- ShortReadQ(sread = input, quality = input_q, id = input_names)

# Plot of average qualities
qual_plot <- function(x, cutoff) {
q <- alphabetScore(x) / width(x)
plot(q, type="l", xlab = "Sequence", ylab= "Average quality", ylim = c(0, 40))
points(q, col = "#1a81c2", pch = 16, cex = 2)
lines(seq_along(q), rep(cutoff, length(q)), type="l", col = "red", lty=2)
text(length(q), cutoff+2, cutoff)
}

#' Average qualities before
qual_plot(my_read, cutoff = 30)

## -----------------------------------------------------------------------------
# Apply the filter
filtered <- qmean_filter(my_read, minq = 30)

# Average qualities after
qual_plot(filtered, cutoff = 30)


## -----------------------------------------------------------------------------

# Generate random sequences
set.seed(10)
input <- random_length(30, 3:7)

# Remove sequences that contain the following patterns:
rm.seq  = c("TGGTC", "CGGT", "GTTCT", "ATA")
match_before <- unlist(lapply(rm.seq, function(x) grep(x, 
as.character(sread(input)))))
match_before

filtered <- seq_filter(input,rm.seq =  rm.seq)

# Verify that matching sequences were removed
match_after <- unlist(lapply(rm.seq, function(x) {
                grep(x, as.character(sread(filtered)))}))
match_after

## -----------------------------------------------------------------------------

# Create 6 sequences of width 20
set.seed(10)
input <- random_seq(6, 20)
input

# Create Phred+33 qualities of width 15 and paste to qualities of length 
# 5 used for the tails.
# for three of the sequences, put low qualities in tails
set.seed(10)
my_qual <- random_qual(c(30,40), slength = 6, swidth = 15, 
    encod = "Sanger")
set.seed(10)
tails <-   random_qual(c(30,40), slength = 6, swidth = 5, 
    encod = "Sanger")

# Low quality tails in sequences 2, 3 & 4
set.seed(10)
tails[2:4] <- random_qual(c(3, 20), slength = 3, swidth = 5,
    encod = "Sanger")
my_qual <- paste0(my_qual, tails)
input_q <- BStringSet(my_qual)
input_q

# Watch qualities before filtering
as.matrix(PhredQuality(input_q))

# Create names and ShortReadQ object
input_names <- seq_names(6)
my_read <- ShortReadQ(sread = input, quality = input_q, id = input_names)

# Apply the filter 
filtered <- trim3q_filter(my_read, rm.3qual = 28)
sread(filtered)


## -----------------------------------------------------------------------------

# Create duplicated sequences
s <- random_seq(10, 10)
s <- sample(s, 30, replace = TRUE)

# Create a ShortReadQ object
q <- random_qual(30, 10)
n <- seq_names(30)
my_read <- ShortReadQ(sread = s, quality = q, id = n)

# Check presence of duplicates
isUnique(as.character(sread(my_read)))

# Apply the filter
filtered <- unique_filter(my_read)
isUnique(as.character(sread(filtered)))

Try the FastqCleaner package in your browser

Any scripts or data that you put into this service are public.

FastqCleaner documentation built on Nov. 8, 2020, 5:05 p.m.