# Author: Dominic Ritler (DR)
# Contact: dominic.ritler@students.unibe.ch
# Version: 0.1.3
#
# Header: DamID pipeline all in one. (Adapter removal, bowtie, analysis)
################################################################################
## functions ##
################################################################################
fuse.chr <- function(query.gr) {
# Fuse all chromosome listed in a grange object by adding the length of the
# previous chromosomes to the GATC position of the current chromosome
# resulting in a single string of GATC coordinates.
#
# Args:
# query.gr: (grange) grange object
#
# Return:
# list with all GATC coordinates over all chromosomes added together
# get length
chr.leng <- seqlengths(query.gr)[seqnames(query.gr)@values]
# make list with length of all listed chromosomes
startplot <- rep(0, length(seqnames(query.gr)@values))
# if more than 1 chromosome is chosen
if (length(chr.leng) > 1) {
# first position is 0 then sum up the sum of the lengths of the previous
# chromosomes to the next chromosome.
for (i in 2:length(seqnames(query.gr)@values)) {
startplot[i] <- sum(chr.leng[1:i - 1])
}
# get the number of "hits" GATC sites for each chromosome
coordinates <- Rle(startplot, runLength(seqnames(query.gr)))
# add start value to the right position
coordinates <- as.vector(coordinates) + as.vector(start(query.gr))
} else {
# make coordinates from 0 to length of the only chosen chromosome
coordinates <- as.vector(start(query.gr))
}
return(coordinates)
}
read.sam.files.to.grange <- function(file.name) {
# Read a bam/sam file and make a GRange with all reads from the sam/bam file.
#
# Args:
# file.name: (String) the name of the bam file to import
#
# Return:
# new genomicRange with the reads from the sam/bam file
# import sam/bam file
mapps.galing <- readGAlignments(file.name)
# make GRange
mapps.grang <- as(mapps.galing, "GRanges")
return(mapps.grang)
}
find.restriction.sites.overlaps3 <- function(bam.grange,
res.site.grages,
strand="*", genom, chrom.nam,
restsite="GATC") {
# Find all reads starting or ending with a GATC motif and return a grange
# listing all reads starting (plus strand) or/and ending (minus) with
# a GATC site. This step is to map the DNA seq bam/sam files to the list of
# all GATC sites.
#
# Args:
# bam.grange: (GRange) grange object with al the reads from sequencing
# res.site.grages (vector of grange) GATC grange (all,plus,minus,frag)
# strand: (chr) a string indicating the strand either "*", "+", "-"
# or "all" to return all in a list.
# genom: (string) the name of the genome
# chrom.nam: (vector of strings) the chromosome names
# restsite: the restriction site string
#
# Return:
# vector with the number of matches of query in subject grange
# if strand == "all" a list with 3 vectors are returned (all, plus, minus)
# get chromosome name
gatc.plus <- res.site.grages[[2]]
gatc.minus <- res.site.grages[[3]]
rest.sites.plus <- gatc.plus[which(seqnames(gatc.plus) %in% chrom.nam)]
rest.sites.minus <- gatc.minus[which(seqnames(gatc.minus) %in% chrom.nam)]
# find overlaps for plus and minus strand
olaps.start <- findOverlaps(bam.grange, rest.sites.plus, type = "start")
olaps.end <- findOverlaps(bam.grange, rest.sites.minus, type = "end")
# make frequency (+ = plus strand only, - = minus strand only,
# * = both strands, all = return list with plus, minus and all strand)
if (strand == "+") {
freq <- table(subjectHits(olaps.start))
} else if (strand == "-") {
freq <- table(subjectHits(olaps.end))
} else if (strand == "*") {
freq <- table(c(subjectHits(olaps.start), subjectHits(olaps.end)))
} else if (strand == "all") {
freq.all <- table(c(subjectHits(olaps.start), subjectHits(olaps.end)))
freq.plu <- table(subjectHits(olaps.start))
freq.min <- table(subjectHits(olaps.end))
} else {# if not valid statement use * and give warning message.
message(paste("The strand information", strand,
"is not valid: use '*', '+' or '-'. now '*' was used!"))
freq <- table(c(subjectHits(olaps.start), subjectHits(olaps.end)))
}
# make vector with number of reads per GATC site
if (strand == "all") {
# initiate all GATC sites with 0 (if the site has no hits it will not be
# overwritten in the next step therefore sett al to 0 in every iteration)
scor.all <- rep(0,length(rest.sites.plus))#minus and plus GATC is palindrome
scor.all[as.vector(as.integer(rownames(freq.all)))] <- freq.all
scor.plu <- rep(0,length(rest.sites.plus))
scor.plu[as.vector(as.integer(rownames(freq.plu)))] <- freq.plu
scor.min <- rep(0,length(rest.sites.minus))
scor.min[as.vector(as.integer(rownames(freq.min)))] <- freq.min
scores <- list(scor.all, scor.plu, scor.min)
} else {
scores <- rep(0,length(rest.sites.plus))
scores[as.vector(as.integer(rownames(freq)))] <- freq
}
return(scores)
}
add.metadata.to.grange <- function(metadata, grange, m.name) {
# Add a metadata line to the end of the metadata in a grange file
#
# Args:
# metadata: (list) number of counts per GATC sites in the GRange
# grange: (GRange) the grange file to add the metadata
# m.name: (string) the name of the metadata column
#
# Return:
# GRange including the new metadata column at the end of the metadata.
# get metadata from GRange
meth.data <- elementMetadata(grange)
# make data-frame with name
new.meth <- data.frame(metadata)
names(new.meth) <- m.name
# bind new metadata column to the existing metadata
final.meth <- cbind(meth.data, new.meth)
# add metadata back to the grange
elementMetadata(grange) <- final.meth
return(grange)
}
make.empty.genomicRange.bin3 <- function(animal, chrom.names,
bin.size=100000, output.mesage=T) {
# Make a empty genomicRange object with specific chosen bin size
# (each bin one genomic range entry)
#
# Args:
# animal: (BSgenome) BSgenome object of a species
# species name as animal as species is a object "BioGenerics"
# chrom.names: (vector of strings) the names of the chromosomes
# bin.size: (int) length of the bins (length of each genomicRnage entry)
# output.mesage: (bool) if progress should be reported
#
# Return:
# new genomicRange object with specific genomic ranges
#
# set bin.grange zo NULL
bin.grange <- NULL # old version is not working as exist search in global env. ****************************
# go over all chromosomes chosen in chrom.names
for (chr in chrom.names) {
# get length of chromosome
chr.length <- seqlengths(animal)[chr]
# get last full range (bin.size=100, chrom.length=310, last full range=300)
full.ranges1 <- floor(as.integer(chr.length) / bin.size)
full.ranges <- full.ranges1 # (minus 1 bin size)
comp.length <- full.ranges * bin.size # the length to get only full ranges
# test if the chromosome length is bigger or equal than the bin size.
if (chr.length >= bin.size) {
# indicator for clean up
# 1 means all tmp.gr, tmp.gr.ful, tmp.gr.last have to be cleaned
clean.all <- 1
# make grange wit only full ranges
tmp.gr.ful <- GRanges(seqnames = chr,
ranges = IRanges(start = seq(from = 1,
to = comp.length,
bin.size),
width = bin.size,
names = paste(chr,
seq(1, comp.length,
bin.size),
sep = ":")),
strand = "*", seqinfo = seqinfo(animal))
# add the last range (not full)
# but before test the extreme unlikely event that the chromosome has
# exact the length of the last full range
if (comp.length + bin.size != as.integer(chr.length)) {
tmp.gr.last <- GRanges(seqnames = chr,
ranges = IRanges(start = comp.length + 1,
width = chr.length - comp.length,
names = paste(chr,
comp.length + 1,
sep = ":")),
strand = "*", seqinfo = seqinfo(animal))
# add (concatenate) the last not full bin to the grange
tmp.gr <- c(tmp.gr.ful, tmp.gr.last)
} else {
# indicator for chean up
# 2 means all tmp.gr, tmp.gr.ful have to be cleaned
clean.all <- 2
# no last not full bin has to be concatenated
tmp.gr <- tmp.gr.ful #if the range is completely and no must be appended
}
} else {# if the chromosome is smaller than the bin size.
# indicator for clean up
# 3 means tmp.gr have to be cleaned
clean.all <- 3
tmp.gr <- GRanges(seqnames = chr,
ranges = IRanges(start = 1,
width = chr.length,
names = paste(chr,
1,
sep = ":")),
strand = "*", seqinfo = seqinfo(animal))
}
# test if grange exist if not (first for loop iteration) make one
# otherwise concatenate the reads from the chromosome
if (is.null(bin.grange)) {
bin.grange <- tmp.gr
} else {
bin.grange <- append(bin.grange, tmp.gr)
}
# clean-up
if (clean.all == 1) {
rm(tmp.gr, tmp.gr.last, tmp.gr.ful)
} else if (clean.all == 2) {
rm(tmp.gr, tmp.gr.ful)
} else if (clean.all == 3) {
rm(tmp.gr)
}
# writhe message if wanted
if (output.mesage) {
message(paste(format(Sys.time(), "%H:%M:%S:"),
"Chromosome ... ",chr," done!"))
}
}
return(bin.grange)
}
################################################################################
## packages management ##
################################################################################
# This functions are obsolete when using the DamIDseq R package.
# Only BSgenomes when changing the species may need downloading packages
# from bioconductor
install.biolite.packages <- function(bc.install.names) {
# install bioclite packages
#
# Args:
# install.names: (string) names of the packages to be installed from biocon
#
# Return:
# none (install packages if not installed)
source("http://bioconductor.org/biocLite.R")
biocLite()
biocLite(bc.install.names)
}
install.r.packages <- function(r.install.names) {
# install r packages
#
# Args:
# install.names: (string) names of the R packages to be installed from CRAN
#
# Return:
# none (install packages if not installed)
install.packages(r.install.names)
}
test.installed.bc.packages <- function(bc.package.names) {
# test if bioconductor package are installed and if not install it.
#
# Args:
# package.names: (vector of string) packages names for the pipeline
#
# Return:
# none (install packages if not installed)
# find not installed packages
installed <- bc.package.names %in% rownames(installed.packages())
# install package if not installed but first test if list is empty
if (length(bc.package.names[!installed]) != 0) {
install.biolite.packages(bc.package.names[!installed])
} else {
print("all bioconductor packages installed")
}
}
test.installed.r.packages <- function(r.package.names) {
# test if r package are installed and if not install it.
#
# Args:
# package.names: (vector of string) packages to be needed for the analysis
#
# Return:
# none (install packages if not installed)
# find not installed packages
installed <- r.package.names %in% rownames(installed.packages())
# install package if not installed but first test if list is empty
if (length(r.package.names[!installed]) != 0) {
install.r.packages(r.package.names[!installed])
} else {
print("all R packages installed")
}
}
load.pakage.main <- function(chr.packages, bio.packages) {
# all in one to load and install bioconductor and cran (r) packages
#
# Args:
# chr.packages: (vector of string) all package names for R cran
# bio.packages: (vector of string) all package names for bioconductor
#
# Return:
# none (install packages if not installed and load packages)
if (bio.packages != "") {
# test if all bioconductor packages are installed and if not install it
test.installed.bc.packages(bio.packages)
# load all bioconductor library’s
lapply(bio.packages, require, character.only = TRUE)
}
if (chr.packages != "") {
# test if all r packages are installed and install it if not
test.installed.r.packages(chr.packages)
# load all r library’s
lapply(chr.packages, require, character.only = TRUE)
}
}
################################################################################
## binning reads in GRanges ##
################################################################################
find.all.overlaps.and.sum.metadata3 <- function(query.gr, subject.gr,
met.data=T, messag=T,
sum.bin=T) {
# Find genome overlaps in two GRange objects and return a list of hits per
# grange of the query grange (use when binning granges).
# either sum (sum.bin=T) metadata or mean metadata (sum.bin=F)
#
# Args:
# query.gr: (GRange) query grange file
# subject.gr: (GRange) subject grange file
# meth.data: (bool) F = matrix will be returned
# T = query.gr will be returned with metadata
# messag: (bool) if a message should be made or not
# sum.bin: (bool) if T values in bin are summed if F the mean is taken
#
# Return:
# If met.data=F datata.frame with number of overlaps in metadata
# file and ranges from query (number of reads per bin)
# If met.data=T Bin Grange with metadata (GATC reads per bin)
if (messag) {
bin.siz <- width(subject.gr[1])
message(paste(format(Sys.time(), "%H:%M:%S:"),
"find overlaps and sum up metadata entries for bin size:",
bin.siz, "bp"))
}
# calculate number of metadata rows
met.rows <- dim(elementMetadata(query.gr))[2]
# make empty matrix for new Grange (bin) metadata, each column is a sample
bin.meta <- matrix(NA, length(subject.gr), met.rows) # sample matrix
# Find overlaps (ov) between GATC Grange and bin Grange
# This approach counts the GATC reads twice if the GATC site overlaps
# bins (for example bin 1 ends with "GA", bin 2 starts with "TC").
# type="within" cannot be used as it excludes the overlapping reads)
overlaps <- findOverlaps(query.gr, subject.gr, ignore.strand = TRUE)
# Get number of GATC sites for each separate bin.
# Value is the position in the (bin grange),
# length is the number of GATC site per bin
overlaps.table <- Rle(subjectHits(overlaps))
# get number of GATC for each bin separate and make list with start and end
# of each bin (start, end = GATC grange index position)
gatc.site.index <- PartitioningByWidth(width(overlaps.table))
# decide if bin gets summed up or the mean of the bin is taken
if (sum.bin) {
sum.or.mean <- viewSums
} else {
sum.or.mean <- viewMeans
}
# go over all metadata and sum up reads per bin
for (i in 1:met.rows) {
# make list with 0 for each bin (0 reds per bin)
reads.per.bin <- rep(0, length(subject.gr))
# add the sum of GATC reads for each bin if bin has GATC reads
reads.per.bin[runValue(overlaps.table)] <- sum.or.mean(Views(Rle(
elementMetadata(query.gr)[queryHits(overlaps), i]), gatc.site.index))
# add the sum of GATC reads per bin to the sample matrix
bin.meta[,i] <- reads.per.bin
}
# assign the same column name from the GATC grange metadata to the bin Grange
colnames(bin.meta) <- names(elementMetadata(query.gr))
# return grange with metadata of matrix
if (met.data) {
elementMetadata(subject.gr) <- bin.meta
return(subject.gr)
} else {
return(bin.meta)
}
}
################################################################################
# function to process already mapped reads starting with read Granges #
################################################################################
make.full.gatc.grange.from.read.grange2 <- function(file.nam,
save.nam="gran.all",
messag=T, genom, chr.na,
restsite="GATC",
res.site.grages) {
# Combine different function when starting the pipeline with read Granges
# instead of a sam/bam file (sam reads as grange) to get reads per GATC grange
#
# Args:
# file.nam: (data.frame) data frame with 2 or 3 columns:
# path: the path to the Grange file
# name: the name of the sample
# group: optional! 's' for sample or 'c' for control
# save.nam: (string) the name and path of the file to save the
# resulting GATC grange
# messag: (bool) if a message should be returned
# genom: (string) the name of the BSgenome object
# chr.na (vector of string) the name of the chromosomes
# restsite: (string) the restriction site sequence
# res.site.grages (list of granges) all GATC granges (all,plus,minus,fragm)
#
# Return:
# grange with metadata of all reads
# make empty GATC Granges
gatc.all.m <- res.site.grages[[1]] # for all GATC sites
gatc.plus.m <- res.site.grages[[2]] # for plus sites
gatc.minus.m <- res.site.grages[[3]] # for minus sites
gatc.frag.m <- res.site.grages[[4]]
#rm(gatc.all)
# print message if asked
if (messag) {
message(paste(format(Sys.time(), "%H:%M:%S:"),
"Load grange object and start mapping to restriction sites"))
}
# go over al samples and add read number to GATC GRange
for (i in 1:length(file.nam[,1])) {
# load grange object
read.grange <- load.restriction.sites(toString(file.nam[i,1]))
# map to GATC sites for
mapped.to.gatc <- find.restriction.sites.overlaps3(read.grange,
res.site.grages,
strand = "all", genom,
chrom.nam = chr.na,
restsite = restsite)
# add metadata to grange for all, plus and minus strand
gatc.all.m <- add.metadata.to.grange(mapped.to.gatc[[1]], gatc.all.m,
toString(file.nam[i,2]))
gatc.plus.m <- add.metadata.to.grange(mapped.to.gatc[[2]], gatc.plus.m,
toString(file.nam[i,2]))
gatc.minus.m <- add.metadata.to.grange(mapped.to.gatc[[3]], gatc.minus.m,
toString(file.nam[i,2]))
}
# save grange object
save.GRange.object(gatc.all.m, save.nam)
save.nam.p <- paste(save.nam, "plus", sep = "-")
save.GRange.object(gatc.plus.m, save.nam.p)
save.nam.m <- paste(save.nam, "minus", sep = "-")
save.GRange.object(gatc.minus.m, save.nam.m)
# get fragment and save it (not reads per GATC site but per GATC fragment)
frag.gran <- make.fragmet.counts(list(gatc.plus.m, gatc.minus.m),
genom.data = genom, resr.site = restsite,
gatc.fragm = gatc.frag.m)
save.nam.f <- paste(save.nam, "fragment", sep = "-")
save.GRange.object(frag.gran, save.nam.f)
# return GATC sites grange with all metadata reads. 2016.01.20 also frag.gran
return(list(gatc.all.m, gatc.plus.m, gatc.minus.m, frag.gran))
}
make.log.fold.change <- function(granges, ids, norm.tr=T) {
# normalize reads and calculate fold change for every sample control pair
# (sample1/sum(sample1) / control1/sum(control1))
# then calculate the log2 of the mean fold change of all sample control pairs
# if norm.tr=F don’t do the normalization per total number of reads
#
# Args:
# granges: (granges) granges with all reads in metadata
# ids: (list of 2) list with two vectors sampl.ids and control.ids
# norm.tr: (bool) if norm by total number or reads should be made or not
#
# Return:
# matrix with mean log2 fold change
# get sample and control ids
sampl <- ids[[1]]
contr <- ids[[2]]
# get metadata
meta.datas <- elementMetadata(granges)
# build result matrix ncol n/2 as we have always sample and control
res.matrix <- matrix(NA, nrow = length(meta.datas[,1]),
ncol = (length(meta.datas[1,])/2))
if (norm.tr) {
# loop over all sample control pairs (has been tested to have equal numbers)
for (i in 1:length(sampl)) {
# add one to each sample and control position (no 0 in the sample)
#temp.samp.1 <- meta.datas[,sampl[i]] + 1
#temp.cont.1 <- meta.datas[,contr[i]] + 1
# sum the read number per sample and control for normalization
norm.sam <- sum(meta.datas[,sampl[i]])
norm.con <- sum(meta.datas[,contr[i]])
# take the sum
#norm.sam <- sum(temp.samp.1)
#norm.con <- sum(temp.cont.1)
# change all 0 values to 1 as we don’t want to divide by zero
temp.cont.1 <- meta.datas[,contr[i]]
temp.cont.1[which(temp.cont.1 == 0)] <- 1
# make normalized reads reads/sum(reads)/control/sum(controls)
temp.reads <- (meta.datas[,sampl[i]] / norm.sam) / (temp.cont.1 / norm.con)
#temp.reads <- (temp.samp.1 / norm.sam) / (temp.cont.1 / norm.con)
res.matrix[,i] <- temp.reads
}
} else { # this is for data who are not binned and therefore have lot of 0
for (i in 1:length(sampl)) {
# change all 0 values to 1 as we don’t want to divide by zero and also
# don’t want to loos the value if 0 / x (0 as sample)
t.cont.1 <- meta.datas[,contr[i]]
t.samp.1 <- meta.datas[,sampl[i]]
# search all possition who control and sample is 0
sample.nul <- which(t.samp.1 == 0)
contr.null <- which(t.cont.1 == 0)
# find positions with sample and control as 0
both.null <- intersect(sample.nul, contr.null)
# the lowest value should be equal to 1 if we have 0.nnnn.
# give all 0 in the controls the lowest value of the samples
min.va <- min(t.samp.1)
if (min.va == 0) {
min.va <- min(t.samp.1[t.samp.1 != min(t.samp.1)])
}
t.cont.1[contr.null] <- min.va / 2
# the lowest value should be equal to 1 if we have 0.nnnn.
min.va <- min(t.cont.1)
if (min.va == 0) {
min.va <- min(t.cont.1[t.cont.1 != min(t.cont.1)])
}
# set only samples to a value where the control is not 0
sample.nul.no.c.nul <- sample.nul[-both.null]
t.samp.1[sample.nul.no.c.nul] <- min.va / 2
# make normalized reads reads / control
temp.reads <- t.samp.1 / t.cont.1
res.matrix[,i] <- temp.reads
}
}
# mean all rows, make log2 transformation and return values.
lo.fo.ca <- log2(rowMeans(res.matrix))
lo.fo.ca[which(lo.fo.ca == -Inf)] <- 0 # cange -inf to 0
return(lo.fo.ca)
}
################################################################################
# make and load grange with restriction site #
################################################################################
find.gatc.helper <- function(chr, spec, res.seq) {
# find restriction site overlap in one chromosome.
#
# Args:
# chr: (string) the name of the chromosome
# spec: (BSGenome) the genome
# res.seq: (string) restriction site string
#
# Return:
# list with chromosome string and restriction site matches.
# make a empty list
chr.res.match <- list()
# Assign pointer string from chr to list and add restriction site positions
chr.res.match <- matchPattern(res.seq, getSeq(spec, names = chr))
return(chr.res.match)
}
make.restriction.site.grange.from.scratch <- function(species,
rest.seq="GATC",
save.names,
chro.nams) {
# make empty GATC grange form scratch using BSgenome data
#
# Args:
# species: (BSgenomes) the genome of the species
# rest.seq: (string) the sequence of the restriction site
# save.names: the path to save the empty GATC Grange
# chro.nams: (string or NULL) the names of wished chromosomes or NUll
# for all chromosomes
#
# Return:
# The GATC granges for all, plus, minus, fragments
# get restriction sites out of the genome
if (!is.null(chro.nams)) { # for subset of chromosomes
# call find.gatc.helper for all chosen chromosomes to extract GATC sites
# use sapply approach as bsapply has error when using only one chrom.
res.sit.all <- sapply(X = chro.nams, FUN = find.gatc.helper,
spec = species, res.seq = rest.seq)
# make GRange out of the list.
res.sit.all <- as(as(res.sit.all, "RangesList"), "GRanges")
# set seqinfo for chosen chromosomes
seqinfo(res.sit.all) <- seqinfo(species)[chro.nams]
# make plus and minus
res.sit.plus <- res.sit.all
strand(res.sit.plus) <- Rle("+", length(res.sit.plus))
res.sit.minus <- res.sit.all
strand(res.sit.minus) <- Rle("-", length(res.sit.minus))
} else {# when all chromosomes are chosen
# get restriction sites out of genome
res.sit <- vmatchPattern(rest.seq, species, fixed = FALSE)
seqinfo(res.sit) <- seqinfo(species)
# make plus and minus
res.sit.plus <- res.sit[which(strand(res.sit) == "+")]
res.sit.minus <- res.sit[which(strand(res.sit) == "-")]
# Make Genomic Range without strand information
res.sit.all <- res.sit.plus
strand(res.sit.all) <- Rle("*", length(res.sit.all))
}
# make fragment grange
fragment.grang <- make.fragment.from.gatc.grange(res.sit.all)
# save GRages (all, plus, minus, fragments)
save.GRange.object(res.sit.all, file.name = save.names[1])
save.GRange.object(res.sit.plus, file.name = save.names[2])
save.GRange.object(res.sit.minus, file.name = save.names[3])
save.GRange.object(fragment.grang, file.name = save.names[4])
# load granges for global use. (maybe better way to do this!)
return(list(res.sit.all, res.sit.plus, res.sit.minus, fragment.grang))
}
make.restriction.site.grange <- function(species, rest.seq="GATC",
save.location="granges",
chr.nam=NULL) {
# Test if empty GATC granges file exist: if not make it, if yes load it
#
# Args:
# species: (BSgenomes) the genome of the species
# rest.seq: (string) the sequence of the restriction site
# save.location: (string) folder name to search for saved Granges
# chr.nam: (string or NULL) name of the chromosomes or for all NULL
#
# Return:
# list of 4 Granges, GATC.all, plus only, minus only, fragments
# make name to load/save granges
prov <- providerVersion(species)
relda <- releaseName(species)
org <- organism(species)
# also make a string for chromnames to add at the end of the Grange filename
if (is.null(chr.nam)) {
cr.n <- "all-chr"
} else {
cr.n <- toString(chr.nam)
}
# concatenate the name parts
full.name <- paste(org, prov, relda, cr.n)
# delete whitespace, points and commas
full.name.nw <- gsub(" ", "", full.name, fixed = TRUE) # delete ws
rele.name.n1 <- gsub(".", "", full.name.nw, fixed = TRUE) # delete .
rele.name.nw <- gsub(",", "", rele.name.n1, fixed = TRUE) # delete ,
# add the suffix to the name
sufix <- c("all.RData", "plus.RData", "minus.RData", "fragment.RData")
# concatenate name and suffix and add the path string
names.all <- paste(rele.name.nw, sufix, sep = "-")
path.all <- file.path(save.location, names.all)
# test if granges already exist and load it if yes
exist.file <- file.exists(path.all)
if (sum(exist.file) < 4) { # if not all empty GATC granges are found make it
message(paste(format(Sys.time(), "%H:%M:%S:"), "Find all", rest.seq,
"restriction sites in genome for:", cr.n))
gatc.grangs <- make.restriction.site.grange.from.scratch(species = species,
rest.seq,
path.all, chr.nam)
} else {# otherwise load it
message(paste(format(Sys.time(), "%H:%M:%S:"), "Load", rest.seq,
"restriction sites for", cr.n))
gatc.all <- load.restriction.sites(path.all[1])
gatc.plus <- load.restriction.sites(path.all[2])
gatc.minus <- load.restriction.sites(path.all[3])
gatc.fragm <- load.restriction.sites(path.all[4])
gatc.grangs <- list(gatc.all, gatc.plus, gatc.minus, gatc.fragm)
}
return(gatc.grangs)
}
normalize.grange.md <- function(gran.data) {
# Normalize reads per bin by dividing by total number of reads
#
# Args:
# gran.data: (grange) the grange containing all metadata
#
# Return:
# grange with normalized (by total number) metadata
#
# make matrix out of grange
ful.met <- as.matrix(elementMetadata(gran.data))
# make vector with all column sums
normalizer <- colSums(ful.met)
# make new matrix
# (this step is not needed could use ful.met in for loop instead)
new.met.norm <- ful.met
# normalize
for (i in 1:length(ful.met[1,])) {
new.met.norm[,i] <- ful.met[,i] / normalizer[i]
}
# delete metadata from grange and write normalized read in it
elementMetadata(gran.data) <- NA
elementMetadata(gran.data) <- new.met.norm
return(gran.data)
}
log10.transfom.grange.md <- function(gran.data) {
# log10 transformation of all grange metadata values
#
# Args:
# gran.data: (grange) the grange containing all metadata
#
# Return:
# grange with log10 transformed data
#
# make matrix out of grange
ful.met <- as.matrix(elementMetadata(gran.data))
# make new matrix
# (this step is not needed could use ful.met in for loop instead)
new.met.log <- ful.met
# log10 transformation (could also don with sapply)
for (i in 1:length(ful.met[1,])) {
new.met.log[,i] <- log10(ful.met[,i])
}
# delete metadata from grange and write normalized read in it
elementMetadata(gran.data) <- NA
elementMetadata(gran.data) <- new.met.log
return(gran.data)
}
################################################################################
# make fragment not GATC site counts #
################################################################################
# GATC count = every sequencing read starting (plus) or ending (minus) with a
# GATC counts for this GATC site.
#
# fragment count = (fragment = distance between two GATC sites)
# count per fragment means the reads staring with a GATC (plus)
# and the next ending with a GATC (minus) count
# for one fragment.
make.fragment.from.gatc.grange <- function(gran) {
# Expand start and end position of a GATC grange to make fragment grange
# HELPER FUNCTION FOR: make.fragmet.counts
#
# Args:
# gran: (grange) grange with grange entries (ranges with width of 4 bp)
#
# Return:
# new grange with fragments (distance between two GATC sites )
# get name and length of chromosomes
chr.nam <- seqnames(gran)
chr.len <- seqlengths(gran)
# loop over chromosomes
for (chr in seqnames(gran)@values) {
# get sub grange (one chromosome)
chr.gran <- gran[which(seqnames(gran) == chr)]
## expand ranges
# first form 1 bp (start of chromosome) to first GATC site
# from 1 bp to first GATC must be added to the metadata
# as the GATC site is expanded to the right.
first.gatc.pos <- chr.gran[1]
start(first.gatc.pos[1]) <- 1
# expand the length from the first GATC (end of the next GATC)
# to the second last GATC.
# move end position one step to the right
# end position 2 -> now end position 3
new.end.pos <- end(chr.gran[-1])
# add end position to grange
end(chr.gran[-length(chr.gran)]) <- new.end.pos
# last fragment from last GATC site to length(chromosome)
end(chr.gran[length(chr.gran)]) <- chr.len[chr]
# add first fragment to grange
ful.chr <- append(first.gatc.pos, chr.gran)
# append chromosome grange to full grange or in the first iteration make
# the final grange
if (exists("return.gran")) {
return.gran <- append(return.gran, ful.chr)
} else {
return.gran <- ful.chr
}
# remove gran (cleanup)
rm(first.gatc.pos, chr.gran, ful.chr)
}
return(return.gran)
}
make.fragmet.counts <- function(grangs, genom.data, resr.site="GATC",
gatc.fragm) {
# Make read counts per fragment from GATC plus and minus grange instead of
# read counts per GATC.
#
# Args:
# grangs: (list of GRange) GATC sites plus and minus with metadata
# genome.da: (BSgenome) genome of the species to use
# rest.site: (string) restriction site string
# gatc.fragm: (GRange) empty GATC fragment GRange
#
# Return:
# new grange with counts per fragment (plus and minus separated)
# extract names
p.nam <- names(elementMetadata(grangs[[1]]))
m.nam <- names(elementMetadata(grangs[[2]]))
# loop over all chromosomes
for (chr in seqnames(grangs[[1]])@values) {
# get GATC reads for one chromosome
chr.gran.p <- grangs[[1]][which(seqnames(grangs[[1]]) == chr)]
chr.gran.m <- grangs[[2]][which(seqnames(grangs[[2]]) == chr)]
# extract metadata
plus.gran <- elementMetadata(chr.gran.p)
minu.gran <- elementMetadata(chr.gran.m)
# make empty lists and add first (minus list) and last fragment (plus list)
plus.frag <- rbind(rep(NA, length(plus.gran[1,])), data.frame(plus.gran))
minu.frag <- rbind(data.frame(minu.gran), rep(NA, length(minu.gran[1,])))
# make new matrix for plus and minus in one list and fill in first 2 column.
# plus GATC site 1 minus GATC site 1, plus GATC site 2 minus GATC site 2, ..
result.metadata <- data.frame(plus.frag[,1], minu.frag[,1])
result.names <- c(paste(p.nam[1], "p", sep = "."),
paste(m.nam[1], "m", sep = "."))
# fill in reads if more than one sample is listed in the grange metadata:
# plus 2 minus 2, plus 3 minus 3 .... plus n minus n for each read
if (length(plus.gran[1,]) > 1) {
for (i in 2:length(plus.gran[1,])) {
result.metadata <- data.frame(result.metadata, plus.frag[,i],
minu.frag[,i])
result.names <- c(result.names, paste(p.nam[i], "p", sep = "."),
paste(m.nam[i], "m", sep = "."))
names(result.metadata) <- result.names
}
}
# make result in the first iteration and append it for other iterations
if (exists("result.d.f")) {
result.d.f <- rbind(result.d.f, result.metadata)
} else {
result.d.f <- result.metadata
}
}
# add fragment reads to metadata of empty fragment GRange
return.md <- gatc.fragm # do not add stuff to gatc.fragm
elementMetadata(return.md) <- NULL # just to be save
elementMetadata(return.md) <- result.d.f # add metadata to result
return(return.md)
}
################################################################################
## save and load objects ##
################################################################################
save.GRange.object <- function(grange.ob, file.name="test") {
# save a GRange object to a .RData file
#
# Args:
# grange.ob: (GRange) the grange object you want to save.
# file.name: (string) the path and filename
#
# Return:
# none (it saves the object using the given path and name)
# test if .RData is in the name and if not concatenate it
test <- grep(".RData", file.name)
# concatenate .Rdata if not already in name
if (length(test) == 0) {
file.name <- paste(file.name, ".RData", sep = '')
}
# save grange object
save(grange.ob, file = file.name)
}
load.restriction.sites <- function(object.names) {
# load sved R objects. (.RData)
#
# Args:
# object.names: list of R object names to be loaded.
#
# Return:
# imported R object
# test if ending with .Rdata concatenate the .Rdata ending
test <- grep(".RData", object.names)
# concatenate .Rdata if not already in name
if (length(test) != 0) {
f.name <- object.names
} else {
f.name <- paste(object.names, ".RData", sep = '')
}
# import R object
obj <- local(get(load(f.name)))
return(obj)
}
################################################################################
# save metadata values of granges to a table #
################################################################################
save.table.from.grange <- function(gran, file.pa) {
# write table with metadata of granges GATC reads (strand less)
#
# Args:
# gran: (GRange) grange with metadata
# file.pa: (string) path and name to save the table
#
# Return:
# none (save table to hard disk)
# make data frame out of grange metadata and assign name
met.df <- data.frame(elementMetadata(gran))
colnames(met.df) <- names(elementMetadata(gran))
# get start position and chromosome
ind.df <- data.frame("chr" = seqnames(gran), "start.position" = start(gran))
# concatenate both
met.df.ful <- cbind(ind.df, met.df)
# write table to hard disk
write.table(met.df.ful, file.pa)
}
save.table.from.grange2 <- function(gran, gran.p, gran.m, file.pa) {
# write table with metadata of granges
# given in ob.name with plus and minus strand reads separately
#
# Args:
# gran: (GRange) grange with metadata
# gran.p: (GRange) grange with metadata plus strand
# gran.m: (GRange) grange with metadata minus strand
# file.pa: (string) path and name to save the table
#
# Return:
# none (save table to hard disk)
# all
met.df <- data.frame(elementMetadata(gran))
colnames(met.df) <- names(elementMetadata(gran))
# plus
met.p.df <- data.frame(elementMetadata(gran.p))
colnames(met.p.df) <- names(elementMetadata(gran.p))
# minus
met.m.df <- data.frame(elementMetadata(gran.m))
colnames(met.m.df) <- names(elementMetadata(gran.m))
# test if more than one sample in dataframe and add it if true
if (length(met.m.df[1,]) > 1) {
# concatenate data frame for first sample (plus and minus)
name.all <- colnames(met.df[1,])
name.plus <- paste(colnames(met.p.df[1,]), "plus", sep = ".")
name.minu <- paste(colnames(met.m.df[1,]), "minus", sep = ".")
met.all.in.one <- data.frame(met.df[,1], met.p.df[,1], met.m.df[,1])
colnames(met.all.in.one) <- c(name.all[1], name.plus[1], name.minu[1])
# loop from the 2. sample to the end
for (i in 2:length(met.m.df[1,])) {
temp.df <- data.frame(met.df[,i], met.p.df[,i], met.m.df[,i])
colnames(temp.df) <- c(name.all[i], name.plus[i], name.minu[i])
met.all.in.one <- cbind(met.all.in.one, temp.df)
}
} else {# if only one sample
name.all <- colnames(met.df)
name.plus <- paste(colnames(met.p.df), "plus", sep = ".")
name.minu <- paste(colnames(met.m.df), "minus", sep = ".")
met.all.in.one <- data.frame(met.df[,1], met.p.df[,1], met.m.df[,1])
colnames(met.all.in.one) <- c(name.all[1], name.plus[1], name.minu[1])
}
# get start position and chromosome
ind.df <- data.frame("chr" = seqnames(gran), "start.position" = start(gran))
# concatenate both
met.df.ful <- cbind(ind.df, met.all.in.one)
# write table
write.table(met.df.ful, file.pa)
}
################################################################################
## function to plot ##
################################################################################
plot.all.chrom.in.one.2 <- function(grange, chr.nam, main.name="a plot",
y.name="a y axis", y.lim=c(0,25000),
file.name="resolution.png",
plot.type="h",
plot.size=c(12,8), met.dat.name="value",
d.type="png", font.size=25, plot.col=1,
log.t=F) {
# Print .png / .pdf image of GRange restriction sites resolutions.
# by plot all chromosome in one continuous plot.
#
# Args:
# grange: (GRange) with metadata
# chr.nam: (vector of strings) name of the chromosomes to be plotted
# file.name: (string) .png or .pdf file name
# plot.type: (string) plot type e.g. h for histogram (r plot notation)
# plot.size: (vector of two int) width and height of the plot
# y.lim (vector of two int) y axis limitations
# y.name (string) name of the y axis
# main.name (string) name for the plot
# met.dat.name (string) name of the metadata column to be plotted
# d.type (string or NA) name of the plot type (png or pdf)
# if pdf w/h must be inches
# (if NA then the plot will not be saved)
# font.size (int): define the size of the font
# plot.col: (int, string) the id or name for the plot color
# log.t: (bool) if T values get log 10 transformed
#
# Return:
# none, safes a png file in the working directory
# make a continuous list over all chromosomes
coordinates <- fuse.chr(grange)
# get metadata from grange either do a log10 transformation
if (log.t) {
metadata <- log10(elementMetadata(grange)[,met.dat.name])
y.lim[2] <- log10(y.lim[2]) # also log transform y axis limitations
temp.min <- min(metadata) # get minimum value
y.lim[1] <- floor(temp.min) # also log transform y axis limitations
} else {
metadata <- elementMetadata(grange)[,met.dat.name]
}
# choose plot data type
if (d.type == "png") {
png(file.name, width = plot.size[1], height = plot.size[2])
}
if (d.type == "pdf") {
pdf(file.name, width = plot.size[1], height = plot.size[2])
}
# get maximal plot length (+1 to have a spacer, not really necessary)
x.lim <- c(0, max(coordinates) + 1)
# plot the graph
opar = par(ps = font.size)
plot(coordinates, metadata, type = plot.type, ylim = y.lim, xlim = x.lim,
ylab = y.name, main = main.name, col = plot.col)
# make vertical lines add also a red lie at the end of the last chromosome
line.positions <- c(coordinates[start(seqnames(grange))],
max(coordinates[end(seqnames(grange))]))
abline(v = line.positions, col = rgb(1,0,0,1/2)) # red half transparent
# get chromosome names
c.nams <- seqnames(grange)@values
# write names to plot, (-last to exclude last line position)
text(x = line.positions[-length(line.positions)],
y = rep(y.lim[2], length(c.nams)), c.nams, pos = 4) # pos=4 = rigth
opar
# end ploting
if (d.type == "png" | d.type == "pdf") {
dev.off()
}
}
plot.correlation <- function(grange, samp, cont, type="a", timesp=0,
bin.name="gatc", exp.name="test", qs.fold.name) {
# plot the correlation of grange metadata (all columns vs. the others)
# and save correlation in a file
#
# Args:
# grange: (GRange) with metadata
# samp: (vector) index of sample column positions in grange metadata
# cont: (vector) index of control column positions in grange metadata
# type: (string) if all, only sample and only control should be plot
# 3 values allowed "a" = all, "s" = sample, "c"=control
# bin.name (string) the name of the bin
# exp.name (string) the name of the experiment
# qs.fold.name (string) the name of the folder to save QC data
#
# Return:
# none, plot in current device
# get metadata
if (type == "s") {
met.data <- elementMetadata(grange)
met.data <- met.data[,samp]
} else if (type == "c") {
met.data <- elementMetadata(grange)
met.data <- met.data[,cont]
} else {
met.data <- elementMetadata(grange)
}
# get correlation
cor.gatc <- cor(as.matrix(met.data))
# save correlation as file (only for type a)
if (type == "a") {
f.nam <- paste("correlation-",exp.name, "-", bin.name, "-",
timesp, ".txt", sep = "")
rep.path <- file.path(qs.fold.name, f.nam)
write.table(cor.gatc, file = rep.path, eol = "\r\n" )
}
# cl.lim (test if negative correlation and if true make cl.lim = -1,1)
if (min(cor.gatc) < 0) {
cor.lim <- c(-1,1)
# color panel for the correlations -1 to 1
colpal <- colorRampPalette (c("#7F0001", "#FFA501", "#808081", "#045A8E",
"#00007E", "#045A8D", "lightgray",
"orange", "#7F0000"))(300)
} else {
cor.lim <- c(0,1)
# color panel for the correlations 0 to 1
colpal <- colorRampPalette (c("#00007F","#045A8D","lightgray", "orange",
"#7F0000","#00007F","#045A8D","lightgray",
"orange", "#7F0000"))(300)
}
# first plot for the color plot
corrplot(cor.gatc, method = "color", type = "lower", hclust = "complete",
cl.lim = cor.lim, col = colpal, order = "hclust", outline = FALSE,
addgrid.col = "black", plotCI = "n", tl.pos = "lt", diag = TRUE)
# second plot for the numbers
corrplot(cor.gatc, method = "number", tl.pos = "n", type = "upper",
diag = FALSE, hclust = "complete", order = "hclust", outline = FALSE,
addgrid.col = "black", plotCI = "n", add = TRUE, cl.pos = "n",
tl.cex = 8)
# make list with both corplots in it
#cor.plots <- c(first.plot, second.plot)
#return(cor.plots)
}
plot.normalized.gatc.number.one.vs.another.log <- function(x.data, y.data,
nams=c("a", "b"),
bin.id) {
# plot normalized numbers of GATC as smoothScatter and add correlation to it
#
# Args:
# x.data: (vector) value of sample 1
# y.data: (vector) value of sample 2
# nams: (vector of 2) name of x and y sample
# bin.id: (int) length of the bin used for this data set
#
# Return:
# none, plot in current device
# normalize
nor.x <- x.data / sum(x.data)
nor.y <- y.data / sum(y.data)
# change 0 to NA for linear model
nor.x.na <- nor.x
nor.x.na[nor.x.na == 0] <- NA
nor.y.na <- nor.y
nor.y.na[nor.y.na == 0] <- NA
# make linear model
fit <- lm(log10(nor.y.na) ~ log10(nor.x.na), na.action = "na.omit")
# make log fold scatterplot
smoothScatter(log10(nor.x), log10(nor.y),
main = paste("log10", nams[1], "vs", nams[2], bin.id, "bins"),
ylab = paste("log10", nams[2]), xlab = paste("log10", nams[1]),
pch = ".", xlim = c(-6,0), ylim = c(-6,0))
# plot linear model
abline(fit, col = "red")
# add legend with correlation value
legend("topleft", paste("cor =", round(cor(nor.x, nor.y),3)))
}
################################################################################
## helper function to get genome ##
################################################################################
test.if.chrom.exist <- function(chr.names, bs.gen) {
# test if chromosome names exist
#
# Args:
# chr.names: (vector of strings) chromosome names if NULL, all
# bs.gen: (string) the name of the package (species)
#
# Return:
# the BSgenomeobject of the species if available
# test if chromosome names match given values if not set NULL(NULL=take all)
if (!is.null(chr.names)) {
chr.n.test <- chr.names %in% seqnames(bs.gen)
if (sum(chr.n.test) != length(chr.names)) {
stop(paste("Chromosome name:",
toString(chr.names[which(!chr.n.test)]),
"not found in genome chromosome name set.",
"Choose only chromosome listed next:",
toString(seqnames(bs.gen)),
"from", toString(bs.gen@pkgname)))
}
}
}
################################################################################
## get genome form BSgenome ##
################################################################################
get.genome <- function(bs.genome.name="BSgenome.Celegans.UCSC.ce10",
chromosome.names=NULL) {
# get genome form BSgenome, download, install, Xstring object
#
# Args:
# bs.genome.name: (string) the name of the package (species)
# chromosome.names: (vector of strings) chromosome names if NULL, all
#
# Return:
# the BSgenomeobject of the species if available
#
# TODO: (DR) forge genome from fasta ... home made genome library.
# test if package is installed
if (bs.genome.name %in% rownames(installed.packages())) {
# load the bsgenome package
load.pakage.main("", bs.genome.name)
# load the genome
bs.genome <- getBSgenome(bs.genome.name, masked = FALSE)
# test if chromosome names match given values if not set NULL(NULL=take all)
test.if.chrom.exist(chromosome.names, bs.genome)
return(bs.genome)
}
# test if package (genome) exist
genome.list <- available.genomes()
is.name.in.list <- bs.genome.name %in% genome.list
if (!is.name.in.list) {
stop(paste("Your genome", bs.genome.name, "is not supported"))
} else {
load.pakage.main("", bs.genome.name)
bs.genome <- getBSgenome(bs.genome.name, masked = FALSE)
# test if chromosome names match given values if not set NULL(NULL=take all)
test.if.chrom.exist(chromosome.names, bs.genome)
return(bs.genome)
}
}
################################################################################
## read fastq.gz files ##
################################################################################
read.fastq <- function(file.path, fasta.for="fastq", seq.type="DNA") {
# read a fastq (with quality string) or fasta file
#
# Args:
# file.path: (string) the path and filename of the fastq file
# fasta.for: (string) the format (fasta, fastq)
# seq.type: (string) the type of the data, DNA, RNA, aa, b for anything
#
# Return:
# Xstrinng object with the reads and qc (fastq)
# read fastq (with quality string) or fasta from file
if (fasta.for == "fastq") {
return(readFastq(file.path, withIds = T))
} else {
return(readDNAStringSet(filepath = file.path, format = fasta.for))
}
}
################################################################################
## make QC report ##
################################################################################
make.qc <- function(file.list, time.stamp, exp.name, rep.dir) {
# make QC (fastqc)
#
# Args:
# file.list: (data.frame) the war list data frame with 1=path, 2=name,
# and optionally 3=group
# the first column would be enough
# time.stamp: (Sys.time) the timestamp for the name of the saved file
# exp.name: (string) the name of the experiment
# rep.dir: (string) name of the directory to put in the reports
#
# Return:
# none (save report to file)
# make filenames list
file.names <- sapply(file.list[,1], as.character)
# make pdf report path
qc.file.name <- paste(exp.name, "-", time.stamp, ".pdf", sep = "")
qc.full.path <- file.path(rep.dir, qc.file.name)
# make qc (also be used with multicore)
ret.report <- qQCReport(input = file.names, pdfFilename = qc.full.path)
}
################################################################################
## cutadapt homemade script ##
################################################################################
make.cutadapt.fastq <- function(read.list, adapter="CGCGGCCGAG", error.a=1,
out.name="test", compr=T, res.site="GATC",
dir.name.ca) {
# cut adapter from the 5' end if the GATC sequence is following after
# the adapter. This script cuts the adapter not only on the flanking regions.
# this is for fastq files
#
# Args:
# read.list: (Xstring) all the reads form 1 sequencing sample
# adapter: (string) the adapter sequence
# errors.a: (integer) the number of errors allowed
# out.name: (string) the name to save the cut sequence as fastq
# compr: (bool) save the file T=compressed (gzip), F=uncompressed
# res.site: (string) the motif of the restriction site
# dir.name.ca (stirng) the name of the folder for cut read files
#
# Return:
# (Xstrinng) only trimmed reads starting with GATC (saves cut fasta in file)
# get timestamp to calculate the run time
start.time <- Sys.time()
# get number of raw reads
raw.read.length <- length(read.list)
# concatenate adapter + restriction site
adapter.p <- paste(adapter, res.site, sep = "")
# get quality strings
quality.list <- quality(read.list)@quality
# get sequence and also overwrite the read.list
read.list <- sread(read.list)
# test if all sequences have the same length and give warning if not
if (mean(width(read.list)) != width(read.list[1])) {
warning("Your read list contains multiple read lengths!")
}
# get length of every read
seq.length <- width(read.list)
# find adapter + GATC position using vmatchpattern (no flanking regions)
adapter.position <- vmatchPattern(adapter.p, read.list,
max.mismatch = error.a)
# get the end position of the matched adapter
adapter.end.pos <- endIndex(adapter.position)
# remove adapter.position (RAM friendly)
rm(adapter.position)
# change NULL to 0 as atomic vectors cannot have NULL
# returns the full length of the sequence when no adapter is found.
adapter.end.pos[unlist(lapply(adapter.end.pos, is.null))] <- as.integer(0)
# vmatchpattern is able to return more than one element when
# there are more matches in the DNA string.
# get every first entry of each individual list
# (could also exclude reads with multiple matches)
adapter.end.pos.first <- lapply( adapter.end.pos, '[[', 1)
# remove adapter.end.pos (RAM friendly)
rm(adapter.end.pos)
# make a atomic vector from list (faster)
adapter.vect <- as.vector(adapter.end.pos.first, mode = "integer")
# remove adapter.end.pos.first (RAM friendly)
rm(adapter.end.pos.first)
# remove the restriction site form the adapter so that the read start with it
adapter.vect.r <- adapter.vect - nchar(res.site)
# set all adapter end positions to 0 if they are negative
# (if no adapter was found 0 + -4 = -4 therefore set it back to 0)
# 5' overlapping and the sequence starts with the restriction site(no adapter)
adapter.vect.r[which(adapter.vect.r < 0)] <- 0
# set all adapter end positions to 100 if they are 3' (right) overlap
# this reads are not useful as we do not have any sequence starting with
# a restriction site left. (only the adapter is overlapping the 3' end)
adapter.vect.r[which(adapter.vect.r > seq.length)] <- seq.length
# use the adapter end index - restriction site to truncate the read sequences
# +1 as we have from 0 to 100 (or 50) and not from 1 to 100 in this object
cut.read.list <- subseq(read.list, start = adapter.vect.r + 1) # for reads
cut.qual.list <- subseq(quality.list, start = adapter.vect.r + 1) # quality
# remove adapter.vec.r, read.list, and quality list (RAM friendly)
rm(adapter.vect.r, read.list, quality.list)
# make a new list with only cut sequences and also cut the quality list
cut.read.list.red <- cut.read.list[which(width(cut.read.list) != seq.length)]
cut.qual.list.red <- cut.qual.list[which(width(cut.read.list) != seq.length)]
# find adapter in 5' flanking region (overlapping)
# get uncut sequences and also get quality list for uncut reads
uncut.read.list <- cut.read.list[which(width(cut.read.list) == seq.length)]
uncut.qual.list <- cut.qual.list[which(width(cut.read.list) == seq.length)]
# make new seq.length list with only uncut reads
seq.le.u.c <- seq.length[which(width(cut.read.list) == seq.length)]
# use trimLTPattern to map adapter in flanking regions
temp.trim <- trimLRPatterns(Lpattern = adapter.p, Rpattern = "",
uncut.read.list, ranges = T)
# add the restriction site to the start of the remaining sequence
start(temp.trim) <- start(temp.trim) - nchar(res.site)
# remove all entries with negative starts and also remove adapter matching
# only the first 3 or 4 bases of the read.
start(temp.trim)[which(start(temp.trim) < 4)] <- 1
# 4 as only take sequences with match 3 bases on the adapter
# (increasing of this number truncates the max length of
# reads 4 = max read length 97bp 5=96bp)
# remove adapter sequence from sequences and also the quality data
flanking.adapt <- subseq(uncut.read.list, start = start(temp.trim))
flanking.quali <- subseq(uncut.qual.list, start = start(temp.trim))
# get only sequences who are truncated
flanking.adapt.red <- flanking.adapt[which(width(flanking.adapt)
!= seq.le.u.c)]
flanking.quali.red <- flanking.quali[which(width(flanking.adapt)
!= seq.le.u.c)]
# save fastq file compressed compr==T or uncompressed
save.shre <- ShortReadQ(sread = c(cut.read.list.red, flanking.adapt.red),
quality = c(cut.qual.list.red, flanking.quali.red))
if (compr) {
export.name <- paste(out.name, "cut.fastq.gz", sep = "-")
export.path <- file.path(getwd(), dir.name.ca, export.name, fsep = "/")
# save compressed
writeFastq(save.shre, export.path, mode = 'w', compress = compr)
} else {
export.name <- paste(out.name, "cut.fastq", sep = "-")
export.path <- file.path(getwd(), dir.name.ca, export.name, fsep = "/")
# save uncompressed
writeFastq(save.shre, export.path, mode = 'w', compress = compr)
}
## prepare report
# get lengths of remaining sequences
seq.len.c <- table(width(cut.read.list.red)) # from full adapter alignment
seq.len.f <- table(width(flanking.adapt.red)) # from partial adapter alignment
# concatenate both tables.
seq.len <- c(seq.len.c, seq.len.f)
# calculate number of reads with and without adapter
no.adapter <- raw.read.length - (length(cut.read.list.red) +
length(flanking.adapt.red))
wi.adapter <- length(cut.read.list.red) + length(flanking.adapt.red)
# calculate percentage
adapt.presentage <- wi.adapter * 100 / raw.read.length
## save report file as .txt table.
# make data.frame for easy report saving
result.report <- as.data.frame(seq.len)
colnames(result.report) <- "count"
# make file name and set path
export.name.txt <- paste(out.name, "cut-read-lengths.txt", sep = "-")
export.path.txt <- file.path(getwd(),
dir.name.ca, export.name.txt, fsep = "/")
# write header
write.table(paste("length", "count", sep = "\t"),
file = export.path.txt, eol = "\r\n", col.names = F,
row.names = F, quote = F)
# add length table data to the file
write.table(seq.len, file = export.path.txt, sep = "\t",
eol = "\r\n", quote = F, col.names = F, append = T)
# get timestamp to calculate run time
end.time <- Sys.time()
duration <- difftime(end.time, start.time, units = "auto")
# write some stats (concatenate text and values for printing in file)
total.r <- paste("Total number of reads:", raw.read.length, sep = "\t")
total.r.l <- paste("length of reads in bp:", toString(unique(seq.length))
, sep = "\t")
total.r.a <- paste("Reads with adapter and GATC sequence:", wi.adapter,
"(%)", round(adapt.presentage, digits = 4), sep = "\t")
total.r.n <- paste("Remaining reads which didn't align to adapter and GATC",
no.adapter, sep = "\t")
total.rut <- paste("Time used for file import and adapter removal:",
format(duration), sep = "\t")
# concatenate report to single string
some.stats <- paste(total.r, total.r.l, total.r.a, total.r.n, total.rut,
sep = "\r\n")
# set name, path and write report (number and % of cut reads) to file
export.name.txt2 <- paste(out.name, "cut-report.txt", sep = "-")
export.path.txt2 <- file.path(getwd(), dir.name.ca, export.name.txt2,
fsep = "/")
write(some.stats, file = export.path.txt2)
# return the path of the fastq file and (wi.adapter) number of adapter found
return(c(export.name, wi.adapter))
}
make.cutadapt.fasta <- function(read.list, adapter="CGCGGCCGAG", error.a=1,
out.name="test", compr=T, res.site="GATC",
dir.name.ca) {
# cut adapter from the 5' end if the GATC sequence is following after
# the adapter also when the adapter sequence is flanking the 5' end. for fasta
#
# Args:
# read.list: (Xstring) all the reads form 1 sequencing sample
# adapter: (string) the adapter sequence
# errors.a: (integer) the number of errors allowed
# out.name: (string) the name to save the cut sequence as fastq
# compr: (bool) T for compressed (gzip), F for uncompressed
# res.site: (string) the string of the restriction site
# dir.name.ca(stirng) the name of the folder for cut read files
#
# Return:
# (Xstrinng) only trimmed reads starting with GATC (saves cut fasta in file)
# get timestamp to calculate the run time
start.time <- Sys.time()
# concatenate adapter + restriction site
adapter.p <- paste(adapter, res.site, sep = "")
# test if all sequences have the same length
if (mean(width(read.list)) != width(read.list[1])) {
warning("Your read list contains multiple read lengths!")
}
# get length of the sequences
seq.length <- width(read.list)
# find adapter position using vmatchpattern (no flanking regions)
adapter.position <- vmatchPattern(adapter.p, read.list,
max.mismatch = error.a)
# get the end position of the matched adapter
adapter.end.pos <- endIndex(adapter.position)
# remove adapter.position (RAM friendly)
rm(adapter.position)
# change NULL to 0 as atomic vectors cannot have NULL
# returns the full length of the sequence when no adapter is found.
adapter.end.pos[unlist(lapply(adapter.end.pos, is.null))] <- as.integer(0)
# vmatchpattern is able to return more than one element when
# there are more matches in the DNA string.
# get every first entry of each individual list
adapter.end.pos.first <- lapply( adapter.end.pos, '[[', 1)
# make a atomic vector from list (faster)
adapter.vect <- as.vector(adapter.end.pos.first, mode = "integer")
# remove the restriction site form the adapter as this should not be removed
adapter.vect.r <- adapter.vect - nchar(res.site)
# set all adapter end positions to 0 if they are negative
# (if no adapter was found 0 + -4 = -4 therefore set it back to 0)
# 5' overlapping and the sequence starts with the restriction site(no adapter)
adapter.vect.r[which(adapter.vect.r < 0)] <- 0
# set all adapter end positions to 100 if they are 3' overlap
# this reads are not useful as we do not have any sequence starting with
# a restriction site left. (only the adapter is overlapping the 3' end)
adapter.vect.r[which(adapter.vect.r > seq.length)] <- seq.length
# use the adapter end index - restriction site to truncate the read sequences
# +1 as we have from 0 to 100 (or 50) and not from 1 to 100 in this object
cut.read.list <- subseq(read.list, start = adapter.vect.r + 1)
# make a new list with only cut sequences
cut.read.list.red <- cut.read.list[which(width(cut.read.list) != seq.length)]
# find adapter in 5' flanking region (overlapping)
# get uncut sequences
uncut.read.list <- cut.read.list[which(width(cut.read.list) == seq.length)]
# make new seq.length list with only uncut reads
seq.le.u.c <- seq.length[which(width(cut.read.list) == seq.length)]
# use trimLTPattern to map adapter to flanking regions
temp.trim <- trimLRPatterns(Lpattern = adapter.p, Rpattern = "",
uncut.read.list, ranges = T)
# add the restriction site to the start of the remaining sequence
start(temp.trim) <- start(temp.trim) - nchar(res.site)
# remove all entries with negative starts and also remove adapter matching
# only the first 3 or 4 bases of the read.
start(temp.trim)[which(start(temp.trim) < 4)] <- 1
# 4 as only take sequences with match 3 bases on the adapter
# (increasing of this number truncates the max length of
# reads 4 = max read length 97bp 5=96bp)
# remove adapter sequence from sequences
flanking.adapt <- subseq(uncut.read.list, start = start(temp.trim))
# get only sequences who are truncated
flanking.adapt.red <- flanking.adapt[which(width(flanking.adapt)
!= seq.le.u.c)]
# save fasta file compressed compr==T or uncompressed
if (compr) {
export.name <- paste(out.name, "cut.fasta.gz", sep = "-")
export.path <- file.path(getwd(), dir.name.ca, export.name, fsep = "/")
# save cut reads to compressed file
writeXStringSet(c(cut.read.list.red, flanking.adapt.red), export.path,
format = "fasta", compress = T)
} else {# uncompressed
export.name <- paste(out.name, "cut.fasta", sep = "-")
export.path <- file.path(getwd(), dir.name.ca, export.name, fsep = "/")
# save cut reads to uncompressed file
writeXStringSet(c(cut.read.list.red, flanking.adapt.red), export.path,
format = "fasta")
}
## prepare report
# get lengths of remaining sequence
seq.len.c <- table(width(cut.read.list.red)) # from full adapter alignment
seq.len.f <- table(width(flanking.adapt.red)) # from partial adapter alignment
# concatenate both tables.
seq.len <- c(seq.len.c, seq.len.f)
# calculate number of reads with and without adapter
no.adapter <- length(read.list) - (length(cut.read.list.red) +
length(flanking.adapt.red))
wi.adapter <- length(cut.read.list.red) + length(flanking.adapt.red)
# calculate percentage
adapt.presentage <- wi.adapter * 100 / length(read.list)
## save report file as .txt table.
# make data.frame for easy report saving
result.report <- as.data.frame(seq.len)
colnames(result.report) <- "count"
# make file name and set path
export.name.txt <- paste(out.name, "cut-read-lengths.txt", sep = "-")
export.path.txt <- file.path(getwd(), dir.name.ca, export.name.txt,
fsep = "/")
# write header
write.table(paste("length", "count", sep = "\t"),
file = export.path.txt, eol = "\r\n", col.names = F,
row.names = F, quote = F)
# add cut read length table data to the file
write.table(seq.len, file = export.path.txt, sep = "\t",
eol = "\r\n", quote = F, col.names = F, append = T)
# get timestamp for calculating runtime
end.time <- Sys.time()
duration <- difftime(end.time, start.time, units = "auto")
# write some stats (concatenate text and values for printing in file)
total.r <- paste("Total number of reads:", length(read.list), sep = "\t")
total.r.l <- paste("length of reads in bp:", toString(unique(seq.length))
, sep = "\t")
total.r.a <- paste("Reads with adapter and GATC sequence:", wi.adapter,
"(%)", round(adapt.presentage, digits = 4), sep = "\t")
total.r.n <- paste("Remaining reads which didn't align to adapter and GATC",
no.adapter, sep = "\t")
total.rut <- paste("Time used for file import and adapter removal:",
format(duration), sep = "\t")
# concatenate stats to single string
some.stats <- paste(total.r, total.r.l, total.r.a, total.r.n, total.rut,
sep = "\r\n")
# set name, path and write report to file
export.name.txt2 <- paste(out.name, "cut-report.txt", sep = "-")
export.path.txt2 <- file.path(getwd(), dir.name.ca, export.name.txt2,
fsep = "/")
write(some.stats, file = export.path.txt2)
# return the path of the saved fastq file
return(c(export.name, wi.adapter))
}
################################################################################
## bowtie ##
################################################################################
make.bowtie <- function(fastq.files, multi.core=F, genome.name, pr.name,
dir.name.bt, m.hits=1) {
# map all cut sequences to genome using bowtie (QuasR pipeline)
#
# Args:
# fastq.files: (string) path to the .txt folder containing all the cut
# fastq files and a name (tap delimit)
# multi.core: (Boolean) T = enables multicore functionality
# genome.name: (string) the name of the BSgenome package
# pr.name: (string) the name of the project
# dir.name.bt: (string) the name of the folder to store the .sam/.bam reads
# m.hits: (int) the number of maximal hits allowed
#
# Return:
# save location of mapped .sam/.bam file
# the actual mapping using rbowtie (either multicore or single core)
if (multi.core) {
# test how many cores are available
num.of.cores <- detectCores()
# make cluster for multicore
clusts <- makeCluster(num.of.cores)
# the bowtie mapping
test <- qAlign(fastq.files, genome = genome.name,
aligner = "Rbowtie", maxHits = m.hits,
alignmentsDir = file.path(getwd(), dir.name.bt),
clObj = clusts, projectName = pr.name)
stopCluster(clusts) # stop cluster
} else {
# the bowtie mapping without multicore
test <- qAlign(fastq.files, genome = genome.name,
aligner = "Rbowtie", maxHits = m.hits,
alignmentsDir = file.path(getwd(), dir.name.bt),
projectName = pr.name)
}
message(paste(format(Sys.time(), "%H:%M:%S:"), "All sequences mapped to",
genome.name ))
message(paste(format(Sys.time(), "%H:%M:%S:"),
".bam files are saved in 'bowtie' subfolder"))
# return position of alignments (mapped .sam files)
ret.d.f <- alignments(test)
return(ret.d.f[1])
}
################################################################################
## test input ##
################################################################################
test.input.data <- function(input.file, inputs) {
# test if the input data is valid
#
# Args:
# input.file: (data.frame) the path, names, and type of the files.
# inputs: (list with objects) all the objects to test
# multi.core, exp.name, bin.len, adapt.seq, errors, input.format,
# qc, species, chr.names, restr.seq, normali, log10t
#
# Return:
# atomic vector with indexes of sample.ids, controll.ids
## test if input are valid
# test multi.core
if (!is.logical(inputs[[1]])) {
stop(paste("multi.core:", toString(inputs[[1]]),
"must be Boolean (logical, TRUE or FALSE)"))
}
# test exp.name
if (!is.character(inputs[[2]])) {
stop(paste("exp.name:", toString(inputs[[2]]),
"must be string: ('string', nostring)"))
}
# test bin.len
if (!is.numeric(inputs[[3]])) {
stop(paste("bin.len:", toString(inputs[[3]]),
"must be int: (n,...,-3,-2,-1,0,1,2,3,..,n)"))
}
# test adapt.seq
if (!is.character(inputs[[4]])) {
stop(paste("adapt.seq:", toString(inputs[[4]]),
"must be string: ('string', nostring)"))
}
# test errors
if (!is.numeric(inputs[[5]])) {
stop(paste("errors:", toString(inputs[[5]]),
"must be int: (n,...,-3,-2,-1,0,1,2,3,..,n)"))
}
# test input.format
if (inputs[[6]] != "fasta" & inputs[[6]] != "fastq") {
if (substring(inputs[[6]], 1, 1) != "b") {
if (substring(inputs[[6]], 1, 1) != "g") {
stop(paste("input.format:", toString(inputs[[6]]),
"must be 'fastq', 'fasta', 'b' or 'g' of type",
"character string"))
}
}
}
# test qc
if (!is.logical(inputs[[7]])) {
stop(paste("qc:", toString(inputs[[7]]),
"must be Boolean (logical, TRUE or FALSE)"))
}
# test species
if (!is.character(inputs[[8]])) {
stop(paste("species:", toString(inputs[[8]]),
"must be string (character): ('string', nostring)"))
}
# test chr.names
if (!is.null(inputs[[9]])) {
if (!is.character(inputs[[9]])) {
stop(paste("chr.names:", toString(inputs[[9]]),
"must be vector of character: c('string1', 'string2')",
"or NULL"))
}
}
# test restr.seq
if (!is.character(inputs[[10]])) {
stop(paste("restr.seq:", toString(inputs[[10]]),
"must be string (character): ('string', nostring)"))
}
# test normali
if (!is.logical(inputs[[11]])) {
stop(paste("normali:", toString(inputs[[11]]),
"must be Boolean (logical, TRUE or FALSE)"))
}
# test log10t
if (!is.logical(inputs[[12]])) {
stop(paste("log10t:", toString(inputs[[12]]),
"must be Boolean (logical, TRUE or FALSE)"))
}
# test if raw.files have 3 or 2 columns
if (length(colnames(input.file)) == 2) {
with.contr <- F
} else if (length(colnames(input.file)) >= 3) {
with.contr <- T
} else {
stop("input list must have 3 or 2 columns: path to fastq file, name or",
"id and group 'c' for control or 's' for sample if wished")
}
# test if name begins with number
for (i in 1:length(input.file[,2])) {
if (!is.na(suppressWarnings(as.numeric(substring(input.file[i,2],
first = 1, last = 1 ))))) {
stop(paste("Name cannot start with a number! change", input.file[i,2],
"first character to alphabetic: e.g. t.1234"))
}
}
# count numbers of "s" sample and "c" control and test if they are equal
if (with.contr) {
sample.ids <- NULL
controll.ids <- NULL
for (i in 1:length(input.file[,3])) {
if (input.file[i,3] == "s") {
sample.ids <- c(sample.ids, i)
} else if (input.file[i,3] == "c") {
controll.ids <- c(controll.ids, i)
} else {# if not c or s is used
stop(paste("group column '", input.file[i,3],
"' must be 's' for sample and 'c' for control", sep = ""))
}
}
if (length(sample.ids) != length(controll.ids)) {
stop(paste("number of samples 's'", length(sample.ids),
"must be the same as number of controls 'c'",
length(controll.ids), "!"))
}
}
# test if all cols have the same row lengths
col1 <- length(input.file[,1])
col2 <- length(input.file[,2])
if (col1 != col2) {
stop(paste("Your input .txt file has missing values: file paths, names:",
col1, col2))
}
if (with.contr) {
col3 <- length(input.file[,3])
if (col1 != col3) {
stop(paste("Your input .txt file has missing values: groups:",
"number of paths" ,col1, "versus number of group", col3))
}
}
# return index position of sample.ids and controll.ids
if (with.contr) {
return(list(sample.ids, controll.ids))
} else {
return(NULL)
}
}
################################################################################
## make empty bins ##
################################################################################
mapp.to.bins <- function(bin.len, genom.bs, chromo.names) {
# make empty bins (grange)
#
# Args:
# bin.len: (int) the length of the bins
# genom.bs: (BSGenome) genome
# chromo.names: (vector of string) chromosome names or NULL
#
# Return:
# grange with chosen bin size
message(paste(format(Sys.time(), "%H:%M:%S:"), "Make", bin.len,
"bp bins for chromosome",
toString(chromo.names)))
# get the empty bin grange
return(make.empty.genomicRange.bin3(animal = genom.bs,
chrom.names = chromo.names,
bin.size = bin.len, output.mesage = F))
}
################################################################################
## export bed files ##
################################################################################
export.bed.file <- function(gran.data, exp.name, s.c.id, norm.width=10,
frag = F, bed.index="test", write.out = T) {
# export wig/bed file for UCSC genomeBrowser import (https://genome.ucsc.edu/)
# this script makes the rolling mean averaging after log2 fold change
#
# Args:
# gran.data: (grange) the Grange object containing the reads information
# exp.name: (string) the name of the wig file
# s.c.id: (vector of 2 int) index of sample and control
# norm.width: (integer) number of sliding region (standard = 10)
# frag: (bool) if T a GATC fragment GRange is expects if F a GATC site
# bed.index: (string) the name to put in the bed file
# write.out: (bool) if output should be saved
#
# Return:
# the grange with the log fold change in it if wirte.out = T (saves as bed)
# make log 2 fold change
# test if GATC fragment or site
if (frag) {
grange.em <- as.matrix(elementMetadata(gran.data))
# change NA to 0
grange.em[is.na(grange.em)] <- 0
# sum forward and revers read hits for each fragment
counter <- 1
# make emty matrix
new.met.dat <- matrix(nrow = length(grange.em[,1]),
ncol = (length(grange.em[1,]) / 2))
colnames(new.met.dat) <- colnames(grange.em)[c(seq(1, length(grange.em[1,]),
2))]
for (i in 1:(length(grange.em[1,]) / 2)) {
new.met.dat[,i] <- rowSums(grange.em[,c(counter, counter + 1)])
counter <- counter + 2
}
elementMetadata(gran.data) <- new.met.dat
}
# chrI is not allowed so must be changed to chr1 instead
lfc.matrix <- make.log.fold.change(gran.data, s.c.id, norm.tr = F)
# add log fold change to grange
elementMetadata(gran.data) <- lfc.matrix
# separate, split grange for chromosomes
# chr.gran <- split(gran.data, seqnames(gran.data))
#smoothing averaging with 10 GATC sites rolling over all.
#library(zoo)
# make empty result vector
# res.smooth.lfc <- NULL
#
# # go over all chromosomes
# for (chr in seqnames(gran.data)@values) {
# # get metadata for chromosome
# temp.metdat <- elementMetadata(chr.gran[[chr]])[,1]
#
# # make smoothing
# smooth.lfc <- rollapply(temp.metdat, width = norm.width, FUN = mean,
# align = "center", partial = T) #, fill = NA)
#
# # add chromosome to result vector
# res.smooth.lfc <- append(res.smooth.lfc, smooth.lfc)
# }
# delete all metadata
# elementMetadata(gran.data) <- NULL
# make 1 line of log fold change metadata
# elementMetadata(gran.data) <- res.smooth.lfc
# assinge score as name for wig file
names(elementMetadata(gran.data)) <- "score"
bed.name <- paste(exp.name, ".bed", sep = "")
if (write.out) {
# write home made bedGRaph
track.info <- paste('track type=bedGraph name="', bed.index,
'" description="BedGraph format"',
' visibility=full color=200,100,0 altColor=0,100,200',
' priority=20', sep = "")
write.table(track.info, file = bed.name, row.names = F, col.names = F,
quote = F, eol = "\n")
bed.data.frame <- data.frame("chr" = seqnames(gran.data),
"start" = start(gran.data),
"stop" = end(gran.data),
"score" = elementMetadata(gran.data))
write.table(bed.data.frame, file = bed.name, row.names = F, col.names = F,
quote = F, eol = "\n", append = T)
}
return(gran.data)
}
################################################################################
## combining multiple function to one ##
################################################################################
write.bed.file <- function(gran.data, exp.name, bed.index="test") {
# write bed file.
#
# Args:
# gran.data: (grange) grange with 1 metadata column
# exp.name: (string) the name of the wig file
# bed.index: (string) the name to put in the bed file
#
# Return:
# saves a .bed file
# give right name to metadata
names(elementMetadata(gran.data)) <- "score"
bed.name <- paste(exp.name, ".bed", sep = "")
# write home made bedGRaph
track.info <- paste('track type=bedGraph name="', 'bin-',bed.index,
'" description="BedGraph format"',
' visibility=full color=200,100,0 altColor=0,100,200',
' priority=20', sep = "")
write.table(track.info, file = bed.name, row.names = F, col.names = F,
quote = F, eol = "\n")
bed.data.frame <- data.frame("chr" = seqnames(gran.data),
"start" = start(gran.data),
"stop" = end(gran.data),
"score" = elementMetadata(gran.data))
write.table(bed.data.frame, file = bed.name, row.names = F, col.names = F,
quote = F, eol = "\n", append = T)
}
################################################################################
## combining multiple function to one ##
################################################################################
from.fastq.to.grange <- function(raw.files, multi.core, exp.name, adapt.seq,
errors, raw.file.name, timestampm, rep.dir,
genome.name, chromosom.nams,
restrict.site="GATC",
dir.name.ca, dir.name.bt, dir.name.gr,
res.site.grages, m.hits, bam.o.fastq) {
# make the cutadapt, bowtie step and save GATC granges from sequencing reads
#
# Args:
# raw.files: (data.f) data.frame with 3 or 2 cols:
# col 1: path to the fasta/q sequencing read files
# col 2: name of the sample
# col 3: (optional), 's', 'c' for control or sample
# multi.core: (bool) if T bowtie uses multiple cores
# exp.name: (string) name of the experiment
# adapt.seq: (string) adapter sequence
# errors: (integer) number of errors allowed in the adapter removal step
# raw.file.name: (string) the name/path to the read index file (raw.files)
# timestampm: (Sys.time) the current start timestamp
# rep.dir: (string) name of the directory to put in the QS reports
# genome.name: (string) the name of the genome (BSgenome)
# chromosom.nams: (vector or strings) the names of the chromosomes used
# restrict.site: (string) the restriction site sequence (mostly GATC :-)
# dir.name.ca: (string) the name of the folder for cut read files
# dir.name.bt: (string) the name of the folder for .bam/sam files
# dir.name.gr: (string) the name of the folder for GRange Rdata files
# res.site.grages: (list of grange) all GATC granges (all,plus,minus,frag)
# m.hits: (int) the number of maximal hits in bowtie mapping
# bam.o.fastq: (string): 'fastq/a' = fastq/a, 'b' = bam imput file
#
# Return:
# grange with metadata of all sites reads
# save cut.fastq
# save mapped .bam files
# test if input file format is fastq/a or bam.
# if fastq/a = rawreads, if ,bam = already mapped
f.o.b <- substring(bam.o.fastq, 1, 1)
if (f.o.b == "f") {
##############
## read fastq and cutadapt step
# make empty vector to store the names and location of the cut fasta/q files
cut.file.name <- rep(NA, length(raw.files[,1]))
# make empty vector to store how many reads are in raw files and cut files
raw.file.reads <- rep(NA, length(raw.files[,1]))
cut.file.reads <- rep(NA, length(raw.files[,1]))
# read all fasta/q files (in raw.files) and cut adapter
for (i in 1:length(raw.files[,1])) {
message(paste(format(Sys.time(), "%H:%M:%S:"), "Reading file:", i, "of",
length(raw.files[,1]),"file:", toString(raw.files[i,2])))
# read raw file
temp.raw <- read.fastq(toString(raw.files[i,1]), fasta.for = bam.o.fastq)
# store and print number of reads in raw file
raw.file.reads[i] <- length(temp.raw)
message(paste(format(Sys.time(), "%H:%M:%S:"), "Cut adapter for",
toString(raw.files[i,2]), bam.o.fastq,
"; Total reads:", raw.file.reads[i]))
# remove adapter either for fasta or fastq files
if (bam.o.fastq == "fastq") {# for fastq
cut.file.ret <- make.cutadapt.fastq(temp.raw,
out.name = toString(raw.files[i,2]),
adapter = adapt.seq,
error.a = errors,
res.site = restrict.site,
dir.name.ca = dir.name.ca)
} else {# for fasta
cut.file.ret <- make.cutadapt.fasta(temp.raw,
out.name = toString(raw.files[i,2]),
adapter = adapt.seq,
error.a = errors,
res.site = restrict.site,
dir.name.ca = dir.name.ca)
}
# store file name and number of reads in cut read file
cut.file.name[i] <- cut.file.ret[1]
cut.file.reads[i] <- as.integer(cut.file.ret[2])
# print number of cut reads
message(paste(format(Sys.time(), "%H:%M:%S:"), cut.file.reads[i],
"out of",
raw.file.reads[i], "reads had adapter and start with",
restrict.site,
round(cut.file.reads[i] * 100 / raw.file.reads[i],
digits = 5), "%"))
}
# make tab separated list with cut read list for bowtie and save it
bow.path <- file.path(getwd(), dir.name.ca, cut.file.name)
bowtie.list <- data.frame("FileName" = bow.path, "SampleName" = raw.files[,2])
# make file name and path
bowtie.name <- paste("bowtie-import", timestampm, basename(raw.file.name),
sep = "-")
bowtie.path <- file.path(getwd(), dir.name.bt, bowtie.name)
# write cut read index file for QuasR import
write.table(bowtie.list,
file = bowtie.path, eol = "\r\n", col.names = T,
row.names = F, quote = F, sep = "\t")
##############
#### bowtie step
# rune rbowtie
alig.paths <- make.bowtie(bowtie.path, multi.core = multi.core,
genome.name = genome.name, pr.name = exp.name,
dir.name.bt = dir.name.bt, m.hits = m.hits)
# get path to bam files
alig.paths.d.f <- data.frame(alig.paths)
# get only path to sam/bam file
sam.directories <- alig.paths.d.f[,1]
} else {# if the input files arealready mapped .bam files
sam.directories <- as.vector(raw.files[,1])
alig.paths.d.f <- data.frame(raw.files[,1], raw.files[,2])
# set number of original reads to NA
raw.file.reads <- 1
# set number of cut reads to NA
cut.file.reads <- 1
}
# copy GATC site grange to leave gatc.all untouched.
gatc.all.m <- res.site.grages[[1]] # for all GATC sites
gatc.plus.m <- res.site.grages[[2]] # for plus sites
gatc.minus.m <- res.site.grages[[3]] # for minus sites
gatc.frag.m <- res.site.grages[[4]] # for fragments
# make empty vector to store number of reads in the .bam file
mapped.reads.in.sam.file <- rep(NA, length(alig.paths.d.f[,1]))
# make empty vector to store number of valide reads in .bam file
gatc.star.reads.in.sam.file <- rep(NA, length(alig.paths.d.f[,1]))
message(paste(format(Sys.time(), "%H:%M:%S:"),
"Start mapping to restriction sites", sep = " "))
# loop over all .bam files and make GRange.
for (i in 1:length(alig.paths.d.f[,1])) {
# import .bam file
sam.grang <- read.sam.files.to.grange(sam.directories[i])
# count number of reads in grange
mapped.reads.in.sam.file[i] <- length(sam.grang)
# save each reads grange separately as GRange
save.name <- paste(alig.paths.d.f[i,2], exp.name, timestampm, sep = "-")
save.GRange.object(sam.grang, file.path(getwd(), dir.name.gr, save.name))
# map reads to GATC sites
mapped.to.gatc <- find.restriction.sites.overlaps3(sam.grang,
res.site.grages,
strand = "all",
genom = genome.name,
chrom.nam = chromosom.nams,
restsite = restrict.site)
# sum reads up to get number of valid reads (GATC matches)
gatc.star.reads.in.sam.file[i] <- sum(mapped.to.gatc[[1]]) # for all reads
# add metadata to all, plus, minus strand GRange
gatc.all.m <- add.metadata.to.grange(mapped.to.gatc[[1]],
gatc.all.m, alig.paths.d.f[i,2])
gatc.plus.m <- add.metadata.to.grange(mapped.to.gatc[[2]],
gatc.plus.m, alig.paths.d.f[i,2])
gatc.minus.m <- add.metadata.to.grange(mapped.to.gatc[[3]],
gatc.minus.m, alig.paths.d.f[i,2])
}
# save grange object (reads per GATC sites)
save.name <- paste(exp.name, timestampm, sep = "-")
message(paste(format(Sys.time(), "%H:%M:%S:"), "Save", restrict.site,
"mapped GRange object as:", save.name))
save.GRange.object(gatc.all.m, file.path(getwd(), dir.name.gr, save.name))
# save also plus and minus strand as GRange
save.name.p <- paste(exp.name,"plus", timestampm, sep = "-")
save.GRange.object(gatc.plus.m, file.path(getwd(), dir.name.gr, save.name.p))
save.name.m <- paste(exp.name,"minus", timestampm, sep = "-")
save.GRange.object(gatc.minus.m, file.path(getwd(), dir.name.gr, save.name.m))
# save fragment GRange
frag.gran <- make.fragmet.counts(list(gatc.plus.m, gatc.minus.m),
genom.data = genome.name,
resr.site = restrict.site,
gatc.fragm = gatc.frag.m )
save.name.f <- paste(exp.name,"fragment", timestampm, sep = "-")
save.GRange.object(frag.gran, file.path(getwd(), dir.name.gr, save.name.f))
# calculate % of reads valide reads to export qc .txt file
pres.usfull.reads <- gatc.star.reads.in.sam.file * 100 / raw.file.reads
rep.d.f <- data.frame("Raw.reads" = raw.file.reads,
"Cut.reads" = cut.file.reads,
"Mapped.reads" = mapped.reads.in.sam.file,
"GATC.reads" = gatc.star.reads.in.sam.file,
"Percentage" = pres.usfull.reads)
# give row names from sample
row.names(rep.d.f) <- raw.files[,2]
# save qc txt file
qc.file.name <- paste(exp.name, "-", timestampm, ".txt", sep = "")
qc.full.path <- file.path(rep.dir, qc.file.name)
write.table(rep.d.f, file = qc.full.path, eol = "\r\n" )
# return GATC sites grange with all metadata reads
return(list(gatc.all.m, gatc.plus.m, gatc.minus.m, frag.gran))
}
################################################################################
## plot log GATC reads of each sample against all other ##
################################################################################
plot.normalized.gatc.number.one.vs.another.log.main <- function(ful.gra,
bin.name="gatc",
timest, main.n,
dir.name.res) {
# plot the log10 read number per GATC and bin for each sample
# one vs. all others in a single comparison plot (X vs. Y)
#
# Args:
# ful.gra: (grange) grange with all metadata values
# bin.name: (string) the name of the bin (either GATC or bin length)
# timest: (timestamp) the timestamp as identifier for the plot file name
# main.n: (string) the name of the plot
# dir.name.res (string) the name of the result folder
#
# Return:
# plots are saved in the dir.name.res folder
# test if ful.gra contains only one column
# (required more than 1 sample for cor)
if (length(elementMetadata(ful.gra)[1,]) <= 1) {
return(NULL)
}
# get name and get metadata as matrix
met.nam <- names(elementMetadata(ful.gra))
ful.met <- as.matrix(elementMetadata(ful.gra))
# make filename
f.name <- paste(main.n, "-one-by-one-comp-", bin.name, "-", timest,
".pdf", sep = "")
f.path <- file.path(dir.name.res, f.name)
# make pdf
pdf(file = f.path, width = 8.267, height = 11.692)
# overall file parameters for printing (sheet parameters)
font.size.main <- 9
border.layout <- c(6,6,6,6)
margin.layout <- c(5.1,4.1,4.1,2.1)
# count how many plots are needed: -1 as the first sample is compared against
# all others and not with itself
tot.metad <- length(ful.met[1,]) - 1
total.num <- (tot.metad * (tot.metad + 1)) / 2
num.of.plots <- 12 # number of maximal plots per page
num.of.tog <- 3 # number of plots in each row
# calculate the number of pages needed to plot all comparisons
num.of.pages <- ceiling(total.num / num.of.plots)
# indexes to be able to compare all samples with each other.
# get first sample and compare it with all others, then get second and
# compare it with all except the first, and so on.
up.nu <- tot.metad # the number when to change the x axis plot
in.x <- 1 # the index for the x axis plot
in.y <- 2 # the index for the y axis plot
de.step <- 1 # the step to decrease the tot.metda number
# go over all pages and make the plots
for (i in 1:num.of.pages) {
par(oma = border.layout)
par(mar = margin.layout)
par(mfrow = c(num.of.plots / num.of.tog, num.of.tog))
# start plot for the next for loop (if first start is 1)
if (i == 1) {
j.start <- 1
} else {
j.start <- ((i - 1) * num.of.plots) + 1
}
# end position (plot) for next for loop (test also if last page)
if ((j.start + num.of.plots - 1) < total.num) {
j.end <- j.start + num.of.plots - 1
} else {
j.end <- total.num
}
# loop to make num.of.plots plots
for (j in j.start:j.end) {
# plot
plot.normalized.gatc.number.one.vs.another.log(ful.met[,in.x],
ful.met[,in.y],
nams = c(met.nam[in.x],
met.nam[in.y]),
bin.name)
# test if x and y have to be changed
if (j >= up.nu) {
# increase up.nu
up.nu <- up.nu + (tot.metad - de.step)
de.step <- de.step + 1
in.x <- in.x + 1 # increase in.x
in.y <- in.x + 1 # make in.y is in.x + 1
} else {
# only increase in.y
in.y <- in.y + 1 # always increase
}
}
# plot title for this page
title(main = "x vs y", outer = T)
}
dev.off() # close report pdf
}
################################################################################
## plot correlation ##
################################################################################
plot.results <- function(gran.data, exp.name, timest, bin.name = "gatc",
sap.cot.ids, norma = T, logt = T, lfc, dir.name.res,
qs.fold.name) {
# plot reads distribution for each sample and correlation in one pdf file
#
# Args:
# gran.data: (grange) grange with metadata
# exp.name: (string) the name of the experiment
# timest: (timestamp) the timestamp for the plot filename
# bin.name: (string) the name of the bin ('gatc' or the bin size)
# sap.cot.ids: (list of 2 vectors) the index of control and sample
# norma: (bool) if data should be normalized (value / sum(all values))
# logt: (bool) if the plot should be log transformed
# lfc: (bool) if samples have controls or only samples
# dir.name.res (string) the name of the folder to store the results
#
# Return:
# a pdf file: with correlation for all files, correlation for samples and
# control separately if controls are available and the
# chromosome wide distribution of the reads for each sample
# make normalization
if (norma) {
gran.data <- normalize.grange.md(gran.data)
}
# make log10 transformation, corplot must be not log10 transformed! (-Int)
if (logt) {
gran.data.p <- log10.transfom.grange.md(gran.data)
} else {
gran.data.p <- gran.data
}
# make file name and path and start plotting
f.name <- paste(exp.name, bin.name, timest, "docs.pdf", sep = "-")
f.path <- file.path(dir.name.res, f.name)
pdf(file = f.path, width = 8.267, height = 11.692)
font.size.main <- 12
border.layout <- c(6,6,6,6)
margin.layout <- c(5.1,4.1,4.1,2.1) # default c(5,4,4,2)
# plot correlation if more than one metadata col is available.
if (length(elementMetadata(gran.data)[1,]) > 1 ) {
# outer margin first page (correlation)
par(oma = border.layout)
par(mar = margin.layout)
# page layout
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
# plot correlation for all samples: type = 'a'
# sap.cot.ids[[1]] = sample.ids, sap.cot.ids[[2]] = contorl.ids
plot.correlation(gran.data, sap.cot.ids[[1]], sap.cot.ids[[2]],type = "a",
timest, bin.name, exp.name, qs.fold.name)
# test if only one sample and control if only one sample and control
# do not make correlation between samples only and control only
if (length(sap.cot.ids[[1]]) > 1 && lfc == T) {
plot.correlation(gran.data, sap.cot.ids[[1]], sap.cot.ids[[2]],
type = "s", qs.fold.name)
plot.correlation(gran.data, sap.cot.ids[[1]], sap.cot.ids[[2]],
type = "c", qs.fold.name)
# make title for more than one sample control pair
title(main = paste("Correlations: all, sample only, control only for",
bin.name, "bins"), outer = T)
} else {
# make title if only one sample control pair
title(main = paste("Correlations for",
bin.name, "bins"), outer = T)
}
}
# plot second page (GATC methylation distribution for each sample separately)
# plot only 4 plots each page
num.of.plots <- 4 # number of plots each page
total.num <- length(elementMetadata(gran.data))
# calculate number of pages
num.of.pages <- ceiling(total.num/num.of.plots)
# loop over all pages
for (i in 1:num.of.pages) {
par(oma = border.layout)
par(mar = margin.layout)
par(mfrow = c(num.of.plots,1))
# start for the next for loop (if first start is 1)
if (i == 1) {
j.start <- 1
} else {
j.start <- ((i - 1) * num.of.plots) + 1
}
# end position for next for loop (also test if last page)
if ((j.start + num.of.plots - 1) < total.num) {
j.end <- j.start + num.of.plots - 1
} else {
j.end <- total.num
}
# loop to make num.of.plots plots
for (j in j.start:j.end) {
# get name, max and minimal values
met.nam <- names(elementMetadata(gran.data.p)[j])
max.value <- max(elementMetadata(gran.data.p)[,j])
min.test <- elementMetadata(gran.data.p)[,j] # remove the -inf entries
min.test <- min.test[which(min.test != -Inf)]
min.value <- min(min.test)
# define the space from the maximal value to the upper border
if (norma) {
# if normalized
if (logt) { # if log transformed
overshoot <- 0.5
p.type <- "l"
y.na <- "norm log10 reads"
min.value <- min.value - 0.5
} else {# if normalized but not log transformed
overshoot <- 0.0001
p.type <- "l" #"hist"
y.na <- "norm reads"
min.value <- 0
}
} else {
# if not normalized
if (logt) { # if log transformed
overshoot <- 2 # upper plot margin
p.type <- "l"
y.na <- "log10 reads"
} else {# if not normalized and not log transformed
overshoot <- 50
p.type <- "l"
y.na <- "reads"
min.value <- 0
}
}
# plot the chromosomal distribution
plot.all.chrom.in.one.2(gran.data.p, main.name = met.nam,
y.name = y.na, #c(0, 0.01)
y.lim = c(min.value, max.value + overshoot),
file.name = "asdf.png", plot.type = p.type,
plot.size = c(12,8), met.dat.name = met.nam,
d.type = "NA", font.size = font.size.main)
}
# plot title for this page
title(main = paste("distribution of methylated GATC sites for", bin.name,
"bins"), outer = T)
}
dev.off() # close report pdf
}
plot.log.fold.change <- function(gran.bin, sap.cot.ids, f.path, exp.name) {
# plot log2 fold change between all samples and all controls
#
# Args:
# gran.bin: (grange) grange with metadata
# sap.cot.ids: (list of two) the ids for control and samples
# f.path: (string) the path and name of the plot after saving
# exp.name: (string) name of the experiment
#
# Return:
# pdf with log2 fold change
# make the log2 fold transformation between samples and controls
log.fold <- make.log.fold.change(gran.bin, sap.cot.ids)
# add log fold values to grange, after removing all old metadata
bin.lfc.grange <- gran.bin
elementMetadata(bin.lfc.grange ) <- NULL # clean metadata
bin.lfc.grange <- add.metadata.to.grange(log.fold, bin.lfc.grange,
m.name = "lfc")
# define font size and start pdf plotting
font.size.main <- 9
pdf(file = f.path, width = 8.267, height = 4)
# find maximal and minimal value
max.reads <- max(elementMetadata(bin.lfc.grange)[,1])
min.test <- elementMetadata(bin.lfc.grange)[,1] # remove the -Inf entries
min.test <- min.test[which(min.test != -Inf)]
min.value <- min(min.test)
# plot log2 fold change y.lim=c(0,(max.reads+0.5)
plot.all.chrom.in.one.2(bin.lfc.grange,
main.name = paste("log2 fold change", exp.name),
y.name = "log2 fold change",
y.lim = c((min.value - 0.5), (max.reads + 0.5)),
file.name = "resolution.png", plot.type = "h",
plot.size = c(12,8), met.dat.name = "lfc",
d.type = "NA", font.size = font.size.main)
dev.off() # close pdf
# return log2 fold change
return(bin.lfc.grange)
}
################################################################################
## main function ##
################################################################################
damid.seq <- function(raw.file.name, input.format="fastq", multi.core=F,
exp.name="damid-gatc-sites", bin.len=100000,
adapt.seq="CGCGGCCGAG", errors=1, qc=T,
species="BSgenome.Celegans.UCSC.ce10", chr.names = NULL,
restr.seq="GATC", normali=T, log10t=T, m.hits=1) {
# DamID-seq pipeline: from sequencing reads to result: main, default function
#
# Args:
# raw.file.name: (string) path to a .txt file with 3 or 2 columns.
# columns must be tab separated
# first row = header: you can choose your header freely
# column 1: path and name to the sequencing files
# column 2: name of the sample starting with a letter
# column 3: (optional) group 's'=sample and 'c'=control
# input.format: (string) if 'fa' / 'f' fastq sequencing reads are cut,
# mapped, if 'gr' / 'g' cut and mapping is skipped.
# Instead of fastq sequencing reads already mapped
# GRanges are listed in in raw.file.name
# if 'bam' / 'b' .bam file must be listed in the raw.file.name
# list as the mapping is scipped
# multi.core: (bool) if T all cores of the machine is used
# (some machine have trouble using multicore)
# exp.name: (string) freely chosen name of the experiment
# bin.len: (int) length of the bins
# adapt.seq: (string) the sequence of the adapter
# errors: (int) how many errors allowed in the cutadapt process.
# qc: (bool) if quality of sequencing reads should be tested (fastqc)
# species: (string) the name of the BSgenome package
# chr.names: (vector of strings) name of the chromosomes to be considered
# in the analysis. if NULL all chromosome in the genome are used
# restr.seq: (string) the restriction site sequence (should be GATC)
# normali: (bool) if the data should be normalized for the chromosomal
# distribution plots.
# log10t: (bool) if the number of reads should be log190 transformed in the
# chromosomal distribution plots.
# m.hits: (int) the number of maximal hits in bowtie mapping
# (1 = only unique hits)
#
# Return:
# list of GRanges (all, plus, minus, fragment, and also binned)
# get timestamp of pipeline start time
timestampm <- as.integer(as.numeric(Sys.time()))
# read raw.files .txt list
raw.files <- read.table(raw.file.name, header = T, sep = "\t")
# Test if the input file is valid (get the index of sample and control ids)
# sap.cot.ids is NULL if no log2 fold change should be made
# also test the rest of the input data
sap.cot.ids <- test.input.data(raw.files, list(multi.core, exp.name, bin.len,
adapt.seq, errors,
input.format, qc,
species, chr.names, restr.seq,
normali, log10t))
# set if log2 fold change should be made at the end of the pipeline
if (length(sap.cot.ids) == 0) {
lfc = F
} else {
lfc = T
}
# make integer if float was given for the number inputs (errors and bin size)
errors <- as.integer(errors)
bin.len <- as.integer(bin.len)
# test, load or install full genome data BSgenome
# This function also test the correctness of the user input (chromosome names)
# therefore this function has to be always first before make and work
# further in the pipeline.
bc.genome <- get.genome(species, chr.names)
## make all folders to store the resulting files
# make cutadapter folder
dir.name.ca <- "cutadapter"
dir.create(file.path(getwd(), dir.name.ca), showWarnings = FALSE)
# make directory to save alignments
dir.name.bt <- "bowtie"
dir.create(file.path(getwd(), dir.name.bt), showWarnings = FALSE)
# make grange folder
dir.name.gr <- "granges"
dir.create(file.path(getwd(), dir.name.gr), showWarnings = FALSE)
# qs folder name
qs.fold.name <- "qc-reports"
dir.create(file.path(getwd(), qs.fold.name), showWarnings = FALSE)
# make directory for results
dir.name.res <- "results"
dir.create(file.path(getwd(), dir.name.res), showWarnings = FALSE)
# make or load restriction sites (GATC) granges
res.site.grages <- make.restriction.site.grange(species = bc.genome,
rest.seq = restr.seq,
save.location = dir.name.gr,
chr.nam = chr.names)
# get all chromosome name if chroms = NULL
if (is.null(chr.names)) {
chr.names <- seqnames(bc.genome)
}
# make quality control of sequencing files
if (qc) {
make.qc(raw.files, timestampm, exp.name = exp.name, rep.dir = qs.fold.name)
}
# cut adapter, map reads using bowtie, save grange with reads per GATC
fir.in.chr <- substring(input.format, 1, 1) # get fist character
if (fir.in.chr == "f" | fir.in.chr == "b") {
gatc.all.m.l <- from.fastq.to.grange(raw.files, multi.core, exp.name,
adapt.seq, errors, raw.file.name,
timestampm, rep.dir = qs.fold.name,
genome.name = species,
chromosom.nams = chr.names,
restrict.site = restr.seq,
dir.name.ca = dir.name.ca,
dir.name.bt = dir.name.bt,
dir.name.gr = dir.name.gr,
res.site.grages,
m.hits = m.hits,
bam.o.fastq = input.format)
} else {# or load granges with reads grange
# prepare path and name to save results
save.name.g <- paste(exp.name, timestampm, sep = "-")
file.save.name <- file.path(getwd(), dir.name.gr, save.name.g)
# import GRange with cut reads and make GATC Grange with metadata
gatc.all.m.l <- make.full.gatc.grange.from.read.grange2(raw.files,
save.nam = file.save.name,
genom = species,
restsite = restr.seq,
chr.na = chr.names,
res.site.grages = res.site.grages)
}
# make single grange out of grange list
gatc.all.m <- gatc.all.m.l[[1]]
gatc.plus.m <- gatc.all.m.l[[2]]
gatc.minus.m <- gatc.all.m.l[[3]]
gatc.fragm <- gatc.all.m.l[[4]]
## sage GATC reads as table .txt
# save GATC sites table for non R use of the data plus and minus separated
file.naa <- paste("reads-", exp.name, "-gatc-reads-sep", timestampm, ".txt",
sep = "")
file.paa <- file.path(dir.name.res, file.naa)
save.table.from.grange2(gatc.all.m, gatc.plus.m, gatc.minus.m, file.paa)
# save table with GATC sites for non R use of the data
file.na <- paste("reads-", exp.name, "-gatc-", timestampm, ".txt", sep = "")
file.pa <- file.path(dir.name.res, file.na)
save.table.from.grange(gatc.all.m, file.pa)
# save table with GATC fragments for non R use of the data
file.na.f <- paste("fragment-reads", exp.name, "-gatc-", timestampm, ".txt",
sep = "")
file.pa.f <- file.path(dir.name.res, file.na.f)
save.table.from.grange(gatc.fragm, file.pa.f)
# make empty bin grange
bin.grange <- mapp.to.bins(bin.len, genom.bs = bc.genome,
chromo.names = chr.names)
############
## make reads per bin out of reads per GATC site and save table
# bin gatc grange to bin grange
bin.grange <- find.all.overlaps.and.sum.metadata3(gatc.all.m, bin.grange,
met.data = T, messag = T )
# save table with GATC sites for non R use of the data
file.na <- paste("reads-", exp.name, "-", toString(bin.len), "-",
timestampm, ".txt", sep = "")
file.pa <- file.path(dir.name.res, file.na)
save.table.from.grange(bin.grange, file.pa)
###########
## plot correlation, distribution of reads on the chromosome and lfc
message(paste(format(Sys.time(), "%H:%M:%S:"),
"plotting result and documentation"))
# plot data for GATC sites
plot.results(gatc.all.m, exp.name, timestampm, bin.name = "gatc", sap.cot.ids,
norma = normali, logt = log10t, lfc, dir.name.res, qs.fold.name)
# plot correlation if more than 1 samples is available.
plot.normalized.gatc.number.one.vs.another.log.main(gatc.all.m,
bin.name = restr.seq,
timestampm, exp.name,
dir.name.res)
# plot data for defined bin sites
plot.results(bin.grange, exp.name, timestampm, bin.name = toString(bin.len),
sap.cot.ids, norma = normali, logt = log10t, lfc, dir.name.res,
qs.fold.name)
# plot sample GATC reads comparison when multiple samples are available.
plot.normalized.gatc.number.one.vs.another.log.main(bin.grange,
bin.name = toString(bin.len),
timestampm, exp.name,
dir.name.res)
##############
## plot log2 fold change and make bed file.
if (lfc) {
# make path and name for log2 fold change plot
f.name.r <- paste(exp.name, timestampm, "res.pdf", sep = "-")
f.path.r <- file.path(dir.name.res, f.name.r)
# call function to do log fold change
message(paste(format(Sys.time(), "%H:%M:%S:"), "Plot log2 fold change"))
# plot result
lof2.f <- plot.log.fold.change(bin.grange, sap.cot.ids, f.path.r, exp.name)
## print .bed file
file.na.b <- paste(exp.name, "-", toString(bin.len), "-",
timestampm, sep = "")
file.pa.b <- file.path(dir.name.res, file.na.b)
export.bed.file(gatc.all.m, file.pa.b, sap.cot.ids, norm.width = 10,
frag = F, bed.index = exp.name, write.out = T)
file.na.b <- paste(exp.name, "-", toString(bin.len), "-", bin.len,
"-bp-bin-", timestampm, sep = "")
file.pa.b <- file.path(dir.name.res, file.na.b)
# export bed file using bins
write.bed.file(lof2.f, file.pa.b, bed.index = paste(exp.name, "bin"))
}
# return bin and GATC grange
if (lfc) {
return(list("GATC" = gatc.all.m, "GATC.plus" = gatc.plus.m,
"GATC.minus" = gatc.minus.m, "GATC.frag" = gatc.fragm,
"bins" = bin.grange, "lfc" = lof2.f))
} else {
return(list("GATC" = gatc.all.m, "GATC.plus" = gatc.plus.m,
"GATC.minus" = gatc.minus.m, "GATC.frag" = gatc.fragm,
"bins" = bin.grange))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.