### R code from vignette source 'Markov_Report_Guerra_Jaunatre.Rnw'
### Encoding: UTF-8
###################################################
### code chunk number 1: packages
###################################################
library(xtable)
library(ggplot2)
library(reshape)
# library(plyr)
# library(MASS)
# library(ResourceSelection)
# library(ggpubr)
# library(tidyr)
if("BeeMarkov" %in% installed.packages()) {
library(BeeMarkov)
}else{
# library(devtools)
# install_github("gowachin/BeeMarkov")
# library(BeeMarkov)
}
library(seqinr)
rerun = FALSE
###################################################
### code chunk number 2: working (eval = FALSE)
###################################################
## setwd("manuscrit")
##
###################################################
### code chunk number 3: transition
###################################################
transition <- function(file, # fichier
l_word = 1, # longueur des mots
n_seq = 1, # Nombre de sequences a analyser
log = TRUE,
type = "") {
seq <-seqinr::read.fasta(file)
Nseq <- length(seq)
if (Nseq < n_seq) { # check if user want to input too many seq in the matrix learning
stop(paste("n_seq is larger than the number of sequence in this file ( ", Nseq, " sequences for this file).", sep = ""))
}
l <- sapply(seq, length)
if (l_word > min(l) | l_word >= 10) {
stop("This is really too much for me, abort!!!")
}
tmp <-seqinr::count(seq[[1]], l_word) + 1 # add 1 occurence to have at least 1 obs
cat(paste(" ============ Training model M",type,l_word," ============ \n", sep = ""))
pb <- utils::txtProgressBar(min = 0, max = n_seq, style = 3)
for (i in 2:n_seq) {
utils::setTxtProgressBar(pb, i)
tmp <- tmp +seqinr::count(seq[[i]], l_word)
}
close(pb)
# # alternativ way, slower but prettier
# cat(paste(" ============ Training model M",type,l_word," ============ \n", sep = ""))
# l_count = function(Seq,n = l_word){count(Seq,n)}
# tmp <- rowSums(sapply(seq,l_count))
if (l_word>1) {
i <- 1
wind = 4
for(j in 0:((length(tmp)/wind)-1)){
tmp[(1+wind*j):(wind+wind*j)] <- tmp[(1+wind*j):(wind+wind*j)] * 4^(i-1) / sum(tmp[(1+wind*j):(wind+wind*j)] )
}
cat(' ============ Computing conditionnal probabilities ============ \n')
} else {
tmp = tmp / sum(tmp)
}
# possibility to compute without log
if (log) {
tmp = log(tmp)
}
return(tmp)
}
mP <-transition(file = "../raw_data/mus_cpg_app.fa", n_seq = 1160, l_word = 2, log = F)
mP <- matrix(mP, byrow = TRUE, ncol = 4, dimnames = list(unique(substr(names(mP),1,1)),unique(substr(names(mP),1,1))))
matable <- xtable(x = mP , label = "tab:trans")
# Notez les doubles \\ nécessaires dans R, c'est la "double escape rule"
print(matable, file = "fig/tab_trans.tex",size = "tiny", NA.string = "NA",
table.placement = "!t",
floating = FALSE,
caption.placement="top",
include.rownames = TRUE,
include.colnames = TRUE,
latex.environments = "center")
###################################################
### code chunk number 4: threshold
###################################################
quality <- function(file, # fichier
pos_training = NULL, # file to train with for positive
neg_training = NULL, # file to train with for negative
trans_pos = NULL, #transition matrice pos
trans_neg = NULL, #transition matrice pos
l_word_pos = 1, # word lenght for transition table positif
l_word_neg = 1, # word length for transition table negatif
n_train = 1, # number of sequences to train with
n_seq = 1, # number of sequences to analyse
quiet = FALSE
){
if(is.null(c(trans_pos,trans_neg))){
trans_pos <-transition(file = pos_training, n_seq = n_train, l_word = l_word_pos, type ="+")
trans_neg <- transition(file = neg_training, n_seq = n_train, l_word = l_word_neg, type = "-")
}
seq <-seqinr::read.fasta(file)
Nseq <- length(seq)
if(Nseq < n_seq){ # check if user want to input too many seq in the matrix learning
stop(paste("n_seq is larger than the number of sequence in this file ( ",Nseq," sequences for this file).", sep = ""))
}
result <- data.frame(VP = rep(FALSE,n_seq),
FN = rep(TRUE,n_seq),
pos = rep(NA,n_seq),
neg = rep(NA,n_seq)
)
p_init_pos = log(1/4^l_word_pos)
p_init_neg = log(1/4^l_word_neg)
if(!quiet) {cat(' ============ Computing sensi and speci for the test sequences ============ \n')
pb <- utils::txtProgressBar(min = 0, max = n_seq, style = 3)}
for(i in 1:n_seq){
if(!quiet) utils::setTxtProgressBar(pb, i)
n_word_pos <-seqinr::count(seq[[i]], l_word_pos)
n_word_neg <-seqinr::count(seq[[i]], l_word_neg)
result$pos[i] <- p_init_pos + sum( trans_pos * n_word_pos )
result$neg[i] <- p_init_neg + sum( trans_neg * n_word_neg )
if(result$pos[i] > result$neg[i]){
result$VP[i] <- TRUE ; result$FN[i] <- FALSE
} else {
result$VP[i] <- FALSE ; result$FN[i] <- TRUE
}
}
if(!quiet) close(pb)
tmp <- colSums(result[,1:2])
return(tmp)
}
threshold <- function(pos_test, # fichier
neg_test,
pos_training, # file to train with for positive
neg_training, # file to train with for negative
pos_seq = c(1:2),
neg_seq = c(1:2),
n_train = 1, # number of sequences to train with
n_seq = 1 # number of sequences to analyse
){
# choix =utils::menu(c("yes","no"),
# title = "You will launch long computation, do you wish to procede further ?")
# if (choix ==1) cat("\n ============ Go take a good coffee ============ \n\n")
# if (choix ==2) stop("You stopped the computations")
trans_pos <- list()
trans_neg <- list()
cat('\n ============ Training modeles ============ \n\n')
for(i in pos_seq){
trans_pos[[i]] <- transition(file = pos_training, n_seq = n_train, l_word = i, type = "+")
}
cat('\n')
for(j in neg_seq){
trans_neg[[j]] <- transition(file = neg_training, n_seq = n_train, l_word = j, type = "-")
}
cat('\n ============ Training modeles ============ \n\n')
sensi <- speci <- matrix(rep(0,length(pos_seq)*length(neg_seq)),ncol = length(pos_seq),nrow = length(neg_seq))
for(i in pos_seq){
for(j in neg_seq){
cat('\n')
cat(paste(" ============ Model M+ (",i,"/",j,") ============ \n", sep = ""))
pos <- quality(file = pos_test, # fichier
trans_pos = trans_pos[[i]], #transition matrice pos
trans_neg = trans_neg[[j]], #transition matrice neg
l_word_pos = i, # transition table positif
l_word_neg = j, # transition table negatif
n_train = n_train, # number of sequences to train with
n_seq = n_seq, # number of sequences to analyse
quiet = TRUE
)
cat(paste(" ============ Model M- (",i,"/",j,") ============ \n", sep = ""))
neg <- quality(file = neg_test, # fichier
trans_pos = trans_pos[[i]], #transition matrice pos
trans_neg = trans_neg[[j]], #transition matrice neg
l_word_pos = i, # transition table positif
l_word_neg = j, # transition table negatif
n_train = n_train, # number of sequences to train with
n_seq = n_seq, # number of sequences to analyse
quiet = TRUE
)
sensi[i,j] <- pos[1] / sum(pos)
speci[i,j] <- neg[2] / sum(neg)
}
}
cat(paste(" ============ Come back from your coffee ============ \n", sep = ""))
colnames(sensi) <- colnames(speci) <- paste("-",neg_seq, sep="")
rownames(sensi) <- rownames(speci) <- paste("+",pos_seq, sep="")
final <-list(sensi,speci) ; names(final) = c("sensi","speci")
return(final)
}
###################################################
### code chunk number 5: seuil
###################################################
if("final.RData" %in% list.files("fig/") && !rerun){
load("fig/final.RData")
}else{
final <- threshold("../raw_data/mus_cpg_test.fa", # fichier
"../raw_data/mus_tem_test.fa",
"../raw_data/mus_cpg_app.fa", # file to train with for positive
"../raw_data/mus_tem_app.fa", # file to train with for negative
pos_seq = c(1:6),
neg_seq = c(1:6),
n_train = 1160, # number of sequences to train with
n_seq = 1163 # number of sequences to analyse
)
save(final,file = "fig/final.RData")
}
table <- cbind(melt(final$sensi), melt(final$speci)[, 3])
colnames(table) <- c("M", "m", "Sensi", "Speci")
table$score <- table$Sensi + table$Speci
visual <- ggplot(table, aes(M, m)) +
geom_raster(aes(fill = score), hjust = 0.5, vjust = 0.5, interpolate = FALSE) +
geom_contour(aes(z = score)) + xlab("CpG+") + ylab("CpG-")
visual
ggsave('visual.pdf', plot = visual, device = "pdf", path = 'fig/',scale = 3, width = 7, height = 4, units = "cm", limitsize = T)
rm(visual)
table[which(table$tot == max(table$tot)),]
###################################################
### code chunk number 6: trans_mod
###################################################
l_c <- 1 / 1000
l_nc <- 1 / 125000
trans_mod <- log(matrix(c(
1 - l_c, l_nc,
l_c, 1 - l_nc
),
ncol = 2, nrow = 2 , dimnames = list(c("M+","M-"),c("M+","M-"))
))
matable <- xtable(x = trans_mod , label = "tab:trans_mod")
# Notez les doubles \\ nécessaires dans R, c'est la "double escape rule"
print(matable, file = "fig/trans_mod.tex",size = "tiny", NA.string = "NA",
table.placement = "!t",
floating = FALSE,
caption.placement="top",
include.rownames = TRUE,
include.colnames = TRUE,
latex.environments = "center")
###################################################
### code chunk number 7: viterbi
###################################################
viterbi <- function(file,
pos_training = "../raw_data/mus_cpg_app.fa", # file to train with for positive
neg_training = "../raw_data/mus_tem_app.fa", # file to train with for negative
l_word_pos = 1,
l_word_neg = 1,
n_train = 1160,
n_ana = 1,
l_c = 1000, # length coding
l_nc = 125000 # length non-coding
) {
trans_pos <- transition(file = pos_training, n_seq = n_train, l_word = l_word_pos, type = "+")
trans_neg <- transition(file = neg_training, n_seq = n_train, l_word = l_word_neg, type = "-")
seq <- seqinr::read.fasta(file)
if (n_ana > 1) stop("Analysis for multiple files is not implemented yet")
raw_seq <- seq[[1]]
long <- length(raw_seq)
beg <- max(l_word_pos, l_word_neg)
# compute p_initial and transition matrix for markov
pos_init <- neg_init <- log(0.5)
l_c <- 1 / l_c
l_nc <- 1 / l_nc
trans_mod <- log(matrix(c(
1 - l_c, l_nc,
l_c, 1 - l_nc
),
ncol = 2, nrow = 2
))
colnames(trans_mod) <- rownames(trans_mod) <- c("c", "nc")
# initialisation
ncol <- 7
proba <- matrix(rep(NA, long * ncol), ncol = ncol)
colnames(proba) <- c("M+", "M-", "model", "length", "rep_length", "begin", "end")
# time explode if it is not a matrix anymore !!!
# proba <- as.data.frame(proba)
# old version is v2 takes too long
# v1 = function(){
# trans_pos[which(names(trans_pos)==paste(raw_seq[(beg-l_word_pos+1):beg],collapse = ""))]
# }
# v2 = function(){
# min(count(raw_seq[(beg-l_word_pos+1):beg], l_word_pos) * trans_pos)
# }
# system.time(v1())
# system.time(v2())
# compute first base and initiate Viterbi
proba[beg, "M+"] <- trans_pos[which(names(trans_pos) == paste(raw_seq[(beg - l_word_pos + 1):beg], collapse = ""))] + pos_init
proba[beg, "M-"] <- trans_neg[which(names(trans_neg) == paste(raw_seq[(beg - l_word_neg + 1):beg], collapse = ""))] + neg_init
if (proba[beg, "M+"] > proba[beg, "M-"]) {
proba[beg, "model"] <- 1
} else {
proba[beg, "model"] <- 2
}
proba[beg, c("length", "rep_length")] <- 1
tmp <- proba[beg, c(6, 7)] <- beg
proba[1:beg - 1, "rep_length"] <- beg - 1
proba[1:beg - 1, "model"] <- 3
proba[1, "begin"] <- 1
proba[beg - 1, c(4, 7)] <- beg - 1
beg <- beg + 1
cat("\n ============ Viterbi is running ============ \n\n")
pb <- utils::txtProgressBar(min = beg, max = long, style = 3)
for (i in beg:long) {
utils::setTxtProgressBar(pb, i)
# proba d'avoir la base sous M
pM <- trans_pos[which(names(trans_pos) == paste(raw_seq[(i - l_word_pos + 1):i], collapse = ""))] + max(
proba[i - 1, "M+"] + trans_mod[1, 1],
proba[i - 1, "M-"] + trans_mod[2, 1]
)
# proba d'avoir la base sous m
pm <- trans_neg[which(names(trans_neg) == paste(raw_seq[(i - l_word_neg + 1):i], collapse = ""))] + max(
proba[i - 1, "M-"] + trans_mod[2, 2],
proba[i - 1, "M+"] + trans_mod[1, 2]
)
proba[i, "M+"] <- pM
proba[i, "M-"] <- pm
if (proba[i, "M+"] > proba[i, 2]) {
proba[i, "model"] <- 1
} else {
proba[i, "model"] <- 2
}
# length information
if (proba[i, "model"] == proba[i - 1, "model"]) {
proba[i, "length"] <- proba[i - 1, "length"] + 1 # increase part length
proba[i - 1, c(4, 7)] <- NA # erase length in previous ligne
} else {
proba[i, "length"] <- 1 # initiate new part length
proba[i - 1, "end"] <- i - 1 # put end value of precedent part
proba[c(tmp:(i - 1)), "rep_length"] <- proba[i - 1, "length"] # rep value of length for precedent part
proba[i, "begin"] <- tmp <- i # put begin value of the actual part
}
}
close(pb)
# closing table
proba[i, "end"] <- i # put end value of precedent part
proba[c(tmp:(i)), "rep_length"] <- proba[i, "length"] # rep value of length for precedent part
# add a column for the line number
proba <- cbind(c(1:dim(proba)[1]), proba)
colnames(proba)[1] <- "n"
return(proba)
}
###################################################
### code chunk number 8: smoothing
###################################################
smoothing <- function(seq,
l_word_pos = 5,
l_word_neg = 4,
smooth_win = 10,
reject_win = 1) {
beg <- max(l_word_pos, l_word_neg)
# finding ambiguous regions which are shorter than a certain windows
cat(" ============ Smoothing ============ \n")
seq <- cbind(seq, seq[, 4])
colnames(seq)[9] <- c("smoothed")
seq[which(seq[, "rep_length"] <= smooth_win), "smoothed"] <- 3
# old version is v1 takes too long
# v1 = function(){
# cat(" ============ Smoothing boucle ============ \n")
# pb <- utils::txtProgressBar(min = 1, max = max(seq[, 1]), style = 3)
# for (i in 1:dim(seq)[1]) {
# utils::setTxtProgressBar(pb, i)
# if (seq[i, 6] <= smooth_win) {
# seq[i, "smoothed"] <- 3
# }
# }
# close(pb)
# }
# v2 = function(){
# seq[which(seq[, 6] <= smooth_win), "smoothed"] <- 3}
# system.time(v1())
# system.time(v2())
# unified ambiguous regions
seq <- cbind(seq, seq[, c(5:8)])
colnames(seq)[10:13] <- paste("S_", colnames(seq[, c(10:13)]), sep = "")
seq[beg, "S_length"] <- 1
tmp <- seq[beg, c("S_begin", "S_end")] <- beg
seq[-c(1:beg), c(10:13)] <- NA
beg <- beg + 1
cat("\n ============ Smoothing length ============ \n")
pb <- utils::txtProgressBar(min = beg - 1, max = dim(seq)[1], style = 3)
for (i in beg:dim(seq)[1]) {
utils::setTxtProgressBar(pb, i)
if (seq[i, "smoothed"] == seq[i - 1, "smoothed"]) {
seq[i, "S_length"] <- seq[i - 1, "S_length"] + 1 # increase part length
seq[i - 1, c("S_length", "S_end")] <- NA # erase length in previous ligne
} else {
seq[i, "S_length"] <- 1 # initiate new part length
seq[i - 1, "S_end"] <- i - 1 # put end value of precedent part
seq[c(tmp:(i - 1)), "S_rep_length"] <- seq[i - 1, "S_length"] # rep value of length for precedent part
seq[i, "S_begin"] <- tmp <- i # put begin value of the actual part
}
}
close(pb)
# closing table
seq[i, 13] <- i # put end value of precedent part
seq[c(tmp:(i)), 11] <- seq[i, 10] # rep value of length for precedent part
# to reject some region and put arbitrary models on them
if(reject_win > 1){
colnames(seq)[10:13] <- paste("R_", colnames(seq[, c(10:13)]), sep = "")
head(seq)
cat(" ============ Rejecting ============ \n")
# finding regions of length < reject_win between tho regions of same model. Changing the model to surrounding
solo_l <-seq[which(seq[,"R_S_length"] %in% unique(seq[, "R_S_rep_length"])),c("smoothed","R_S_rep_length")]
for(i in 2:(dim(solo_l)[1]-1)){
if(solo_l[i-1,1]==solo_l[i+1,1] && solo_l[i,2] < reject_win && solo_l[i,1] == 3) {solo_l[i,1] <- solo_l[i-1,1]}
}
seq[,"smoothed"] <- rep(solo_l[,1],solo_l[,2])
beg <- beg - 1
seq[beg, "R_S_length"] <- 1
tmp <- seq[beg, c("R_S_begin", "R_S_end")] <- beg
seq[-c(1:beg), c(10:13)] <- NA
beg <- beg + 1
seq[-c(1:beg), c(10:13)] <- NA
cat("\n ============ Smoothing length after reject ============ \n")
pb <- utils::txtProgressBar(min = beg - 1, max = dim(seq)[1], style = 3)
for (i in beg:dim(seq)[1]) {
utils::setTxtProgressBar(pb, i)
if (seq[i, "smoothed"] == seq[i - 1, "smoothed"]) {
seq[i, "R_S_length"] <- seq[i - 1, "R_S_length"] + 1 # increase part length
seq[i - 1, c("R_S_length", "R_S_end")] <- NA # erase length in previous ligne
} else {
seq[i, "R_S_length"] <- 1 # initiate new part length
seq[i - 1, "R_S_end"] <- i - 1 # put end value of precedent part
seq[c(tmp:(i - 1)), "R_S_rep_length"] <- seq[i - 1, "R_S_length"] # rep value of length for precedent part
seq[i, "R_S_begin"] <- tmp <- i # put begin value of the actual part
}
}
close(pb)
# closing table
seq[i, 13] <- i # put end value of precedent part
seq[c(tmp:(i)), 11] <- seq[i, 10] # rep value of length for precedent part
}
return(seq)
}
###################################################
### code chunk number 9: plot_resume
###################################################
graph <- function(seq,colors = c("blue","red","green"),nrow = 3,ycol = 4){
par(mfrow = c(nrow,1), mar = c(3,2,1,1))
# c(5, 4, 4, 2)
long <- max(seq[,1])/nrow
for(i in 0:(nrow-1)){
plot(x = seq[((1+long*i):(long+long*i)),1],
y = seq[((1+long*i):(long+long*i)),ycol],
col = colors[seq[((1+long*i):(long+long*i)),9]], xlab = " ", ylab = " ")
}
par(mfrow = c(1,1))
}
resume <- function(seq){
beg <- seq[which(!is.na(seq[,12])),12]
end <- seq[which(!is.na(seq[,13])),13]
long <- seq[which(!is.na(seq[,10])),10]
model <- seq[which(!is.na(seq[,10])),9]
length(beg) ; length((end)) ; length(long) ; length(model)
table <- data.frame(beg,end,long,model)
return(table)
}
###################################################
### code chunk number 10: mus1
###################################################
if("mus1.RData" %in% list.files("fig/") && !rerun){
load("fig/mus1.RData")
}else{
mus1 <- viterbi(file = "../raw_data/mus1.fa",
l_word_pos = 5,
l_word_neg = 4
)
save(mus1,file = "fig/mus1.RData")
}
mus1 <- smoothing(mus1,5,4,60, 40)
jpeg("fig/mus1_s.jpg", width = 836, height = 496)
# 2. Create the plot
graph(mus1,nrow = 3, ycol = 9)
# 3. Close the file
dev.off()
mus1_t <- resume(mus1)
matable <- xtable(x = mus1_t , label = "tab:mus1_t")
# Notez les doubles \\ nécessaires dans R, c'est la "double escape rule"
print(matable, file = "fig/mus1_t.tex",size = "tiny", NA.string = "NA",
table.placement = "!t",
floating = FALSE,
caption.placement="top",
include.rownames = TRUE,
include.colnames = TRUE,
latex.environments = "center")
###################################################
### code chunk number 11: mus2
###################################################
if("mus2.RData" %in% list.files("fig/") && !rerun){
load("fig/mus2.RData")
}else{
mus2 <- viterbi(file = "../raw_data/mus2.fa",
l_word_pos = 5,
l_word_neg = 4
)
save(mus2,file = "fig/mus2.RData")
}
mus2 <- smoothing(mus2,5,4,60, 40)
jpeg("fig/mus2_s.jpg", width = 836, height = 496)
# 2. Create the plot
graph(mus2,nrow = 3, ycol = 9)
# 3. Close the file
dev.off()
mus2_t <- resume(mus2)
matable <- xtable(x = mus2_t , label = "tab:mus2_t")
# Notez les doubles \\ nécessaires dans R, c'est la "double escape rule"
print(matable, file = "fig/mus2_t.tex",size = "tiny", NA.string = "NA",
table.placement = "!t",
floating = FALSE,
caption.placement="top",
include.rownames = TRUE,
include.colnames = TRUE,
latex.environments = "center")
###################################################
### code chunk number 12: mus3
###################################################
if("mus3.RData" %in% list.files("fig/") && !rerun){
load("fig/mus3.RData")
}else{
mus3 <- viterbi(file = "../raw_data/mus3.fa",
l_word_pos = 5,
l_word_neg = 4
)
save(mus3,file = "fig/mus3.RData")
}
mus3 <- smoothing(mus3,5,4,60, 40)
jpeg("fig/mus3_s.jpg", width = 836, height = 496)
# 2. Create the plot
graph(mus3,nrow = 3, ycol = 9)
# 3. Close the file
dev.off()
mus3_t <- resume(mus3)
matable <- xtable(x = mus3_t , label = "tab:mus3_t")
# Notez les doubles \\ nécessaires dans R, c'est la "double escape rule"
print(matable, file = "fig/mus3_t.tex",size = "tiny", NA.string = "NA",
table.placement = "!t",
floating = FALSE,
caption.placement="top",
include.rownames = TRUE,
include.colnames = TRUE,
latex.environments = "center")
###################################################
### code chunk number 13: github_install (eval = FALSE)
###################################################
## # NOT RUN
## library(devtools)
## install_github("gowachin/BeeMarkov")
## library(BeeMarkov)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.