#' Reference class to store the results from sequence similarity analyses, and block detection runs for a given triplet.
#' @name SimilarityScan
#' @field TableFile A length 1 character vector storing the temporary filepath of the datatable of sequence similarity table.
#' @field Table Accessor function for the datatable of sequence similarity that is stored on temporary files.
#' @field WindowSizeUsed Single integer value, stores the size of the sliding window used for the scan.
#' @field StepSizeUsed Single integer value, stores the size of the sliding window used for the scan.
SimilarityScan <- setRefClass("SimilarityScan",
fields = list(
TableFile = "character",
Table = function(value){
if(missing(value)){
read.table(TableFile)
} else {
write.table(value, file = TableFile)
}
},
WindowSizeUsed = "numeric",
StepSizeUsed = "numeric"),
methods = list(
initialize =
function(HCDir){
"Method initializes the object, generates temporary filenames for the sequence similarity table."
TableFile <<- tempfile(pattern = "SSTable", tmpdir = HCDir)
blankTable()
},
blankTable =
function(){
"Method clears the SS analysis results table."
Table <<- data.frame(WindowCenter = NA, WindowStart = NA, WindowEnd = NA,
ActualCenter = NA, ActualStart = NA, ActualEnd = NA,
AB = NA, AC = NA, BC = NA)
},
tableIsBlank =
function(){
"Returns TRUE, if the SS analysis table is blank and no results are contained in it."
return(all(is.na(Table)))
},
finalize =
function(){
"Called when the object is destroyed, makes sure to delete the file saved in the system's temporary directory."
unlink(TableFile)
}
)
)
#' Reference class to store and manage triplet data.
#' @name Triplet
#' @field ContigNames character vector of length 3, stores the dna names of which the triplet was made.
#' @field ContigNumbers integer vector of length 3, stores the indecies of the sequences that made the triplet.
#'
Triplet <- setRefClass("Triplet",
fields = list(
ContigNames = "character",
ContigPairs = "list",
InformativeDNALength = "numeric",
FullDNALength = "numeric",
ScanData = "ANY",
Blocks = "list"
),
methods = list(
initialize =
function(sequencenames, fullseqlength, HCDir){
"Initializer function, creates an instance of triplet."
ContigNames <<- sequencenames
FullDNALength <<- fullseqlength
ContigPairs <<- combn(ContigNames, 2, simplify=F)
ScanData <<- SimilarityScan$new(HCDir)
},
readSettings =
function(dna, settings){
InformativeDNALength <<- unique(width(dna))
ScanData$StepSizeUsed <<- settings$StepSize
ScanData$WindowSizeUsed <<- settings$WindowSize
},
noScanPerformed =
function(){
"Returns TRUE, if the SS analysis table is blank and the informative sites are not known. This is indicative that a scan of the file has not been done yet."
return(ScanData$tableIsBlank() && length(InformativeDNALength) == 0)
},
blocksNotFound =
function(){
"Returns TRUE, if the tables for block are blank and no block findng has been done."
return(length(Blocks) == 0)
},
blocksNotDated =
function(){
"Returns TRUE, if the blocks detected have not been tested for significance or had a divergence time estimated."
bools <- unlist(lapply(Blocks, function(y) all(unlist(lapply(y, function(x) all(is.na(x$SNPs)) && all(is.na(x$CorrectedSNPs)) && all(is.na(x$P_Value)) && all(is.na(x$P_Threshold)) && all(is.na(x$fiveAge)) && all(is.na(x$fiftyAge)) && all(is.na(x$ninetyFiveAge)) )))))
return(all(bools))
},
# Method for putative block detection.
putativeBlockFind =
function(parameters){
"DOCSTRING TO BE COMPLETE"
if(noScanPerformed()){stop("No sequence similarity scan data is available for this triplet - can't identify blocks.")}
if(parameters$AutoThresholds == TRUE) {
message("Using the autodetect thresholds method...")
message("Deciding on suitable thresholds...")
thresholds <- autodetect.thresholds(ScanData, parameters)
# Results in a list of thresholds for AB, AC and BC.
} else {
thresholds <- list(parameters$ManualThresholds, parameters$ManualThresholds, parameters$ManualThresholds)
}
names(thresholds) <- unlist(lapply(combn(ContigNames, 2, simplify=F), function(x) paste(x, collapse=":")))
message("Now beginning Block Search...")
Blocks <<- lapply(1:3, function(i) block.find(ScanData$Table[,c( 1:6, 6+i )], thresholds[[i]]))
names(Blocks) <<- names(thresholds)
},
blockDate =
function(dnaobj, parameters){
"Block Dating method, estimates the ages of blocks detected based on how many mutations are observed in a block and ."
message("Now dating blocks")
ab.blocks <- lapply(Blocks[[1]], function(x) date.blocks(x, dnaobj, parameters$MutationRate, ContigPairs[[1]], parameters$PValue, parameters$BonfCorrection, parameters$DateAnyway, parameters$MutationCorrection))
ac.blocks <- lapply(Blocks[[2]], function(x) date.blocks(x, dnaobj, parameters$MutationRate, ContigPairs[[2]], parameters$PValue, parameters$BonfCorrection, parameters$DateAnyway, parameters$MutationCorrection))
bc.blocks <- lapply(Blocks[[3]], function(x) date.blocks(x, dnaobj, parameters$MutationRate, ContigPairs[[3]], parameters$PValue, parameters$BonfCorrection, parameters$DateAnyway, parameters$MutationCorrection))
out.blocks <- list(ab.blocks, ac.blocks, bc.blocks)
names(out.blocks) <- names(Blocks)
Blocks <<- out.blocks
},
tabulateBlocks = function(){
"Tabulates the blocks for a triplet."
message(paste("Tabulating blocks for the triplet", paste(ContigNames, collapse=":")))
# Check that the tables are present, if they aren't, turn them into blank data.frames.
temps <- lapply(1:3, function(i) do.call(rbind, Blocks[[i]]))
SS <- lapply(1:3, function(i) floor(as.numeric(rownames(temps[[i]]))))
pair <- lapply(1:3, function(i) rep(names(Blocks)[[i]], nrow(temps[[i]])))
temp2 <- do.call(rbind, temps)
if(blocksNotDated()){
temp2 <- cbind(temp2, data.frame(fiveAge = rep(NA, times=nrow(temp2)), fiftyAge = rep(NA, times=nrow(temp2)), ninetyfiveAge = rep(NA, times=nrow(temp2)), SNPnum = rep(NA, times=nrow(temp2)), PValue = rep(NA, times=nrow(temp2)), PThresh = rep(NA, times=nrow(temp2)), MeanAge = rep(NA, times=nrow(temp2)), CorrectedSNPs = rep(NA, times=nrow(temp2))))
}
temp2["SequencePair"] <- unlist(pair)
temp2["SequenceSimilarityThreshold"] <- unlist(SS)
if(nrow(temp2) > 0){
temp2["Triplet"] <- paste(ContigNames, collapse=":")
} else {
temp2["Triplet"] <- character()
}
return(temp2)
},
plotTriplet = function(plottingSettings){
if("Lines" %in% plottingSettings$What && "Bars" %in% plottingSettings$What){
plottedbars <- plotBars(plottingSettings)
plottedlines <- plotLines(plottingSettings)
together <- arrangeGrob(plottedbars, plottedlines, ncol=1)
class(together) <- c("HC_LB_Plot", class(together))
return(together)
} else {
if("Lines" %in% plottingSettings$What){
plottedlines <- plotLines(plottingSettings)
class(plottedlines) <- c("HC_L_Plot", class(plottedlines))
return(plottedlines)
}
if("Bars" %in% plottingSettings$What){
plottedbars <- plotBars(plottingSettings)
class(plottedbars) <- c("HC_B_Plot", class(plottedbars))
return(plottedbars)
}
}
},
plotLines =
function(plottingSettings){
"Method plots a lineplot using ggplot2 of the sequence similarity data from the scan."
if(noScanPerformed()){stop("No sequence similarity scan has been performed for this triplet.")}
combo <- unlist(lapply(combn(ContigNames, 2, simplify=FALSE), function(x) paste(x, collapse=":")))
data <- ScanData$Table
plotting.frame <- data.frame(basepos = rep(data$ActualCenter,3),
yvalues = c(data$AB, data$AC, data$BC),
factors = rep(1:3, each = nrow(data)))
plot <- ggplot(plotting.frame, aes(x=basepos, y=yvalues)) + geom_line(aes(colour=factor(factors)), show_guide=plottingSettings$Legends, size=0.8) +
ylim(0,100) +
scale_colour_manual(name = "Pairwise Comparrisons", labels=c(combo[1], combo[2], combo[3]),values=c("yellow","purple","cyan")) +
xlab("Base Position") +
ylab("% Sequence Similarity")
plot <- applyPlottingParams(plot, plottingSettings, title = paste("Sequence Similarity Between Sequences for Triplet ", ContigNames[1], ":", ContigNames[2], ":", ContigNames[3], sep=""))
return(plot)
},
plotBars =
function(plottingSettings){
"Method plots the heatmap based graphic of bars, from the sequence similarity scan data."
if(noScanPerformed()){stop("No sequence similarity scan has been performed for this triplet.")}
# Generate the reference colour palette.
colourPalette <- expand.grid(A = seq(0, 100, by = 1), B = seq(0, 100, by = 1))
colourPalette$RefA <- rgb(green = colourPalette$A, red = 100, blue = colourPalette$B, maxColorValue = 100)
colourPalette$RefB <- rgb(green = 100, red = colourPalette$A, blue = colourPalette$B, maxColorValue = 100)
colourPalette$RefC <- rgb(green = colourPalette$B, red = colourPalette$A, blue = 100, maxColorValue = 100)
# Now figure out the scale and data to go into each vertical bar: TODO - Put this in a function.
div <- FullDNALength / plottingSettings$MosaicScale
frame <- data.frame(bpstart = seq(from = 1, to = FullDNALength, by = div),
bpend = seq(from=div, to = FullDNALength, by = div))
frame$bpX <- round(frame$bpstart + (div / 2))
scanTable <- ScanData$Table
AB <- round(apply(frame, 1, function(x) vertbar_create(scanTable, x, 7)))
AC <- round(apply(frame, 1, function(x) vertbar_create(scanTable, x, 8)))
BC <- round(apply(frame, 1, function(x) vertbar_create(scanTable, x, 9)))
rm(scanTable)
frame$AB <- AB
frame$AC <- AC
frame$BC <- BC
frame$X <- 1:nrow(frame)
rm(AB, AC, BC)
if(any(is.nan( as.numeric(frame$AB))) || any(is.nan( as.numeric(frame$AC))) || any(is.nan( as.numeric(frame$BC)))){
warning("\nNot a numbers (NaNs)! have been detected in the plotting frame.\n
The common cause of this is a small alignment or few informative sites in the data,
with a too high MosaicScale parameter.\nThis usually happens at either end of the
bars and the NaNs will be dealt with my filling them in black.\n\nTo get rid of them use a lower MosaicScale parameter.")
}
A_mix <- apply(frame, 1, function(x) col_deter(x[c(4,5)], colourPalette[,c(1,2,3)]))
B_mix <- apply(frame, 1, function(x) col_deter(x[c(4,6)], colourPalette[,c(1,2,4)]))
C_mix <- apply(frame, 1, function(x) col_deter(x[c(5,6)], colourPalette[,c(1,2,5)]))
frame$A_mix <- A_mix
frame$B_mix <- B_mix
frame$C_mix <- C_mix
rm(A_mix, B_mix, C_mix)
plottingFrame <- data.frame(X = frame$X, Y = rep(c(3, 2, 1), each = plottingSettings$MosaicScale), colour = c(frame$A_mix, frame$B_mix, frame$C_mix))
bars <- ggplot(plottingFrame, aes(x = X, y = as.factor(Y))) +
geom_raster(aes(fill = colour)) + scale_fill_identity() +
xlab("Approximate Base Position") +
ylab("Sequence Name") +
scale_x_continuous(breaks = c(seq(from = 1, to = plottingSettings$MosaicScale, by = plottingSettings$MosaicScale / 10), plottingSettings$MosaicScale), labels = c(frame$bpX[seq(from = 1, to = plottingSettings$MosaicScale, by = plottingSettings$MosaicScale / 10)], max(frame$bpX))) +
scale_y_discrete(labels = c(ContigNames[3], ContigNames[2], ContigNames[1]))
bars <- applyPlottingParams(bars, plottingSettings, title = paste("Sequence Similarity Between Sequences for Triplet ", ContigNames[1], ":", ContigNames[2], ":", ContigNames[3], sep=""))
if(plottingSettings$Legends == T){
legend <- readPNG(system.file("extdata/rgblegend.png", package="HybridCheck"), TRUE)
if (names(dev.cur()) == "windows"){
transparent <- legend[,,4] == 0
legend <- as.raster(legend[,,1:3])
legend[transparent] <- NA
}
legendgrob <- grid::rasterGrob(image=legend)
bars <- arrangeGrob(bars, legendgrob, widths = c(1, 0.13), ncol = 2)
}
return(bars)
}
)
)
vertbar_create <- function(sequenceSimilarityTable, plottingFrameRow, whichComparrison){
bool1 <- sequenceSimilarityTable$ActualStart <= plottingFrameRow[2]
bool2 <- plottingFrameRow[1] <= sequenceSimilarityTable$ActualEnd
index <- which(bool1 == bool2)
return(mean(sequenceSimilarityTable[index, whichComparrison]))
}
col_deter <- function(invalues, reference){
if(any(is.nan(as.numeric(invalues)))){
cols <- "#000000"
} else {
cols <- reference[(reference[, 1] == as.numeric(invalues[1])) & reference[,2] == as.numeric(invalues[2]), 3]
}
return(cols)
}
#' Reference class storing all triplets in a HC analysis.
#' @name Triplets
#' @description The Triplets reference class stores and manages operations over many Triplet objects.
Triplets <- setRefClass("Triplets",
fields = list(
triplets = "list"
),
methods = list(
initialize =
function(dna){
"Initializes the object."
triplets <<- list()
},
tripletsGenerated =
function(){
"Returns TRUE if the triplet generations have been prepared for."
return(length(triplets) > 0)
},
matchNames =
function(selection){
if(!tripletsGenerated()){stop("No triplets have been prepared yet.")}
return(unlist(lapply(triplets, function(x) sum(selection %in% x$ContigNames))))
},
deleteAllTriplets =
function(){
"Removes all current triplet data completely."
message("Deleting all triplets data.")
triplets <<- list()
},
generateTriplets =
function(dna, csettings, basefile){
"Initializes all triplet objects based on the combination settings"
if(csettings$Modified){
if(tripletsGenerated()){
deleteAllTriplets()
}
message("Initializing new triplets data.")
seqlength <- dna$getFullLength()
seqnames <- dna$getSequenceNames()
triplets <<- lapply(csettings$AcceptedCombinations, function(x) Triplet$new(sequencenames = c(x[1], x[2], x[3]), fullseqlength = seqlength, basefile))
csettings$Modified <- FALSE
}
},
# updateTriplets =
# function(dna, settings, basefile){
# got <- getAllNames()
# to.remove <- which(!unlist(lapply(got, function(x) x %in% settings$AcceptedCombinations)))
# to.add <- which(!unlist(lapply(settings$AcceptedCombinations, function(x) x %in% got)))
# triplets <<- triplets[-to.remove]
# triplets <<- append(triplets, makeTriplets(settings$AcceptedCombinations[to.add], dna, basefile))
# },
#
# makeTriplets =
# function(selection, dna, basefile){
# seqlength <- dna$getFullLength()
# return(lapply(selection, function(x) Triplet$new(sequencenumbers = x, sequences = c(x[1], x[2], x[3]), fullseqlength = seqlength, basefile)))
# },
scanTriplets =
function(tripletSelections, dna, scansettings){
if(!tripletsGenerated()){stop("No triplets have been prepared yet.")}
tripletsToScan <- getTriplets(tripletSelections)
for(tripletToScan in tripletsToScan){
scan.similarity(dna, tripletToScan, scansettings)
}
},
findBlocks =
function(tripletSelections, findSettings){
if(!tripletsGenerated()){stop("No triplets have been prepared yet.")}
tripletsToFindIn <- getTriplets(tripletSelections)
for(triplet in tripletsToFindIn){
triplet$putativeBlockFind(findSettings)
}
},
dateBlocks =
function(tripletSelections, dateSettings, dna){
if(!tripletsGenerated()){stop("No triplets have been prepared yet.")}
tripletsToDate <- getTriplets(tripletSelections)
for(triplet in tripletsToDate){
triplet$blockDate(dna, dateSettings)
}
},
plotTriplets = function(tripletSelections, plotSettings){
tripletsToPlot <- getTriplets(tripletSelections)
return(lapply(tripletsToPlot, function(x) x$plotTriplet(plotSettings)))
},
tabulateBlocks = function(tripletSelections, neat){
if(!tripletsGenerated()){stop("No triplets have been prepared yet.")}
tripletsToTabulate <- getTriplets(tripletSelections)
listedTabulates <- lapply(tripletsToTabulate, function(x) x$tabulateBlocks())
collected <- do.call(rbind, listedTabulates)
output <- data.frame(collected$Triplet, collected$SequencePair, collected$SequenceSimilarityThreshold, collected$Length,
collected$First, collected$Last, collected$FirstBP, collected$LastBP, collected$ApproxBpLength, collected$SNPs, collected$CorrectedSNPs, collected$fiveAge, collected$fiftyAge,
collected$ninetyFiveAge, collected$P_Value, collected$P_Threshold)
if(neat){
output <- output[,-c(4,5,6)]
names(output) <- c("Triplet", "Sequence_Pair","Sequence_Similarity_Threshold","First_BP_Position", "Last_BP_Position", "Approximate_Length_BP", "Number_of_SNPs", "Corrected_Number_of_SNPs", "p=0.05_Age", "p=0.5_Age","p=0.95_Age","P_Value", "P_Thresh")
}
return(output)
},
show =
function(){
cat("A total of ")
},
tripletCombinations =
function(){
return(unlist(lapply(triplets, function(x){paste0(x$ContigNames, collapse=", ")})))
},
htmlSummary =
function(){
"Prints a HTML summary of all the different triplet combinations."
tripnames <- paste0(lapply(triplets, function(x){paste0(paste0(x$ContigNames, collapse=", "), " <br> ", collapse="")}), collapse="")
output <- paste0("<h2>TripletCombinations</h2>")
return(tripnames)
},
getTriplets =
function(selections){
"Returns a list of references to triplets according to user selection."
if(!is.list(selections)){
selections <- list(selections)
}
selections <- unique(selections)
if(!is.null(selections) && length(selections) > 0){
ind <- numeric()
if("ALL" %in% selections){
ind <- 1:length(triplets)
} else {
if("NOT.SCANNED" %in% selections){
ind <- c(ind, which(unlist(lapply(triplets, function(x) x$noScanPerformed()))))
selections <- selections[which(selections != "NOT.SCANNED")]
}
if("NOT.SEARCHED" %in% selections){
ind <- c(ind, which(unlist(lapply(triplets, function(x) x$blocksNotFound()))))
selections <- selections[which(selections != "NOT.SEARCHED")]
}
if("NOT.DATED" %in% selections){
ind <- c(ind, which(unlist(lapply(triplets, function(x) x$blocksNotDated()))))
selections <- selections[which(selections != "NOT.DATED")]
}
if(any(unlist(lapply(selections, length)) != 3)){stop("Selections must provide a vector of 3 sequence names.")}
if(any(unlist(lapply(selections, function(x) !is.character(x))))){stop("Selections must be of class character.")}
allNames <- do.call(rbind, getAllNames())
ind <- c(ind, unlist(lapply(selections, function(x) which(allNames[,1] %in% x & allNames[,2] %in% x & allNames[,3] %in% x))))
}
ind <- unique(ind)
return(triplets[ind])
} else {
stop("No selection of triplet was provided.")
}
},
getAllNames =
function(){
"Returns the names of the sequences in each triplet as a list."
return(lapply(triplets, function(x) x$ContigNames))
},
getAllIndexes =
function(){
"Returns the indexes of the sequences (according to their rows in the HC sequence object) in each triplet as a list."
return(lapply(triplets, function(x) x$ContigIndexes))
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.