#' Create dataframe from two dist objects
#' Created dataframe from distance matrices calculated for different regions of sequence alignment
#' @param dist1 distance matrix for region1 in sequence alignment, object of class `dist`
#' @param dist2 distance matrix for region2 in sequence alignment, object of class `dist`
#' @return dataframe with columns "row, col, value.x, value.y" - where row and col contain sequence names, x.value contains distances in region1, y.value contains distances in region2.
#' @import spaa
#' @noRd
dist_to_df = function(dist1, dist2){
#convert dist object to dataframe
d1 = dist2list(dist1)
d2 = dist2list(dist2)
d = merge(d1, d2, by=c("row", "col"))
return(d)
}
#' Pairwise nucleotide Distance Correspondence Plot
#'
#' Each dot corresponds to a pair of nucleotide distances between
#' the same pair of genomes in two genomic regions with alignment positions `st1-e1` and `st2-e2` (see axis).
#' Returns list with ggplot, matrices of distances between pairs of sequences calculated for
#' `st1-e1` and `st2-e2` regions
#' @param dna_object list of DNA sequences produced by `read.dna` function of `ape` package (`as.character = TRUE` mode)
#' @param st1 start position of genome region 1
#' @param e1 end position of genome region 1
#' @param st2 start position of genome region 2
#' @param e2 end position of genome region 2
#' @return list with ggplot object with PDC plot for two regions, dataframe with distances in region 1 and region 2
#' @export
#' @import ape
#' @import ggplot2
#' @examples
#' \dontrun{
#' alignment = read.dna(path/to/file, format="fasta", as.character=TRUE)
#' alignment[alignment=='-'] <- NA
#' plot_PDCP(alignment, 1, 500, 600, 1000)}
plot_PDCP = function(dna_object, st1,e1,st2,e2){
# subalignments for st1-e1 and st2-e2 regions
dna_sl1=dna_object[1:length(dna_object[,1]), seq(from = st1, to = e1, by=1)]
dna_sl2=dna_object[1:length(dna_object[,1]), seq(from = st2, to = e2, by=1)]
# distance matrices for each region
dna_sl_dist1 <-dist.gene(dna_sl1, method = "percentage", pairwise.deletion = TRUE)
dna_sl_dist2 <-dist.gene(dna_sl2, method = "percentage", pairwise.deletion = TRUE)
# adding random noise to distances matrices' values
dist1= as.vector(dna_sl_dist1) + rnorm(length(dna_sl_dist1),mean = 0,sd= 0.0001)
dist2= as.vector(dna_sl_dist2) + rnorm(length(dna_sl_dist2),mean = 0,sd= 0.0001)
# pairwise nucleotide distance correpondence plot
dist_plot=ggplot(data.frame(dist1,dist2),aes(dist1,dist2))+stat_bin2d(binwidth = 0.002)+
scale_fill_gradientn(colours=c("blue","red"))+ theme(legend.justification=c(1,0), legend.position=c(1,0))+
xlab(paste(toString(st1),toString(e1),sep=":"))+ylab(paste(toString(st2),toString(e2),sep=":"))
#+ geom_smooth(method='lm',formula=y~x)
dist_df = dist_to_df(dna_sl_dist1, dna_sl_dist2)
return(list(dist_plot, dist_df))
}
#' Control for Pairwise nucleotide Distance Correspondence Plot
#'
#' Plots control for PDC plot. Control plot visualizes the correspondence of p-distances in subalignments with odd and even positions of original alignment.
#' @param dna_object list of DNA sequences produced by `read.dna` function of `ape` package (`as.character = TRUE` mode)
#' @param st1 start position of genome region 1
#' @param e1 end position of genome region 1
#' @param st2 start position of genome region 2
#' @param e2 end position of genome region 2
#' @return list with ggplot object with control plot for two regions, dataframe with distances in concatenated odd and even positions of alignment
#' @export
#' @import ape
#' @import ggplot2
#' @examples
#' \dontrun{
#' alignment = read.dna(path/to/file, format="fasta", as.character=TRUE)
#' alignment[alignment=='-'] <- NA
#' plot_PDCP_control(alignment, 1, 500, 600, 1000)}
plot_PDCP_control = function(dna_object, st1,e1,st2,e2){
# join alignment slices for control plot
aln_control = cbind(dna_object[,st1:e1], dna_object[,st2:e2])
# subalignments for odd and even sites
al_odd=aln_control[1:length(aln_control[,1]), seq(from = 1, to = length(aln_control[1,]), by=2)]
al_even=aln_control[1:length(aln_control[,1]), seq(from = 2, to = length(aln_control[1,]), by=2)]
# distance matrices for each region
dna_sl_dist1_odd <-dist.gene(al_odd, method = "percentage", pairwise.deletion = TRUE)
dna_sl_dist2_even <-dist.gene(al_even, method = "percentage", pairwise.deletion = TRUE)
# adding random noise to distances matrices' values
dist1_odd = as.vector(dna_sl_dist1_odd) + rnorm(length(dna_sl_dist1_odd),mean = 0,sd= 0.0001)
dist2_even = as.vector(dna_sl_dist2_even) + rnorm(length(dna_sl_dist2_even),mean = 0,sd= 0.0001)
dist_plot=ggplot(data.frame(dist1_odd,dist2_even),aes(dist1_odd,dist2_even))+stat_bin2d(binwidth = 0.002)+
scale_fill_gradientn(colours=c("blue","red"))+ theme(legend.justification=c(1,0), legend.position=c(1,0))+
xlab(paste(toString(st1),toString(e1),sep=":"))+ylab(paste(toString(st2),toString(e2),sep=":"))
#+ geom_smooth(method='lm',formula=y~x)
dist_df = dist_to_df(dna_sl_dist1_odd, dna_sl_dist2_even)
return(list(dist_plot, dist_df))
}
#' Pairwise nucleotide Distance Correspondence Plot with control
#'
#' This function plots PDC plot and its control on the same figure. Each dot corresponds to a pair of nucleotide distances between
#' the same pair of genomes in two genomic regions - with alignment positions `st1-e1` and `st2-e2` (see axis).
#' Returns list with ggplot and dataframe with pairwise distances calculated for `st1-e1` and `st2-e2` regions
#' @param dna_object list of DNA sequences produced by `read.dna` function of `ape` package (`as.character = TRUE` mode)
#' @param st1 start position of genome region 1
#' @param e1 end position of genome region 1
#' @param st2 start position of genome region 2
#' @param e2 end position of genome region 2
#' @return list with ggplot object with PDC and control plot for two regions, dataframe with distances in two regions of alignment and in concatenated odd and even positions of alignment
#' @export
#' @import ape
#' @import ggplot2
#' @examples
#' \dontrun{
#' alignment = read.dna(path/to/file, format="fasta", as.character=TRUE)
#' alignment[alignment=='-'] <- NA
#' plot_PDCP_with_control(alignment, 1, 500, 600, 1000)}
plot_PDCP_with_control = function(dna_object, st1,e1,st2,e2){
# join alignment slices for control plot
aln_control = cbind(dna_object[,st1:e1], dna_object[,st2:e2])
# subalignments for odd and even sites
al_odd=aln_control[1:length(aln_control[,1]), seq(from = 1, to = length(aln_control[1,]), by=2)]
al_even=aln_control[1:length(aln_control[,1]), seq(from = 2, to = length(aln_control[1,]), by=2)]
# distance matrices for each region
dna_sl_dist1_odd <-dist.gene(al_odd, method = "percentage", pairwise.deletion = TRUE)
dna_sl_dist2_even <-dist.gene(al_even, method = "percentage", pairwise.deletion = TRUE)
# adding random noise to distances matrices' values
dist1_odd = as.vector(dna_sl_dist1_odd) + rnorm(length(dna_sl_dist1_odd),mean = 0,sd= 0.0001)
dist2_even = as.vector(dna_sl_dist2_even) + rnorm(length(dna_sl_dist2_even),mean = 0,sd= 0.0001)
# alignment slices
dna_sl1=dna_object[1:length(dna_object[,1]), seq(from = st1, to = e1, by=1)]
dna_sl2=dna_object[1:length(dna_object[,1]), seq(from = st2, to = e2, by=1)]
# distance matrices for each region
dna_sl_dist1 <-dist.gene(dna_sl1, method = "percentage", pairwise.deletion = TRUE)
dna_sl_dist2 <-dist.gene(dna_sl2, method = "percentage", pairwise.deletion = TRUE)
# adding random noise to distances matrices' values
dist1= as.vector(dna_sl_dist1) + rnorm(length(dna_sl_dist1),mean = 0,sd= 0.0001)
dist2= as.vector(dna_sl_dist2) + rnorm(length(dna_sl_dist2),mean = 0,sd= 0.0001)
df1 = data.frame(dist1, dist2, factor="real")
colnames(df1) = c('dist1', 'dist2')
df2 = data.frame(dist1_odd, dist2_even, factor="control")
colnames(df2) = c('dist1', 'dist2')
df = rbind(df1, df2)
colnames(df) = c('dist1', 'dist2', 'cond')
PDCP = ggplot(data = df, aes(dist1, dist2)) + geom_point(aes(color=cond)) + scale_color_manual(values = c("dimgrey", "dodgerblue4"))+
theme(legend.justification=c(1,0), legend.position=c(1,0)) +
xlab(paste(toString(st1),toString(e1),sep=":"))+ylab(paste(toString(st2),toString(e2),sep=":"))+
guides(color = guide_legend(title = ""))
return(list(PDCP, df))
}
#' Find sequence pairs which pairwise genetic distances follow the condition
#' Returns dataframe with names of sequences pairs which pairwise nucleotide distances lay between values
#'`val11-val12` in genomic region 1 and values `val21-val22` in genomic region 2.
#' @param distM1 matrix with pairwise nucleotide distances between sequences in genome region 1
#' @param val11 the lowest value in distance interval in region 1
#' @param val12 the highest value in distance interval in region 1
#' @param distM2 matrix with pairwise nucleotide distances between sequences in genome region 2
#' @param val21 the lowest value in distance interval in region 2
#' @param val22 the highest value in distance interval in region 2
#' @return dataframe with the following columns: "name1 name2 distance_in_region1 distance_in_region2"
#' @export
find_recomb_names <- function(distM1, val11, val12, distM2, val21, val22){
#positions of matrix for region 1 with values between val11 and val12
b1 = find_dist_slice(distM1,val11, val12)
#positions of matrix for region 2 with values between val21 and val22
b2 = find_dist_slice(distM2,val21, val22)
#intersection of b1 and b2
b= intersect(b1,b2)
len_b = length(distM1[1,])
#returns positions of rows in matrix
r = sapply(b,function(x){if(x%%len_b==0){return(len_b)} else{return(x%%len_b)}})
#returns positions of columns in matrix
c = sapply(b,function(x){if(x%%len_b==0){return(x%/%len_b)} else{return(x%/%len_b+1)}})
#values of rows and columns
c_names <- sapply(c,function(x) {colnames(distM1)[x]})
r_names <- sapply(r,function(x) {rownames(distM1)[x]})
#print(c_names)
#print(r_names)
df = cbind(c_names,r_names)
#sorts names in alphabetical order
sort_str = function(x){
if (x["c_names"]>x["r_names"]) return(as.vector(x))
else {return(c(x["r_names"],x["c_names"]))}
}
sorted = data.frame(unique(t(apply(df,1,sort_str))),stringsAsFactors = FALSE)
colnames(sorted) = c(1,2)
# values in matrices
x = apply(sorted, 1, function(x){distM1[which(rownames(distM1) == x[1]),which(colnames(distM1) == x[2])]})
y = apply(sorted, 1, function(x){distM2[which(rownames(distM1) == x[1]),which(colnames(distM1) == x[2])]})
sorted = cbind(sorted,x,y)
return(sorted)
}
#'Returns positions in matrix which values are between val1 and val2
#' @noRd
find_dist_slice<-function(distM, val1, val2){
b = which((distM>=val1)&(distM<=val2))
#print(which((distM>val1)&(distM<val2),arr.ind=T))
return(b)
}
#'Plots pairwise distance comparison plots for different pairs of regions
#'
#'dna_object - list of DNA sequences (class DNAbin)
#'step - step for dna regions/ window for regions is equal to step
#'method - method of calculation distances ("pdist", "JC", "Kimura", "TN")
#'fig_dir - path to directory for saving figures
#'name_fig - prefix for figure name
#'created directory fig_dir/step and saves pairwise distance comparison plot there
#' @noRd
plot_dist = function(dna_object, step, method, fig_dir, name_fig){
length_aln = length(dna_object[1,]) #length of alignment
num_seq = length(dna_object[,1]) # number od sequences in alignment
starts = seq(from=1, to = length_aln, by = step) #start positions of regions
ends = seq(from=step, to = length_aln, by = step) #end positions of regions
if (length_aln%%step>0){ends=c(ends,length_aln)}
#positions of genomes regions
df_intervals = cbind(starts,ends)
#list with distance matrices for each region
dist_matrices = list()
for (i in 1:nrow(df_intervals)){
#slice of alignment
slice = dna_object[1:num_seq, seq(from = df_intervals[i,"starts"], to = df_intervals[i,"ends"], by=1)]
#dist_matrices[[i]] = dist.dna(slice, as.matrix = TRUE, model = "JC69")
#calculation of distance matrix
if (method == "pdist"){dist_matrices[[i]] = dist.gene(slice, method = "percentage", pairwise.deletion = TRUE)}
else {
if (method == "JC"){dist_matrices[[i]] = dist.dna(slice, as.matrix = TRUE, model = "JC69")}
if (method == "Kimura"){dist_matrices[[i]] = dist.dna(slice, as.matrix = TRUE, model = "K80")}
if (method == "TN"){dist_matrices[[i]] = dist.dna(slice, as.matrix = TRUE, model = "K80")}
#else{print("Unknown method")}
}
}
#list with ggplots of distance comparison plots for different pairs of regions
ggplots = list()
k=1
for(i in 1:(nrow(df_intervals)-1)){
for(j in (i+1):nrow(df_intervals)){
#xlab name
rname1 = paste(toString(df_intervals[i,"starts"]),"-",toString(df_intervals[i,"ends"]),sep="")
#ylab name
rname2 = paste(toString(df_intervals[j,"starts"]),"-",toString(df_intervals[j,"ends"]),sep="")
#dist1 = dist_matrices[[i]][lower.tri(dist_matrices[[i]],diag = FALSE)]+ rnorm(length(dist_matrices[[i]][lower.tri(dist_matrices[[i]],diag = FALSE)]), mean=0, sd = 0.0001)
#dist2 = dist_matrices[[j]][lower.tri(dist_matrices[[j]],diag = FALSE)]+ rnorm(length(dist_matrices[[j]][lower.tri(dist_matrices[[j]],diag = FALSE)]), mean=0, sd = 0.0001)
#adding random noise to values
if (method == "pdist"){
dist1 = as.vector(dist_matrices[[i]]+ rnorm(length(dist_matrices[[i]]), mean=0, sd = 0.0001))
dist2 = as.vector(dist_matrices[[j]] + rnorm(length(dist_matrices[[j]]), mean=0, sd = 0.0001))
}
else{
dist1 = dist_matrices[[i]][lower.tri(dist_matrices[[i]],diag = FALSE)]+ rnorm(length(dist_matrices[[i]][lower.tri(dist_matrices[[i]],diag = FALSE)]), mean=0, sd = 0.0001)
dist2 = dist_matrices[[j]][lower.tri(dist_matrices[[j]],diag = FALSE)]+ rnorm(length(dist_matrices[[j]][lower.tri(dist_matrices[[j]],diag = FALSE)]), mean=0, sd = 0.0001)
}
ggplots[[k]] = ggplot(data.frame(dist1,dist2),aes(dist1,dist2))+
stat_bin2d(binwidth = 0.005)+scale_fill_gradientn(colours=c("blue","red"))+
theme(legend.justification=c(1,0), legend.position=c(1,0))+
xlab(rname1)+ylab(rname2)
ggsave(ggplots[[k]],file= paste(fig_dir,name_fig,rname1,"_",rname2,".png",sep="") )
xirk=k+1
}
}
}
#' Calculate Pairwise Distance Deviation Matrix
#'
#' Returns pairwise distance deviation matrix. To build such matrix, the sliding window is moved along the alignment, and for each window the pairwise genetic distances between sequences are calculated. Then for each pair of windows the linear regression for pairwise genetic distances is built and the root mean square error of linear regression is estimated.
#' @param dna_object list of DNA sequences produced by `read.dna` function of `ape` package (`as.character = TRUE` mode)
#' @param window window size (length of genomic regions)
#' @param step step size for sliding process
#' @param method method of calculation distances ("pdist", "JC", "Kimura", "TN")
#' @param modification pairwise deletion of positions with gaps (`modification='pairwise'`) or not (`modification='NA'`)
#' @return matrix
#' @export
#' @import ape
#' @import sjstats
#' @examples
#' \dontrun{
#' library(colorRamps)
#' library(gplots)
#' alignment = read.dna(path/to/file, format="fasta", as.character=TRUE)
#' alignment[alignment=='-'] <- NA
#' pddmatrix = calc_PDDmatrix(alignment, 1, 500, 600, 1000)
#' heatmap.2(as.matrix(pddmatrix), Rowv = FALSE, Colv = "Rowv",
#' dendrogram = 'none', col=matlab.like, main="PDD matrix", tracecol=NA)}
calc_PDDmatrix = function(dna_object, step, window, method, modification=NA){
length_aln = length(dna_object[1,]) #length of alignment
num_seq = length(dna_object[,1]) # number of sequences in alignment
starts = seq(from=0, to=length_aln-window, by = step) # start positions of genomic regions
starts[1]=1
ends = seq(from=window, to = length_aln, by = step) # end positions of genomic regions
if (length_aln%%step>step){ends=c(ends,length_aln)}
df_intervals = cbind(starts,ends) #intervals
#names = apply(df_intervals, 1, function(x){paste(toString(x[1]),toString(x[2]),sep="_")})
#dataframe to store RMSE values of each comparison
rmse_df = data.frame(matrix(ncol=length(starts), nrow = length(starts)))
colnames(rmse_df)=starts
rownames(rmse_df)=starts
cat("Total number of intervals", nrow(df_intervals))
#list of distance matrices for each pair of genomic regions
dist_matrices = list()
for (i in 1:nrow(df_intervals)){
cat("\r", "Calculating distances in interval", i)
slice = dna_object[1:num_seq, seq(from = df_intervals[i,"starts"], to = df_intervals[i,"ends"], by=1)]
#dist_matrices[[i]] = dist.dna(slice, as.matrix = TRUE, model = "JC69")
if (method == "pdist"){
if (modification=="pairwise"){
dist_matrices[[i]] = dist.gene(slice, method = "percentage", pairwise.deletion = TRUE)}
else {
dist_matrices[[i]] = dist.gene(slice, method = "percentage", pairwise.deletion = FALSE)}
}
else {
if (method == "JC"){dist_matrices[[i]] = dist.dna(slice, as.matrix = TRUE, model = "JC69")}
if (method == "Kimura"){dist_matrices[[i]] = dist.dna(slice, as.matrix = TRUE, model = "K80")}
if (method == "TN"){dist_matrices[[i]] = dist.dna(slice, as.matrix = TRUE, model = "K80")}
#else{print("Unknown method")}
}
}
n = nrow(df_intervals)
print(length(dist_matrices))
for (i in 1:n){
cat("\r", "Calculating rmse in row", i)
for (j in i:n){
#fits pairwise distance comparison plots linear model, calculates rmse
rmse_i_j = (rmse(lm(dist_matrices[[j]]~dist_matrices[[i]])) + rmse(lm(dist_matrices[[i]]~dist_matrices[[j]]))) /2.0
#rmse_i_j = rmse(lm(dist_matrices[[j]]~dist_matrices[[i]]))
rmse_df[i,j] = rmse_i_j
rmse_df[j,i] = rmse_i_j
}
}
#print(rmse_df)
#colnames(rmse_df)
return(rmse_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.