#' Generates a similarity matrix
#' @description Takes a path to a directory that contains
#' the [CHR]_filt.snps files and processes each to generate
#' a similarity matrix using a given similarityFun. If no
#' similarityFun is given, it generates one using the default
#' Jaccard metric.
#'
#' @param similarityFun Similarity function with two parameters, 'i' and 'j'.
#' @param rm_nocov If no filtering process took place prior, setting this to TRUE
#' will remove all SNPs where no sample have any coverage (Default=FALSE)
#' @param sample_matrix character: path to csv or vcf file to run similarity match on
#' @param matchmode character: Either 'autosome' or 'chrM' (Default=autosome)
#' @param rem_ref boolean: Removes reference matches in chrM sample matching (0/0 and
#' 0/0) (Default=TRUE)
#' @param samples character: Comma-separated list of ordered samples for the matrix
#'
#' @importFrom utils read.csv
#' @importFrom stats na.omit
#' @importFrom assertthat assert_that
#' @return
#' Returns a list of similarity matrices, matrix of number of SNPs found between
#' two samples, and a matrix of number of heterozygous SNPs between two samples
#' @export
getSampleSimilarity <- function(sample_matrix, samples=NULL, matchmode='autosome',
similarityFun=NULL, rm_nocov=FALSE, rem_ref=TRUE){
assert_that(file.exists(sample_matrix), msg="sample_matrix must be a file path containing the samples to match")
## Read in the VCF or CSV file
mat <- switch(matchmode,
"autosome"={
mat <- read.csv(sample_matrix, header = TRUE)
if(!is.null(samples)) colnames(mat) <- strsplit(samples, ",")[[1]]
mat
},
"chrM"={
mega_vcf = read.csv(sample_matrix, sep = "\t", comment.char = "#")
mat = mega_vcf[,10:ncol(mega_vcf)]
if(!is.null(samples)) colnames(mat) <- strsplit(samples, ",")[[1]]
mat
},
stop("matchmode must be either 'autosome' or 'chrM'"))
## Run the similarity checks
mats <- switch(matchmode,
"autosome"=.autosomeMatch(mat, rm_nocov, similarityFun),
"chrM"=.chrmMatch(mat, rem_ref, similarityFun),
stop("matchmode must be either 'autosome' or 'chrM'"))
return(mats)
}
#' Sim-N plotter
#' @description Plot the similarity and n-matrix
#'
#' @param sim_mat Similarity matrix
#' @param n_mat Matrix of N's, same size and format as sim_mat
#' @param mid_diag boolean: Set diagonal of similarity matrix to midpoint (Default=FALSE)
#' @param midpoint Midpoint value of colour range (Default=0.5)
#'
#' @import ggplot2
#' @importFrom reshape2 melt
#' @export
plotSampleSimilarity <- function(sim_mat, n_mat, midpoint=0.5, mid_diag=FALSE){
# Order the matrix based on hclusts
cl <- hclust(dist(sim_mat))
# Remove the sim=1 of intersects by setting to midpoint
if(mid_diag){
diag(sim_mat) <- midpoint
}
# Melt and combine similarity and n-matrices
m_sim <- melt(sim_mat[cl$order,cl$order])
m_n <- melt(n_mat[cl$order,cl$order])
m_sim_n <- merge(m_sim, m_n, by=c('Var1', 'Var2'))
colnames(m_sim_n) <- c('Var1', 'Var2', 'similarity', 'n')
m_sim_n$Var1 <- factor(m_sim_n$Var1)
m_sim_n$Var2 <- factor(m_sim_n$Var2)
# Heatmap visualization of two-factor clustered matrix
ggplot(m_sim_n, aes_string(y='Var1', x='Var2')) +
geom_point(aes_string(colour='similarity', size='n')) +
scale_color_gradient2(low='blue', mid='gray', high='red',
midpoint=midpoint) +
theme_bw()
}
.autosomeMatch <- function(mat, rm_nocov, similarityFun){
mat <- as.matrix(mat)
storage.mode(mat) <- 'integer'
mat <- as.data.frame(mat)
if(rm_nocov){
# Removes SNPs where no samples have coverage for it
mat[mat==0] <- NA # Remove no coverage SNPs
nonhet_idx <- apply(mat,1,function(i) {
i <- na.omit(as.integer(i))
any(i==2) | (any(i==1) & any(i==3))
})
mat[!nonhet_idx,] <- NA # Removes SNPs that have no het cov
}
if(is.null(similarityFun)){
similarityFun <- function(i,j){
# Heterozygous SNPs in i and j
# Divided by
# SNPs heterozygous in i or j +
# SNPs that are ref in i and alt in j (vice versa)
ihet <- i==2
jhet <- j==2
#ihom <- (i==1 | i==3)
# i[which(ihom)] != j[which(ihom)]
sum(ihet & jhet, na.rm=T) /
(sum(ihet | jhet, na.rm=T)) #+ sum(i[which(ihom)] != j[which(ihom)], na.rm=T))
}
}
getN <- function(i,j){ sum(!is.na(i) & !is.na(j)) }
getHet <- function(i,j){ sum((i==2) & (j==2),na.rm=T) }
simmat <- allbyall(mat, margin=2, fun=similarityFun)
nmat <- allbyall(mat, margin=2, fun=getN)
hetmat <- allbyall(mat, margin=2, fun=getHet)
list("sim"=simmat,
"n"=nmat,
"het"=hetmat)
}
.chrmMatch <- function(mat, rem_ref, similarityFun){
# Parse haplotype caller into the first column (i.e. 0/0, or 1/1, or 0/1).
sample_geno <- apply(mat, 2, .getGT)
sample_count <- .cleanGenotype(as.matrix(sample_geno))
colnames(sample_count) <- colnames(mat)
## Read all the samples and count the genotypes####
if(is.null(similarityFun)){
similarityFun <- function(i,j){
# Checks that each samples SNP matches the other samples SNP
comp_idx <- c(1:length(i))
if(rem_ref){
#print("Removing SNPs which are ref in both samples")
refs <- i == '0/0' & j=='0/0'
comp_idx <- comp_idx[-which(refs)]
}
m <- sum(i[comp_idx] == j[comp_idx])
n <- length(comp_idx)
jacc <- sum(m)/n
jacc
# return(c(jacc,n))
}
}
getN <- function(i,j){
comp_idx <- c(1:length(i))
if(rem_ref){
#print("Removing SNPs which are ref in both samples")
refs <- i == '0/0' & j=='0/0'
comp_idx <- comp_idx[-which(refs)]
}
return(length(comp_idx))
}
jacc_mat <- allbyall(sample_count, margin=2, fun=similarityFun)
n_mat <- allbyall(sample_count, margin=2, fun=getN)
list("sim"=jacc_mat,
"n"=n_mat)
}
.getGT <- function(x, gt_idx=1){
split_gt <- strsplit(as.vector(x), split = ":")
sapply(split_gt, function(i) i[[gt_idx]])
}
.cleanGenotype <- function(b) {
# GT genotype, encoded as alleles values separated by either of ”/” or “|”,
# e.g. The allele values are 0 for the reference allele (what is in the
# reference sequence), 1 for the first allele listed in ALT, 2 for the
# second allele list in ALT and so on. For diploid calls examples could be
# 0/1 or 1|0 etc. For haploid calls, e.g. on Y, male X, mitochondrion,
# only one allele value should be given. All samples must have GT call
# information; if a call cannot be made for a sample at a given locus, ”.”
# must be specified for each missing allele in the GT field (for example
# ./. for a diploid). The meanings of the separators are:
stopifnot(any(class(b)=='matrix'))
b <- gsub("\\|", "/", b) # convert phased to unphased
b[b == '.'] <- "0/0" # convert No-call to HOMREF
# Deeper sanity checks
splgt <- sapply(strsplit(as.vector(b), "/"), function(i) i[1:2])
class(splgt) <- 'integer'
# CHECK_1: Flips 1/0 to 0/1
hetidx <- which(splgt[2,] != splgt[1,])
rev_hetidx <- splgt[1,hetidx] > splgt[2,hetidx]
if(any(rev_hetidx)){
for(i in which(rev_hetidx)){
splgt[,hetidx[i]] <- rev(splgt[,hetidx[i]])
}
}
# CHECK_2: ...
if(FALSE){
}
genotype <- matrix(apply(splgt, 2, paste, collapse="/"), ncol=ncol(b))
return(genotype)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.