# For help see <http://sift.bii.a-star.edu.sg/www/SIFT_help.html>
#' Remove identical sequences in a list alignment
#'
#' Given an alignment of sequences (as a list),
#' Remove sequences that have 100% identity to query (first element)
#'
#' @param aln alignment list of sequences
#' @return alignment list with identical sequences removed
removeIdenticalSeqs <- function(aln){
id = sapply(aln, percentIdentity, aln[[1]]) >= 100 #%
# Always keep query
id[1] = FALSE
return(aln[!id])
}
#' Compute percent identity between two sequences
#'
#' @param comp sequence being compared
#' @param seq suery sequence
#'
#' @return percent identity between both sequences
percentIdentity <- function(comp, seq){
ident = sum(seq == comp)
ident = ident - sum(comp == 'X')
round( ident / length(seq ) * 100)
}
#' SIFT version of median
#'
#' The way medians are computed in sift C code is a bit different
#' This implements the median function used by sift
#'
#' @param arr vector of numeric values
#'
#' @return median value
medianOfArray <- function(arr){
n = length(arr)
if(n == 1) return(array)
arr = sort(arr)
z = length(arr)/2
if( (n %% 2) == 0 ){
return(arr[z])
} else {
return((arr[floor(z)] + arr[ceiling(z)])/2)
}
}
#' Check lengths of sequences are equal
#'
#' This checks that the lengths of each sequence in the
#' alignment is equal to the length of the query
#'
#' @param aln alignment list of sequences
#'
#' @return will throw error if there is at least one unequal sequence length
#' otherwise, will return length of the query
checkUnequalSequenceLengths <- function(aln){
# Get lengths of all sequences
seq_len = sapply(aln, length)
# Stop if they're not all the same as the query length
if(! all(seq_len == seq_len[1])) stop('Unequal length of sequences')
# Return number of positions
seq_len[1]
}
#' Compute matrix of letters for the alignment
#'
#' This takes all n sequences, of m positions and constructs
#' a matrix containing sequence amino acid x position
#' amino acids are represented as integers 1-20, 21 non-AA character
#'
#' @param aln alignment list of sequences
#' @param alphabet character vector of alphabet (amino acids used)
#' @param npos number of positions in the alignment
#'
#' @return matrix of amino acid indices for each sequence/position
letterMatrix <- function(aln, alphabet, npos){
# Unlist all residues
split = unlist( aln, F, F )
# Convert to numeric
split = as.numeric( factor(split, levels = alphabet) )
split[is.na(split)] = length(alphabet) + 1
# Reformat into a matrix with correct number of rows/columns
aln_mat = t( matrix(split, npos, length(split)/npos ) )
aln_mat
}
#' Get median information content for each position
#'
#' @param aln_mat alignment matrix generated from letterMatrix
#' @param frq precomputed amino acid frequency in the proteome
#' @param cores number of cores for parallel processing (default: 1)
#'
#' @return Numeric vector with median information content for each position
medianInfoContent <- function(aln_mat, frq, cores=1){
# Used for information content calculation
ln2 = logb(2.0)
hmax = logb(20) #20 amino caids
# Position profiles / what sequences to keep for each position
position_profiles = apply(aln_mat, 2, function(x) (x<21)+0)
# Convert to sequence
position_profiles_seq = apply(position_profiles, 2, paste0, collapse='')
# Get unique profiles / avoid running duplicates
unique_profiles = which(!duplicated(position_profiles_seq, MARGIN = 2))
# Match all positions back to the unique one
info_content_ind = match(position_profiles_seq,
position_profiles_seq[unique_profiles])
if(cores == 1){
# Single core using vapply (faster than sapply)
median_info_content = vapply(unique_profiles, FUN.VALUE = double(1),
positionInfoContent, aln_mat, frq, FALSE)
}else{
# Parallel process using mclapply
median_info_content = mclapply(unique_profiles, mc.cores = cores,
FUN = positionInfoContent, aln_mat, frq, FALSE)
median_info_content = simplify2array(median_info_content)
}
#positionInfoContent(1, aln_mat, F)
median_info_content = median_info_content[info_content_ind]
}
#' Get median information content for a given position
#'
#' Get sequences that have no gap or X in position i,
#' compute median information content accross all positions
#'
#' @param position alignment position of interest
#' @param aln_mat alignment matrix generated from letterMatrix
#' @param frq precomputed amino acid frequency in the proteome
#' @param small_sample_correction TRUE if small sample correction is applied
#' to information content calculation (not implemented currently, sift has it but doesn't use it)
#' @return information content value between 0 (conserved) to log2(20) (least conserved)
positionInfoContent <- function(position=1, aln_mat, frq, small_sample_correction=F){
# Used for information content calculation
ln2 = logb(2.0)
hmax = logb(20) #20 amino caids
# Number of positions
npos = ncol( aln_mat )
# Which sequences have valid residues
ind = aln_mat[,position] < 21
# Number of sequences we're computing info content on
nseqs = sum(ind)
# Handle case where we have 0 amino acids (gaps or non amino acids)
if(nseqs == 0) return(NA)
# Handle case where we have one sequence
if(nseqs == 1) return(log2(20))
# Extract sub matrix
aln_mat_curr = aln_mat[ind,]
# writeLines(sprintf('position = %s', position))
# Get pb weights
ptm <- proc.time()
pb_weights = pbWeights(aln_mat_curr)
# writeLines( sprintf('pbWeights took %f seconds', (proc.time() - ptm)['elapsed']) )
# Make PWM
ptm <- proc.time()
# Make PWM for sequences with information
pwm = makePWM( aln_mat_curr, pb_weights, frq )
# writeLines( sprintf('makePWM took %f seconds', (proc.time() - ptm)['elapsed']) )
ptm <- proc.time()
# Compute information content for each position
info_per_pos = sapply(1:npos, function(i){
pos_data = pwm[,i]
dtemp = pos_data / sum(pos_data, na.rm=T)
r = hmax
r = r + sum( dtemp * logb(dtemp), na.rm=T )
r / ln2
})
# writeLines( sprintf('infoContent took %f seconds', (proc.time() - ptm)['elapsed']) )
m = medianOfArray(info_per_pos)
# writeLines(sprintf('median_info=%s', m))
# writeLines('_______')
m
}
#' Compute a weight for each amino acid at each position
#'
#' These weights are used to compute a sequence weight
#' denoting how much information each sequence is contributing
#'
#' @param aln_mat alignment matrix generated from letterMatrix
#' @param return_all if TRUE, returns intermediate variables computed (pfm, diff_aas, etc), otherwise just weight matrix
#'
#' @return numeric matrix of n sequences x m positions, with a weight for each cell
pbWeights <- function(aln_mat, return_all=FALSE){
# Number of sequences/positions
npos = ncol( aln_mat )
nseq = nrow( aln_mat )
# Construct data table with position, amino acid, and freq (to be grouped)
pb_data = data.table(pos=rep(1:npos, each = nseq),
aa=as.integer(aln_mat), freq=1)
# Compute amino acid frequency (group by pos and aa)
pb_counts = pb_data[, lapply(.SD, sum), by=c('pos', 'aa')]
# Pick valid amino acids non gaps and non X (1 to 20)
pb_counts_valid = pb_counts[pb_counts$aa < 21,]
# Compute number of different amino acids per position
diff_aas = ftable(factor(pb_counts_valid$pos, 1:npos))
# Compute pb_weight
pb_counts$pb = 1 / (pb_counts$freq * diff_aas[ pb_counts$pos ])
pb_counts = as.matrix( pb_counts[pb_counts$aa < 21,] )
# Construct pfm / pb matrix from data and assign counts
pb_mat = pfm = matrix(0, 21, npos)
pfm[ pb_counts[,2:1] ] = pb_counts[,3]
pb_mat[ pb_counts[,2:1] ] = pb_counts[,4]
# Loop over each position and get pb_weights per sequence (used to compute sequence weight)
dat = vapply(1:ncol(aln_mat), FUN.VALUE = numeric( nseq ), FUN=function(pos){
pb_mat[aln_mat[,pos],pos] # pb weight
})
if(return_all){
return( list(pb_mat = dat, pfm = pfm,
diff_aas = as.integer(diff_aas),
pb_counts=pb_counts, npos=npos, nseq=nseq) )
}
dat
}
#' Make PWM using pb weights and amino acid frequencies
#'
#' Computes a weight matrix of 20 amino acids x n positions
#' Each cell is the frequency of the corresponding amino acid in that position * weight
#'
#' @param aln_mat alignment matrix generated from letterMatrix
#' @param pb_weights weights computed from pbWeights function
#' @param frq precomputed amino acid frequency in the proteome
#'
#' @return weight matrix of 20 amino acids x number of positions
makePWM <- function(aln_mat, pb_weights, frq){
# Number of positions and sequences
npos = ncol( aln_mat )
nseq = nrow( aln_mat )
# Get weights for each sequence
seq_weights = rowSums(pb_weights, na.rm=T)
# Data for building pwm: position, amino acid and sequence weight (from pb_weights)
pwm_df = data.frame(pos=rep(1:npos, each = nseq),
aa=as.integer(aln_mat),
seq_w = rep(seq_weights, times = npos))
# Assign background amino acid frequencies
pwm_df$bg_freq = frq[ pwm_df$aa ]
# Compute new weight
pwm_df$new_weight = pwm_df$seq_w / pwm_df$bg_freq
# Construct data table (much faster to aggregate)
dt = as.data.table( pwm_df[,c('pos', 'aa', 'new_weight')] )
# Aggregate weights table by position and amino acid
dt = dt[, lapply(.SD, sum), by=c('pos', 'aa')]
# Choose only valid amino acids
dt = as.matrix(dt[dt$aa < 21,])
# Construct pwm and assign weights
pwm = matrix(0, 21, npos)
pwm[ dt[,2:1] ] = dt[,3]
pwm
}
#' Normalize pb weights matrix
#'
#' This takes wight matrix produced by pbWeights and normalizes
#' by number of sequences in alignment / sum of all weights
#'
#' @param weights weights computed from pbWeights function
#' @return normalized pb weight matrix
normalizeWeights <- function(weights){
w_sum = sum(weights)
num_sequences = nrow(weights)
factor = num_sequences / w_sum
weights = factor * weights
return(weights)
}
#' Normalizes weights computed by pbWeights
#'
#' This normalizes the pbWeights weight matrix using normalizeWeights
#' then groups by amino acid for 20 x number of position matrix
#' This is used to compute sume of weights for each position
#'
#' @param pb_weights weights computed from pbWeights function
#' @param aln_mat alignment matrix generated from letterMatrix
#' @return list of two elements: (1) matrix of normalized weights
#' grouped by amino acids (20 x number of positions)
#' (2) numeric vector of the sum of weights per positions
pbWeightsNorm <- function(pb_weights, aln_mat){
# Number of positions/sequences
npos = ncol(aln_mat)
nseq = nrow(aln_mat)
# Normalize sequence weights
pb_weights_norm = normalizeWeights(pb_weights$pb_mat)
# Get new sequence weights
seq_weights = rowSums(pb_weights_norm, na.rm=T)
# Construct data table (for fast aggregation)
dt = data.table(pos = rep( 1:npos, each = nseq ),
aa = as.integer( aln_mat ),
freq = 1,
seq_w = rep( seq_weights, times = npos ))
# Group sum weights by position and amino acid
dt = dt[, lapply(.SD, sum), by=c('pos', 'aa')]
# Remove non amino acid residues
dt_mat = as.matrix( dt[dt$aa < 21] )
# Make into weight matrix
# cnt (list) is now sum_seq_weights (matrix, columns)
grouped_seq_weights = matrix(0, 21, npos)
grouped_seq_weights[ dt_mat[,2:1] ] = dt_mat[,4]
# totcnt -> sum_seq_weights
sum_seq_weights = apply(grouped_seq_weights, 2, sum)
list(grouped = grouped_seq_weights[1:20,],
sum = sum_seq_weights)
}
#' Compute epsilon values
#'
#' Used for computing final sift scores
#'
#' @param pb_weights see pbWeights
#' @param pb_weights_norm see pbWeightsNorm
#' @param rank_matrix precomputed rank matrix for amino acids (BLOSUM62)
getEpsilonValues <- function(pb_weights, pb_weights_norm, rank_matrix){
# similarity_dependent_scale_0
dtemp2 = apply(pb_weights_norm$grouped, 2, function(pos_data){
# Get heaviest weight amino acid
max_aa = which.max(pos_data)
# Get rank with other amino acids
rank = rank_matrix[max_aa,]
w = pos_data / sum(pos_data)
sum( rank * pos_data / sum(pos_data) )
})
# Compute epsilon values
# different amino acids per position
diff_aas = pb_weights$diff_aas
eps = integer(length(diff_aas))
ind = diff_aas > 1
eps[ind] = exp(dtemp2[ind])
eps
}
#' Computes final matrix of sift scores returned to the user
#'
#' @param pb_weights see pbWeights
#' @param pb_weights_norm see pbWeightsNorm
#' @param eps epsilon values
#' @param diri precomputed dirichelet components, used to compute pseudocounts
#'
#' @return numeric matrix with amino acid rows x number of positions containing sift scores
makeSiftMatrix <- function(pb_weights, pb_weights_norm, eps, diri){
# Total number of amino acids per columns (totreg)
num_aa = apply(pb_weights$pfm, 2, sum)
npos = pb_weights$npos
# Seq weights
sum_seq_weights = pb_weights_norm$sum
grouped_seq_weights = pb_weights_norm$grouped
# Construct final sift matrix of siftr scores
sift_matrix = vapply(1:npos, FUN.VALUE = double(20), FUN = function(i){
d = pseudo_diri(pos=i, diri, grouped_seq_weights, sum_seq_weights)
fi = grouped_seq_weights[,i] + (eps[i] * d$pseudocounts)
ind = sum_seq_weights[i] > d$totreg
if(any(ind)) fi = fi / (sum_seq_weights[i] + eps[i])
fi / max(fi)
})
sift_matrix
}
#' Helper function for pseudo_diri, adds two logs
#'
#' @param lx first value
#' @param ly second value
addLogs <- function(lx, ly) {
if (lx > ly) return (lx + log(1.0 + exp(ly - lx)))
else return (ly + log(1.0 + exp(lx - ly)));
}
#' Get dirichelet pseudocounts for position i
#'
#' @param pos position
#' @param diri precomputed dirichelet components, used to compute pseudocounts
#' @param grouped_seq_weights see pbWeightsNorm
#' @param sum_seq_weights see pbWeightsNorm
#'
#' @return pseudocounts for amino acids of position i
pseudo_diri <- function(pos, diri, grouped_seq_weights, sum_seq_weights){
# Get diri objects
diri_alpha = diri$alpha
altot = diri$altot
q = diri$q
# Number of components
ncomp = length(altot)
# Weights of amino acids for this position (cnt)
seq_weights_pos = grouped_seq_weights[1:20,pos]
# Sum of seq_weights_pos (totcnt)
sum_seq_weights_pos = sum_seq_weights[pos]
# Keep weights that have a weight
nonzero_weights = seq_weights_pos > 0
seq_weights_pos = seq_weights_pos[nonzero_weights]
# compute equation (3), Prob(n|j)
probn = sapply(1:ncomp, function(j){
lg = lgamma( sum_seq_weights[pos] + 1.0 ) + lgamma(altot[j])
lg = lg - lgamma(sum_seq_weights[pos] + altot[j])
dtemp = lgamma(seq_weights_pos + diri_alpha[nonzero_weights, j])
dtemp = dtemp - lgamma(seq_weights_pos + 1.0)
dtemp = dtemp - lgamma(diri_alpha[nonzero_weights, j])
lg + sum(dtemp)
})
denom = log(q[1]) + probn[1]
dtemp = log(q[-1]) + probn[-1]
for(j in 1:(ncomp-1)) denom = addLogs(denom, dtemp[j])
probj = log(q) + probn - denom
aa_reg = sapply(1:20, function(aai){
reg = exp(probj) * diri_alpha[aai,]
sum(reg)
})
aa_totreg = sum(aa_reg)
aa_reg = aa_reg / aa_totreg
pseudocounts = aa_reg
# Return pseudo counts
return(list(pseudocounts=pseudocounts,
totreg=aa_totreg))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.