R/compare_alignments.R

Defines functions PS SP list_pairs SPprep valid_alignments degap_alignment prepare_alignment_matrix import_alignment compare_alignments

Documented in compare_alignments

#' Compare alternative multiple sequence alignments
#'
#' @param ref   The reference MSA (in fasta, clustal, msf, phylip, mase or AAbin format)
#' @param com   The MSA to compare (in fasta, clustal, msf, phylip or mase format)
#' @param SP    Optionally also compute sum of pairs scores (default=FALSE)
#' @param CS    Optionally also compute total column score (default=FALSE)
#'
#' @return Generates an object of class "pairwise alignment comparison" (PAC), providing the optimal pairwise column
#' alignment of two alternative MSAs of the same sequences, and summary statistics of the differences between them.
#' The input alignments must be in the formats fasta, clustal, msf, phylip, mase or the AAbin format generated by
#' the ape package.
#' The details of the PAC output components are as follows:
#' \itemize{
#'  \item {reference_P}          {The numbered character matrix of the reference alignment}
#'  \item {comparison_Q}         {The numbered character matrix of the comparison alignment}
#'  \item {results_R}            {The results summary matrix (containing column averages of match, gapcon, merge, split, shift)}
#'  \item {similarity_S}         {The similarity matrix between the reference and comparison alignment columns}
#'  \item {dissimilarity_D}      {The dissimilarity matrix between the reference and comparison (containing match, gapcon, merge, split, shift)}
#'  \item {dissimilarity_simple} {The dissimilarity matrix with categories stacked into a single 2D matrix}
#'  \item {columnmatch}          {The column of the comparison alignment with the highest final match score}
#'  \item {cys}                  {The proportion of cysteines (relevant for cysteine rich proteins)}
#'  \item {reflen}               {The length of the reference alignment}
#'  \item {comlen}               {The length of the comparison alignment}
#'  \item {refcon}               {The consensus sequence of the reference alignment}
#'  \item {comcon}               {The consensus sequence of the comparison alignment}
#'  \item {similarity_score}     {The overall similarity score}
#'  \item {column_score}         {The proportion of columns that are fully identical between the reference and comparison MSAs and related data (optional)}
#'  \item {sum_of_pairs}         {The sum of pairs score and related data (optional)}
#' } 
#' 
#' @export
#' @examples
#' # Using example data
#' data(reference_alignment)
#' data(comparison_alignment)
#' PAC <- compare_alignments(reference_alignment,comparison_alignment)
#' 
#' \dontrun{
#' # Using fasta files from your harddrive
#' PAC <- compare_alignments(file.choose(),file.choose())
#'
#' # Using fasta files from your harddrive
#' library('ape')
#' data(woodmouse)
#' AA <- trans(woodmouse, 2)
#' PAC <- compare_alignments(AA,AA)
#' }
#'
#' @note The `compare_alignments` compares two alternative multiple sequence alignments (MSAs) of the same sequences.
#' The alternative alignments must contain the same sequences. The function classifies similarities and differences
#' between the two MSAs. It produces the "pairwise alignment comparison" object required as the first step any other
#' package functions. The function converts the MSAs into matrices of sequence characters labelled by their occurrence
#' number in the sequence (e.g. to distinguish between the first and second cysteines of a sequence). It then compares
#' the two MSAs to determine which columns have the highest similarty between the reference and comparison MSAs to
#' generate a similarity matrix (excluding conserved gaps). From this matrix, the comparison alignment column with
#' the similarity to each reference alignment column is used to calculate further statistics for dissimilarity matrix,
#' summarised for each reference MSA column in the results matrix. Lastly, it calculates the overall similarity score
#' between the two MSAs.
#'
compare_alignments <- function(ref,com,SP=FALSE,CS=FALSE){
  
  if (!is.data.frame(ref)){
    import_alignment(ref) -> ref
  }
  if (!is.data.frame(com)){
    import_alignment(com) -> com
  }
  # Check that all sequences are same present in both alignments even if different order
  if (!valid_alignments(ref,com)){
    stop("both alignments must contain the same sets of sequences, even if they are in a different order")
  }
  # Degap both alignments
  ref.degap <- degap_alignment(ref)
  com.degap <- degap_alignment(com)

  # If any sequences differ between ref and con then...
  if (!all(ref.degap==com.degap)){
    # Order full com alignment alphabetically
    com.alpha <- com[,order(com.degap)]
    # Reorder com.alpha by the order of ref.degap
    com <- com.alpha[,as.factor(ref.degap)]
  }
  
  ###########################################
  # Replacing letters with letter+occurance #
  ###########################################
  ref2 <- prepare_alignment_matrix(ref)
  com2 <- prepare_alignment_matrix(com)
  
  if (!is.data.frame(ref)){
    names <- row.names(as.matrix(seqinr::read.fasta(ref)))
  } else {
    names <- colnames(ref)
  }
  # Replacing "-" with NA in the test alignment means
  # that gaps don't count towards column matching score
  com[com=="-"]  <-NA
  com2[com2=="-"]<-NA
  
  ##################################
  # Alignment identity calculation #
  ##################################
  
  # res_list contains $results, $cat, $means 
  res_list = rcpp_align(ref2,com2)
  results  = res_list$results
  cat      = res_list$cat
  means    = res_list$means
  catnames = c("Match","Gapcon","Merge","Split","Shift")
  
  row.names(results)<-c("ColumnMatch",  # 1
                        "NonGap",       # 2
                        "Cys",          # 3
                        "RawMatch",     # 4
                        "Gapcon",       # 5
                        "Merge",        # 6
                        "Split",        # 7
                        "Shift",        # 8
                        "Match")        # 9

  # Format P and Q matrices for output
  ref3 <- t(ref2)
  com3 <- t(com2)
  rownames(ref3) <- names
  rownames(com3) <- names
  # Restore "-"s to comparison alignment
  com3[is.na(com3)]<-"-"
  
  # Create dissimilarity (matrix D) from simplified dissimilarity (res_list$cat)
  dissimilarity_D <- array(dim      = c(ncol(ref), # rows
                                        nrow(ref), # columns
                                        5),        # stacks
                           dimnames = list(names,
                                           NULL,
                                           catnames))
                                           
  dissimilarity_D[,,1] <- 1*t(cat=="M") # "Match"
  dissimilarity_D[,,2] <- 1*t(cat=="g") # "Gapcon"
  dissimilarity_D[,,3] <- 1*t(cat=="m") # "Merge"
  dissimilarity_D[,,4] <- 1*t(cat=="s") # "Split"
  dissimilarity_D[,,5] <- 1*t(cat=="x") # "Shift"

  # Write category averages to results (R matrix)
  results_R           <- t(colMeans(dissimilarity_D))
  rownames(results_R) <- catnames

  # For each column of ref, which column of com is most similar
  columnmatch <- as.vector(res_list$results[1,]) 
 
  # Ref cysteine occurance
  cys <- as.vector((t(rowMeans(ref=="c"))))
 
  # Count alignment columns
  reflen <- nrow(ref)                              
  comlen <- nrow(com)
  
  # Alignment consensus sequences
  refcon <- seqinr::consensus(t(ref))
  comcon <- seqinr::consensus(t(com))

  # Column scores
  column_score=NA
  if (CS==TRUE){
    columnwise.column.score  <- (results_R[1,]==1)*1
    column.score             <- sum(columnwise.column.score)/reflen
    column_score <- list(columnwise.column.score = columnwise.column.score,
                         column.score            = column.score)
  }

  # Final mean identity score
  similarity_score <- mean(cat=="M")/(1-mean(cat=="g"))
  
  ################
  # Sum of pairs #
  ################
  sum_of_pairs <- NULL
  
  if (SP==TRUE){
    
    P <- SPprep(ref3)
    ref.pairs     <- apply(t(P),1,list_pairs)
    ref.pairs.all <- unlist(ref.pairs)
    Q <- SPprep(com3)
    com.pairs     <- apply(t(Q),1,list_pairs)
    com.pairs.all <- unlist(com.pairs)
    
    SPSs <- NULL
    for(x in 1:length(com.pairs)){
      SPSs <- append(SPSs,SP(com.pairs[[x]],ref.pairs.all))
    }
    SPSs[is.nan(SPSs)] <- 0
    columnwise.SPS     <- SPSs
    columnwise.CS      <- SPSs==1
    
    sum.of.pairs.score         <- SP(ref.pairs.all,com.pairs.all)
    reverse.sum.of.pairs.score <- PS(ref.pairs.all,com.pairs.all)
    column.score               <- sum(columnwise.SPS==1)/comlen
    
    sum_of_pairs <- list(sum.of.pairs.score         = sum.of.pairs.score,
                         reverse.sum.of.pairs.score = reverse.sum.of.pairs.score,
                         columnwise.SPS             = columnwise.SPS,
                         column.score               = column.score,
                         columnwise.CS              = columnwise.CS,
                         ref.pairs                  = ref.pairs,
                         com.pairs                  = com.pairs,
                         ref.pairs.all              = ref.pairs.all,
                         com.pairs.all              = com.pairs.all)
  }


  # Create final object
  list(reference_P          = ref3,
       comparison_Q         = com3,
       results_R            = results_R,
       similarity_S         = means,
       dissimilarity_D      = dissimilarity_D,
       dissimilarity_simple = t(cat),
       columnmatch          = columnmatch,
       cys                  = cys,
       reflen               = reflen,
       comlen               = comlen,
       refcon               = refcon,
       comcon               = comcon,
       similarity_score     = similarity_score,
       column_score         = column_score,
       sum_of_pairs         = sum_of_pairs)
}


import_alignment <- function(alignment,format=NULL){

  # if AAbin (ape package)
  if(class(alignment)=="AAbin"){
    if(var(sapply(alignment,length))==0){
      output <- data.frame(t(as.character(as.matrix(alignment))))
    }else{
      warning("all seq must be equal length")
    }
    return(output)
  }
    
  # default fmt
  fmt <- "fasta"
  
  # if clustal
  if( tools::file_ext(alignment)=="clustal"
     |tools::file_ext(alignment)=="CLUSTAL"
     |tools::file_ext(alignment)=="aln"
     |tools::file_ext(alignment)=="ALN"
     |tools::file_ext(alignment)=="clust"
     |tools::file_ext(alignment)=="clus"){
    fmt <- "clustal"
  }
  
  # if msf
  if( tools::file_ext(alignment)=="msf"
     |tools::file_ext(alignment)=="MSF"){
    fmt <- "msf"
  }
  
  # if mase
  if( tools::file_ext(alignment)=="mase"
     |tools::file_ext(alignment)=="MASE"){
    fmt <- "mase"
  }
  
  # if phylip
  if( tools::file_ext(alignment)=="phylip"
     |tools::file_ext(alignment)=="PHYLIP"
     |tools::file_ext(alignment)=="phy"
     |tools::file_ext(alignment)=="PHY"){
    fmt <- "phylip"
  }
  
  # format override
  if(!is.null(format)){
    fmt <- format
  }
  
  # import
  temp <- seqinr::read.alignment(alignment,format=fmt)
  # fix names
  temp$nam <- do.call("rbind", lapply(strsplit(temp$nam," "),"[[", 1))
  # reformat to data frame
  output <- data.frame(strsplit(gsub("[\r\n]","",unlist(temp$seq)),split = ""))
  colnames(output) <- temp$nam
  
  return(output)
}


prepare_alignment_matrix <- function(alignment){
  mat2 <- rcpp_prepare_alignment_matrix(as.matrix(alignment))
  # Remove extra space and de-number gaps
  gsub(x = mat2, pattern = " ",     replacement = "")  -> mat2
  gsub(x = mat2, pattern = "[-].*", replacement = "-") -> mat2
  mat2
}


degap_alignment <- function(final){
  # Remove gaps to convert alignment to list of strings
  gsub("-","",do.call("paste",c(data.frame(t(final)),sep="")))
}


valid_alignments <- function(ref,com){
  ref.degap <- degap_alignment(ref)
  com.degap <- degap_alignment(com)
  all(sort(ref.degap)==sort(com.degap))
}


# Fully unique identities fro all residues in MSA
SPprep <- function(x){
  matrix(paste(row.names(x),x,sep = "|"),nrow = nrow(x))
}


# Full list of all pairs in an alignment column
list_pairs <- function(x){
  data <- x[grep(pattern = "\\|[^-]" , x)]
  tryCatch(do.call(paste,
                   as.data.frame(t(utils::combn(data,2)),stringsAsFactors=FALSE)),
           error=function(e) NULL)
}


# Sum of pairs
SP <- function(reference,comparison){
  length(intersect(comparison,reference))/length(reference)
}


# Reverse sum of pairs
PS <- function(reference,comparison){
  length(intersect(comparison,reference))/length(comparison)
}
TS404/AlignStat documentation built on Oct. 13, 2021, 4:13 p.m.