R/DCA.R

Defines functions dca return_alignment Compute_True_Frequencies sampledHammingdist with_pc mapkey ReturnW bp_link compute_mu compute_di Compute_Results read_stockholm_alignment codon_msa_to_binary_matrix codon_seq_to_binary_vec codon_letter_to_binary_vec aa_msa_to_binary_matrix aa_seq_to_binary_vec aa_letter_to_binary_vec

Documented in dca read_stockholm_alignment

#DCA.R
#Copyright (C) 2015 Etai Jacob, etai.jacob@gmail.com
#'
#' \code{dca} Direct Coupling Analysis (DCA) for amino acid and codon sequences
#'
#' @param inputfile file containing the alignment in FASTA format
#' @param pseudocount_weight relative weight of pseudo count
#' @param theta threshold for sequence id in reweighting
#' @param seqid_or_excluded_indices Three options:
#'                 1) A single numeric value indicates the sequence index in the MSA to use as a reference
#'                   (1 is the default - An MSA column is either all dot+lower case or dash+upper case,
#'                    by very definition of the output of HMMer.
#'                 2) A string indicates the name or id of the sequence in the msa to use as a reference
#'                 3) A vector of intergers indicates the columns to exclude from analysis
#'                    (no use of reference sequence in this case).
#' @param nuc TRUE for codon based analysis and FALSE (default) for amino acids based analysis.
#' @param fileType input file type
#' @param outputfile output file name to wrie results
#'
#' @return a list containing data produced in this function and the dca results.
#'          The results is composed by N(N-1)/2
#'          (N = length of the sequences) rows and 4 columns:
#'          residue i (column 1), residue j (column 2),
#'          MI(i,j) (Mutual Information between i and j), and
#'          DI(i,j) (Direct Information between i and j).
#'          Note: all insert columns are removed from the alignment.
#'
#' @seealso \url{https://github.com/etaijacob/AA2CODON} if you want to generate the input files
#'
#' @details
#'   SOME RELEVANT VARIABLES RETURNED:
#'   N        number of residues in each sequence (no insert)
#'   M        number of sequences in the alignment
#'   Meff     effective number of sequences after reweighting
#'   q        equal to 21 (20 aminoacids + 1 gap)
#'   align    M x N matrix containing the alignmnent
#'   Pij_true N x N x q x q matrix containing the reweigthed frequency
#'            counts.
#'   Pij      N x N x q x q matrix containing the reweighted frequency
#'            counts with pseudo counts.
#'   C        N(q-1) x N(q-1) matrix containing the covariance matrix.
#'
#' @examples
#' cma.res <- dca(codon_file_name, seqid = 1, nuc = T, fileType = "fasta")
#'
#' @export
dca <- function(inputfile, pseudocount_weight = 0.5, theta = 0.2,
                seqid_or_excluded_indices = 1, nuc = F,
                fileType="fasta", outputfile = NULL) {

  cat("return_alignment.\n")
  print(system.time(msa <- return_alignment(inputfile, seqid = seqid_or_excluded_indices, nuc = nuc, fileType = fileType)))
  cat("Compute_True_Frequencies.\n")
  print(system.time(tfs <-  Compute_True_Frequencies(msa$Z, msa$M, msa$N, msa$q, theta)))
  cat(sprintf("### N = %d M = %d Meff = %.2f q = %d\n", msa$N, msa$M, tfs$Meff, msa$q))
  cat("with_pc\n")
  print(system.time(pc <- with_pc(tfs$Pij_true, tfs$Pi_true, pseudocount_weight, msa$N, msa$q)))
  cat("Compute_C\n")
  print(system.time(C <- Compute_C(pc$Pij, pc$Pi, msa$N, msa$q)))
  cat("InvC.\n")
  #print(system.time(invC <- solve(C)))
  #inverse via the Choleski decomposition performs faster than "solve"
  print(system.time(invC <- chol2inv(chol(C))))
  cat("Compute_Results.\n")
  print(system.time(results <- Compute_Results(pc$Pij, pc$Pi, tfs$Pij_true, tfs$Pi_true, invC, msa$N, msa$q, fnameout = outputfile)))

  return(list(msa = msa,
              tfs = tfs,
              pc = pc,
              C = C,
              invC = invC,
              results = results))
}


#all characters should be an uppercase and AAs or Nucs only
#mind that this function treats aa differently than nuc file:
#in nuc file it excludes all gaps '---' and '...' and 'XXX' where in aa file only '.'.
#Best thing is to give as input the sequence indices to exclude in seqid parameter.
return_alignment <- function(inputfile="data/PF00074_seed.sth.aligndprots",
                             seqid = 1, nuc = F, fileType = c("fasta", "sth"), doUnique=T) {
  # reads alignment from inputfile, removes inserts and converts into numbers
  cat("return_alignment..\n")
  #require(seqinr)
  attributes(seqid) <- list(type="Reference sequence index")
  if(fileType == "fasta") {
    msa <- read.alignment(inputfile, format = "fasta", forceToLower = F)
    msa$seq <- toupper(msa$seq)
    cat(sprintf("Original MSA length is: %d.\n", length(msa$seq)))
  } else if(fileType == "sth") {
    msa <- read_stockholm_alignment(inputfile)
    cat(sprintf("Original MSA length is: %d.\n", length(msa$seq)))
  }

  columnidxs <- NULL
  if(length(seqid) > 1 & is.numeric(seqid)) {
    cat("seqid is a numeric vector\n")
    columnidxs <- seqid
    attributes(seqid) <- list(type="Excluded columns")

  } else if(is.numeric(seqid)) {
    if(seqid > length(msa$nam)) {
      cat(sprintf("%d is greater than %d. Taking the first as ref.\n", seqid, length(msa$nam)))
      seqid <- 1
    }
  } else {
    print(seqid)
    myid <- grep(seqid, msa$nam)[1]

    if(length(myid) == 0) {
      cat(sprintf("%s does not exists in MSA. Taking the first seq as ref.\n", seqid))
      seqid <- 1
    } else {
      cat(sprintf("Ref id found: %d out of %d possibilities\n", myid, length(grep(seqid, msa$nam))))
      seqid <- myid
      if(doUnique) {
        if(fileType == "fasta") {
          msa$nam  <- c(msa$nam[seqid], msa$nam)
          msa$seq  <- c(msa$seq[seqid], msa$seq)
          msa$nb  <- c(msa$nb[seqid], msa$nb)
          msa$com  <- c(msa$com[seqid], msa$com)
        } else if(fileType == "sth") {
          msa$name  <- c(msa$name[seqid], msa$name)
          msa$seq  <- c(msa$seq[seqid], msa$seq)
        }
        seqid <- 1
      }
    }
  }
  if(doUnique) {
    if(fileType == "sth") {
      msa <- msa[!duplicated(msa$seq),]
    } else if(fileType == "fasta") {
      inidxs <- which(!duplicated(msa$seq))
      msa$nb  <- msa$nb[inidxs]
      msa$nam <- msa$nam[inidxs]
      msa$seq <- msa$seq[inidxs]
      msa$com <- msa$com[inidxs]
    }
  }
  cat(sprintf("After redundancy (unique) MSA length is: %d.\n", length(msa$seq)))
  cat(sprintf("Ref sequence is %s after removal of duplicates.\n", paste(seqid, collapse = ",")))
  if(nuc == FALSE) {
    #return(msa)
    tmp <- do.call(rbind, lapply(strsplit((msa$seq), ""), function(x) factor(x, levels = aas)))
    #print(dim(tmp))
    #return(tmp)
    if(is.null(columnidxs)) {
      if(fileType == "sth") { #excluding only '.' and lower case aas from analysis as defined by HMMER
        excludeidxs <- which(tmp[seqid, ] == 1 | is.na(tmp[seqid, ]))
      } else if(fileType == "fasta") { #exclude gaps of ref sequence from analysis
        excludeidxs <- which(tmp[seqid, ] == 22)
        #print(tmp[seqid,])
      }
      if(length(excludeidxs) > 0)
        tmp <- tmp[, -excludeidxs] #exclude gaps/dots from analysis
    } else {
      if(max(columnidxs) > dim(tmp)[2]) {
        cat(sprintf("ERROR - there are only %d columns where input column idx max val is %s.\n", dim(tmp)[2], max(columnidxs)))
        return(NA)
      }
      tmp <- tmp[, -columnidxs] #exclude gaps (in hammer means '.' and lower case letters) from analysis
    }
    tmp[ tmp > 21 | is.na(tmp) ] <- 1 # 'X' and '-' turn to be 1

  } else {
    tmp <- do.call(rbind, strsplit((msa$seq), ""))
    im <- matrix(1:dim(tmp)[2], ncol=dim(tmp)[2]/3)
    tmp <- do.call(rbind, lapply(1:dim(tmp)[1],
                                 function(z) sapply(lapply(1:dim(im)[2],
                                                           function(x) tmp[z, ][im[,x]]),
                                                    function(x) factor(paste(x, collapse=""), levels=codons))))
    if(is.null(columnidxs)) {
      tmp <- tmp[, -which(tmp[seqid, ] == 1 | is.na(tmp[seqid, ]))] #exclude gaps from analysis
    } else {
      if(max(columnidxs) > dim(tmp)[2]) {
        cat(sprintf("ERROR - there are only %d columns where input column idx max val is %s.\n", dim(tmp)[2], max(columnidxs)))
        return(NA)
      }
      tmp <- tmp[, -columnidxs] #exclude gaps from analysis

    }
    print(class(tmp))
    tmp[ tmp > 65 | is.na(tmp) ] <- 1 # 'xxx' and '...' turn to be 1
  }
  rownames(tmp) <- msa$nam
  align_full <- tmp
  M <- dim(align_full)[1]

  N <- dim(align_full)[2]
  q <- max(align_full)
  print(align_full[1:12,1:12])
  return(list(N=N, M=M, q=q, Z=align_full, seqid = seqid, raw.msa = msa))
}


# Two different implementations:
#   1. One for RW
#   TODO - 2. One without - further optimizations can be done (using table)
Compute_True_Frequencies <- function(align,M,N,q,theta, useSampling=F) {
  # computes reweighted frequency counts
  # the number of nonredundant sequences is measured as the effective sequence number Meff after reweighting (see Methods).
  # The comparison to results
  # without reweighting and to reweighting at 80%
  cat("Compute_True_Frequencies..\n")
  W <- rep(1, M)

  if( theta > 0.0 ) {
    if(useSampling) {
      W <- 1/(1 + colSums(sampledHammingdist(align, Ns = 240) < theta))
    } else {
      W <- 1/(1 + colSums(hammingdist(align) < theta))
    }
  }
  Meff <- sum(W)

  cat(sprintf("Allocating: %d, %d withb Meff = %f\n", N, q, Meff))

  #   Pi_true <- array(0, c(N, q))

  #    for(j in 1:M)
  #      for(i in 1:N)
  #        Pi_true[i, align[j,i]] <- Pi_true[i,align[j,i]] + W[j]

  Pi_true <- AccPi_true(align, W, N, q, M)
  Pi_true <- Pi_true/Meff

  cat("Going for main loop..\n")
  # Pij_true <- array(0, c(N, N, q, q))

  #   for(l in 1:M)
  #     for(i in 1:(N-1))
  #       for(j in (i+1):N) {
  #         Pij_true[i, j, align[l,i],align[l,j]] <- Pij_true[i, j, align[l, i], align[l, j]] + W[l]
  #         Pij_true[j, i, align[l,j],align[l,i]] <- Pij_true[i, j, align[l, i], align[l, j]]
  #       }

  Pij_true = AccPij_true(align, W, N, q, M)
  Pij_true <- Pij_true/Meff

  scra <- diag(q)
  for(i in 1:N)
    for(alpha in 1:q)
      for(beta in 1:q)
        Pij_true[i, i, alpha, beta] <- Pi_true[i, alpha] * scra[alpha, beta]

  return(list(Pij_true=Pij_true, Pi_true= Pi_true, Meff = Meff, W = W))
}



#Sampled Hamming distance, the percentage of coordinates that differ in a sample of Ns comparisons.
#Input X: matrix with n rows vectors
#Output m: each column j represents a Ns sampled distances from seq j
sampledHammingdist <- function(X, Ns) {
  cat("sampledHammingdist..\n")
  n <- nrow(X)
  if(n < Ns) {
    print("is less")
    m <- matrix(nrow=n, ncol=n)
    for(i in seq_len(n))
      for(j in seq(i, n))
        m[j, i] <- m[i, j] <- sum(X[i,] != X[j,])
    return(m/ncol(X))
  } else {
    print("is bigger")
    getmydist <- function(i) {
      sapply(sample(seq(1, n)[-i], Ns), function(j) sum(X[i,] != X[j,]))
    }
    m <- sapply(seq_len(n), getmydist)/ncol(X)
    return(m)
  }
}


with_pc <- function(Pij_true, Pi_true, pseudocount_weight, N, q) {
  # adds pseudocount
  cat("with_pc..\n")
  Pij <- (1 - pseudocount_weight) * Pij_true + pseudocount_weight / q / q * array(1, c(N, N, q, q))
  Pi <- (1 - pseudocount_weight) * Pi_true + pseudocount_weight / q * array(1, c(N, q))

  scra <- diag(q)
  for(i in 1:N)
    Pij[i, i, , ] <- (1 - pseudocount_weight) * Pij_true[i, i, ,] + pseudocount_weight / q * scra

  return(list(Pij = Pij, Pi = Pi))
}


mapkey <- function(i, alpha, q) {
  A <- (q-1) * (i-1) + alpha
  return(A)
}




ReturnW <- function(C, i, j, q) {
  # extracts coupling matrix for columns i and j
  #cat("ReturnW..\n")
  W <- array(1, c(q,q))
  W[1:(q-1), 1:(q-1)] <- exp( -C[mapkey(i, 1:(q-1), q), mapkey(j, 1:(q-1), q)] )
  #print(W)
  return(W)
}

bp_link <- function(i, j, W, P1, q) {
  # computes direct information
  #cat("bp_link..\n")
  mus <- compute_mu(i, j, W, P1, q)
  DI <- compute_di(i, j, W, mus$mu1, mus$mu2, P1)
  return(DI)
}


compute_mu <- function(i, j, W, P1, q) {
  #cat("compute_mu..\n")
  epsilon <- 1e-4
  diff <- 1.0
  mu1 <- array(1,q)/q
  mu2 <- array(1,q)/q
  pi <- P1[i, ]
  pj <- P1[j, ]

  while ( diff > epsilon ) {
    scra1 <- as.numeric(mu2 %*% t(W))
    scra2 <- as.numeric(mu1 %*% W)

    new1 <- as.vector(pi / scra1)
    new1 <- as.vector(new1 / sum(new1))

    new2 <- as.vector(pj / scra2)
    new2 <- as.vector(new2 / sum(new2))

    diff <- max( abs( new1 - mu1 ), abs( new2 - mu2 ) )
    #print(diff)
    mu1 <- new1
    mu2 <- new2
  }
  #cat(sprintf("%d, %d: %f %f\n", i, j, mu1, mu2))
  return(list(mu1 = mu1, mu2 = mu2))
}


compute_di <- function(i, j, W, mu1,mu2, Pia) {
  # computes direct information
  #cat("compute_di..\n")
  tiny <- 1.0e-100
  Pdir <- W * ((as.matrix(mu1)) %*% t(as.matrix(mu2))) #direction of vector is the opposite than matlab

  #Pdir <- W * as.vector(t(mu1) %*% mu2)
  #print(Pdir)
  Pdir <- Pdir / sum(Pdir)
  #print(Pdir)
  #print(dim(Pdir))
  Pfac <- as.matrix(Pia[i,]) %*% t(as.matrix(Pia[j, ])) #direction of vector is the opposite than matlab
  #print(Pfac)
  #cat(sprintf("%d, %d: %f %f %f\n", i, j, Pdir, Pfac, (Pdir+tiny)/(Pfac+tiny)))
  DI <- sum(diag( t(Pdir) %*% log( (Pdir+tiny)/(Pfac+tiny) ) ))
  #print(log(Pdir+tiny)/(Pfac+tiny))
  return(DI)
}


Compute_Results <- function(Pij, Pi, Pij_true, Pi_true, invC, N, q, fnameout = NULL) {
  # computes and prints the mutual and direct informations
  cat("Compute_Results..\n")
  df <- NULL

  nns <- utils::combn(N, 2)
  df <- do.call(rbind,
                lapply(1:dim(nns)[2],
                       function(x) {
                         MI <- calculate_mi_and_omes(nns[1, x] - 1, nns[2, x] - 1, Pij_true, Pi_true, q, N)
                         W_mf <- ReturnW(invC, nns[1, x], nns[2, x], q)
                         DI_mf_pc <- bp_link(nns[1, x], nns[2, x], W_mf, Pi, q)
                         return(data.frame(i = nns[1, x], j = nns[2, x], MI = MI[1], OMES = MI[2], DI = DI_mf_pc))
                       }))

  cat(sprintf("Results dim: (%d, %d)", dim(df)[1], dim(df)[2]))
  if(!is.null(fnameout)) {
    cat(sprintf("Writing output to file: %s.\n", fnameout))
    utils::write.table(df, file = fnameout, sep = "\t", row.names = F, quote = F)
  }
  return(df)
}



#' Reads an sth file
#'
#' \code{read_stockholm_alignment} reads a stockholm file format
#' Only lines which do not start with a # are considered
#' @param file sth file name
#' @return a dataframe with name and seq columnns
#'
#' @export
read_stockholm_alignment <- function(file="Y://PFAM2/data/sth/seed/PF00013_v27_seed.sth") {
  mm <- readLines(file)
  mm <- do.call(rbind, strsplit(mm[-grep("^#", mm)], "\\s+", perl = T))
  mm <- data.frame(name=mm[,1], seq=toupper(mm[,2]), row.names = mm[,1], stringsAsFactors = F)
  return(mm)
}

codon_msa_to_binary_matrix <- function(nmat) {
  tt <- lapply(1:dim(nmat)[1], function(x) codon_seq_to_binary_vec(nmat[x,]))
  tt <- do.call(rbind, tt)
  rownames(tt) <- rownames(nmat)
  return(tt)
}

codon_seq_to_binary_vec <- function(sequence) {
  unlist(lapply(sequence, codon_letter_to_binary_vec))
}

codon_letter_to_binary_vec <- function(codon) {
  (1:65==codon)+0
}


aa_msa_to_binary_matrix <- function(pmat) {
  tt <- lapply(1:dim(pmat)[1], function(x) aa_seq_to_binary_vec(pmat[x,]))
  tt <- do.call(rbind, tt)
  rownames(tt) <- rownames(pmat)
  return(tt)
}

aa_seq_to_binary_vec <- function(sequence) {
  unlist(lapply(sequence, aa_letter_to_binary_vec))
}

aa_letter_to_binary_vec <- function(aa) {
  (1:21==aa)+0
}


aas <- toupper(c(".","A", "C", "D", "E", "F", "G", "H", "I",
                 "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W",
                 "Y", "-", "X"))

aas.v2 <- toupper(c("-","A", "C", "D", "E", "F", "G", "H", "I",
                    "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W",
                    "Y", ".", "X"))

codons <- toupper(c("---", "ttt", "ttc", "tta", "ttg", "tct", "tcc", "tca", "tcg", "tat", "tac",
                    "taa", "tag", "tgt", "tgc", "tga", "tgg", "ctt", "ctc", "cta",
                    "ctg", "cct", "ccc", "cca", "ccg", "cat", "cac", "caa", "cag",
                    "cgt", "cgc", "cga", "cgg", "att", "atc", "ata", "atg", "act",
                    "acc", "aca", "acg", "aat", "aac", "aaa", "aag", "agt", "agc",
                    "aga", "agg", "gtt", "gtc", "gta", "gtg", "gct", "gcc", "gca",
                    "gcg", "gat", "gac", "gaa", "gag", "ggt", "ggc", "gga", "ggg", "...", "xxx"))
etaijacob/CMA documentation built on Dec. 27, 2019, 4:17 p.m.