R/CLggseqlogo-heights.R

Defines functions CLmatrix_to_heights CLmakePFM CLcomputeBits CLfindNamespace CLguessSeqType CLletterMatrix .RNA_NAMESPACE_CL .DNA_NAMESPACE_CL .AA_NAMESPACE_CL CLprobability_method_pfm CLbits_method_pfm

### From package ggseqplot

################################### patch ###################################

# Shannon entropy method,
#   with pfm as input instead of the list of sequences.
#
# Inputs :
#   pfm       matrix with positions in columns, nt, codons or similar in rows
#   N         number of nt, codons, etc
#   nseqs     number of sequences
#   seq_type  type of sequences : "DNA", "RNA" or "AA"
# Ouputs :
#   matrix to heights object
CLbits_method_pfm <- function(pfm, N, nseqs, seq_type, decreasing, ...){
  # Get ic
  ic = CLcomputeBits(pfm, N, nseqs)
  if(all(ic == 0)){
    warning('All positions have zero information content perhaps due to too few input sequences. Setting all information content to 2.')
    ic = (ic * 0)+2
  }
  heights = t(t(pfm) * ic)

  #seq_type = attr(pfm, 'seq_type')
  CLmatrix_to_heights(heights, seq_type, decreasing)
}

# Probability method,
#   with pfm as input instead of the list of sequences.
#
# Inputs :
#   pfm       matrix with positions in columns, nt, codons or similar in rows
#   N         number of nt, codons, etc
#   nseqs     number of sequences
#   seq_type  type of sequences : "DNA", "RNA" or "AA"
# Ouputs :
#   matrix to heights object
CLprobability_method_pfm <- function(pfm, N, nseqs, seq_type, decreasing, ...){
  CLmatrix_to_heights(pfm, seq_type, decreasing)
}

################################### original R code ###################################

# Namespaces
.AA_NAMESPACE_CL = function() c('A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V')
.DNA_NAMESPACE_CL = function() c('A', 'T', 'G', 'C')
.RNA_NAMESPACE_CL = function() c('A', 'U', 'G', 'C')

# Generate letter matrix from vector of sequences
#
# @param input vector of sequences
CLletterMatrix <- function(input){
  # Ensure kmers are the same length characters
  seq.len = sapply(input, nchar)
  num_pos = seq.len[1]
  if(! all(seq.len == num_pos)) stop('Sequences in alignment must have identical lengths')

  # Construct matrix of letters
  split = unlist( sapply(input, function(seq){strsplit(seq, '')}) )

  t( matrix(split, seq.len, length(split)/num_pos) )
}

# Guess sequence type based on letter matrix
#
# @param sp letters
CLguessSeqType <- function(sp){
  # Ensure we have something
  if(length( intersect(sp, c(.AA_NAMESPACE_CL(), .DNA_NAMESPACE_CL(),.RNA_NAMESPACE_CL())) ) == 0)
    stop('Could not get guess seq_type. Please explicitly define sequence type or use "other" with custom namespaces.')

  dat = setdiff(intersect(sp, .AA_NAMESPACE_CL()), c(.DNA_NAMESPACE_CL(),.RNA_NAMESPACE_CL()))
  if(length(dat) > 0){
    return('AA')
  }else if('U' %in% sp){
    return('RNA')
  }
  return('DNA')
}


# Find namespace
#
# @param letter_mat Matrix of latters
# @param seq_type Sequence type
# @param namespace Alphabet
CLfindNamespace <- function(letter_mat, seq_type, namespace){

  # Get all letters in our alignment
  sp = as.character(letter_mat)

  # Other namespace
  if(seq_type == "other"){
    if(is.null(namespace))
      stop('seq_type of "other" must have a defined namespace')

    namespace = as.character(namespace)
    # Get unique
    namespace = unique( unlist(strsplit(namespace, '')) )


    # Validate
    non_alphanumeric = grepl('[^a-zA-Z0-9\u03b1\u03b2\u0393\u03b3\u0394\u03b4\u03b5\u03b6\u03b7\u03b8\u0398\u03b9\u03ba\u039b\u039b\u03bc\u039e\u03be\u03a0\u03c0\u03c1\u03c3\u03c4\u03c5\u03a6\u03c6\u03c7\u03c8\u03a8\u03a9\u03c9]', namespace)
    if( any( non_alphanumeric ) )
      stop('All letters in the namespace must be alphanumeric')

    # Ensure there is something in each column
    # apply(letter_mat, 2, function(column_letters){
    #   int = intersect(namespace, column_letters)
    #   if(length(int) == 0)
    #     stop('The alignment has no letters in namespace match aligned sequences in at least one column')
    # })

  }else{
    if(!is.null(namespace))
      stop('For custom namespaces please set seq_type to "other"')

    # Guess sequence type
    if(seq_type == "auto")
      seq_type = CLguessSeqType(sp)

    # Get predefined namespace
    namespace = get( sprintf('.%s_NAMESPACE', toupper(seq_type)) )()
  }

  return(list(seq_type = toupper(seq_type),
              namespace = namespace))
}

# Calcualte bits
#
# @param pwm Position weight matrix
# @param N Number of letters in namespace
# @param Nseqs Number of sequences in PWM
CLcomputeBits <- function(pwm, N=4, Nseqs=NULL){
  Nseqs = attr(pwm, 'nongapped')
  H_i = - apply(pwm, 2, function(col) sum(col * log2(col), na.rm=TRUE))
  e_n = 0
  if(!is.null(Nseqs)) e_n = (1/logb(2)) * (N-1)/(2*Nseqs)

  R_i = log2(N) - (H_i  + e_n)
  # Set any negatives to 0
  R_i = pmax(R_i, 0)
  return(R_i)
}

# Construct relative frequency matrix
# @param seqs aligned sequences as vector
# @param seq_type sequence type
# @param namespace letters used for matrix construction
# @param keep_letter_mat Keep letter matrix for some height methods
CLmakePFM <- function(seqs, seq_type='auto', namespace=NULL, keep_letter_mat=FALSE){

  letter_mat = NA
  if(is.matrix(seqs)){
    # Process matrix
    if(is.null(rownames(seqs))) stop('Matrix must have letters for row names')

    num_pos = ncol(seqs)

    # Get namespace
    ns = CLfindNamespace(rownames(seqs), seq_type, namespace)
    namespace = ns$namespace
    seq_type = ns$seq_type

    nseqs = NULL

    bg_prob = NA
    pfm_mat = seqs
    pfm_mat = apply(pfm_mat, 2, function(x) x / sum(x, na.rm=TRUE))

    missing_rows = setdiff(namespace, rownames(pfm_mat))

    if(length(missing_rows) > 0){
      miss = matrix(rep(0, length(missing_rows) * ncol(pfm_mat)), nrow=length(missing_rows), dimnames = list(missing_rows))
      pfm_mat = rbind(pfm_mat, miss)
    }

    pfm_mat = pfm_mat[namespace,]

  }else{
    # Process sequences

    # Number of positions in alignment
    num_pos = nchar(seqs[1])
    # Number of sequences
    nseqs = length(seqs)
    # Letter matrix
    letter_mat = CLletterMatrix(seqs)


    # Get namespace
    ns = CLfindNamespace(letter_mat, seq_type, namespace=namespace)
    namespace = ns$namespace
    seq_type = ns$seq_type

    # Construct PWM
    pfm_mat = apply(letter_mat, 2, function(pos.data){
      # Get frequencies
      t = table(pos.data)
      # Match to aa
      ind = match(namespace, names(t))
      # Create column
      col = t[ind]
      col[is.na(col)] = 0
      names(col) = namespace
      # Do relative frequencies
      col = col / sum(col)
      col
    })

    mat = matrix((letter_mat %in% namespace), nrow=nrow(letter_mat))
    attr(pfm_mat, 'nongapped') = apply(mat, 2, sum)
    attr(pfm_mat, 'nseqs') = nseqs
  }

  # Number of letters in ns
  N = length(namespace)

  # Assign seq type and namespace as attributes
  attr(pfm_mat, 'seq_type') = seq_type
  attr(pfm_mat, 'namespace') = namespace

  # Non-gapped columns
  if(seq_type == 'aa') namespace = c(namespace, 'X', 'B', 'Z')

  # Information content
  attr(pfm_mat, 'bits') = CLcomputeBits(pfm_mat, N, nseqs)

  # Assign AA names to rows/pos col
  rownames(pfm_mat) = namespace
  colnames(pfm_mat) = 1:num_pos

  if(keep_letter_mat) return(list(letter_mat = letter_mat, pfm=pfm_mat))

  return(pfm_mat)
}



######################
# Matrix to heights
######################

# General function to convert matrix of heights to polygon data frame
# @param mat matrix of heghts
# @param seq_type sequence type
# @decreasing Sets order of letters, high to low or low to high
CLmatrix_to_heights <- function(mat, seq_type, decreasing=TRUE){

  mat[is.infinite(mat)] = 0

  if(any(duplicated(rownames(mat)))) stop('Matrix input must have unique row names')

  dat = lapply(1:ncol(mat), function(i){
    vals = mat[,i]

    pos = sort( vals[vals >= 0], decreasing = decreasing)
    neg = sort(vals[vals < 0], decreasing = !decreasing)
    #vals = sort(vals, decreasing = T)
    cs_pos = cumsum( pos )
    cs_neg = cumsum( neg )

    df_pos = df_neg = NULL

    if(length(pos) > 0)
      df_pos = data.frame(letter=names(pos), position=i,  y0=c(0, cs_pos[-length(cs_pos)]),
                          y1=cs_pos, stringsAsFactors = FALSE)

    if(length(neg) > 0)
      df_neg = data.frame(letter=names(neg), position=i, y0=cs_neg, y1=c(0, cs_neg[-length(cs_neg)]),
                          stringsAsFactors = FALSE)

    rbind(df_pos, df_neg)
  })

  dat = do.call(rbind, dat)

  # Adjust y spacing
  space_factor = 0.004
  y_pad = max(dat$y1) * space_factor
  dat$y0 = dat$y0 + y_pad
  dat = subset(dat, dat$y1 > dat$y0)

  # Dummy points to make sure full plot is drawn
  # Make sure position 1 and n have a dummy empty letter missing
  dummy = data.frame(letter=dat$letter[1], position=NA, y0=0, y1=0)

  # Missing first position
  if(dat$position[1] != 1){
    dummy$position = 1
    dat = rbind( dummy, dat )
  }

  # Missing last position
  if(dat$position[nrow(dat)] != ncol(mat)){
    dummy$position = ncol(mat)
    dat = rbind( dat, dummy )
  }

  rownames(dat) = NULL

  attr(dat, 'seq_type') = seq_type

  dat
}



#######################
# Two sample logo functions - method not used currently
#######################
# t_test = function(a, b){
#   x = tryCatch({
#     return( t.test(a, b, var.equal = T)$p.value )
#   } , error=function(e) return(1) )
#   x
# }
#
# binom_test = function(a, b){
#   binom.test(sum(a), length(a), sum(b)/length(b))$p.value
# }
#
# # ttest pvalue calculation reimplemented from TSL code
# ttest_p_value <- function(k1, n1, k2, n2)  {
#   mean1 = k1 / n1;
#   mean2 = k2 / n2;
#
#   var1_mult = (k1*(1-mean1)*(1-mean1)) + ((n1-k1)*mean1*mean1);
#   var2_mult = (k2*(1-mean2)*(1-mean2)) + ((n2-k2)*mean2*mean2);
#
#   df   = n1 + n2 - 2;
#   svar = (var1_mult + var2_mult) / df;
#   t    = (mean1-mean2) / sqrt(svar*(1.0/n1 + 1.0/n2));
#   return( 2*pt(t, df, lower=FALSE) )
# }
#
# # Convert to matrix of 1s and 0s
# to_matrix = function (x, seq){
#   X <- matrix(0, length(x), length(seq), dimnames = list(names(x), seq))
#   for (i in 1:length(seq)) X[x == seq[i], i] <- 1
#   return(X)
# }
#
# twosamplelogo_method <- function(fg, bg, fix_pos=NULL, test='t.test', pval_thresh=0.05, ...){
#   if(!is.character(fg) | !is.character(bg))
#     stop('Foreground and background sequences must be character vectors')
#
#   if(!identical(unique(nchar(fg)), unique(nchar(bg))))
#     stop('Foreground sequences must have same width as background')
#
#   fg_obj = CLmakePFM(fg, keep_letter_mat=T, ...)
#
#   namespace = attr(fg_obj$pfm, 'namespace')
#   seq_type = attr(fg_obj$pfm, 'seq_type')
#
#   # Pass sequence type and namespace - avoid double guessing
#   bg_obj = CLmakePFM(bg, keep_letter_mat=T, seq_type = 'other', namespace = namespace)
#
#   # Difference in relative frequencies
#   pfm_diff = fg_obj$pfm - bg_obj$pfm
#
#   # Get letter matrices
#   fg_lm = fg_obj$letter_mat
#   bg_lm = bg_obj$letter_mat
#
#   pv_mat = sapply(1:ncol(fg_lm), function(i){
#     p = to_matrix(fg_lm[,i], namespace)
#     n = to_matrix(bg_lm[,i], namespace)
#
#     np = nrow(p)
#     nn = nrow(n)
#
#     #pv = sapply(1:ncol(p), function(j) binom_test(p[,j], n[,j]) )
#     pv = sapply(1:ncol(p), function(j) ttest_p_value(sum(p[,j]), np, sum( n[,j] ), nn) )
#     names(pv) = names(p)
#     pv
#   })
#
#   # Set things below threshold to zero
#   pfm_diff[ pv_mat >= pval_thresh ] = 0
#   pfm_diff = pfm_diff * 100
#
#   #fix_pos = 1
#   if(!is.null(fix_pos)){
#     i = apply(fg_obj$pfm[,fix_pos,drop=F], 2, which.max)
#     ind = matrix(c(i, fix_pos), ncol=2)
#     x = pfm_diff
#     x[x < 0] = 0
#     pfm_diff[ind] = max( apply(x, 2, sum) )
#   }
#
#   # Make heights
#   hh = CLmatrix_to_heights(pfm_diff, seq_type)
#   hh
# }


# plogo <- function(fg, bg, pval_thresh=0.05){

#   fg_obj = CLmakePFM(fg, keep_letter_mat=T, NO REL FREQ, ...)

#   namespace = attr(fg_obj$pfm, 'namespace')
#   seq_type = attr(fg_obj$pfm, 'seq_type')

#   # Pass sequence type and namespace - avoid double guessing
#   bg_obj = CLmakePFM(bg, keep_letter_mat=T, seq_type = seq_type, namespace = namespace)


#   # -log( binom.test(1, 100, 0.01, alternative = 'g')$estimate /
#   #   binom.test(1, 100, 0.01, alternative = 'l')$estimate )

# }
carinelegrand/RiboVIEW documentation built on July 17, 2020, 3:02 p.m.