R/matrix.R

Defines functions sift.supermatrices sift supermatrices_get backbone_sequences_select drop_tips supermatrix_get sequences_filter sequences_select pmissing.gapmatrix pmissing gapmatrix_get

Documented in drop_tips gapmatrix_get pmissing.gapmatrix sequences_filter sequences_select sift sift.supermatrices supermatrices_get supermatrix_get

#' @name gapmatrix_get
#' @title Generate a gaps matrix
#' @description Return a matrix of TRUE and FALSE corresponding to '-' in a
#' alignment.
#' @param alignment alignment object
#' @return gapmatrix
gapmatrix_get <- function(alignment) {
  res <- alignment == '-'
  rownames(res) <- rownames(alignment)
  class(res) <- 'gapmatrix'
  res
}

pmissing <- function(x) {
  UseMethod('pmissing', x)
}

#' @rdname gapmatrix_get
#' @param x gapmatrix object
pmissing.gapmatrix <- function(x) {
  sum(x)/(nrow(x) * ncol(x))
}

#' @name sequences_select
#' @title Select sequences from alignments
#' @description Return a alignment of just sequences corresponding to
#' nms.
#' @param alignment_list alignment list object
#' @param nms names of tips to be selected
#' @return alignment_list
sequences_select <- function(alignment_list, nms) {
  alignment_get <- function(i) {
    alignment <- alignment_list[[i]]
    nbp <- ncol(alignment)
    res <- matrix(data = '-', nrow = length(nms), ncol = nbp)
    rownames(res) <- nms
    present <- rownames(alignment)[rownames(alignment) %in% nms]
    res[present, ] <- alignment[present, ]
    class(res) <- 'alignment'
    res
  }
  res <- lapply(seq_along(alignment_list), alignment_get)
  names(res) <- names(alignment_list)
  class(res) <- 'alignment_list'
  res
}

#' @name sequences_filter
#' @title Filter sequecnes from alignments
#' @description Return a filtered alignment. Columns will be dropped
#' that have fewer proportion of non-gaps then cutoff and genes will be dropped
#' that are less than min_nbps long.
#' @param alignment_list alignment list object
#' @param cutoff Proportion of non-gaps
#' @param min_nbps Minimum sequence length
#' @return alignment_list
sequences_filter <- function(alignment_list, cutoff = 0.9, min_nbps = 200) {
  calc <- function(alignment) {
    gapmatrix <- gapmatrix_get(alignment = alignment)
    pclmn <- 1 - (colSums(gapmatrix)/nrow(gapmatrix))
    keep_clmns <- pclmn >= cutoff
    alignment <- alignment[ ,keep_clmns]
    class(alignment) <- 'alignment'
    alignment
  }
  alignment_list <- lapply(X = alignment_list, FUN = calc)
  nbps <- vapply(X = alignment_list, FUN = ncol, FUN.VALUE = integer(1))
  res <- alignment_list[nbps >= min_nbps]
  class(res) <- 'alignment_list'
  res
}

#' @name supermatrix_get
#' @title Convert list of alignments into a supermatrix
#' @description Return a supermatrix object from a list of alignments. The
#' supermatrix object is smaller.
#' @param alignment_list alignment list object
#' @return supermatrix
supermatrix_get <- function(alignment_list) {
  stick_together <- function(i) {
    res <- unlist(lapply(X = alignment_list, FUN = function(x) x[i, ]))
    paste0(res, collapse = '')
  }
  nbps <- vapply(X = alignment_list, FUN = ncol, FUN.VALUE = integer(1))
  ntips <- nrow(alignment_list[[1]])
  res <- lapply(seq_len(ntips), stick_together)
  names(res) <- rownames(alignment_list[[1]])
  gene_nms <- names(alignment_list)
  if (is.null(gene_nms)) {
    gene_nms <- as.character(seq_along(alignment_list))
  }
  attr(res, 'genes') <- gene_nms
  attr(res, 'nbps') <- nbps
  class(res) <- 'supermatrix'
  res
}

#' @name drop_tips
#' @title Drop tips from a supermatrix
#' @description Return a supermatrix object with tips dropped that have more
#' non-gaps than cutoff.
#' @param supermatrix supermatrix object
#' @param cutoff Proportion of non-gap cutoff
#' @return supermatrix
drop_tips <- function(supermatrix, cutoff = 0.5) {
  nmissing <- vapply(X = gregexpr(pattern = '-', text = supermatrix),
                     FUN = length, FUN.VALUE = integer(1))
  nbps <- sum(attr(supermatrix, 'nbps'))
  pmissing <- 1 - (nmissing/nbps)
  keep_tips <- pmissing >= cutoff
  res <- supermatrix[keep_tips]
  attr(res, 'genes') <- attr(supermatrix, 'genes')
  attr(res, 'nbps') <- attr(supermatrix, 'nbps')
  class(res) <- 'supermatrix'
  res
}

backbone_sequences_select <- function(groups, alignment_list) {
  backbone_alignment_list <- alignment_list
  to_drop <- NULL
  for (grp_id in names(groups)) {
    nms <- groups[[grp_id]]
    for (i in seq_along(backbone_alignment_list)) {
      al <- backbone_alignment_list[[i]]
      pull <- rownames(al) %in% nms
      if (all(pull)) {
        to_drop <- c(i, to_drop)
        next
      }
      if (any(pull)) {
        pssbls <- rownames(al)[pull]
        if (length(pssbls) > 1) {
          # select the one with fewest gaps
          selected <- pssbls[which.min(rowSums(al[pssbls, ] == '-'))]
        } else {
          selected <- pssbls
        }
        rownames(al)[rownames(al) == selected] <- grp_id
        al <- al[!rownames(al) %in% pssbls, ]
        class(al) <- 'alignment'
        backbone_alignment_list[[i]] <- al
      }
    }
  }
  backbone_alignment_list <- backbone_alignment_list[-1 * to_drop]
  class(backbone_alignment_list) <- 'alignment_list'
  nms <- unique(unlist(lapply(X = backbone_alignment_list, FUN = rownames)))
  sequences_select(alignment_list = backbone_alignment_list, nms = nms)
}

#' @name supermatrices_get
#' @title Create supermatrices from list of alignments and groups object
#' @description Return a supermatrices object from a list of alignments
#' and a groups object.
#' @param groups groups object, determines names of monophyletic groups
#' @param alignment_list list of alignments
#' @param column_cutoff Min. prop. of non-gaps in an alignment column
#' @param tip_cutoff Min. prop. of non-gaps in an alignment row
#' @param min_ntips Min. no. of tips in a supermatrix
#' @param min_ngenes Min. no. of genes in a supermatrix
#' @param min_nbps Min. length (in base pairs) for a gene in a supermatrix
#' @return supermatrices
#' @export
supermatrices_get <- function(alignment_list, groups = NULL, column_cutoff = .5,
                              tip_cutoff = column_cutoff, min_ntips = 5,
                              min_ngenes = 5, min_nbps = 200) {
  res <- list()
  all_tips <- character(0)
  if (is.null(groups)) {
    nms <- unique(unname(unlist(lapply(X = alignment_list, FUN = rownames))))
    groups <- list('all' = nms)
    class(groups) <- 'groups'
    grp_ids <- 'all'
  } else {
    groups <- groups[names(groups) != 'unmatched']
    grp_ids <- names(groups)
    grp_ids <- c(grp_ids, 'backbone')
  }
  for (grp_id in grp_ids) {
    if (grp_id == 'backbone') {
      selected <- backbone_sequences_select(groups = groups,
                                            alignment_list = alignment_list)
    } else {
      nms <- groups[[grp_id]]
      # select sequences from alignments
      selected <- sequences_select(nms = nms, alignment_list = alignment_list)
    }
    # filter selected sequences
    filtered <- sequences_filter(alignment_list = selected,
                                 cutoff = column_cutoff, min_nbps = min_nbps)
    if (length(filtered) == 0) {
      next
    }
    # merge into supermatrix
    supermatrix <- supermatrix_get(alignment_list = filtered)
    # drop tips
    supermatrix <- drop_tips(supermatrix = supermatrix, cutoff = tip_cutoff)
    if (length(supermatrix) >= min_ntips &
        length(attr(supermatrix, 'genes')) >= min_ngenes) {
      res[[grp_id]] <- supermatrix
      # record tips
      all_tips <- c(all_tips, names(supermatrix))
    }
  }
  attr(res, 'tips') <- all_tips
  class(res) <- 'supermatrices'
  res
}

# TODO: update docs
#' @rdname supermatrices_get
#' @export
sift <- function(x, ...) {
  UseMethod('sift', x)
}

#' @rdname supermatrices_get
#' @param x supermatrices
#' @param keep Names of matrices to keep, character vector.
#' @param drop Names of matrices to drop, character vector
#' @return supermatrices
#' @export
sift.supermatrices <- function(x, keep = NULL, drop = NULL) {
  check <- function(nms) {
    pull <- nms %in% names(x)
    if (!all(pull)) {
      nms[!pull]
      msg <- paste0('Names not found:\n',
                    paste0(char(nms[!pull]), collapse = ', '))
      stop(msg)
    }
    names(x) %in% nms
  }
  if (!is.null(keep)) {
    x <- x[check(nms = keep)]
  }
  if (!is.null(drop)) {
    x <- x[!check(nms = drop)]
  }
  class(x) <- 'supermatrices'
  x
}
AntonelliLab/gaius documentation built on June 17, 2020, 8:07 a.m.