R/iprSelectedPositions.R

Defines functions testAlignedForUnalignedAAPos replaceSanitizedWithOriginalIDs domainsForPos alignedForUnalignedAAPos unalignedAAforAlignedAAPos iprWithSelectedAA readMinimumInterProScanResultTable readInterProScanResultTable

Documented in alignedForUnalignedAAPos domainsForPos iprWithSelectedAA readInterProScanResultTable readMinimumInterProScanResultTable replaceSanitizedWithOriginalIDs testAlignedForUnalignedAAPos unalignedAAforAlignedAAPos

#' Parses the result file of InterProScan into a data.frame.
#'
#' @param path.2.iprscan.res The file path to the argument InterProScan result
#' table.
#'
#' @return A data.frame with 14 columns.
#' @export
#' 
readInterProScanResultTable <- function(path.2.iprscan.res) {
    read.table(path.2.iprscan.res, sep = "\t", stringsAsFactors = FALSE, 
        fill = TRUE, comment.char = "", quote = "", na.string = "")
}


#' Uses function 'readInterProScanResultTable(path.2.iprscan.res)' and retains
#' only two columns: 1. Gene Identifier, and 2. InterPro Identifier.
#'
#' @param path.2.iprscan.res The file path to the argument InterProScan result
#' table.
#' @param gene.col The column identifier for the column in which to find the
#' gene identifiers.
#' @param ipr.col The column identifier for the column in which to find the
#' InterPro identifiers.
#'
#' @export
#' @return An instance of base::data.frame
readMinimumInterProScanResultTable <- function(path.2.iprscan.res, 
    gene.col = 1, ipr.col = 12) {
    ipr.df <- readInterProScanResultTable(path.2.iprscan.res)
    if (!is.null(ipr.df) && !is.na(ipr.df) && nrow(ipr.df) > 0) {
        ipr.df <- ipr.df[which(!is.na(ipr.df[, gene.col]) & !is.na(ipr.df[, 
            ipr.col]) & ipr.df[, ipr.col] != "NA"), c(gene.col, ipr.col)]
        colnames(ipr.df) <- paste("V", 1:2, sep = "")
    }
    unique(ipr.df)
}


#' Identifies conserved protein domains (InterPro) overlapping with the
#' conserved homologous amino acid (codon) 'sel.aa'.
#'
#' @param sel.aa The index of the conserved amino acid under selection in the
#' corresponding multiple sequence alignment (MSA)
#' @param gene The gene accession / ID as used in 'msa.fasta'
#' @param msa.fasta The result of
#' \code{seqinr::read.fasta(path_2_AAs_MSA.fasta, seqtype='AA', as.string=TRUE,
#' strip.desc=TRUE)}.
#' @param iprscan.tbl The result of calling
#' readInterProScanResultTable(path_2_interproscan_result_table.tsv)
#'
#' @return  A character vector of matching InterPro entries.
#' @export
#' 
iprWithSelectedAA <- function(sel.aa, gene, msa.fasta, iprscan.tbl) {
    sel.aa.unaligned.pos <- unalignedAAforAlignedAAPos(gene, sel.aa, 
        msa.fasta)
    if (!is.na(sel.aa.unaligned.pos)) 
        domainsForPos(gene, sel.aa.unaligned.pos, iprscan.tbl) else NA
}


#' Computes the position in the unaligned sequence of a character indicated by
#' \code{sel.aa}.
#'
#' @param  gene The gene accession / ID as used in 'msa.fasta'
#' @param  sel.aa The index of the homologous amino acid subject to selection
#' (integer coordinate)
#' @param msa.fasta The result of
#' \code{seqinr::read.fasta(path_2_AAs_MSA.fasta, seqtype='AA', as.string=TRUE,
#' strip.desc=TRUE)}.
#' @param gap.char The character or regular expression used to identify non
#' sequence characters in the aligned sequences. Default is
#' \code{getOption('GeneFamilies.gap.char', '-')}.
#' @param gap.char.matching.fixed boolean indicating the value to pass to
#' \code{grepl} and \code{gsub} argument \code{fixed=}. Set to \code{TRUE} if
#' \code{gap.char} is literal and atomic. Default is
#' getOption("GeneFamilies.gap.char.matching.fixed", TRUE).
#'
#' @return  An integer; either NA if the position is a non sequence character,
#' or the corresponding un-aligned position.
#' @export
unalignedAAforAlignedAAPos <- function(gene, sel.aa, msa.fasta, gap.char = getOption("GeneFamilies.gap.char", 
    "-"), gap.char.matching.fixed = getOption("GeneFamilies.gap.char.matching.fixed", 
    TRUE)) {
    aa.algn.seq <- msa.fasta[[gene]][[1]]
    aa.algn.char <- substr(aa.algn.seq, sel.aa, sel.aa)
    if (grepl(gap.char, aa.algn.char, fixed = gap.char.matching.fixed)) {
        NA
    } else {
        nchar(gsub(gap.char, "", substr(aa.algn.seq, 1, sel.aa), fixed = gap.char.matching.fixed))
    }
}

#' Computes the position of the unaligned character at argument position
#' 'char.pos' in the aligned sequence of argument 'gene'.
#'
#' @param gene The gene accession / ID as used in 'msa.fasta'
#' @param char.pos The index of the homologous amino acid subject to selection
#' (integer coordinate)
#' @param msa.fasta The result of
#' \code{seqinr::read.fasta(path_2_AAs_MSA.fasta, seqtype='AA', as.string=TRUE,
#' strip.desc=TRUE)}.
#' @param gap.char The character used to identify non sequence characters in
#' the aligned sequences. Default is \code{getOption('GeneFamilies.gap.char',
#' '-')}.
#'
#' @return  An integer; either NA if the position is a non sequence character,
#' or the corresponding aligned position.
#' @export
alignedForUnalignedAAPos <- function(gene, char.pos, msa.fasta, gap.char = getOption("GeneFamilies.gap.char", 
    "-")) {
    aligned.seq <- msa.fasta[[gene]][[1]]
    n.non.gap.char <- 0
    i <- 1
    aligned.pos <- NA
    # do
    repeat {
        if (substr(aligned.seq, i, i) != gap.char) {
            n.non.gap.char <- n.non.gap.char + 1
        }
        # until
        if (n.non.gap.char == char.pos) {
            aligned.pos <- i
            break
        } else if (i == nchar(aligned.seq)) {
            break
        } else {
            i <- i + 1
        }
    }
    aligned.pos
}

#' Looks up the conserved protein domains (InterPro) that have been annotated
#' to gene and overlap with amino acid position 'aa.pos'.
#'
#' @param gene The gene accession or ID as used in iprscan.tbl
#' @param aa.pos The amino acid position (non aligned) to be overlapped by any
#' InterPros
#' @param gene.col The column index or name of iprscan.tbl in which to look up
#' the gene accessions
#' @param start.col The column index or name of iprscan.tbl in which to look up
#' the domain start positions
#' @param end.col The column index or name of iprscan.tbl in which to look up
#' the domain end positions
#' @param ipr.col The column index or name of iprscan.tbl in which to look up
#' the InterPro identifier
#'
#' @return  A character vector of matching InterPro domains.
#' @export
#' 
domainsForPos <- function(gene, aa.pos, iprscan.tbl, gene.col = "V1", 
    start.col = "V7", end.col = "V8", ipr.col = "V12") {
    x <- iprscan.tbl[which(iprscan.tbl[, gene.col] == gene), ]
    sort(unique(x[which(x[, start.col] <= aa.pos & x[, end.col] >= 
        aa.pos), ipr.col]), na.last = NA)
}


#' Replaces sanitized protein identifiers in XStringSet 'xstring.set' with the
#' original ones held in 'name.maps' table.
#'
#' @param xstring.set A vector or list of strings
#' @param name.maps A data.frame with at least two columns one holding the
#' sanitized and the other the original protein IDs
#' @param san.col The column index or name of 'name.maps' holding the sanitized
#' protein IDs
#' @param orig.col The column index or name of 'name.maps' holding the original
#' protein IDs
#'
#' @return  A copy of argument 'xstring.set' with the sanitized protein IDs
#' replaced with their originals.
#' @export
#' 
replaceSanitizedWithOriginalIDs <- function(xstring.set, name.maps, 
    san.col = "sanitized", orig.col = "original") {
    names(xstring.set) <- as.character(lapply(names(xstring.set), 
        function(x) {
            name.maps[which(name.maps[, san.col] == x), orig.col]
        }))
    xstring.set
}

#' Test for function \code{alignedForUnalignedAAPos}.
#'
#' @return TRUE if and only if all tests pass.
#' @export
testAlignedForUnalignedAAPos <- function() {
    msa.fasta <- list(A = "---Hello---World")
    gene.id <- "A"
    act.res.1 <- alignedForUnalignedAAPos(gene.id, 1, msa.fasta)
    t.1 <- 4 == act.res.1
    act.res.2 <- alignedForUnalignedAAPos(gene.id, 6, msa.fasta)
    t.2 <- 12 == act.res.2
    all(c(t.1, t.2))
}
asishallab/GeneFamilies documentation built on May 22, 2023, 11:30 a.m.