#' Identify homologous gene pairs in a table of significantly similar genes.
#'
#' @param seq.sim.tbl a data frame of two character columns and a third score
#' column. It is highly important that this table represents only gene pairs of
#' TWO species, not more or less.
#'
#' @return A subset of the argument data frame, holding only the two character
#' columns, containing the candidate homologs.
#' @export
homologs <- function(seq.sim.tbl) {
# Exclude self matches:
x <- seq.sim.tbl[which(seq.sim.tbl[, "V1"] != seq.sim.tbl[, "V2"]), ]
# Order by score, such that best scoring hits are listed first for each query:
x <- x[with(x, order(V1, -V3)), c("V1", "V2")]
# Retain only the best hit for each query:
x <- x[which(!duplicated(x$V1)), ]
# Identify reciprocal best hits:
y <- data.frame(V1 = x$V2, V2 = x$V1, stringsAsFactors = FALSE)
dbl.x <- rbind(x, y)
y <- dbl.x[duplicated(dbl.x[, 1:2]), ]
rownames(y) <- 1:nrow(y)
y
}
#' Test of function 'homologs(...)'
#'
#' @return TRUE if and only if the test succeeded.
#' @export
testHomologs <- function() {
df <- data.frame(V1 = c(1, 1, 1:5), V2 = c(1, 4, 4, 3, 2, 1, 8), V11 = c(0, 0,
1, rep(0, 4)), stringsAsFactors = FALSE)
expected.res <- data.frame(V1 = 4:1, V2 = 1:4, V11 = rep(0, 2), stringsAsFactors = FALSE)
res <- homologs(df)
# Ignore increased row-names in 'identical' comparison:
all(as.logical(lapply(1:ncol(res), function(i) identical(as.numeric(res[, i]),
as.numeric(expected.res[, i])))))
}
#' Identifies orthologous gene pairs as genes being significantly similar in
#' sequence and belonging to different species.
#'
#' @param homologs.tbl a data.frame of two character columns - possibly the
#' result obtained from invoking homologous(...)
#' @param gene.regex.1 a string encoding a PERL regular expression to match
#' gene identifiers to species A
#' @param gene.regex.2 a string encoding a PERL regular expression to match
#' gene identifiers to species B
#'
#' @return A subset of argument 'homologs.tbl' containg the orthologous gene
#' pairs. Note that if pair (A,B) is contained the reciprocal pair (B,A) is,
#' too. This should ease lookup.
#' @export
orthologs <- function(homologs.tbl, gene.regex.1, gene.regex.2) {
paralogs.1 <- grepl(gene.regex.1, homologs.tbl$V1, perl = TRUE) & grepl(gene.regex.1,
homologs.tbl$V2, perl = TRUE)
paralogs.2 <- grepl(gene.regex.2, homologs.tbl$V1, perl = TRUE) & grepl(gene.regex.2,
homologs.tbl$V2, perl = TRUE)
homologs.tbl[which(!(paralogs.1 | paralogs.2)), ]
}
#' Parses 'psl' default BLAT output formatted files to infer orthologs between
#' species. Uses PERL regular expressions to identify which gene identifiers
#' belongs to which species. Hence by using two such regular expressions
#' orthologs between two species can be extracted, even though the BLAT output
#' file contains similarity pairs from more than two species. Note that
#' orthology is inferred based on reciprocal best search results.
#'
#' @param blat.df a data frame holding the output of BLAT in psl format. Se
#' function readBlatPSLoutput(...) for more details
#' @param query.col the number of the column in which to lookup the query
#' sequence's name
#' @param target.col the number of the column in which to lookup the target
#' sequence's name
#' @param matching.bases.col the number of the column in which to lookup the
#' number of nucleotide bases matching between query and target
#' @param qSize.col the number of the column in which to lookup the query
#' sequence's size
#' @param tSize.col the number of the column in which to lookup the target
#' sequence's size
#' @param spec.regexs a character vector containg two regular expressions
#' matching gene identifiers from each of the two species orthologies are to be
#' found for
#'
#' @return A data frame of two columns holding the gene identifiers of the
#' orthologous gene pairs. Note, that for reasons of easing lookup inverse
#' pairs are also contained.
#' @export
orthologsFromBLATpslTable <- function(blat.df, query.col = 10, target.col = 14, matching.bases.col = 1,
qSize.col = 11, tSize.col = 15, spec.regexs = c("CAHR\\d+(\\.\\d)?", "AT[0-9MC]G\\d+(\\.\\d)?")) {
# Retain only those gene pairs, where each gene matches one of the species'
# regular expressions:
b.df <- blat.df[which((grepl(spec.regexs[[1]], blat.df[, query.col], perl = TRUE) |
grepl(spec.regexs[[1]], blat.df[, target.col], perl = TRUE)) & (grepl(spec.regexs[[2]],
blat.df[, query.col], perl = TRUE) | grepl(spec.regexs[[2]], blat.df[, target.col],
perl = TRUE))), ]
b.df <- b.df[which((b.df[, matching.bases.col] >= ((b.df[, qSize.col] + b.df[,
tSize.col])/4))), c(query.col, target.col, matching.bases.col)]
colnames(b.df) <- paste("V", 1:ncol(b.df), sep = "")
# Identify homologous candidates and retain orthologs as reciprocal best hits of
# genes belonging to different species:
orthologs(homologs(b.df), spec.regexs[[1]], spec.regexs[[2]])
}
#' Wraps function base::read.table with the proper arguments needed to
#' efficiently and correctly read in a BLAT output table in PSL format.
#'
#' @param blat.psl.path valid file path to the respective BLAT psl output file
#'
#' @return The read in table as data frame
#' @export
readBlatPSLoutput <- function(blat.psl.path) {
read.table(blat.psl.path, sep = "\t", skip = 5, stringsAsFactors = FALSE, colClasses = c(rep("numeric",
8), rep("character", 2), rep("numeric", 3), "character", rep("numeric", 4),
rep("character", 3)), comment.char = "", quote = "", na.strings = "")
}
#' Identifies those pairs of orthologous genes where one member appears in all
#' pairwise species comparisons.
#'
#' @param pairwise.orths a list of two column data frames holding orthologous
#' gene pairs identified in pairwise species comparisons. The names of this
#' list reflect the species the anchoring species was compared to iteratively
#' @param intersect.col the name or index of the column of each of the data
#' frames in 'pairwise.orths' to use as a source of gene identifiers
#' @param ortholog.col the name or index of the column to extract ortholog
#' genes from when they match anchor species orthologs conserved over all
#' species comparisons
#' @param anchor.species the name of the species which was used in all pairwise
#' comparisons.
#'
#' @return a two column data frame of orthologous gene groups where an ortholog
#' was found in each species.
#' @export
conservedOrthologs <- function(pairwise.orths, intersect.col = "V1", ortholog.col = "V2",
anchor.species = "chi") {
orths.genes <- lapply(pairwise.orths, function(x) unique(as.character(unlist(x[,
intersect.col]))))
conserved.genes <- base::Reduce(intersect, orths.genes)
conserved.orths <- data.frame(V1 = conserved.genes, stringsAsFactors = FALSE)
colnames(conserved.orths) <- anchor.species
cbind(conserved.orths, do.call("cbind", lapply(pairwise.orths, function(x) x[which(x[,
intersect.col] %in% conserved.genes), ortholog.col])), stringsAsFactors = FALSE)
}
#' Assuming 'gene.ids' holds gene identifiers that lack splice variant suffixes
#' like '.1' but refer to genes contained in this packages coding sequences
#' (data) this function aims to identify the matching splice variants.
#'
#' @param 'gene.ids' character vector of gene identifiers to be matched to
#' coding sequence identifiers held in this packages data.
#' @param 'genes' character vector of gene identifiers with splice variant
#' suffixes to be matched against the argument gene identifiers.
#'
#' @return Character vector of the original gene identifiers or the matching
#' splice variants if the original argument identifiers were not found in
#' 'genes'. NA wherever the argument in 'gene.ids' was not found in 'genes' and
#' could also not be matched to an entry of 'genes'.
#' @export
matchingSpliceVariant <- function(gene.ids, genes = as.character(unlist(spec.gene.ids))) {
i <- which(!gene.ids %in% genes)
gene.ids[i] <- as.character(unlist(mclapply(gene.ids[i], function(x) {
y <- which(grepl(paste("\\b", x, "\\.\\d+\\b", sep = ""), genes, perl = TRUE))
if (length(y) != 1) {
warning("Could not identify unique matching splicing variant for gene identifier '",
x, "'!")
NA
} else genes[[y]]
})))
gene.ids
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.