#' countHumanParalogs
#'
#' This function uses the \pkg{biomaRt} package to extract paralog information for supplied human genes
#'
#' @param con A \code{SQLiteConnection} object
#' @param genes A vector of ENSEMBL gene ids
#' @param alt_mart A Mart object created using biomaRt::useMart. This should specify \code{hsapiens_gene_ensembl}
#' as the dataset.
#' @return a data frame
#' @export
#' @examples
#' \dontrun{
#' ex_con <- setupSQLite()
#' all_genes <- getAllHumanGenes(ex_con)
#' set.seed(20120806)
#' ex_genes <- sample(all_genes$gene_id, 100)
#'
#' #count paralogs
#' ex_df <- countHumanParalogs(ex_con, ex_genes)
#' ex_df[1:10,1:2]
#'
#' #view the tables in the database
#' DBI::dbListTables(ex_con)
#'
#' #larger set of genes
#' countHumanParalogs(ex_con, sample(all_genes$gene_id, 2000))[1:10,1:2]
#'
#' #Specify which Mart to use
#' my_mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl", host="sep2015.archive.ensembl.org")
#' ex_df <- countHumanParalogs(ex_con, ex_genes, alt_mart=my_mart)
#' ex_df[1:10,1:2]
#' }
countHumanParalogs <- function(con, genes, alt_mart=NULL) {
#get the data from ensembl - use alt_mart if supplied
if(class(alt_mart) == 'Mart' && alt_mart@dataset == 'hsapiens_gene_ensembl') {
message("Using user provided mart details")
ensembl <- alt_mart
} else {
message("Using default mart details")
ensembl <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host="mar2015.archive.ensembl.org")
}
#display info on mart
message(sprintf("Biomart: %s\nHost: %s\nDataset: %s\n", ensembl@biomart, ensembl@host, ensembl@dataset))
# list of gene-chunks, each of chunk 500 genes
gene_chunks <- split(genes, ceiling(seq_along(genes)/500))
message(paste("splitting data into ",length(gene_chunks),"chunks of up to 500 genes"))
#internal function to get the data from biomart
queryMart <- function(query_genes) {
message('fetching from mart ...')
biomaRt::getBM ( attributes = c('ensembl_gene_id',
'hsapiens_paralog_ensembl_gene',
'hsapiens_paralog_orthology_type',
'hsapiens_paralog_perc_id',
'hsapiens_paralog_perc_id_r1'),
filters = 'ensembl_gene_id',
values = query_genes,
mart = ensembl,
bmHeader=F)
}
#get the data from ensembl
martdata <- lapply(gene_chunks, queryMart) %>% dplyr::bind_rows() %>% as.data.frame()
#add filter so that %id is >20 and only within species paralogs ar eused
martdata <- martdata %>% dplyr::mutate(filter = hsapiens_paralog_perc_id > 20 &
hsapiens_paralog_orthology_type == 'within_species_paralog')
#count paralogs per gene
martdata_grouped <- martdata %>%
dplyr::filter(filter) %>%
dplyr::group_by(ensembl_gene_id) %>%
dplyr::summarise(count_paralogs=n(),
paralog_ids=paste(hsapiens_paralog_ensembl_gene, sep='|', collapse='|')) %>%
dplyr::ungroup()
#ensure genes with no paralogs accounted for
martdata_notfound <- data.frame(ensembl_gene_id = setdiff(genes, martdata_grouped$ensembl_gene_id),
count_paralogs=0,
paralog_ids='',
stringsAsFactors = FALSE)
#combine output
count_paralogs_output <- dplyr::bind_rows(martdata_grouped, martdata_notfound) %>%
dplyr::arrange(ensembl_gene_id) %>%
as.data.frame()
DBI::dbWriteTable(con, "count_paralogs_martdata", martdata, overwrite=T)
DBI::dbWriteTable(con, 'count_paralogs_output', count_paralogs_output, overwrite=T)
return(count_paralogs_output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.