R/getOrthologs.R

#' getOrthologs
#'
#' This function uses the \pkg{biomaRt} package to get ortholog information for supplied human genes
#'
#' @param con A \code{SQLiteConnection} object
#' @param genes A vector of ENSEMBL gene ids
#' @param species The ENSEMBL species id eg mmusculus for mouse
#' @param alt_mart A Mart object created using biomaRt::useMart.  This should specify \code{hsapiens_gene_ensembl}
#'   as the dataset.
#' @return a data frame
#' @export
#' @examples
#' \dontrun{
#' ex_con <- setupSQLite()
#' ex_genes <- c('ENSG00000074800', 'ENSG00000127616')
#'
#' #get mouse orthologs
#' getOrthologs(ex_con, ex_genes, species='mmusculus')
#'
#' #get fly orthologs
#' getOrthologs(ex_con, ex_genes, species='dmelanogaster')
#'
#' #view the tables in the database
#' DBI::dbListTables(ex_con)
#'
#' #larger set of genes
#' all_genes <- getAllHumanGenes(ex_con)
#' set.seed(20120806)
#' ex_genes2 <- sample(all_genes$gene_id, 2000)
#' ortho_df <- getOrthologs(ex_con, ex_genes2, species='mmusculus')
#'
#' #Specify which Mart to use
#' my_mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl", host="sep2015.archive.ensembl.org")
#' getOrthologs(ex_con, ex_genes, species='mmusculus', alt_mart=my_mart)
#' getOrthologs(ex_con, ex_genes, species='dmelanogaster', alt_mart=my_mart)
#' }


getOrthologs <- function(con, genes, species='mmusculus', alt_mart=NULL) {

    message(sprintf("Get the %s orthologs for %s human genes from biomart", species, length(genes)))

    #get the data from ensembl - use alt_mart if supplied
    if(class(alt_mart) == 'Mart' && alt_mart@dataset == 'hsapiens_gene_ensembl') {
        message("Using user provided mart details")
        ensembl <- alt_mart
    } else {
        message("Using default mart details")
        ensembl <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host="mar2015.archive.ensembl.org")
    }

    #display info on mart
    message(sprintf("Biomart: %s\nHost: %s\nDataset: %s\n", ensembl@biomart, ensembl@host, ensembl@dataset))

    #check that the species is valid
    if (!(sprintf('%s_homolog_ensembl_gene', species) %in% biomaRt::listAttributes(ensembl)$name)) {
        stop(sprintf('%s not a valid ENSEMBL species id'))
    }

    # list of gene-chunks, each of chunk 500 genes
    gene_chunks <- split(genes, ceiling(seq_along(genes)/500))
    message(paste("splitting data into ",length(gene_chunks),"chunks of up to 500 genes"))

    #internal function to get the data from biomart
    queryMart <- function(query_genes) {
        message('fetching from mart ...')
        biomaRt::getBM ( attributes = c('ensembl_gene_id',
                                        sprintf('%s_homolog_ensembl_gene', species),
                                        sprintf('%s_homolog_orthology_type', species),
                                        sprintf('%s_homolog_perc_id', species),
                                        sprintf('%s_homolog_perc_id_r1', species)),
                         filters = 'ensembl_gene_id',
                         values = query_genes,
                         mart = ensembl,
                         bmHeader=F)
    }

    #get the data from ensembl
    martdata <- lapply(gene_chunks, queryMart) %>% dplyr::bind_rows() %>% as.data.frame()

    message('Done, now writing to database')
    DBI::dbWriteTable(con, sprintf("%s_martdata", species), martdata, overwrite=T)

    message('Finished')

    return(martdata)
}
chapmandu2/CollateralVulnerability2016 documentation built on May 13, 2019, 3:27 p.m.