R/countHumanParalogs.R

#' countHumanParalogs
#'
#' This function uses the \pkg{biomaRt} package to extract paralog information for supplied human genes
#'
#' @param con A \code{SQLiteConnection} object
#' @param genes A vector of ENSEMBL gene ids
#' @param alt_mart A Mart object created using biomaRt::useMart.  This should specify \code{hsapiens_gene_ensembl}
#'   as the dataset.
#' @return a data frame
#' @export
#' @examples
#' \dontrun{
#' ex_con <- setupSQLite()
#' all_genes <- getAllHumanGenes(ex_con)
#' set.seed(20120806)
#' ex_genes <- sample(all_genes$gene_id, 100)
#'
#' #count paralogs
#' ex_df <- countHumanParalogs(ex_con, ex_genes)
#' ex_df[1:10,1:2]
#'
#' #view the tables in the database
#' DBI::dbListTables(ex_con)
#'
#' #larger set of genes
#' countHumanParalogs(ex_con, sample(all_genes$gene_id, 2000))[1:10,1:2]
#'
#' #Specify which Mart to use
#' my_mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl", host="sep2015.archive.ensembl.org")
#' ex_df <- countHumanParalogs(ex_con, ex_genes, alt_mart=my_mart)
#' ex_df[1:10,1:2]
#' }

countHumanParalogs <- function(con, genes, alt_mart=NULL) {

    #get the data from ensembl - use alt_mart if supplied
    if(class(alt_mart) == 'Mart' && alt_mart@dataset == 'hsapiens_gene_ensembl') {
        message("Using user provided mart details")
        ensembl <- alt_mart
    } else {
        message("Using default mart details")
        ensembl <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host="mar2015.archive.ensembl.org")
    }

    #display info on mart
    message(sprintf("Biomart: %s\nHost: %s\nDataset: %s\n", ensembl@biomart, ensembl@host, ensembl@dataset))

    # list of gene-chunks, each of chunk 500 genes
    gene_chunks <- split(genes, ceiling(seq_along(genes)/500))
    message(paste("splitting data into ",length(gene_chunks),"chunks of up to 500 genes"))

    #internal function to get the data from biomart
    queryMart <- function(query_genes) {
        message('fetching from mart ...')
        biomaRt::getBM ( attributes = c('ensembl_gene_id',
                                        'hsapiens_paralog_ensembl_gene',
                                        'hsapiens_paralog_orthology_type',
                                        'hsapiens_paralog_perc_id',
                                        'hsapiens_paralog_perc_id_r1'),
                         filters = 'ensembl_gene_id',
                         values = query_genes,
                         mart = ensembl,
                         bmHeader=F)
    }

    #get the data from ensembl
    martdata <- lapply(gene_chunks, queryMart) %>% dplyr::bind_rows() %>% as.data.frame()

    #add filter so that %id is >20 and only within species paralogs ar eused
    martdata <- martdata %>% dplyr::mutate(filter = hsapiens_paralog_perc_id > 20 &
                                               hsapiens_paralog_orthology_type == 'within_species_paralog')

    #count paralogs per gene
    martdata_grouped <- martdata %>%
        dplyr::filter(filter) %>%
        dplyr::group_by(ensembl_gene_id) %>%
        dplyr::summarise(count_paralogs=n(),
                         paralog_ids=paste(hsapiens_paralog_ensembl_gene, sep='|', collapse='|')) %>%
        dplyr::ungroup()

    #ensure genes with no paralogs accounted for
    martdata_notfound <- data.frame(ensembl_gene_id = setdiff(genes, martdata_grouped$ensembl_gene_id),
                                    count_paralogs=0,
                                    paralog_ids='',
                                    stringsAsFactors = FALSE)

    #combine output
    count_paralogs_output <- dplyr::bind_rows(martdata_grouped, martdata_notfound) %>%
        dplyr::arrange(ensembl_gene_id) %>%
        as.data.frame()

    DBI::dbWriteTable(con, "count_paralogs_martdata", martdata, overwrite=T)
    DBI::dbWriteTable(con, 'count_paralogs_output', count_paralogs_output, overwrite=T)

    return(count_paralogs_output)
}
chapmandu2/CollateralVulnerability2016 documentation built on May 13, 2019, 3:27 p.m.