R/doWormMineAnalysis.R

#' doWormMineAnalysis
#'
#' This function determines the lethality status in C elegans of a given human gene.
#'
#' Note: queries the WormMine API using Ruby see http://im-dev.wormbase.org/tools/wormmine/api.do?subtab=ruby
#'
#' @param con A \code{SQLiteConnection} object
#' @param genes A vector of ENSEMBL gene ids
#' @param alt_mart A Mart object created using biomaRt::useMart.  This should specify \code{hsapiens_gene_ensembl}
#'   as the dataset.  See \code{\link{getOrthologs}} for more details.
#' @return a data frame
#' @export

doWormMineAnalysis <- function(con, genes, alt_mart=NULL) {

    #get the worm orthologs
    worm_orthologs <- getOrthologs(con, genes, 'celegans', alt_mart)

    #group the worm orthologs for those genes with orthologs
    worm_orthologs_grouped <- worm_orthologs %>%
        dplyr::filter(nchar(celegans_homolog_orthology_type) > 4) %>%
        dplyr::group_by(ensembl_gene_id, celegans_homolog_orthology_type) %>%
        dplyr::summarise(celegans_count_orthologs=n(),
                         celegans_ortholog_ids=paste(celegans_homolog_ensembl_gene, collapse='|')) %>%
        dplyr::ungroup()

    #sort out entries with no ortholog
    no_worm_orthologs <-  worm_orthologs %>%
        dplyr::filter(nchar(celegans_homolog_orthology_type) < 4) %>%
        dplyr::transmute(ensembl_gene_id,
                         celegans_homolog_orthology_type='no_worm_ortholog',
                         celegans_count_orthologs=0,
                         celegans_ortholog_ids='')

    #combine and mark up the human genes with a single worm ortholog
    worm_orthologs_combined <- dplyr::bind_rows(worm_orthologs_grouped, no_worm_orthologs) %>%
        dplyr::mutate(single_ortholog_filter=(celegans_count_orthologs==1)) %>%
        as.data.frame()

    #now format gene text for ruby script
    worm_gene_ids <- worm_orthologs_combined %>%
        dplyr::filter(single_ortholog_filter) %>%
        dplyr::transmute(id=celegans_ortholog_ids) %>%
        dplyr::distinct()

    # list of gene-chunks, each of chunk 500 genes
    gene_chunks <- split(worm_gene_ids$id, ceiling(seq_along(worm_gene_ids$id)/500))
    message(paste("splitting data into ",length(gene_chunks),"chunks of up to 500 genes"))

    #internal function to get the data from Intermine
    queryMine <- function(query_genes) {

        #run the ruby script
        message(sprintf("Get data for %s worm genes from WormMine", length(query_genes)))
        worm_genes_txt <- paste(query_genes, collapse=',')
        wormmine_cmd <- sprintf('ruby \'%s/ruby/doWormMineAnalysis.rb\' "%s"',system.file(package='CollateralVulnerability2016'), worm_genes_txt)
        wormmine_data_txt <- system(wormmine_cmd, intern=TRUE)

        #import the data into a data frame
        readr::read_tsv(paste(wormmine_data_txt, collapse='\n'),
                        skip=0,
                        col_types='cccccccccc',
                        col_names=c("primaryIdentifier", "symbol", "secondaryIdentifier", "briefDescription", "alleles_symbol",
                                    "alleles_primaryIdentifier", "alleles_phenotypes_identifier", "alleles_phenotypes_name",
                                    "alleles_phenotypes_description", "query"))

    }

    #get the data from intermine
    wormmine_data <- lapply(gene_chunks, queryMine) %>% dplyr::bind_rows() %>% as.data.frame()

    message(sprintf('%s records retrieved, now analysing', nrow(wormmine_data)))

    #filter the wormmine data
    wormmine_data_filtered <- wormmine_data %>%
        dplyr::mutate(is_lethal_worm = !grepl('NotObs', query)) %>%
        as.data.frame()  #already filtered for lethal phenotypes in intermine query

    ##generate a summary table with one row per allele, highlights whether any phenotype annotations are lethal
    wormmine_data_byallele <- wormmine_data_filtered %>%
        dplyr::group_by(primaryIdentifier, symbol, alleles_primaryIdentifier, alleles_symbol) %>%
        dplyr::summarise(is_lethal_worm=as.logical(max(is_lethal_worm))) %>%
        dplyr::ungroup() %>%
        as.data.frame()

    #now count the number of lethal vs non-lethal alleles
    wormmine_data_bygene <- wormmine_data_byallele %>%
        dplyr::group_by(primaryIdentifier, symbol) %>%
        dplyr::summarise(allele_count_worm=n(),
                         lethal_count_worm=sum(is_lethal_worm),
                         lethal_pct_worm=round(lethal_count_worm*100/allele_count_worm,1)) %>%
        dplyr::ungroup() %>%
        as.data.frame()

    #combine output
    output_df <- worm_orthologs_combined %>%
        dplyr::left_join(wormmine_data_bygene, by=c('celegans_ortholog_ids'='primaryIdentifier')) %>%
        as.data.frame()

    #put data into sqlite database
    DBI::dbWriteTable(con, "wormmine_analysis_martdata_grouped", worm_orthologs_combined, overwrite=TRUE)
    DBI::dbWriteTable(con, "wormmine_analysis_wormmine_filtered", wormmine_data_filtered, overwrite=TRUE)
    DBI::dbWriteTable(con, "wormmine_analysis_wormmine_byallele", wormmine_data_byallele, overwrite=TRUE)
    DBI::dbWriteTable(con, "wormmine_analysis_wormmine_bygene", wormmine_data_bygene, overwrite=TRUE)
    DBI::dbWriteTable(con, "wormmine_analysis_output", output_df, overwrite=TRUE)
    message('Finished')


    return(output_df)
}
chapmandu2/CollateralVulnerability2016 documentation built on May 13, 2019, 3:27 p.m.