R/doFlyMineAnalysis.R

#' doFlyMineAnalysis
#'
#' This function determines the lethality status in D melanogaster of a given human gene
#'
#' Note: queries the WormMine API using Ruby see http://im-dev.wormbase.org/tools/wormmine/api.do?subtab=ruby
#'
#' @param con A \code{SQLiteConnection} object
#' @param genes A vector of human ENSEMBL gene ids
#' @param alt_mart A Mart object created using biomaRt::useMart.  This should specify \code{hsapiens_gene_ensembl}
#'   as the dataset.  See \code{\link{getOrthologs}} for more details.
#' @return a data frame with the results of the analysis
#' @export

doFlyMineAnalysis <- function(con, genes, alt_mart=NULL) {

    #get the fly orthologs
    fly_orthologs <- getOrthologs(con, genes, 'dmelanogaster', alt_mart)

    #group the fly orthologs for those genes with orthologs
    fly_orthologs_grouped <- fly_orthologs %>%
        dplyr::filter(nchar(dmelanogaster_homolog_orthology_type) > 4) %>%
        dplyr::group_by(ensembl_gene_id, dmelanogaster_homolog_orthology_type) %>%
        dplyr::summarise(dmelanogaster_count_orthologs=n(),
                  dmelanogaster_ortholog_ids=paste(dmelanogaster_homolog_ensembl_gene, collapse='|')) %>%
        dplyr::ungroup()

    #sort out entries with no ortholog
    no_fly_orthologs <-  fly_orthologs %>%
        dplyr::filter(nchar(dmelanogaster_homolog_orthology_type) < 4) %>%
        dplyr::transmute(ensembl_gene_id,
                         dmelanogaster_homolog_orthology_type='no_fly_ortholog',
                         dmelanogaster_count_orthologs=0,
                         dmelanogaster_ortholog_ids='')

    #combine and mark up the human genes with a single fly ortholog
    fly_orthologs_combined <- dplyr::bind_rows(fly_orthologs_grouped, no_fly_orthologs) %>%
        dplyr::mutate(single_ortholog_filter=(dmelanogaster_count_orthologs==1)) %>%
        as.data.frame()

    #now format gene text for ruby script
    fly_gene_ids <- fly_orthologs_combined %>%
        dplyr::filter(single_ortholog_filter) %>%
        dplyr::transmute(id=dmelanogaster_ortholog_ids) %>%
        dplyr::distinct()

    # list of gene-chunks, each of chunk 500 genes
    gene_chunks <- split(fly_gene_ids$id, ceiling(seq_along(fly_gene_ids$id)/500))
    message(paste("splitting data into ",length(gene_chunks),"chunks of up to 500 genes"))

    #internal function to get the data from Intermine
    queryMine <- function(query_genes) {

        #run the ruby script
        message(sprintf("Get data for %s fly genes from FlyMine", length(query_genes)))
        fly_genes_txt <- paste(query_genes, collapse=',')
        flymine_cmd <- sprintf('ruby \'%s/ruby/doFlyMineAnalysis.rb\' "%s"',system.file(package='CollateralVulnerability2016'), fly_genes_txt)
        flymine_data_txt <- system(flymine_cmd, intern=TRUE)

        #import the data into a data frame
        readr::read_tsv(paste(flymine_data_txt, collapse='\n'),
                            skip=1,
                            col_types='ccccccc',
                            col_names=c("primaryIdentifier", "symbol", "alleles_alleleClass", "alleles_primaryIdentifier",
                                        "alleles_symbol", "alleles_phenotypeAnnotations_annotationType",
                                        "alleles_phenotypeAnnotations_description"))

    }

    #get the data from intermine
    flymine_data <- lapply(gene_chunks, queryMine) %>% dplyr::bind_rows() %>% as.data.frame()

    message(sprintf('%s records retrieved, now analysing', nrow(flymine_data)))

    #filter the flymine data
    flymine_data_filtered <- flymine_data %>%
        dplyr::filter(alleles_phenotypeAnnotations_annotationType == 'phenotype class') %>%
        dplyr::mutate(is_lethal_fly = grepl('lethal',alleles_phenotypeAnnotations_description)) %>%
        as.data.frame()

    ##generate a summary table with one row per allele, highlgiths whether any phenotype annotations are lethal
    flymine_data_byallele <- flymine_data_filtered %>%
        dplyr::group_by(primaryIdentifier, symbol, alleles_alleleClass, alleles_primaryIdentifier, alleles_symbol) %>%
        dplyr::summarise(is_lethal_fly=as.logical(max(is_lethal_fly))) %>%
        dplyr::ungroup() %>%
        as.data.frame()

    #now count the number of lethal vs non-lethal alleles
    flymine_data_bygene <- flymine_data_byallele %>%
        dplyr::group_by(primaryIdentifier, symbol) %>%
        dplyr::summarise(allele_count_fly=n(),
                         lethal_count_fly=sum(is_lethal_fly),
                         lethal_pct_fly=round(lethal_count_fly*100/allele_count_fly,1)) %>%
        dplyr::ungroup() %>%
        as.data.frame()

    #combine output
    output_df <- fly_orthologs_combined %>%
        dplyr::left_join(flymine_data_bygene, by=c('dmelanogaster_ortholog_ids'='primaryIdentifier')) %>%
        as.data.frame()

    #put data into sqlite database
    DBI::dbWriteTable(con, "flymine_analysis_martdata_grouped", fly_orthologs_combined, overwrite=TRUE)
    DBI::dbWriteTable(con, "flymine_analysis_flymine_filtered", flymine_data_filtered, overwrite=TRUE)
    DBI::dbWriteTable(con, "flymine_analysis_flymine_byallele", flymine_data_byallele, overwrite=TRUE)
    DBI::dbWriteTable(con, "flymine_analysis_flymine_bygene", flymine_data_bygene, overwrite=TRUE)
    DBI::dbWriteTable(con, "flymine_analysis_output", output_df, overwrite=TRUE)
    message('Finished')

    return(output_df)

}
chapmandu2/CollateralVulnerability2016 documentation built on May 13, 2019, 3:27 p.m.