R/getMatch.R

Defines functions getMatch

Documented in getMatch

#' Get Gene Match
#'
#' Get genes match for a single input data set
#'
#' @param genes A vector of gene
#' @param inSpecies Kind of species
#' @param inType Kind of gene type
#' @param newSpecies Output species
#' @param moreAttrIn More attributes input
#' @param moreAttrNew More attributes output
#'
#' @import biomaRt
#'
#' @return A vector of genes
#'
#' @keywords gene, match
#'
#' @export

getMatch <- function(genes, inSpecies, inType, newSpecies, moreAttrIn = NA, moreAttrNew = NA){
    species = data.frame(
        species.nm = c('human','mouse','roundworm','fruitfly','zebrafish','chicken',
                       'rat','guinea pig','golden hamster','rabbit','pig','sheep',
                       'cow','dog','cat','macaque','bonobo','chimpanzee'),
        species.latin = c('homo sapiens','mus musculus','caenorhabditis elegans','drosphila melanagaster',
                          'danio rerio','gallus gallus','ratus norvegicus','cavia porcellus',
                          'melanochromis auratus','oryctolagus cuniculus','sus scrofa domesticus',
                          'ovis aries','bos taurus','canis lupus familiaris','felis catus',
                          'macaca mulatta','pan paniscus','pan troglodytes'),
        ensembl.nms = c('hsapiens_gene_ensembl','mmusculus_gene_ensembl','celegans_gene_ensembl','dmelanogaster_gene_ensembl',
                        'drerio_gene_ensembl','ggallus_gene_ensembl','rnorvegicus_gene_ensembl',
                        'cporcellus_gene_ensembl','mauratus_gene_ensembl','ocuniculus_gene_ensembl',
                        'sscrofa_gene_ensembl','oaries_gene_ensembl','btaurus_gene_ensembl',
                        'clfamiliaris_gene_ensembl','fcatus_gene_ensembl','mmulatta_gene_ensembl',
                        'ppaniscus_gene_ensembl','ptroglodytes_gene_ensembl'),
        genesymbol.attr = c('hgnc_symbol','mgi_symbol','external_gene_name','external_gene_name','hgnc_symbol',
                            'hgnc_symbol','external_gene_name','hgnc_symbol','mgi_symbol','hgnc_symbol','hgnc_symbol',
                            'hgnc_symbol','hgnc_symbol','hgnc_symbol','hgnc_symbol','hgnc_symbol','hgnc_symbol',
                            'hgnc_symbol'),
        row.names = 1
    )
    # print('d1')
    mart.in = species[inSpecies, ]$ensembl.nms
    mart.new = species[newSpecies, ]$ensembl.nms
    # print('d2')
    df.in = useMart(biomart='ensembl', dataset=mart.in)
    df.new = useMart(biomart='ensembl', dataset=mart.new)
    symbol.in = as.character(species[inSpecies,'genesymbol.attr'])
    symbol.new = as.character(species[newSpecies,'genesymbol.attr'])
    # print('d3')
    if (inType == 'symbol') {
        filter = as.character(species[inSpecies,'genesymbol.attr'])
    } else if (inType == 'ensembl') {
        filter = 'ensembl_gene_id'
    }
    # print('d4')
    cat('You have input ',length(genes),' genes\n')

    atb.in = c(symbol.in, "ensembl_gene_id","description")
    atb.new = c(symbol.new,"ensembl_gene_id","description")
    if (!is.na(moreAttrIn)) {atb.in = c(atb.in, moreAttrIn)}
    if (!is.na(moreAttrNew)) {atb.new = c(atb.new, moreAttrNew)}
    # print(species)
    # print(symbol.in)
    # print(atb.in)
    tbl.match = getLDS(
        mart=df.in, attributes=atb.in, # input genes
        filters=filter, values=genes,
        martL=df.new, attributesL=atb.new
    )
    # print('d5')

    tbl.match[,'Gene.description'] = gsub(' \\[.*','',tbl.match[,'Gene.description'])
    tbl.match[,'Gene.description.1'] = gsub(' \\[.*','',tbl.match[,'Gene.description.1'])

    colnames(tbl.match)[1:length(atb.in)] = paste0(colnames(tbl.match)[1:length(atb.in)],'.',inSpecies)
    colnames(tbl.match)[(length(atb.in)+1):(length(atb.in)+length(atb.new))] = paste0(colnames(tbl.match)[(length(atb.in)+1):(length(atb.in)+length(atb.new))],'.',newSpecies)

    cat('We found ',dim(tbl.match)[1],' matches\n')
    cat(sum(duplicated(tbl.match[,1])),' of those are duplicates and only keeping the 1st of each\n')
    if (inType == 'symbol') {
        tbl.match = tbl.match[match(genes,tbl.match[,1]),]
        rownames(tbl.match) = NULL
        tbl.return = cbind(genes,tbl.match,stringsAsFactors=FALSE)
    } else if (inType == 'ensembl') {
        tbl.match = tbl.match[match(genes,tbl.match[,2]),]
        rownames(tbl.match) = NULL
        tbl.return = cbind(genes,tbl.match,stringsAsFactors=FALSE)
    }
    return(tbl.return)
}
CHuanSite/PJD documentation built on Oct. 26, 2021, 1 p.m.