R/TransGeneID.R
In MAGeCKFlute: Integrative Analysis Pipeline for Pooled CRISPR Functional Genetic Screens

Documented in getGeneAnn getOrg getOrtAnn TransGeneID

#' Gene ID conversion between ENTREZID and SYMBOL
#'
#' @docType methods
#' @name TransGeneID
#' @rdname TransGeneID
#' @aliases transGeneID
#'
#' @param genes A character vector, input genes to be converted.
#' @param fromType The input ID type, one of "entrez", "symbol"(default), "hgnc",
#' "ensembl", "fullname" and "uniprotswissprot";
#' you can also input other valid attribute names for biomaRt.
#' Look at the code in examples to check valid attributes.
#' @param toType The output ID type, similar to `fromType`.
#' @param organism "hsa"(default), "mmu", "bta", "cfa", "ptr", "rno", and "ssc" are optional.
#' @param fromOrg "hsa", "mmu", "bta", "cfa", "ptr", "rno", and "ssc" are optional
#' (Only used when transform gene ids between organisms).
#' @param toOrg "hsa"(default), "mmu", "bta", "cfa", "ptr", "rno", and "ssc" are optional
#' (Only used when transform gene ids between organisms).
#' @param ensemblHost String, specifying ensembl host, you can use `listEnsemblArchives()`
#' to show all available Ensembl archives hosts.
#' @param unique Boolean, specifying whether do one-to-one mapping.
#' @param update Boolean, specifying whether update built-in gene annotation (needs network and takes time).
#'
#' @return A character vector, named by unique input gene ids.
#'
#' @author Wubing Zhang
#'
#' @examples
#' TransGeneID("HLA-A", organism="hsa")
#' TransGeneID("HLA-A", toType = "uniprot", organism="hsa")
#' TransGeneID("H2-K1", toType="Symbol", fromOrg = "mmu", toOrg = "hsa")
#'
#' @export

TransGeneID <- function(genes, fromType="Symbol", toType="Entrez",
                        organism = "hsa", fromOrg = organism, toOrg = organism,
                        ensemblHost = "www.ensembl.org",
                        unique = TRUE, update = FALSE){

  #### Verify  parameters ####
  genes = as.character(genes)
  fromType = tolower(fromType)
  toType = tolower(toType)
  if(length(genes)<1) return(c())
  keggcode = rep(c("hsa", "mmu", "rno", "bta", "cfa", "ptr", "ssc"), 2)
  names(keggcode) = c(tolower(c("Human", "Mouse", "Rat", "Bovine", "Canine", "Chimp", "Pig")),
                      c("hsa", "mmu", "rno", "bta", "cfa", "ptr", "ssc"))
  if(!tolower(organism)%in%names(keggcode)) stop("Organism error ...")
  if(!tolower(fromOrg)%in%names(keggcode)) stop("fromOrg error ...")
  if(!tolower(toOrg)%in%names(keggcode)) stop("toOrg error ...")

  organism = keggcode[tolower(organism)]
  fromOrg = keggcode[tolower(fromOrg)]
  toOrg = keggcode[tolower(toOrg)]

  #### Read annotation file ####
  datasets = paste0(c("hsapiens", "mmusculus", "btaurus", "cfamiliaris",
                      "ptroglodytes", "rnorvegicus", "sscrofa"), "_gene_ensembl")
  if(fromOrg==toOrg){#### GeneID Transformation within organisms ####
    ## ID mapping.
    if(all(c(fromType, toType) %in% c("entrez", "symbol", "ensembl"))){
      ann <- getGeneAnn(organism, update=update)$Gene
      if("symbol" %in% c(fromType, toType) & any(!genes%in%ann$symbol)){
        ann = rbind(ann, ann)
        idx = (nrow(ann)/2+1):nrow(ann)
        ann$symbol[idx] = ann$synonyms[idx]
      }
      ann = ann[, c(fromType, toType)]
    }else if(all(c(fromType, toType) %in% c("uniprot", "ensembl", "refseq", "symbol"))){
      ann <- getGeneAnn(organism, update=update)$Protein
      ann = ann[, c(fromType, toType)]
    }else{
      if (!requireNamespace("biomaRt", quietly = TRUE)) {
        stop("Package \"biomaRt\" is required. Please install it.", call. = FALSE)
      }
      ds = datasets[grepl(organism, datasets)]
      ensembl <- biomaRt::useMart(biomart = 'ENSEMBL_MART_ENSEMBL',
                                  dataset = ds, host = ensemblHost)
      ## decide the attributes automatically
      attrs = biomaRt::listAttributes(ensembl)$name
      if(sum(attrs==fromType)==0){
        idx1 = grepl(tolower(fromType), attrs)
        idx = idx1
        if(sum(idx1)>2) idx = idx1&grepl("_id", attrs)
        fromType = ifelse(sum(idx)>0, attrs[idx][1], attrs[idx1][1])
        if(fromType=="hgnc_symbol" & fromOrg=="mmu") fromType = "mgi_symbol"
      }
      if(sum(attrs==toType)==0){
        idx1 = grepl(tolower(toType), attrs)
        idx = idx1
        if(sum(idx1)>2) idx = idx1&grepl("_id", attrs)
        toType = ifelse(sum(idx)>0, attrs[idx][1], attrs[idx1])
        if(toType=="hgnc_symbol" & toOrg=="mmu") toType = "mgi_symbol"
      }
      ## retrieve the data
      ann = biomaRt::getBM(attributes=c(fromType, toType), mart = ensembl,
                           filters = fromType, values = genes)
    }

    ## Retain unique conversion
    idx = ann[, toType]=="" | is.na(ann[, toType])
    ann = ann[!idx, ]
    idx = ann[, fromType]=="" | is.na(ann[, fromType])
    ann = ann[!idx, ]
    ##
    tmp = ann
    tmp[, fromType] = gsub("\\..*|-.*", "", tmp[, fromType])
    ann = rbind.data.frame(ann, tmp)
    ann = ann[ann[,fromType]%in%genes, ]
    # idx = duplicated(ann[, fromType])
    # convert = ann[!idx, toType]
    # names(convert) = ann[!idx, fromType]
    # gene_after = as.character(convert[genes])
    # names(gene_after) = genes
    # genes = gsub("\\..*|-.*", "", genes)
    # gene_after[is.na(gene_after)] = as.character(convert[genes[is.na(gene_after)]])
  }else{#### GeneID Transformation between organisms ####
    if(all(c(fromType, toType) %in% c("symbol", "entrez"))){
      ## read built-in annotation
      ann = getOrtAnn(fromOrg, toOrg, update)
      ann = ann[, c(paste0(fromOrg, "_", fromType), paste0(toOrg, "_", toType))]
      colnames(ann) = c(fromOrg, toOrg)
    }else{
      if (!requireNamespace("biomaRt", quietly = TRUE)) {
        stop("Package \"biomaRt\" is required. Please install it.",
             call. = FALSE)
      }
      ## Ortholog ID mapping.
      from = biomaRt::useMart("ensembl", dataset = datasets[grepl(fromOrg, datasets)])
      to = biomaRt::useMart("ensembl", dataset = datasets[grepl(toOrg, datasets)])
      ## decide the attributes automatically
      attrs_1 = biomaRt::listAttributes(from)$name
      attrs_2 = biomaRt::listAttributes(to)$name
      if(sum(attrs_1==fromType)==0){
        idx1 = grepl(tolower(fromType), attrs_1)
        idx = idx1
        if(sum(idx1)>2) idx = idx1&grepl("_id", attrs_1)
        fromType = ifelse(sum(idx)>0, attrs_1[idx][1], attrs_1[idx1][1])
        if(fromType=="hgnc_symbol" & fromOrg=="mmu") fromType = "mgi_symbol"
      }
      if(sum(attrs_2==toType)==0){
        idx1 = grepl(tolower(toType), attrs_2)
        idx = idx1
        if(sum(idx1)>2) idx = idx1&grepl("_id", attrs_2)
        toType = ifelse(sum(idx)>0, attrs_2[idx][1], attrs_2[idx1])
        if(toType=="hgnc_symbol" & toOrg=="mmu") toType = "mgi_symbol"
      }
      ## retrieve the data
      ann = biomaRt::getLDS(attributes = fromType, mart = from,
                            filters = fromType, values = genes,
                            attributesL = toType, martL = to)
      colnames(ann) = c(fromOrg, toOrg)
    }

    ## Retain unique conversion
    idx = ann[, toOrg]=="" | is.na(ann[, toOrg])
    ann = ann[!idx, ]
    ann = ann[ann[,fromOrg]%in%genes, ]
    # idx = duplicated(ann[, fromOrg])
    # convert = ann[!idx, toOrg]
    # names(convert) = ann[!idx, fromOrg]
    # gene_after = as.character(convert[genes])
    # names(gene_after) = genes
  }
  ann = ann[!duplicated(paste0(ann[,1], ann[,2])), ]
  if(unique){
    ann = ann[!duplicated(ann[,1]), ]
    rownames(ann) = ann[,1]
    tmp = ann[genes,2]
    names(tmp) = genes
    ann = tmp
  }
  return(ann)
}

#' Retrieve gene annotations from the NCBI, HNSC, and Uniprot databases.
#'
#' @docType methods
#' @name getGeneAnn
#' @rdname getGeneAnn
#'
#' @param org Character, hsa (default), bta, cfa, mmu, ptr, rno, ssc are optional.
#' @param update Boolean, indicating whether download current annotation.
#' @return A data frame.
#'
#' @author Wubing Zhang
#'
#' @examples
#' \dontrun{
#'   ann = getGeneAnn("hsa")
#'   head(ann)
#' }
#'
#' @export
#'
getGeneAnn <- function(org = "hsa", update = FALSE){
  options(stringsAsFactors = FALSE)
  #### Read rds file directly ####
  rdsann = file.path(system.file("extdata", package = "MAGeCKFlute"),
                     paste0("GeneID_Annotation_", org, ".rds"))
  if(file.exists(rdsann) & !update)
    return(list(Gene = readRDS(rdsann), Protein = readRDS(gsub("Gene", "Protein", rdsann))))

  #### NCBI gene annotation ####
  gzfile = paste0(c("Homo_sapiens", "Bos_taurus", "Canis_familiaris", "Mus_musculus",
                    "Pan_troglodytes", "Rattus_norvegicus", "Sus_scrofa"), ".gene_info.gz")
  names(gzfile) = c("hsa", "bta", "cfa", "mmu", "ptr", "rno", "ssc")
  locfname <- file.path(system.file("extdata", package = "MAGeCKFlute"), gzfile[org])
  if((!file.exists(locfname)) | update){
    ## Download gene information from NCBI ftp server
    refname <- paste0("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/", gzfile[org])
    download.file(refname, locfname, quiet = TRUE)
  }

  ## Reorder the mapping file
  ncbi_ann = read.csv(gzfile(locfname), sep = "\t", header = TRUE,
                      quote = "", stringsAsFactors = FALSE, comment.char = "")
  ncbi_ann = ncbi_ann[, c("GeneID", "Symbol", "Synonyms", "dbXrefs", "type_of_gene", "description")]
  colnames(ncbi_ann)[c(1,2,6)] = c("entrez", "symbol", "fullname")
  ncbi_ann$hgnc = gsub("\\|.*", "", gsub(".*HGNC:", "", ncbi_ann$dbXrefs))
  ncbi_ann$ensembl = gsub("\\|.*", "", gsub(".*Ensembl:", "", ncbi_ann$dbXrefs))
  ncbi_ann$hgnc[!grepl("HGNC", ncbi_ann$dbXrefs)] = ""
  ncbi_ann$ensembl[!grepl("Ensembl", ncbi_ann$dbXrefs)] = ""

  ncbi_ann = matrix(unlist(apply(ncbi_ann, 1, function(x){
    tmp = unlist(strsplit(x[3], "[|]"))
    return(as.vector(rbind(x[1], x[2], tmp, x[7], x[8], x[6])))
  })) , ncol=6, byrow = TRUE)
  colnames(ncbi_ann) = c("entrez", "symbol", "synonyms", "hgnc", "ensembl", "fullname")
  ncbi_ann[,1] = gsub(" ", "", ncbi_ann[,1])

  #### HGNC gene annotation ####
  # if(org=="hsa"){
  #   locfname2 = file.path(system.file("extdata", package = "MAGeCKFlute"), "HGNC_GeneID_annotation.txt.gz")
  #   if((!file.exists(locfname2)) | update){
  #     ## Download gene information from HGNC
  #     refname <- "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt"
  #     download.file(refname, locfname2, quiet = TRUE)
  #   }
  #   ## Reorder the mapping file
  #   hgnc_ann = read.csv(gzfile(locfname2), sep = "\t", header = TRUE,
  #                       stringsAsFactors = FALSE, comment.char = "")
  #   hgnc_ann = hgnc_ann[, c("entrez_id", "ensembl_gene_id", "symbol", "hgnc_id", "name",
  #                           "alias_symbol", "prev_symbol", "refseq_accession")]
  #   hgnc_ann$alias_symbol = paste0(hgnc_ann$alias_symbol, '|', hgnc_ann$prev_symbol)
  #
  #   synonyms_row = matrix(unlist(apply(hgnc_ann, 1, function(x){
  #     tmp = unlist(strsplit(x[6], "\\|"))
  #     if(length(tmp)>0) return(as.vector(rbind(x[1], tmp, x[2], x[8])))
  #     return(NULL)
  #   })) , ncol=4, byrow = TRUE)
  #   colnames(synonyms_row) = c("entrez", "symbol", "ensembl", "refseq")
  #   synonyms_row = synonyms_row[synonyms_row[,2]!="", ]
  #   names(hgnc_ann)[1:3] = c("entrez", "ensembl", "symbol")
  #   names(hgnc_ann)[8] = "refseq"
  #   hgnc_ann = rbind(hgnc_ann[,c(1:3,8)], synonyms_row[,c(1,3,2,4)])
  #   ncbi_ann = merge(ncbi_ann, hgnc_ann, by = names(hgnc_ann)[1:3], all = TRUE)
  # }

  #### Ensembl gene annotation ####
  tmpfile = file.path(system.file("extdata", package = "MAGeCKFlute"), "filelist")
  download.file("ftp://ftp.ensembl.org/pub/", tmpfile, quiet = TRUE)
  tmp = read.table(tmpfile, fill = TRUE, quote = "", stringsAsFactors = FALSE)
  tmp = gsub("release-", "", tmp[grepl("release", tmp[, ncol(tmp)]), ncol(tmp)])
  version = tmp[length(tmp)]
  gzfile = c("Homo_sapiens.GRCh38.", "Bos_taurus.ARS-UCD1.2.", "Canis_familiaris.CanFam3.1.",
             "Mus_musculus.GRCm38.", "Pan_troglodytes.Pan_tro_3.0.",
             "Rattus_norvegicus.Rnor_6.0.", "Sus_scrofa.Sscrofa11.1.")
  names(gzfile) = c("hsa", "bta", "cfa", "mmu", "ptr", "rno", "ssc")
  entrezfile <- paste0("ftp://ftp.ensembl.org/pub/release-", version, "/tsv/",
                       tolower(gsub("\\..*", "", gzfile[org])), "/", gzfile[org],
                       version, ".entrez.tsv.gz")
  # uniprotfile <- paste0("ftp://ftp.ensembl.org/pub/release-", version, "/tsv/",
  #                       tolower(gsub("\\..*", "", gzfile[org])), "/", gzfile[org],
  #                       version, ".uniprot.tsv.gz")
  refseqfile <- paste0("ftp://ftp.ensembl.org/pub/release-", version, "/tsv/",
                       tolower(gsub("\\..*", "", gzfile[org])), "/", gzfile[org],
                       version, ".refseq.tsv.gz")
  download.file(entrezfile, tmpfile, quiet = TRUE)
  ensg_entrez = read.table(tmpfile, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
  ensg_entrez = ensg_entrez[, c(1,4)]
  ensg_entrez = ensg_entrez[!duplicated(paste0(ensg_entrez[,1], ensg_entrez[,2])), ]
  # download.file(uniprotfile, tmpfile, quiet = TRUE)
  # ensg_uniprot = read.table(tmpfile, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
  # ensg_uniprot = ensg_uniprot[, c(1,4)]
  # ensg_uniprot = ensg_uniprot[!duplicated(paste0(ensg_uniprot[,1], ensg_uniprot[,2])), ]
  download.file(refseqfile, tmpfile, quiet = TRUE)
  ensg_refseq = read.table(tmpfile, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
  ensg_refseq = ensg_refseq[, c(1,4)]
  ensg_refseq = ensg_refseq[!duplicated(paste0(ensg_refseq[,1], ensg_refseq[,2])), ]
  ensembl_ann = merge(ensg_entrez, ensg_refseq, by = names(ensg_entrez)[1], all = TRUE)
  colnames(ensembl_ann) = c("ensembl", "entrez", "refseq")
  ## Remove redundant refseq ids.
  idx = duplicated(paste(ensembl_ann$ensembl, ensembl_ann$entrez))
  ensembl_ann = ensembl_ann[!idx, ]
  idx = is.na(ensembl_ann$entrez) & is.na(ensembl_ann$refseq)
  ensembl_ann = ensembl_ann[!idx, ]
  ensembl_ann = ensembl_ann[grepl("ENSG", ensembl_ann$ensembl), ]
  # datasets = paste0(c("hsapiens", "mmusculus", "btaurus", "cfamiliaris",
  #                     "ptroglodytes", "rnorvegicus", "sscrofa"), "_gene_ensembl")
  # ds = datasets[grepl(org, datasets)]
  # ensembl <- useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = ds)
  # symbol <- ifelse(org=="mmu", "mgi_symbol", "hgnc_symbol")
  # ensembl_ann = getBM(attributes=c("entrezgene_id", symbol, "hgnc_id", "ensembl_gene_id"), mart = ensembl)
  # colnames(ensembl_ann) = c("entrez", "symbol", "hgnc", "ensembl")
  # ensembl_ann$hgnc = gsub("HGNC:", "", ensembl_ann$hgnc)

  #### Merge all annotations ####
  geneann = merge(ncbi_ann[, c("ensembl","entrez","symbol","synonyms")],
                  ensembl_ann[,c("ensembl","entrez")], by = c("ensembl","entrez"), all = TRUE)
  geneann$entrez = gsub(" ", "", geneann$entrez)
  # ids = gsub("-.*$", "", uniprot_ann$Canonical[!grepl("-1", uniprot_ann$Canonical)])
  # data$uniprot[data$uniprot%in%ids] = uniprot_ann[data$uniprot[data$uniprot%in%ids], "Canonical"]
  # idx = !(grepl("-", data$uniprot)|is.na(data$uniprot))
  # data$uniprot[idx] = paste0(data$uniprot[idx], "-1")
  saveRDS(geneann, rdsann)

  #### Uniprot gene annotation ####
  proteome_code = c("up000005640", "UP000009136", "UP000002254", "up000000589", "UP000002277", "UP000002494")
  names(proteome_code) = c("hsa", "bta", "cfa", "mmu", "ptr", "rno")
  locfname <- file.path(system.file("extdata", package = "MAGeCKFlute"),
                        paste0("uniprot_proteome_", proteome_code[org], ".tab"))
  uniprot_link <- paste0("https://www.uniprot.org/uniprot/?query=proteome:", proteome_code[org],
                         "&format=tab&force=true&columns=id,entry%20name,reviewed,protein%20names,genes,organism,database(Ensembl),comment(SUBCELLULAR%20LOCATION),database(RefSeq),comment(ALTERNATIVE%20PRODUCTS)&sort=score")

  if((!file.exists(locfname)) | update){
    ## Download protein annotation information from uniprot
    download.file(uniprot_link, locfname, quiet = TRUE)
  }
  ## Reorder the mapping file
  uniprot_ann = read.csv(gzfile(locfname), sep = "\t", header = TRUE,
                         quote = "", stringsAsFactors = FALSE, comment.char = "")
  suppressWarnings(try(file.remove(locfname), silent = TRUE))
  colnames(uniprot_ann) = c("Entry", "EntryName", "Status", "Name", "Gene",
                            "Organism", "Ensembl", "Subcellular", "RefSeq", "Isoforms")
  uniprot_ann$Canonical = gsub(".*IsoId=", "", gsub("; Sequence=Displayed.*", "", uniprot_ann$Isoforms))
  uniprot_ann$Canonical[!grepl("Sequence=Displayed", uniprot_ann$Isoforms)] =
    paste0(uniprot_ann$Entry[!grepl("Sequence=Displayed", uniprot_ann$Isoforms)], "-1")
  rownames(uniprot_ann) = uniprot_ann$Entry
  Symbols = unlist(apply(uniprot_ann, 1, function(x){
    Uniprot = x[1]
    Symbols = unlist(strsplit(gsub(";$", "", x[5]), " "))
    Symbols = Symbols[Symbols!=""|Symbols==""]
    if(length(Symbols)==0) return(NULL)
    if(length(Symbols)==1) Symbols = c(Symbols, "")
    rbind(rep(Uniprot, length(Symbols)-1), Symbols[1], Symbols[-1])
  }))
  Symbols = matrix(Symbols, ncol = 3, byrow = TRUE)
  colnames(Symbols) = c('Entry', "symbol", "synonyms")
  Symbols = as.data.frame(Symbols, stringsAsFactors = FALSE)

  ENSTs = unlist(apply(uniprot_ann, 1, function(x){
    Uniprot = x[1]
    ENSTs = unlist(strsplit(gsub(";$", "", x[7]), ";"))
    rbind(rep(Uniprot, length(ENSTs)), ENSTs)
  }))
  ENSTs = matrix(ENSTs, ncol = 2, byrow = TRUE)
  ENSTs = as.data.frame(ENSTs, stringsAsFactors = FALSE)
  colnames(ENSTs) = c('Entry', "ensembl")
  ENSTs$uniprot = gsub(".*\\[|\\]", "", ENSTs$ensembl)
  ENSTs$uniprot[!grepl("\\[", ENSTs$ensembl)] = ENSTs$Entry[!grepl("\\[", ENSTs$ensembl)]
  ENSTs$ensembl = gsub(" .*", "", ENSTs$ensembl)
  ENSTs$uniprot[!grepl("-", ENSTs$uniprot)] =
    uniprot_ann[ENSTs$uniprot[!grepl("-", ENSTs$uniprot)], "Canonical"]

  RefSeq = unlist(apply(uniprot_ann, 1, function(x){
    Uniprot = x[1]
    RefSeq = unlist(strsplit(gsub(";$", "", x[9]), ";"))
    rbind(rep(Uniprot, length(RefSeq)), RefSeq)
  }))
  RefSeq = matrix(RefSeq, ncol = 2, byrow = TRUE)
  RefSeq = as.data.frame(RefSeq, stringsAsFactors = FALSE)
  colnames(RefSeq) = c('Entry', "refseq")
  RefSeq$uniprot = gsub(".*\\[|\\]", "", RefSeq$refseq)
  RefSeq$uniprot[!grepl("\\[", RefSeq$refseq)] = RefSeq$Entry[!grepl("\\[", RefSeq$refseq)]
  RefSeq$refseq = gsub(" .*", "", RefSeq$refseq)
  RefSeq$uniprot[!grepl("-", RefSeq$uniprot)] =
    uniprot_ann[RefSeq$uniprot[!grepl("-", RefSeq$uniprot)], "Canonical"]
  ## Merge all annotations from each column
  uniprot_ann = merge(ENSTs[,-1], RefSeq[,-1], by = "uniprot", all = TRUE)
  uniprot_ann$Entry = gsub("-.*","",uniprot_ann$uniprot)
  # Symbols = Symbols[!duplicated(Symbols$Entry), ]
  uniprot_ann = merge(uniprot_ann, Symbols, by = "Entry", all = TRUE)
  saveRDS(uniprot_ann, gsub("Gene", "Protein", rdsann))

  return(list(Gene = geneann, Protein = uniprot_ann))
}


#' Retreive reference orthologs annotation.
#'
#' @docType methods
#' @name getOrtAnn
#' @rdname getOrtAnn
#'
#' @param fromOrg Character, hsa (default), bta, cfa, mmu, ptr, rno, ssc are optional.
#' @param toOrg Character, hsa (default), bta, cfa, mmu, ptr, rno, ssc are optional.
#' @param update Boolean, indicating whether download recent annotation from NCBI.
#' @return A data frame.
#'
#' @author Wubing Zhang
#'
#' @examples
#' \dontrun{
#'   ann = getOrtAnn("mmu", "hsa")
#'   head(ann)
#' }
#'
#' @export
#'
getOrtAnn <- function(fromOrg = "mmu", toOrg = "hsa", update = FALSE){
  #### Read rds file directly ####
  rdsann = file.path(system.file("extdata", package = "MAGeCKFlute"),
                     paste0("HOM_GeneID_Annotation_", fromOrg, "_", toOrg, ".rds"))
  if(file.exists(rdsann) & !update) return(readRDS(rdsann))

  keggcode = c("hsa", "mmu", "rno", "bta", "cfa", "ptr", "ssc")
  names(keggcode) = c("human", "mouse", "rat", "bovine", "canine", "chimp", "pig")
  #### Download data from MGI ####
  locfname <- file.path(system.file("extdata", package = "MAGeCKFlute"),
                        "HOM_MouseHumanSequence.rpt.gz")
  if((!file.exists(locfname)) | update){
    ## Download gene information
    refname <- "http://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt"
    download.file(refname, locfname, quiet = TRUE)
  }
  ## Reorder the mapping file
  read.table(locfname, sep = "\t", header = TRUE, stringsAsFactors = FALSE) -> mgi_ann
  mgi_ann = mgi_ann[, c(1,2,4,5)]
  colnames(mgi_ann) = c("homoloid", "org", "symbol", "entrez")
  mgi_ann$org = gsub(", laboratory", "", mgi_ann$org)
  mgi_ann$org = keggcode[mgi_ann$org]

  #### Download data from NCBI ####
  locfname <- file.path(system.file("extdata", package = "MAGeCKFlute"),
                        "homologene.data.gz")
  if((!file.exists(locfname)) | update){
    ## Download gene information
    refname <- "ftp://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data"
    download.file(refname, locfname, quiet = TRUE)
  }
  ## Reorder the mapping file
  read.table(locfname, sep = "\t", stringsAsFactors = FALSE, quote = "") -> ncbi_ann
  ncbi_ann = ncbi_ann[, c(1,2,4,3)]
  colnames(ncbi_ann) = c("homoloid", "org", "symbol", "entrez")
  names(keggcode) = c(9606, 10090, 10116, 9913, 9615, 9598, 9823)
  ncbi_ann$org = keggcode[as.character(ncbi_ann$org)]
  ncbi_ann = ncbi_ann[!is.na(ncbi_ann$org), ]

  ## Merge and arrange the mapping file
  ann = rbind.data.frame(mgi_ann, ncbi_ann)
  genes = unique(ann$entrez)
  idx1 = ann$entrez %in% genes
  idx2 = ann$org == toOrg
  idx3 = ann$homoloid %in% ann$homoloid[idx1]
  tmp1 = ann[idx1, c("homoloid", "symbol", "entrez")]
  tmp2 = ann[(idx2&idx3), c("homoloid", "symbol", "entrez")]
  colnames(tmp1)[2:3] = paste0(fromOrg, c("_symbol", "_entrez"))
  colnames(tmp2)[2:3] = paste0(toOrg, c("_symbol", "_entrez"))
  ann = merge(tmp1, tmp2, by = "homoloid")[,-1]

  #### Retrieve annotation from Ensembl ####
  datasets = paste0(c("hsapiens", "mmusculus", "btaurus", "cfamiliaris",
                      "ptroglodytes", "rnorvegicus", "sscrofa"), "_gene_ensembl")
  ## Ortholog ID mapping.
  if (!requireNamespace("biomaRt", quietly = TRUE)) {
    stop("Package \"biomaRt\" is required. Please install it.", call. = FALSE)
  }
  from = biomaRt::useMart("ensembl", dataset = datasets[grepl(fromOrg, datasets)])
  to = biomaRt::useMart("ensembl", dataset = datasets[grepl(toOrg, datasets)])
  ## decide the attributes automatically
  from_symbol <- ifelse(fromOrg=="mmu", "mgi_symbol", "hgnc_symbol")
  to_symbol <- ifelse(toOrg=="mmu", "mgi_symbol", "hgnc_symbol")
  ## retrieve the data
  ensembl_ann = biomaRt::getLDS(attributes = c(from_symbol, "entrezgene_id"), mart = from,
                                attributesL = c(to_symbol, "entrezgene_id"), martL = to)
  colnames(ensembl_ann) = c(paste0(fromOrg, c("_symbol", "_entrez")),
                            paste0(toOrg, c("_symbol", "_entrez")))

  ## Merge all the annotations
  ann = rbind.data.frame(ann, ensembl_ann)
  idx = duplicated(paste(ann[,1], ann[,2], ann[,3], ann[,4], sep = "_"))
  ann = ann[!idx, ]
  saveRDS(ann, rdsann)
  return(ann)
}

#' Get the kegg code of specific mammalia organism.
#'
#' @docType methods
#' @name getOrg
#' @rdname getOrg
#'
#' @param organism Character, KEGG species code, or the common species name.
#' For all potential values check: data(bods); bods. Default org="hsa",
#' and can also be "human" (case insensitive).
#' @return A list containing three elements:
#' \item{org}{species}
#' \code{pkg}{annotation package name}
#'
#' @author Wubing Zhang
#'
#' @examples
#' ann = getOrg("human")
#' print(ann$pkg)
#'
#' @export

getOrg <- function(organism){
  bods <- data.frame(package = paste0("org.", c("Hs", "Mm", "Rn", "Bt", "Cf", "Pt", "Ss"), ".eg.db"),
                     species = c("Human", "Mouse", "Rat", "Bovine", "Canine", "Chimp", "Pig"),
                     "kegg code" = c("hsa", "mmu", "rno", "bta", "cfa", "ptr", "ssc"),
                     check.names = FALSE, stringsAsFactors = FALSE)
  res=list()
  ##======
  # Get the mapping from organism to package
  ridx = c(which(tolower(bods[,2])==tolower(organism)),
           which(tolower(bods[,3])==tolower(organism)))
  stopifnot(length(ridx)==1)
  res$org = bods[ridx,3]
  res$pkg = bods[ridx,1]
  return(res)
}
Any scripts or data that you put into this service are public.
MAGeCKFlute documentation built on Nov. 8, 2020, 5:40 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
MAGeCKFlute
Integrative Analysis Pipeline for Pooled CRISPR Functional Genetic Screens

R/TransGeneID.R
In MAGeCKFlute: Integrative Analysis Pipeline for Pooled CRISPR Functional Genetic Screens

Defines functions getOrg getOrtAnn getGeneAnn TransGeneID

Documented in getGeneAnn getOrg getOrtAnn TransGeneID

Try the MAGeCKFlute package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MAGeCKFlute Integrative Analysis Pipeline for Pooled CRISPR Functional Genetic Screens

R/TransGeneID.R In MAGeCKFlute: Integrative Analysis Pipeline for Pooled CRISPR Functional Genetic Screens

Defines functions getOrg getOrtAnn getGeneAnn TransGeneID

Documented in getGeneAnn getOrg getOrtAnn TransGeneID

Try the MAGeCKFlute package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MAGeCKFlute
Integrative Analysis Pipeline for Pooled CRISPR Functional Genetic Screens

R/TransGeneID.R
In MAGeCKFlute: Integrative Analysis Pipeline for Pooled CRISPR Functional Genetic Screens