R/gplMapDetectGeneID.R

Defines functions .detect_entrez_ids_name .detect_genbank_cols_name .detect_int_cols .find_entrez_w_in_block .detect_refseq_cols .detect_ensembl_cols .detect_unigene_cols .detect_hgnc_cols .detect_genbank_cols .find_col_loc .parse_multi_col .check_entrez_overlap

Documented in .check_entrez_overlap .detect_ensembl_cols .detect_entrez_ids_name .detect_genbank_cols .detect_genbank_cols_name .detect_hgnc_cols .detect_int_cols .detect_refseq_cols .detect_unigene_cols .find_col_loc .find_entrez_w_in_block .parse_multi_col

#' Looks for entrez ids by trying to find a column name named "Entrez"
#'
#' @param gpl_cols the column data from a GPL
#' @return a list of columns containing these locations, an empty list if no columns
.detect_entrez_ids_name <- function(gpl_cols){
  entrez.col1 <- which(grepl("entrez",
                             gpl_cols$Column, ignore.case = TRUE))
  entrez.col2 <- which(grepl("entrez",
                             gpl_cols$Description, ignore.case = TRUE))
  entrez.col <- unique(c(entrez.col1, entrez.col2))
  return(entrez.col)
}

#' Looks for genbank cols by trying to find a column name named "GB_ACC"
#' or genbank
#'
#' @param gpl_cols the column data from a GPL
#' @return a list of columns containing these locations, an empty list if no columns
.detect_genbank_cols_name <- function(gpl_cols){
  genbank.col1 <- which(grepl("GB[_|\\.][ACC|LIST]",
                             gpl_cols$Column, ignore.case = TRUE))
  genbank.col2 <- which(grepl("Genbank",
                             gpl_cols$Column, ignore.case = TRUE))
  genbank.col <- unique(genbank.col1, genbank.col2)
  return(genbank.col)
}

#' Identify integer only only columns, hopefully these are Entrez.
#' (that will be tested later)
#' @param gpl.df the gpl data frame
#' @param MIN.OVERLAP the minimum number of mapped genes allowed, default=8000
#' @return a list of the indices of the columns containing these ids
.detect_int_cols <- function(gpl.df, MIN.OVERLAP=8000){
  int_only_cols <- sapply(2:ncol(gpl.df), function(i){
    sum(stringr::str_detect(gpl.df[,i], "^[0-9]+$"), na.rm=TRUE)
  })
  col_idx <- which(int_only_cols > MIN.OVERLAP)+1
  return(col_idx)
}

#' Identify columns that might have entrez in them. This would be
#' a block column where the entrez column is separated by "///" or ","
#'
#' @param gpl.df the gpl data frame
#' @param MIN.OVERLAP the minimum number of mapped genes allowed, default=8000
#' @return a list of the indices of the columns containing these ids
.find_entrez_w_in_block <- function(gpl.df, MIN.OVERLAP=8000){
  INTERNAL.SEP <-"///|//|,|\\|"
  SUFFIX.SEP <- "\\."
  int_cols <- sapply(2:ncol(gpl.df), function(i){
    ids <- stringr::str_replace_all(unlist(stringr::str_split(gpl.df[,i],
                                                             INTERNAL.SEP)), " ", "")
    sum(stringr::str_detect(ids, "^[0-9]+$"), na.rm=TRUE)
  })
  return(which(int_cols > MIN.OVERLAP)+1)
}

#' Identify the refseq columns and return the one with the most overlap.
#' (Note - this looks for "NM_" ids, which is not comprehensive.)
#'
#' @param gpl.df the gpl data frame
#' @param MIN.OVERLAP the minimum number of mapped genes allowed, default=8000
#' @return the index of the column containing these ids
.detect_refseq_cols <- function(gpl.df, MIN.OVERLAP=8000 ){
  refseq_cols <- sapply(2:ncol(gpl.df), function(i){
    sum(stringr::str_detect(gpl.df[,i], "NM_[0-9]+"), na.rm=TRUE)
  })
  if (max(refseq_cols) > MIN.OVERLAP){
    refseq_col <- which.max(refseq_cols)[[1]]
    return(refseq_col+1)
  }
  return(c())
}

#' Identify the ensembl columns and return the one with the max overlap.
#'
#' @param gpl.df the gpl data frame
#' @param organism the organism we are looking at
#' @param MIN.OVERLAP the minimum number of mapped genes allowed, default=8000
#' @return the index of the column containing these ids
.detect_ensembl_cols <- function(gpl.df, organism, MIN.OVERLAP=8000){
  ORG.ENSEMBL.SUFF <- list("human"="G", "mouse"="MUSG", "rat"="RNOG")
  org.ensembl.id <- sprintf("ENS%s", ORG.ENSEMBL.SUFF[[organism]])
  ensembl_cols <- sapply(2:ncol(gpl.df), function(i){
    sum(stringr::str_detect(gpl.df[,i], sprintf("%s[0-9]+",org.ensembl.id)), na.rm=TRUE)
  })
  if (max(ensembl_cols) > MIN.OVERLAP){
    ensembl_col <- which.max(ensembl_cols)[[1]]
    return(ensembl_col+1)
  }
  return(c())
}

#' Identify the unigene columns and return the one with the most overalp
#'
#' @param gpl.df the gpl data frame
#' @param MIN.OVERLAP the minimum number of mapped genes allowed, default=8000
#' @return the index of the column containing these ids
.detect_unigene_cols <- function(gpl.df, MIN.OVERLAP=8000){
  unigene_cols <- sapply(2:ncol(gpl.df), function(i){
    sum(stringr::str_detect(gpl.df[,i], "[Hs|Rn|Mm]\\.[0-9]+"), na.rm=TRUE)
  })
  if (max(unigene_cols) > MIN.OVERLAP){
    unigene_col <- which.max(unigene_cols)[[1]]
    return(unigene_col+1)
  }
  return(c())
}

#' Identify the HGNC columns and return the first
#' (Note - this is by looking for Gapdh which may not be the best metric...)
#'
#' @param gpl.df the gpl data frame
#' @return the index of the column containing these ids
.detect_hgnc_cols <- function(gpl.df){
  hgnc_cols <- sapply(2:ncol(gpl.df), function(i){
    sum(grepl("GAPDH", gpl.df[,i], ignore.case=TRUE), na.rm=TRUE)
  })
  hgnc_idces <- which(hgnc_cols > 0)
  if (length(hgnc_idces) > 0){
    return(which.max(hgnc_cols)[[1]]+1)
  }
  return(c())
}


#' Identify the genbank columns and return the first
#' (Note - this is by looking for a handful of ids...)
#'
#' @param gpl.df the gpl data frame
#' @param organism the organism we are looking at
#' @return the index of the column containing these ids
.detect_genbank_cols <- function(gpl.df, organism){
  #rat_genbank <- paste((genbank %>% filter(entrezgene_id=="24383"))[1:5,"genbank"], collapse="|")
  #mouse_genbank <- paste((genbank %>% filter(entrezgene_id=="14433"))[1:5,"genbank"], collapse="|")
  #human_genbank <- paste((genbank %>% filter(entrezgene_id=="2597"))[1:5,"genbank"], collapse="|")
  LIST.GENBANK.STR <-
    list("rat"="AAA40814|AAA41193|AAA41795|AAB19105|AAH59110",
         "mouse" ="AAA37659|AAH82592|AAH83065|AAH83079|AAH83080",
         "human"="AAA52496|AAA52518|AAA52519|AAA53191|AAA86283")
  genbank_str <- LIST.GENBANK.STR[[organism]]
  genbank_cols <- sapply(2:ncol(gpl.df), function(i){
    sum(grepl(genbank_str, gpl.df[,i]), na.rm=TRUE)
  })
  genbank_idces <- which(genbank_cols > 0)
  if (length(genbank_idces) > 0){
    return(which.max(genbank_cols)[[1]]+1)
  }
  return(c())
}



#' Heper function to find the location within a column of an ID.
#' Used for columns with many fields separated by double and triple slashes
#'
#' @param df the data.frame with feature data
#' @param my.col the column to examine
#' @param ex.str an example ID of that type
#' @param check.overlap defaults to false
#' @param id.list defaults to null
#' @return an expanded data frame with probe/gene data
.find_col_loc <- function(df, my.col, ex.str,
                          check.overlap=FALSE,
                          id.list=NULL){

  # find example row
  ex_rows <- which(stringr::str_detect(df[,my.col], ex.str))
  if (length(ex_rows) == 0){
    print("No mentions of this string in the column")
    return(data.frame("gene"=character(), "probe"=character()))
  }
  my.row <- ex_rows[[1]]

  # // TODO - what if id.list is NULL?
  idces <- c()
  if (check.overlap & !is.null(id.list)){
    i <- 1
    while(i < length(ex_rows)){
      my.row <- ex_rows[[i]]
      tokens <- stringr::str_extract_all(df[my.row, my.col], ex.str)[[1]]
      tokens2 <- tokens[stringr::str_length(tokens) >4]
      tokens3 <- intersect(tokens2, id.list)
      if (length(tokens3) > 0){
        ex.str <- tokens3[[1]] # // if multiple, try to find one that makes most sense
        my_str <- df[my.row, my.col]
        mult_fields <- stringr::str_trim(stringr::str_split(my_str, "///")[[1]])
        lst_fields <- stringr::str_split(mult_fields, "//")
        idces <- lapply(lst_fields, function(x)
          which(sapply(x, function(y)
            stringr::str_detect(stringr::str_trim(y), sprintf("^%s", ex.str, ex.str)))))
        if (length(tokens3) == 1){
          # split into fields
          if (length(unlist(idces))!=0){
            i <- length(ex_rows) # break out of the loop because we have found an example!
          }
        }
      }
      i <- i+1
    }
  }
  my_str <- df[my.row, my.col]
  mult_fields <- stringr::str_trim(stringr::str_split(my_str, "///")[[1]])
  lst_fields <- stringr::str_split(mult_fields, "//")
  idces <- lapply(lst_fields, function(x)
    which(sapply(x, function(y)
      stringr::str_detect(stringr::str_trim(y), sprintf("^%s", ex.str, ex.str)))))


  # problem -- getting "Human GAPDH"
  if (length(unlist(idces))==0){
    # should we try different IDs?
    print("error in identifying within column location")
    return(data.frame("gene"=character(), "probe"=character()))
  }

  # find the location
  idx <- as.numeric(unique(unlist(idces))[[1]])

  # grab IDs from that location
  gene_vals <- lapply(df[,my.col], function(x){
    mult_fields <- stringr::str_trim(stringr::str_split(x, "///")[[1]])
    lst_fields <- stringr::str_split(mult_fields, "//")
    genes <- unique(lapply(lst_fields[sapply(lst_fields, length) >= idx],
                           function(x) stringr::str_trim(x[[idx]])))
    genes <- genes[genes!="---"]

    paste(genes, collapse="///")
  })

  # format into a data frame
  df2 <- data.frame(cbind("probe"=df[,1], "gene_col"=gene_vals))
  df3 <- tidyr::separate_rows(df2, gene_col, sep="///")
  return(dplyr::filter(df3, gene_col != ""))
}

#' Helper function to parse a pattern out of a multi-column.
#' Used for columns with many fields contained within and a consistent
#' defined pattern. This is an alternate to `.find_col_loc()` for cases
#' where all strings match a pattern (e.g. "ENSG") and there may be
#' any type of delimiter within the column.
#'
#' @param df the data.frame with feature data
#' @param my.col the column to examine
#' @param pattern the pattern to extract
#' @return an expanded data frame with probe/gene data
.parse_multi_col <- function(df, my.col, pattern){
  INTERNAL.SEP <-"///|//|,|\\|"
  SUFFIX.SEP <- "\\."
  # check whether it matches the whole pattern
  if (any(stringr::str_detect(df[,my.col], sprintf("^%s$", pattern)))){
    df2 <- df[,c(1, my.col)]
    colnames(df2) <- c("probe", "gene_col")
    df3 <- tidyr::separate_rows(df2, gene_col, sep=INTERNAL.SEP)
    df3 <- dplyr::mutate(df3, gene_col=stringr::str_trim(gene_col))
    return(df3)
  }

  # extract all of the data from that column
  gene_vals <- stringr::str_extract_all(df[,my.col], pattern)
  gene_vals_col <- lapply(gene_vals, function(x)
    paste(x, collapse="///"))
  probe.ids <- df[,1]
  df2 <- data.frame(cbind("probe"=probe.ids, "gene_col"=gene_vals_col))
  df3 <- tidyr::separate_rows(df2, gene_col, sep="///")
  return(df3)
}

#' Count the length of overlap with entrez ids for a sequence of cols.
#'
#' @param gpl.df the feature data from the GPL data frame
#' @param cols the columns you think contain entrez ids to check for overlap with
#' @param organism the organism: rat, mouse, or human
#' @param ref_dir option to set reference directory  (defaults to tempdir)
#' @return the length of overlap for all the columns, where names are the column indices
.check_entrez_overlap <- function(gpl.df, cols, organism, ref_dir=NULL){
  # grab the entrezids associated with the organism
  entrezids <- .load_ref(organism, "entrezids", ref_dir)

  # check the length of overlap
  overlap.lengths <- sapply(1:length(cols), function(i){
    length(intersect(entrezids,
                     unlist(stringr::str_extract_all(gpl.df[,cols[i]], "[0-9]+"))))
  })
  names(overlap.lengths) <- cols
  return(overlap.lengths)
}
erflynn/exprsex documentation built on Feb. 23, 2020, 2:34 a.m.