R/annotate_limma_output.R

#' Get paths for limma output files
#' @param path2fitfiles gives the path to the Results directory that stores the limma output
#' @export

get_fitfiles <- function(path_to_fitfiles_dir){
  result_files <- unlist(
    lapply(dir(path_to_fitfiles_dir), function(x) paste0(path_to_fitfiles_dir, x))
    )
  return(result_files)
}

#' dataset annotation
datasetsAnnotation <- function(con, GSELIST) {
  rs <- lapply(GSELIST, function(x)
  { dbGetQuery(con, paste0("select gse,type,title,contributor,pubmed_id,submission_date,last_update_date from gse where gse='", x, "'")) } )
  rs <- rbindlist(rs)
  gpl <- lapply(GSELIST, function(x) { dbGetQuery(con, paste0("select gse,gpl from gse_gpl where gse='", x, "'")) } )
  gpl <- rbindlist(gpl)
  species <- lapply(gpl$gpl, function(x) { dbGetQuery(con, paste0("select gpl,organism from gpl where gpl='", x, "'")) } )
  species <- rbindlist(species)
  gpl[, species := species$organism]
  rs <- merge(rs, gpl, by = "gse")
  names(rs) <- c("GSE", "DatasetType", "Title", "DatasetContrib", "PMID", "SubmissionDate", "LastUpdateDate", "GPL", "Species")
  # Query for BioC packages linked to given GPLs
  packages <- sapply(gpl$gpl,
                     function(gpl) {
                       package <- dbGetQuery(con, paste0("select gpl,bioc_package from gpl where gpl='", gpl, "'"))$bioc_package
                       if(!is.na(package)) package <- paste0(package, ".db") else package <- NA
                     })
  packages <- unlist(packages)
  rs$Reannotation <- packages
  return(rs)
}



#' use GEOmetadb to get metadata for dataset
#' @param GSELIST a valid gse
#' @param path2GEOmetadb path to most current GEOmetadb.sqlite file
#' @export
get_DSinfo <- function(GSELIST, path2GEOmetadb){
  con <- dbConnect(SQLite(), path2GEOmetadb)
  DSinfo <- datasetsAnnotation(con, GSELIST)
  write_tsv(DSinfo, "dsinfo.tsv")

  return(DSinfo)

}


geneAnnotateBioC <- function(probes, package) {
  #genes <- select(get(package), keys = probes, columns = "SYMBOL", keytype = "PROBEID")
  # interactive package retrieval disabled by AM
  genes <- AnnotationDbi::select(package, keys = probes, columns = "SYMBOL", keytype = "PROBEID")
  genes <- split(genes$SYMBOL, factor(genes$PROBEID))
  genes <- sapply(genes, function(x) x[1])
  return(genes)
}

#' add gene annotation from a reannotation package to limma output
#' @param gse_id a valid GSE ID e.g.: "GSE65309"
#' @param fitfile path to a file retrieved from get_fitfiles
#' @param anno_package for example: mogene10sttranscriptcluster.db
#' @examples
#' process_fitoutput("GSE65309", fitfile, mogene10sttranscriptcluster.db)
#' lapply(result_files, function(x) process_fitoutput("GSE65309", x, mogene10sttranscriptcluster.db))
#' @export
process_fitoutput <- function(gse_id, fitfile, anno_package){
  data <- read.table(fitfile, sep = "\t", header = T, check.names = FALSE)
  probes <- row.names(data)
  genes <- geneAnnotateBioC(probes, anno_package)
  data$genes <- genes
  data$probes <- probes
  out_name <- gsub("\\.txt", "_annotated\\.txt", fitfile)
  #out_name <- paste(gse_id, out_name, sep = "_")
  write.table(data, file = paste0(out_name), sep = "\t", row.names = FALSE)
  #return(data)
}

#' get genes from gpl soft file
#' Annotation with GPL requires more supervision
#' @param probes rownames from limma output
#' @param gpl a valid GPL id, e.g.: "GPL13912"
#' @examples
#' genes <- geneAnnotateGPL(probes, "GPL13912")
#' @export
geneAnnotateGPL <- function(probes, gpl) {
  gpl <- getGEO(filename = paste0("GEOtemp/", gpl, ".soft"))
  cols <- colnames(Table(gpl))
  geneCol <- grep("(GENE)?_?SYMBOL", cols, ignore.case = T, val = T)
  if(!length(geneCol)) geneCol <- grep("^GENE$", ignore.case = T, cols, val = T)
  gpl <- Table(gpl)[, c("ID", geneCol)]
  genes <- gpl[[geneCol]][match(probes, gpl$ID)]
  return(genes)
}


#' add gene annotation from a gpl-soft-file to limma output
#' @param gse_id a valid GSE id, e.g.: "GSE41496"
#' @param fitfile ath to a file retrieved from get_fitfiles
#' @param gpl a valid GPL id, e.g.: "GPL13912"
#' @examples
#' process_fitoutput_with_gpl("GSE41496", fitfiles[1], "GPL13912")
#' lapply(fitfiles, function(x) process_fitoutput_with_gpl("GSE41496", x, "GPL13912"))
#' @export
process_fitoutput_with_gpl <- function(gse_id, fitfile, gpl){
  data <- read.table(fitfile, sep = "\t", header = T, check.names = FALSE)
  probes <- row.names(data)
  genes <- geneAnnotateGPL(probes, gpl)
  data$genes <- genes
  data$probes <- probes
  out_name <- gsub("\\.txt", "_annotated\\.txt", fitfile)
  write.table(data, file = paste0(out_name), sep = "\t", row.names = FALSE)
  #return(data)
}

#' Read gpl file into data.table
#' @param gse_id  a valid GSE id, e.g.: "GSE41496"
#' @param path2GEOmetadb ideally defined before as path2GEOmetadb which is the default value
#' @param path2downloaded_data if the data was acquired using this package it should be stored in GEOtemp/ which is the default setting, please use a / at the end of any custom path, it's required for the system command.
#' @examples
#' read_gpl()
#' @export

read_gpl <- function(gse_id=gse, path2GEOmetadb=path2GEOmetadb,path2downloaded_data="GEOtemp/"){
  DSinfo <- get_DSinfo(gse, path2GEOmetadb)
  gpl_id <- unlist(DSinfo$GPL)
  gpl_soft <- paste0(path2downloaded_data, gpl_id, ".soft")
  gpl_tsv <- paste0(path2downloaded_data, gpl_id, ".tsv")

  print(gpl_id)
  print(gpl_soft)
  system(sprintf("grep -Ev '#|\\!|\\^' %s > %s", gpl_soft, gpl_tsv))

  gpl <- fread(gpl_tsv)

  return(gpl)
}




#' Use gpl file to annotate fit output, requires a "gene_assignment" column in the gpl file
#' @param gse_id  a valid GSE id, e.g.: "GSE41496"
#' @param path2GEOmetadb ideally defined before as path2GEOmetadb which is the default value
#' @param path2downloaded_data if the data was acquired using this package it should be stored in GEOtemp/ which is the default setting, please use a / at the end of any custom path, it's required for the system command.
#' @examples
#' fit_annotate_with_gpl_gene_assignment("GSE12345")
#' @export
fit_annotate_with_gpl_gene_assignment <- function(gse_id=gse, path2GEOmetadb=path2GEOmetadb,path2downloaded_data="GEOtemp/"){
  DSinfo <- fread("dsinfo.tsv")
  gpl_id <- unlist(DSinfo$GPL)
  print(gpl_id)
  gpl_file <- paste0(path2downloaded_data, gpl_id, ".tsv")
  print(gpl_file)
  gpl <- fread(gpl_file)
  DSinfo$Species

  ifelse("gene_assignment" %in% names(gpl), lookup <- gpl[, c("ID", "gene_assignment"), with=FALSE], print("no column called gene_assignment"))


  #lookup <- gpl[, c("ID", "gene_assignment"), with=FALSE]
  lookup[, c("locus", "gene_symbol", "other_gene_assignments") := tstrsplit(gene_assignment, " // ", fixed = TRUE)]
  lookup <- lookup[, c("ID", "locus"), with = FALSE]
  lookup <- lookup[, ID:=as.integer(ID)]

  if (DSinfo$Species == "Homo sapiens"){
    GB_SYMBOL <- select(Homo.sapiens, keys = lookup$locus,
                        columns = c("ACCNUM", "SYMBOL"), keytype = "ACCNUM")
  } else if (DSinfo$Species == "Mus musculus"){
    GB_SYMBOL <- select(Mus.musculus, keys = lookup$locus,
                        columns = c("ACCNUM", "SYMBOL"), keytype = "ACCNUM")
  } else if (DSinfo$Species == "Rattus norvegicus"){
    GB_SYMBOL <- select(Rattus.norvegicus, keys = lookup$locus,
                        columns = c("ACCNUM", "SYMBOL"), keytype = "ACCNUM")
  } else {
    print("wrong species")
  }

  GB_SYMBOL <- na.omit(GB_SYMBOL)
  setDT(GB_SYMBOL)

  setkey(GB_SYMBOL, ACCNUM)
  setkey(lookup, "locus")

  lookup_anno <- lookup[GB_SYMBOL]
  lookup_anno <- lookup_anno[, c("ID", "SYMBOL"), with=FALSE]

  setkey(lookup_anno, "ID")

  fitfiles <- get_fitfiles("Results/")
  fitted <- fread(fitfiles[1])
  setkey(fitted, "V1")

  anno_fit <- fitted[lookup_anno]
  anno_fit <- unique(anno_fit)

  names(anno_fit)[1] <- "probes"

  fwrite(anno_fit, "Results/annotated_fit.txt", sep = "\t")
}


#' Use gpl file to annotate fit output, requires a "gene_assignment" or "gene_symbol" column in the gpl file
#' @param gse_id  a valid GSE id, e.g.: "GSE41496"
#' @param path2GEOmetadb ideally defined before as path2GEOmetadb which is the default value
#' @param path2downloaded_data if the data was acquired using this package it should be stored in GEOtemp/ which is the default setting, please use a / at the end of any custom path, it's required for the system command.
#' @examples
#' annotate_with_gpl("GSE12345")
#' @export

annotate_with_gpl <- function(gse_id=gse, path2GEOmetadb=path2GEOmetadb,path2downloaded_data="GEOtemp/"){
  gpl <- read_gpl(gse, path2GEOmetadb,"GEOtemp/")
  fitfiles <- get_fitfiles("Results/")
  ifelse("symbol" %in% names(gpl), process_fitoutput_with_gpl(gse, fitfiles, "GPL13912"),
         ifelse("gene_assignment" %in% names(gpl),  fit_annotate_with_gpl_gene_assignment(gse), print("Sorry, annotation can't be automated at this point")))
}
axelmuller/geometric2 documentation built on May 25, 2019, 6:26 p.m.