#' Get paths for limma output files
#' @param path2fitfiles gives the path to the Results directory that stores the limma output
#' @export
get_fitfiles <- function(path_to_fitfiles_dir){
result_files <- unlist(
lapply(dir(path_to_fitfiles_dir), function(x) paste0(path_to_fitfiles_dir, x))
)
return(result_files)
}
#' dataset annotation
datasetsAnnotation <- function(con, GSELIST) {
rs <- lapply(GSELIST, function(x)
{ dbGetQuery(con, paste0("select gse,type,title,contributor,pubmed_id,submission_date,last_update_date from gse where gse='", x, "'")) } )
rs <- rbindlist(rs)
gpl <- lapply(GSELIST, function(x) { dbGetQuery(con, paste0("select gse,gpl from gse_gpl where gse='", x, "'")) } )
gpl <- rbindlist(gpl)
species <- lapply(gpl$gpl, function(x) { dbGetQuery(con, paste0("select gpl,organism from gpl where gpl='", x, "'")) } )
species <- rbindlist(species)
gpl[, species := species$organism]
rs <- merge(rs, gpl, by = "gse")
names(rs) <- c("GSE", "DatasetType", "Title", "DatasetContrib", "PMID", "SubmissionDate", "LastUpdateDate", "GPL", "Species")
# Query for BioC packages linked to given GPLs
packages <- sapply(gpl$gpl,
function(gpl) {
package <- dbGetQuery(con, paste0("select gpl,bioc_package from gpl where gpl='", gpl, "'"))$bioc_package
if(!is.na(package)) package <- paste0(package, ".db") else package <- NA
})
packages <- unlist(packages)
rs$Reannotation <- packages
return(rs)
}
#' use GEOmetadb to get metadata for dataset
#' @param GSELIST a valid gse
#' @param path2GEOmetadb path to most current GEOmetadb.sqlite file
#' @export
get_DSinfo <- function(GSELIST, path2GEOmetadb){
con <- dbConnect(SQLite(), path2GEOmetadb)
DSinfo <- datasetsAnnotation(con, GSELIST)
write_tsv(DSinfo, "dsinfo.tsv")
return(DSinfo)
}
geneAnnotateBioC <- function(probes, package) {
#genes <- select(get(package), keys = probes, columns = "SYMBOL", keytype = "PROBEID")
# interactive package retrieval disabled by AM
genes <- AnnotationDbi::select(package, keys = probes, columns = "SYMBOL", keytype = "PROBEID")
genes <- split(genes$SYMBOL, factor(genes$PROBEID))
genes <- sapply(genes, function(x) x[1])
return(genes)
}
#' add gene annotation from a reannotation package to limma output
#' @param gse_id a valid GSE ID e.g.: "GSE65309"
#' @param fitfile path to a file retrieved from get_fitfiles
#' @param anno_package for example: mogene10sttranscriptcluster.db
#' @examples
#' process_fitoutput("GSE65309", fitfile, mogene10sttranscriptcluster.db)
#' lapply(result_files, function(x) process_fitoutput("GSE65309", x, mogene10sttranscriptcluster.db))
#' @export
process_fitoutput <- function(gse_id, fitfile, anno_package){
data <- read.table(fitfile, sep = "\t", header = T, check.names = FALSE)
probes <- row.names(data)
genes <- geneAnnotateBioC(probes, anno_package)
data$genes <- genes
data$probes <- probes
out_name <- gsub("\\.txt", "_annotated\\.txt", fitfile)
#out_name <- paste(gse_id, out_name, sep = "_")
write.table(data, file = paste0(out_name), sep = "\t", row.names = FALSE)
#return(data)
}
#' get genes from gpl soft file
#' Annotation with GPL requires more supervision
#' @param probes rownames from limma output
#' @param gpl a valid GPL id, e.g.: "GPL13912"
#' @examples
#' genes <- geneAnnotateGPL(probes, "GPL13912")
#' @export
geneAnnotateGPL <- function(probes, gpl) {
gpl <- getGEO(filename = paste0("GEOtemp/", gpl, ".soft"))
cols <- colnames(Table(gpl))
geneCol <- grep("(GENE)?_?SYMBOL", cols, ignore.case = T, val = T)
if(!length(geneCol)) geneCol <- grep("^GENE$", ignore.case = T, cols, val = T)
gpl <- Table(gpl)[, c("ID", geneCol)]
genes <- gpl[[geneCol]][match(probes, gpl$ID)]
return(genes)
}
#' add gene annotation from a gpl-soft-file to limma output
#' @param gse_id a valid GSE id, e.g.: "GSE41496"
#' @param fitfile ath to a file retrieved from get_fitfiles
#' @param gpl a valid GPL id, e.g.: "GPL13912"
#' @examples
#' process_fitoutput_with_gpl("GSE41496", fitfiles[1], "GPL13912")
#' lapply(fitfiles, function(x) process_fitoutput_with_gpl("GSE41496", x, "GPL13912"))
#' @export
process_fitoutput_with_gpl <- function(gse_id, fitfile, gpl){
data <- read.table(fitfile, sep = "\t", header = T, check.names = FALSE)
probes <- row.names(data)
genes <- geneAnnotateGPL(probes, gpl)
data$genes <- genes
data$probes <- probes
out_name <- gsub("\\.txt", "_annotated\\.txt", fitfile)
write.table(data, file = paste0(out_name), sep = "\t", row.names = FALSE)
#return(data)
}
#' Read gpl file into data.table
#' @param gse_id a valid GSE id, e.g.: "GSE41496"
#' @param path2GEOmetadb ideally defined before as path2GEOmetadb which is the default value
#' @param path2downloaded_data if the data was acquired using this package it should be stored in GEOtemp/ which is the default setting, please use a / at the end of any custom path, it's required for the system command.
#' @examples
#' read_gpl()
#' @export
read_gpl <- function(gse_id=gse, path2GEOmetadb=path2GEOmetadb,path2downloaded_data="GEOtemp/"){
DSinfo <- get_DSinfo(gse, path2GEOmetadb)
gpl_id <- unlist(DSinfo$GPL)
gpl_soft <- paste0(path2downloaded_data, gpl_id, ".soft")
gpl_tsv <- paste0(path2downloaded_data, gpl_id, ".tsv")
print(gpl_id)
print(gpl_soft)
system(sprintf("grep -Ev '#|\\!|\\^' %s > %s", gpl_soft, gpl_tsv))
gpl <- fread(gpl_tsv)
return(gpl)
}
#' Use gpl file to annotate fit output, requires a "gene_assignment" column in the gpl file
#' @param gse_id a valid GSE id, e.g.: "GSE41496"
#' @param path2GEOmetadb ideally defined before as path2GEOmetadb which is the default value
#' @param path2downloaded_data if the data was acquired using this package it should be stored in GEOtemp/ which is the default setting, please use a / at the end of any custom path, it's required for the system command.
#' @examples
#' fit_annotate_with_gpl_gene_assignment("GSE12345")
#' @export
fit_annotate_with_gpl_gene_assignment <- function(gse_id=gse, path2GEOmetadb=path2GEOmetadb,path2downloaded_data="GEOtemp/"){
DSinfo <- fread("dsinfo.tsv")
gpl_id <- unlist(DSinfo$GPL)
print(gpl_id)
gpl_file <- paste0(path2downloaded_data, gpl_id, ".tsv")
print(gpl_file)
gpl <- fread(gpl_file)
DSinfo$Species
ifelse("gene_assignment" %in% names(gpl), lookup <- gpl[, c("ID", "gene_assignment"), with=FALSE], print("no column called gene_assignment"))
#lookup <- gpl[, c("ID", "gene_assignment"), with=FALSE]
lookup[, c("locus", "gene_symbol", "other_gene_assignments") := tstrsplit(gene_assignment, " // ", fixed = TRUE)]
lookup <- lookup[, c("ID", "locus"), with = FALSE]
lookup <- lookup[, ID:=as.integer(ID)]
if (DSinfo$Species == "Homo sapiens"){
GB_SYMBOL <- select(Homo.sapiens, keys = lookup$locus,
columns = c("ACCNUM", "SYMBOL"), keytype = "ACCNUM")
} else if (DSinfo$Species == "Mus musculus"){
GB_SYMBOL <- select(Mus.musculus, keys = lookup$locus,
columns = c("ACCNUM", "SYMBOL"), keytype = "ACCNUM")
} else if (DSinfo$Species == "Rattus norvegicus"){
GB_SYMBOL <- select(Rattus.norvegicus, keys = lookup$locus,
columns = c("ACCNUM", "SYMBOL"), keytype = "ACCNUM")
} else {
print("wrong species")
}
GB_SYMBOL <- na.omit(GB_SYMBOL)
setDT(GB_SYMBOL)
setkey(GB_SYMBOL, ACCNUM)
setkey(lookup, "locus")
lookup_anno <- lookup[GB_SYMBOL]
lookup_anno <- lookup_anno[, c("ID", "SYMBOL"), with=FALSE]
setkey(lookup_anno, "ID")
fitfiles <- get_fitfiles("Results/")
fitted <- fread(fitfiles[1])
setkey(fitted, "V1")
anno_fit <- fitted[lookup_anno]
anno_fit <- unique(anno_fit)
names(anno_fit)[1] <- "probes"
fwrite(anno_fit, "Results/annotated_fit.txt", sep = "\t")
}
#' Use gpl file to annotate fit output, requires a "gene_assignment" or "gene_symbol" column in the gpl file
#' @param gse_id a valid GSE id, e.g.: "GSE41496"
#' @param path2GEOmetadb ideally defined before as path2GEOmetadb which is the default value
#' @param path2downloaded_data if the data was acquired using this package it should be stored in GEOtemp/ which is the default setting, please use a / at the end of any custom path, it's required for the system command.
#' @examples
#' annotate_with_gpl("GSE12345")
#' @export
annotate_with_gpl <- function(gse_id=gse, path2GEOmetadb=path2GEOmetadb,path2downloaded_data="GEOtemp/"){
gpl <- read_gpl(gse, path2GEOmetadb,"GEOtemp/")
fitfiles <- get_fitfiles("Results/")
ifelse("symbol" %in% names(gpl), process_fitoutput_with_gpl(gse, fitfiles, "GPL13912"),
ifelse("gene_assignment" %in% names(gpl), fit_annotate_with_gpl_gene_assignment(gse), print("Sorry, annotation can't be automated at this point")))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.