R/GeneAnno.R

library(httr)
library(XML)
library(methods)

source("R/GeneAnnoClasses.R")

#' Create gene specific objects containing data from online resources
#'
#' \code{getGeneSummary} populates and returns a vector of \code{gene} objects
#' with information sourced from  a series of html requests to the NIH and
#' Uniport publicly available databases.
#'
#' Information returned from a database requests is parsed into a \code{gene}
#' object, which are saved in a 'genes' subdirectory of the working directory.
#' Each \code{gene} object is added to a vector of objects, which is then
#' returned.
#' Where gene information has previously been downloaded and objects saved
#' (within the last seven days), \code{gene} objects are repopulated from the
#' saved files so as to minimise server traffic.
#' \emph{N.B.} the function includes a random wait (of up to 5s) between each
#' gene downloaded.
#'
#' \emph{N.B.} It is possible to define an alternative directory using
#' \code{slot(geneanno,"fileroot") <- "/path/to/directory"}
#'
#' @param x object of class \code{geneanno}.
#' @return vector of \code{gene} objects, each containing the collated data
#' from the public resources.
#' @importFrom methods setGeneric setMethod
#' @importFrom stats runif
#' @importFrom utils read.table write.table
#' @examples
# \dontrun{
#' query <- matrix(c("Axitinib","BRAF","Imatinib","BRAF"),ncol=2,byrow=TRUE)
#' g <- getUniqueGeneList(geneanno(),query)
#' gs <- getGeneSummary(g)
# }
#' @export
#'
setGeneric("getGeneSummary", function(x) {
    standardGeneric("getGeneSummary")
})

#' @describeIn getGeneSummary Produces a vector of \code{gene} objects, each
#' containing the collated data from the public resources.
setMethod("getGeneSummary","geneanno",
    function(x){
        output = list()
        count = 0
        archivedList = vector()
        archListOld <- vector()
        noret <- vector()
        fileroot <- ifelse(identical(x@fileroot,character(0)),getwd(),x@fileroot)
        if (!dir.exists(file.path(fileroot,x@genefilestem))) {
            dir.create(file.path(fileroot,x@genefilestem))
        }
        archFile <- file.path(fileroot,slot(x,"genefilestem"),"archived.txt")
        archList <- list()
        if (file.exists(archFile)) {
            archList <- read.table(archFile)
        }
        toMatch <- c('^ENSG', '^ENSP', '^ENST')
        match <- paste(toMatch,collapse="|")
        for (gene in slot(x,"genelist")) {
            filename = file.path(fileroot,x@genefilestem,
                                sprintf("%s.RData",gene))
            count = count + 1
            if (gene %in% unlist(archList)) {
                cat(count, gene," is archived in Ensembl\n")
                archListOld <- append(archListOld,gene)
            } else if (file.exists(filename) && Sys.time() < (
                file.info(filename)$mtime + 7*86400)) {
                load(filename)
                output[[gene]] <- g
                cat(count, gene, "\n" )
            } else{
                f <- query()
                if (grepl(match,gene)) {
                    r <- getEnsembleList(gene)
                    if (length(r$display_name) == 0) {
                        cat(count, gene," is archived in Ensembl\n")
                        archivedList <- append(archivedList,gene)
                        next()
                    } else {f@query <- r$display_name}
                } else {
                    f@query <- gene
                }
                query <- sprintf("Homo+sapiens[organism]+AND+(%s[Gene%%2FProtein+Name]+OR+%s[Nucleotide%%2FProtein+Accession])+AND+alive[prop]",f@query,f@query)
                f <- getNihQuery(f,"gene",query)
                cat(count, gene, " step 1..." )
                g <- getNihSummary(gene(),f)
                if (length(slot(g,"name")) == 0) {
                    cat("nothing returned\n")
                } else {
                    cat(" 2..." )
                    g <- getUniprotSummary(g,f)
                    cat("downloaded\n" )
                }
                output[[gene]] <- g
                save(g,file = filename)
                Sys.sleep(runif(1)*5)
            }
        }
        if (length(archivedList) >= 1 || length(archListOld) >= 1) {
            cat("\nThe following Ensembl Ids are archived:",archivedList, archListOld,"\n\n")
            if (length(archivedList) >= 1) {
                write.table(archivedList,archFile,append = TRUE,row.names=FALSE,col.names=FALSE)
            }
        }
        return(output)
    }
)

#' Create unique list of genes from input list
#'
#' \code{getUniqueGeneList} takes a 2-column matrix of group identifiers and
#' gene names, returning the seperated list of unique group identifiers and a
#' unique list of genes.
#'
#' @param x object of class geneanno.
#' @param inputlist vector of strings, being a mixed list of group numbers and
#' gene names
#' @return object of type geneanno; a copy of input object having the additional
#' list of group numbers and a list of genes from \code{uniquelist}
#' @importFrom methods setGeneric setMethod
#' @examples
#' data("genematrix")
#' ga <- getUniqueGeneList(geneanno(),mygenematrix)
#' @export
#'
setGeneric("getUniqueGeneList", function(x,inputlist) {
    standardGeneric("getUniqueGeneList")
})

#' @describeIn getUniqueGeneList object of type geneanno; a copy of input object
#' having the additional list of group numbers and a unique list of genes from
#' \code{s}
setMethod("getUniqueGeneList",signature(x = "geneanno",inputlist = "character"),
    function(x,inputlist){
        cat("inputlist is a List")
        q <- inputlist[grep("^[0-9]",inputlist)]
        x@groupnos <- unique(q)
        inputlist <- inputlist[grep("^[A-Za-z]",inputlist)]
        inputlist <- gsub("^([a-zA-Z0-9]+)\\..*","\\1",inputlist,perl = TRUE)
        x@genelist <- unique(inputlist)
        return(x)
    }
)

#' @describeIn getUniqueGeneList object of type geneanno; a copy of input object
#' having the additional list of group numbers and a unique list of genes from
#' \code{s}
setMethod("getUniqueGeneList",signature(x = "geneanno",inputlist = "matrix"),
    function(x,inputlist){
        #cat("inputlist is a Matrix")
        x@groupnos <- unique(inputlist[,1])
        s <- inputlist[,2]
        s <- gsub("^([a-zA-Z0-9]+)\\..*","\\1",s,perl = TRUE)
        x@genelist <- unique(s)
        return(x)
    }
)

#' Create unique list of group,gene combinations from input list
#'
#' \code{getGroupGeneList} takes a mixed list of group identifiers (numeric) and
#' gene names or a 2-column matrix of group identifiers and gene names,
#' returning the seperated list of group identifiers and the related lists of
#' genes.
#'
#' @param x object of class geneanno.
#' @param inputlist vector of strings, being a mixed list of group numbers and
#' gene names
#' @return object of type geneanno; a copy of input object having the additional
#' list of group numbers and a unique list of genes from \code{s}
#' @examples
#' data("genematrix")
#' ggl <- getGroupGeneList(geneanno(),mygenematrix)
#' @importFrom methods setGeneric setMethod
#' @export
#'
setGeneric("getGroupGeneList", function(x,inputlist) {
    standardGeneric("getGroupGeneList")
})

#' @describeIn getGroupGeneList uses vector of characters of group ids and Gene
#' names for inputlist.
setMethod("getGroupGeneList",signature(x = "geneanno", inputlist = "character"),
    function(x,inputlist){
        output <- list()
        group = 0
        z <- 1
        for (i in x@groupnos) {
            output[[i]] <- list()
        }
        for (i in inputlist) {
            if (grepl("^[A-Za-z]",i)) {
                output[[group]] <- append(output[[group]],i)
            } else if (grepl("^[0-9]+",i)) {
                group <- i
            }
        }
        for (i in x@groupnos) {
            t <- output[[i]][!is.null(output[[i]])]
            output[[i]] <- unique(t)
        }
        return(output)
    }
)

#' @describeIn getGroupGeneList uses matrix of group ids and Gene names for
#' inputlist.
setMethod("getGroupGeneList",signature(x = "geneanno",inputlist = "matrix"),
        function(x,inputlist){
            output <- list()
            group = 0
            for (i in x@groupnos) {
                output[[i]] <- list()
            }
            for (i in 1:nrow(inputlist)) {
                group <- inputlist[i,1]
                output[[group]] <- append(output[[group]],inputlist[i,2])
            }
            for (i in x@groupnos) {
                t <- output[[i]][!is.null(output[[i]])]
                output[[i]] <- unique(t)
            }
            return(output)
        }
)

#' Parse files of gene names and group identifiers to create unique lists of
#' each
#'
#' \code{parseInputFile} takes a mixed file containing group identifiers
#' (numeric) and gene names, returning the list of group identifiers and genes
#' with the remaining columns removed.
#' The package was originally written to work from a file laid out thus:
#' group_id1
#' gene_name1
#' gene_name2
#' group_id2
#' gene_name1
#' gene_name3
#' The methods assume that both group identifiers and gene names are
#' alphanumeric; the group identifiers, where present, begining with a number
#' and gene names starting with a character.
#' please note, this populates the vector with only the alphanumric strings
#' beginning each line of the input file. Also, RNA genes (begining ENSG000)
#' are currently excluded.
#'
#' @param x object of class geneanno.
#' @param file character string providing the name of the input file
#' @return vector of character strings, as exemplified by the inputlist data object.
#' @importFrom methods setGeneric setMethod
#' @importFrom utils read.table
#' @examples
#' data("inputlist")
#' write(myinputlist,file="inputlist.txt")
#' x <- geneanno()
#' x <- parseInputFile(x,"inputlist.txt")
#' @export
#'
setGeneric("parseInputFile", function(x,file) {
    standardGeneric("parseInputFile")
})

#' @describeIn parseInputFile vector of character strings
setMethod("parseInputFile","geneanno",
    function(x,file){
        fileroot <- ifelse(identical(x@fileroot,character(0)),getwd(),
            x@fileroot)
        file <- file.path(fileroot,file)
        s <- read.table(file,header = FALSE)[1]
        t <- gsub("^([a-zA-Z0-9]+)\\..*","\\1",s[,1],perl = TRUE)
        t <- t[grep("^ENSG000",t,invert = TRUE)]
        return(t)
    }
)


#' Save Gene Information to group Specific Output Files
#'
#' \code{produceOutputFiles} saves the downloaded gene summary information into
#' group specific files.
#'
#' @param x object of class geneanno.
#' @param ggl a Vector of group specific vectors relating the list of genes to
#' the groups, as in the input file
#' @param gs Vector of Gene Summary information downloaded from NIH and Uniprot
#' databases, typically output by getGeneSummary
#' @param pub Matrix containing journal article information from the PubMed
#' resource, as retruned by the \code{searchPublications} function
#' @return This function has no return value as outputs directly to file.
#' @importFrom methods setGeneric setMethod
#' @importFrom utils read.table
#' @examples
#' data("genematrix")
#' data("genesummary")
#' ggl <- getGroupGeneList(mygeneanno,mygenematrix)
#' data("publications")
#' produceOutputFiles(mygeneanno,ggl,mygenesummaries,mypublications)
#' @export
#'

setGeneric("produceOutputFiles", function(x,ggl,gs,pub) {
    standardGeneric("produceOutputFiles")
})

#' @describeIn produceOutputFiles Save object data to text files.
setMethod("produceOutputFiles",signature(x = "geneanno",ggl = "vector", gs = "vector",pub = "missing"),
    function(x,ggl,gs){
        fileroot <- ifelse(identical(x@fileroot,character(0)),getwd(),x@fileroot)
            if (!dir.exists(file.path(fileroot,x@outputstem))) {
                dir.create(file.path(fileroot,x@outputstem))
            }
            for (d in x@groupnos) {
                filename = file.path(fileroot,x@outputstem,sprintf("%s.txt",d))
                if (file.exists(filename)) {file.remove(filename)}
                for (i in ggl[[d]]) {
                    if (!is.null(i)) {
                        write(i,file = filename, append = TRUE)
                        for (j in slotNames(gs[[i]])) {
                            for (f in slot(gs[[i]],j)) {
                                if (!is.na(f) && !is.null(f) && f != "") {
                                    write(sprintf("\t%s: %s",toupper(j),f),
                                          file = filename,append = TRUE)
                                }
                            }
                        }
                        write('\n',file = filename,append = TRUE)
                    }
                }
            }
        }
)

#' @describeIn produceOutputFiles Save object data, including journal articles,
#' to text files.
setMethod("produceOutputFiles",signature(x = "geneanno",ggl = "vector", gs = "vector",pub = "vector"),
    function(x,ggl,gs,pub) {
        fileroot <- ifelse(identical(x@fileroot,character(0)),getwd(),x@fileroot)
        if (!dir.exists(file.path(fileroot,x@outputstem))) {
            dir.create(file.path(fileroot,x@outputstem))
        }
        for (d in slot(x,"groupnos")) {
            filename = file.path(fileroot,x@outputstem,sprintf("%s.txt",d))
            if (file.exists(filename)) {file.remove(filename)}
            for (i in ggl[[d]]) {
                if (!is.null(i)) {
                  write(i,file = filename,append = TRUE)
                    for (j in slotNames(gs[[i]])) {
                        for (f in slot(gs[[i]],j)) {
                            if (!is.na(f) && !is.null(f) && f != "") {
                                write(sprintf("\t%s: %s",toupper(j),f), file = filename,append = TRUE)
                            }
                        }
                    }
                    for (k in pub[which(pub == paste(c(d,i),collapse = " ") ),2] ) {
                        if (length(k) > 0) {
                            write("\n\tJOURNAL ARTICLES:", file = filename,append = TRUE)
                            for (l in k) {
                                vol <- ifelse((!is.na(l@Issue) && !is.null(l@Issue) && l@Issue != ""),sprintf("%s(%s)",l@Volume,l@Issue),l@Volume)
                                write(sprintf("\t%s\n\t%s (%s). %s. %s; %s.\n\tPMID:%s. DOI:%s\n",l@Title,l@Authors,strsplit(l@Date," ")[[1]][1],l@Journal,vol,l@Pages,l@Id,l@DOI), file = filename,append = TRUE)
                            }
                        }
                    }
                write('\n',file = filename,append = TRUE)
            }
        }
    }
}
)
biscuit13161/UNGeneAnno documentation built on May 26, 2019, 2:33 a.m.