R/curateExpData.R

Defines functions curateExpdata

Documented in curateExpdata

#' curateExpdata
#'
#' This function will take the name lists generated by nameChange and apply them
#' to an expression data set. The function will process locally downloaded GEO
#' expression files, or even just accept a GEO accession number and do the work
#' for you.
#'
#' @param expname This is the name of your data set; if local is T, it is a GEO
#'   file (any type containing exp data). If local is F, this can be a GEO
#'   accession number.
#' @param namelist This is a name list file as generated by nameChange.
#' @param local This is a logical variable; if T, the function looks for a local
#'   file. If F, the function uses the expname as a GEO accession number and
#'   attempts to download the data.
#' @param pct This is the cutoff for gene data; if a gene has expression values
#'   for less than this percentage of samples, it is excluded.
#' @param pullmeta This allows you to get the metadata for the samples from the
#'   GSE file; default is false.
#'
#' @return Returns an expression frame containing genes as rows and samples as
#'   columns. If pullmeta = T, will be a list with the first entry being the
#'   expression frame and the second being the metadata from a GSE file.
#' @export
#' @importFrom magrittr %>%
#' @examples
#'
curateExpdata <- function(expname, namelist, local = F, pct = 0.8, pullmeta = F){

  #read and curate platform

  if(local == T){alldata <- GEOquery::getGEO(file = expname)
  }else{alldata <- GEOquery::getGEO(GEO = expname, destdir = getwd())}

  #pull expression matrix
  expdata <- Biobase::exprs(alldata[[1]])

  #make name matrix
  names <- as.data.frame(rownames(expdata), stringsAsFactors = F)

  #make lookup table for namelist
  namedict <- as.vector(namelist[,2])
  names(namedict) <- namelist[,1]

  #find new GeneIDs for probes
  names[,2] <- sapply(names[,1], function(x) {unname(namedict[x])})

  #remove non-mapping probes
  names <- names[which(!is.na(names[,2])),]
  rownames(names) <- names[,1]
  names[,1] <- NULL

  #merge names and data

  expdata <- merge(names, expdata, by = "row.names")

  #make everything numeric
  expdata[,3:ncol(expdata)] <- sapply(expdata[,3:ncol(expdata)], function(x) as.numeric(x))

  #average value for all probes per gene
  suppressWarnings(output <- expdata %>% dplyr::group_by(expdata$V2) %>%
                     dplyr::summarize_all(mean, na.rm=TRUE))
  output <- as.data.frame(output)

  #set row names, remove blank rows, export
  rownames(output) <- output[,1]
  output <- output[,-(1:3)]

  #remove rows with insufficient gene coverage
  output$coverage <- apply(output, 1, function(x) {sum(!is.na(x))/ncol(output)})
  output <- output[which(output$coverage >= pct),]
  output$coverage <- NULL

  #If metadata is wanted, pull metadata and make list
  if(pullmeta == T){
    metadata <- Biobase::pData(Biobase::phenoData(alldata[[1]]))
    newout <- list()
    newout[[1]] <- output
    newout[[2]] <- metadata
    output <- newout
  }

 output

}
foster-gabe/PFExpTools documentation built on May 25, 2020, 7:22 a.m.