R/qc.R

#' boxplots of GSMs
#' @param gse_id valid GSE id, e.g.: "GSE41496"
#' @param REVIEWED output file from Step1, e.g.: "sample_review.txt"
#' @param datadir path to GEOtemp, e.g.: "GEOtemp"
#' @export
qc_plots <- function(gse_id, REVIEWED, datadir){
  file <- fread(REVIEWED, colClasses = "character")

  path <- paste0(datadir, "/", gse_id, "_series_matrix.txt.gz")
  #if(!file.exists(path)) {
  #  showNotification("Dataset not found in local cache; downloading now. Please wait.", duration = 10, type = "default")
  #  getGEO(GEO = gse_id, destdir = datadir)

  gse <- setClass("gse", contains = "eSet")
  print("break1")
  gse$eset <- exprs(getGEO(filename = path, getGPL = FALSE, parseCharacteristics = FALSE))
  print("break2")
  gse$groups <- file[GSE == gse_id, paste(BioSampName, Node, NodeFunction, BSM, BSMDCD, sep = "-")]
  gse$gsms <- file[GSE == gse_id, geo_accession]
  gse$biosamples <- length(file[GSE == gse_id, unique(BioSampName)])



  p1 <- boxplot(gse$eset, outline = FALSE)
  validate(need(!is.null(gse$eset), ""))
  eset <- gse$eset
  missing <- sum(is.na(eset) == T)
  if(missing) showNotification(paste("There were", missing, "missing data point(s)."), type = "warning", duration = NULL)
  eset <- na.omit(eset)
  pca <- prcomp(t(eset))
  pca2 <- data.frame(pca$x[, 1:2], Group = factor(gse$groups), GSM = gse$gsms)
  shapes <- rep_len(15:19, nlevels(factor(gse$groups)))
  p <- ggplot(data = pca2, aes(x = PC1, y = PC2, shape = Group, color = Group, label = GSM)) +
    scale_shape_manual(values = shapes) + geom_point(size = 3)
  # Attempt to cluster points to see whether "batch effects" was just due to having more than one biosample type.
  # if(gse$biosamples > 1) {
  #   k <- kmeans(pca$x, centers = 2)
  #   pca2$Cluster <- k$cluster
  #   poly <- lapply(split(pca2, pca2$Cluster), function(df) { df[chull(df), ] })
  #   poly <- do.call(rbind, poly)
  #   p <- p + geom_polygon(data = poly, aes(x = PC1, y = PC2, group = Cluster, fill = Cluster), alpha = 0.3, linetype = 0)
  # }
  p2 <- ggplotly(p, tooltip = c("Group", "GSM"))
  plot_grid(p1, p2)
}
axelmuller/geometric2 documentation built on May 25, 2019, 6:26 p.m.