geometric2: NURSA annotation aid

# Specify a directory where GSE downloaded files are stored
#GEOtemp <- "/home/axel/my_code/nursa/geometric2/required_files/GEOtemp"

# Input file
#GSELIST <- "test2.txt"
# Location/name for output file
#REVIEW1 <- "sample_review.txt"

#' Annotation Step 1
#' @param GEOtemp directory where downloaded GEO files are stored, e.g.: "/home/axel/my_code/nursa/geometric2/data/GEOtemp"
#' @param GSELIST a file with one GSE id per row
#' @param REVIEW1 filename for output
step_1 <- function(GEOtemp="/home/axel/my_code/nursa/geometric2/required_files/GEOtemp", GSELIST= "test2.txt", REVIEW1="sample_review.txt"){

  ## Download GSEs
  #Feed in the ids of GSEs that passed Filter #1 (based on description). The following steps pull the metadata create a standardized annotation table for the samples.

  gses <- readLines(GSELIST)

  # Download the GSE
  getSamples <- function(gse) {
    eset <- getGEO(GEO = gse, destdir = GEOtemp)[[1]]
    pData(eset)
  }

  # if processing many, limit messages to avoid cluttering display
  results <- suppressMessages(lapply(gses, function(x) try(getSamples(x))))
  names(results) <- gses

####
  # the next session is temporarily commented out
####
  #Sometimes downloading a GEO will fail (maybe 1-5% of GSEs?) because there was a parsing error or some other problem. GEOquery verbose output but not always helpful, so better to check with:

  #ok <- allOK(results)

  #table(ok)

  #cat("GSEs that failed:", names(results)[!ok])

  #results <- results[ok] # Go back to failures later, proceed with OK

  ## Create and populate annotation table
  #Infer some annotation information for each GSE using some simple rules.
  #This is only the starting point as data is imperfectly extracted; manual annotation follows for the exported table.

  # Experiment Type (1-Channel or 2-Channel; more specific assignments, i.e. whether there was dye swap, happens in human review)


  inferXPType <- function(DT) {
    default <- rep("1C", nrow(DT))
    ch2 <- grep("source_name_ch2", names(DT))
    if(length(ch2)) return(rep("2C", nrow(DT)))
    return(default)
  }

  ixpType <- sapply(results, inferXPType)

  # BSM -- Molecule -- note: perhaps have a list of all known tlr ligands to match against
  inferBSM <- function(DT) {
    x1 <- rep("", nrow(DT))
    bsmcol <- grep("agent|stimulation", names(DT))
    if(length(bsmcol) == 1) {
      info <- DT[bsmcol]
      return(info)
    }
    return(x1)
  }

  iBSM <- sapply(results, inferBSM)

  # BSMDCD -- Time
  inferDCD <- function(DT) {
    x1 <- rep("", nrow(DT))
    # First find a column that specifically contains treatment info
    tcol <- grep("^treatment:ch1|time", names(DT))
    if(length(tcol)) {
      info <- do.call(paste, DT[tcol])
    } else { # Use information in sample title
      info <- DT$title
    }
    m <- regexpr("[0-9]+ ?(d|h|D|H)|(D|d)(ay) ?[0-9]+", info)
    x1[m != -1] <- regmatches(info, m)
    # Sometimes time is stated as "overnight", which usually means "12 h"
    x1[grepl("overnight", info)] <- "12h"
    x1 <- tolower(gsub(" ", "", x1))
    return(x1)
  }

  iDCD <- sapply(results, inferDCD)

  # BSMDCD2 -- Dose/concentration
  inferDCD2 <- function(DT) {
    x1 <- rep("", nrow(DT))
    dosecol <- grep("dose|concentration|amount|treatment", names(DT))
    dosecol <- c(dosecol, which(sapply(DT, function(x) grepl("treatment_protocol", x[1])) == T))
    if(length(dosecol)) {
      info <- do.call(paste, DT[dosecol])
      m <- gregexpr("[0-9.]+ ?(u|n|m|[?])(g|M)?(\\/[A-Za-z]{2})?", info)
      dose <- sapply(regmatches(info, m), paste, collapse = ";")
      dose <- gsub("?", "u", dose, fixed = T)
      x1 <- dose
    }
    return(x1)
  }

  iDCD2 <- sapply(results, inferDCD2)

  # BiosampName -- to do: a more sophisticated implementation could also match to biosample ID
  #biosamps <- readLines("biosamps.txt") ## This is a list derived from "allbiosamples"

  inferBiosamp <- function(DT) {
    # First find the obvious columns containing cell line/type
    cellcol <- grep("^cell.*ch1", colnames(DT))
    if(length(cellcol)) {
      cell <- do.call(paste, DT[cellcol])
      return(cell)
    } else {
      tissuecol <- grep("^tissue.*ch1", colnames(DT))
      if(length(tissuecol)) {
        tissue <- do.call(paste, DT[tissuecol])
        return(tissue)
      } else { # Use the source_name_ch1,treatment_protocol_ch1 columns otherwise
        bs <- lapply(biosamps, function(b) grepl(b, paste(DT$source_name_ch1, DT$treatment_protocol_ch1)))
        names(bs) <- biosamps
        bs <- bs[sapply(bs, sum) >= 1 ]
        if(!length(bs)) return(rep("?", nrow(DT)))
        bs <- bs[order(sapply(bs, sum), decreasing = T)]
        nterm <- ifelse(length(bs) > 1, 2, 1)
        x1 <- lapply(1:nterm, function(i) ifelse(bs[[i]], names(bs)[[i]], ""))
        x1 <- do.call("paste", x1)
        x1 <- trimws(x1)
        return(x1)
      }
    }
  }

  iBiosamp <- sapply(results, inferBiosamp)

  # Control?
  inferCTRL <- function(DT) {
    s <- DT$title
    ctrl <- as.numeric(grepl("wt|wildtype|control|ctrl|healthy|media|medium|veh|unstim|untreated|mock", s, ignore.case = T))
  }

  iCTRL <- sapply(results, inferCTRL)



  # Subset table for export with selected columns. Join data in one table and fill in additional columns.

  export <- rbindlist(results, use.names = T, fill = T)
  keepcols <- grep("title|geo_accession|source_name|description|platform|characteristics|treatment", names(export))
  export <- export[, keepcols, with = F]
  export[, GSE := unlist(mapply(rep, names(results), sapply(results, nrow)))]
  newcols1 <- c("RefDye", "Comment", "Group", "Batch", "Node", "NodeFunction", "BSM")
  export[, (newcols1) := ""]
  export[, c("Ignore") := 0]
  export[, c("XP") := 1]

  # Inferred variables
  export[, xpType := unlist(ixpType)]
  export[, isCTRL := unlist(iCTRL)]
  export[, BioSampName := unlist(iBiosamp)]
  export[, BSMDCD := unlist(iDCD)]
  export[, BSMDCD2 := unlist(iDCD2)]

  myorder <- c("GSE", "xpType", "RefDye", "Comment", "Group", "Ignore", "XP", "isCTRL", "Batch", "Node", "NodeFunction", "BSM", "BSMDCD", "BSMDCD2", "BioSampName",
               "title", "geo_accession")
  neworder <- c(myorder, names(export)[!names(export) %in% myorder])
  setcolorder(export, neworder)

  head(export)
  write.table(export, REVIEW1, sep = "\t", row.names = F, quote = F)
}