R/tSNE.R

Defines functions plotTsne extractCellFromStates generateStatesFromFolder

Documented in extractCellFromStates generateStatesFromFolder plotTsne

#' generate states list for tSNE from a fasta folder
#' it is expected that the length of each fasta file is equal
#'
#' @param fasta.folder path to fasta folder
#' @param model.path path to model file
#' @param maxlen length of semi-redundant chunks
#' @param batch.size how many chunks are predicted in parallel
#' @param save if TRUE, output will be written to file
#' @param output.folder path to output folder with tailing "/"
#' @param verbose TRUE/FALSE
#' @param exact keep exact values and dont round
#' @export
generateStatesFromFolder <-
  function(fasta.folder = "example_files/fasta",
           model.path = "example_files/dummy_model_cpu.hdf5",
           maxlen = 80,
           batch.size = 500,
           save = T,
           output.folder = "example_files/states/",
           verbose = T,
           exact = F) {
    
    files <- list.files(fasta.folder, full.names = T)
    states.list <- list()
    model <- keras::load_model_hdf5(model.path)
    # Remove the last 2 layers
    keras::pop_layer(model)
    keras::pop_layer(model)
    for (file in files) {
      message(paste("processing: ", file))
      states <- deepG::getStatesFromFasta(
        model = model,
        fasta.path = file,
        batch.size = batch.size,
        maxlen = maxlen,
        verbose = T
      )
      states <- round(states, digits = 3)
      if (save) {
        write.table(
          states,
          file = paste0(
            output.folder,
            basename(tools::file_path_sans_ext(file)),
            ".csv"
          ),
          quote = F,
          col.names = F
        )
        message(paste("saved states to", paste0(
          output.folder,
          basename(tools::file_path_sans_ext(file)),
          ".csv"
        )))
        states.list <- NULL
      } else {
        # keep in memory
        states.list[[tools::file_path_sans_ext(basename(file))]] <-
          states
      }
    }
    return(states.list)
  }

#' Preparing states for tSNE
#'
#' @param states.list list of states generated by generateStatesromFolder
#' @param states.folder generat states list from folder containing tate files
#' @param cell.number number of cell/neuron that should go into tSNE
#' @param flatten if true, all cells will be used
#' @export
extractCellFromStates <- function(states.list = NULL,
                                  states.folder = "example_files/states",
                                  cell.number = 2,
                                  flatten = FALSE) {
  # if states.folder is set, no flatten option is implemented currently
  if (!is.null(states.folder)) {
    message(paste("extracting cell number from", length(list.files(states.folder, full.names = TRUE)), "state files"))
    # read in the individual files and join them together, keep track of file names for later reference
    states.list <- list()
    files <- list.files(states.folder, full.names = TRUE)
    if (!flatten) {
      for (file in files) {
        # reads in file, extracts one column form that (neuron) and saves this to the list
        dat <- read.table(file, sep = " ", header = F)
        dat$V1 <- NULL # first column is the position
        states.list[[tools::file_path_sans_ext(basename(file))]] <- dat[, cell.number]
      }
      dim <- c(dim(data.frame(states.list[[1]]))[1],
               length(states.list)) # should be 920 6 on example dataset
      states.array <- array(as.numeric(unlist(states.list)),
                            dim = dim)
      states <- t(states.array)
    } else {
      for (file in files) {
        # flatten, so use all cells
        dat <- read.table(file, sep = " ", header = F)
        dat$V1 <- NULL # first column is the position
        states.list[[tools::file_path_sans_ext(basename(file))]] <- as.vector(t(dat)) # bring 2dim array to 1D vector
      }
      dim <- c(dim(data.frame(states.list[[1]]))[1],
               length(states.list)) # should be 920 6 on example dataset
      states.array <- array(as.numeric(unlist(states.list)),
                            dim = dim)
      states <- t(states.array)
    }
  } else { 
    dim <- c(dim(data.frame(states.list[[1]]))[1],
             dim(data.frame(states.list[[1]]))[2],
             length(states.list)) # should be 920 512 6 on example dataset
    message(paste("dimension:", dim))
    states.array <- array(as.numeric(unlist(states.list)),
                          dim = dim)
    if (flatten) {
      states <- t(do.call('rbind', lapply(1:dim(states.array)[3],
                                          function(x)
                                            cbind(states.array[x, , ], t = x)))) # bring 3dim array to 2D
    } else {
      states <- t(states.array[, cell.number, ])
    }
  }
  return(list("ids" =  names(states.list), "states" = states))
}

#' generate and plot tSNE
#'
#' @param states states for tSNE (generated by \code{(extractCellFromStates()}) 
#' @param perplexity perplexity parameter (should not be bigger than 3 * perplexity < nrow(X) - 1, see details for interpretation)
#' @param pca whether an initial PCA step should be performed (default: TRUE)
#' @param max_iter number of iterations (default: 1000)
#' @param path.metadata file used for coloring
#' @param sample.name.column column in metadata file that holds the sample name
#' @param label.name.column column that holds the label name in path.metadata
#' @param color.column column that holds the color information in path.metadata
#' @param out.path path to output pdf
#' @export
plotTsne <- function(states,
                     perplexity = c(0.5, 1),
                     pca = TRUE,
                     max_iter = 1000,
                     path.metadata = NULL,
                     sample.name.column = 1,
                     label.name.column = 2,
                     color.column = 3,
                     out.path = "tsne.pdf") {
  pdf(out.path)
  for (i in seq_along(perplexity)) {
    message(paste("perplexity:", perplexity[i]))
    # run tSNE
    tsne <- Rtsne::Rtsne(
      states$states,
      dims = 2,
      perplexity = perplexity[i],
      check_duplicates = FALSE,
      pca = pca,
      max_iter = max_iter
    )
    # plot
    if (!is.null(path.metadata)) {
      meta <- read.csv2(path.metadata,
                        header = F,
                        stringsAsFactors = F)
      colors <-
        meta[match(states$ids, meta[, sample.name.column]), ][, color.column]
      labels <-
        meta[match(states$ids, meta[, sample.name.column]), ][, label.name.column]
      tsne.df <- data.frame(
        x = tsne$Y[, 1],
        y = tsne$Y[, 2],
        labels = labels,
        col = colors,
        stringsAsFactors = F
      )
      p <-
        ggplot2::ggplot(tsne.df, ggplot2::aes(x = x, y = y, color = labels))
    } else {
      tsne.df <- data.frame(
        x = tsne$Y[, 1],
        y = tsne$Y[, 2],
        stringsAsFactors = F
      )
      p <- ggplot2::ggplot(tsne.df, ggplot2::aes(x = x, y = y))
    }
    p <- p + ggplot2::geom_point()
    p <- p + ggplot2::theme_bw()
    p <- p + ggplot2::ggtitle(paste("perplexity:", perplexity[i]))
    print(p)
  }
  dev.off()
}
hiddengenome/altum documentation built on April 22, 2020, 9:33 p.m.