R/read_tome.R

Defines functions read_tome_mapping_desc read_tome_mapping read_tome_dend read_tome_projection_desc read_tome_projection read_tome_stats_desc read_tome_stats read_tome_dend_desc read_tome_gene_meta_desc read_tome_gene_meta read_tome_anno_desc read_tome_anno read_tome_data_dims read_tome_total_counts read_tome_intron_lengths read_tome_exon_lengths read_tome_sample_names read_tome_gene_names read_tome_sample_data read_tome_gene_data

Documented in read_tome_anno read_tome_anno_desc read_tome_data_dims read_tome_dend read_tome_dend_desc read_tome_exon_lengths read_tome_gene_data read_tome_gene_meta read_tome_gene_meta_desc read_tome_gene_names read_tome_intron_lengths read_tome_mapping read_tome_mapping_desc read_tome_projection read_tome_projection_desc read_tome_sample_data read_tome_sample_names read_tome_stats read_tome_stats_desc read_tome_total_counts

#' Read Gene Expression Data from a tome file
#'
#' @param tome tome file to read. Required.
#' @param genes A vector of gene names to read. If NULL, will read all genes.
#' @param regions The gene regions to use. Can be "exon", "intron", or "both". Default = "exon".
#' @param units The type of values to return. Can be "counts" or "cpm". Default = "counts".
#' @param transform Transformation to apply to values. Can be "none", "log", "log2", "log10". Log transforms will add 1 to values before transformation. Default = "none".
#' @param format The format of the output. Can be "data.frame", "matrix", or "dgcMatrix" (sparse matrix). Default is "data.frame".
#'
#' @return A data.frame with sample_name as the first column and each subsequent column
#' containing gene expression values and named for the genes; Or a matrix with columns as genes and rows as samples.
#'
read_tome_gene_data <- function(tome,
                                genes = NULL,
                                regions = "exon",
                                units = "counts",
                                transform = "none",
                                format = "data.frame") {

  jagged <- read_tome_genes_jagged(tome,
                                   genes,
                                   regions)

  if(format == "data.frame") {
    out <- jagged_to_data.frame(jagged,
                                rows = "sample_names",
                                cols = "gene_names")
  } else if(format == "matrix") {
    out <- jagged_to_matrix(jagged,
                            rows = "sample_names",
                            cols = "gene_names")
  } else if(format == "dgCMatrix") {
    out <- jagged_to_dgCMatrix(jagged,
                               rows = "sample_names",
                               cols = "gene_names")
  }

  if(units == "cpm") {

    if(regions == "exon") {
      total_counts <- c(h5read(tome, "/data/total_exon_counts"))
    } else if(regions == "intron") {
      total_counts <- c(h5read(tome, "/data/total_intron_counts"))
    } else if(regions == "both") {
      total_counts <- c(h5read(tome, "/data/total_exon_counts") + h5read(tome, "/data/total_intron_counts"))
    }
    out[,genes] <- out[,genes]/(total_counts/1e6)
  }

  if(transform == "log") {
    out[,genes] <- log(out[,genes] + 1)
  } else if(transform == "log2") {
    out[,genes] <- log2(out[,genes] + 1)
  } else if(transform == "log10") {
    out[,genes] <- log10(out[,genes] + 1)
  }

  h5closeAll()

  out

}

#' Read Sample Expression Data from a tome file
#'
#' @param tome tome file to read. Required.
#' @param samples A vector of sample names to read. Required
#' @param regions The gene regions to use. Can be "exon", "intron", or "both". Default = "exon".
#' @param units The type of values to return. Can be "counts" or "cpm". Default = "counts".
#' @param transform Transformation to apply to values. Can be "none", "log", "log2", "log10". Log transforms will add 1 to values before transformation. Default = "none".
#' @param format The format of the output. Can be "data.frame", "matrix", or "dgcMatrix" (sparse matrix). Default is "data.frame".
#'
#' @return A data.frame with gene_name as the first column and each subsequent column
#' containing gene expression values and named for the samples; Or a matrix with columns as samples and rows as genes.
#'
read_tome_sample_data <- function(tome,
                                  samples,
                                  regions = "exon",
                                  units = "counts",
                                  transform = "none",
                                  format = "data.frame") {
  #library(purrr)
  #library(dplyr)

  jagged <- read_tome_samples_jagged(tome,
                                     samples,
                                     regions)

  if(format == "data.frame") {
    out <- jagged_to_data.frame(jagged,
                                rows = "gene_names",
                                cols = "sample_names")
  } else if(format == "matrix") {
    out <- jagged_to_matrix(jagged,
                            rows = "gene_names",
                            cols = "sample_names")
  } else if(format == "dgCMatrix") {
    out <- jagged_to_dgCMatrix(jagged,
                               rows = "gene_names",
                               cols = "sample_names")
  }

  if(units == "cpm") {
    root <- rhdf5::H5Fopen(tome)
    sample_names <- rhdf5::h5read(root,"/sample_names")
    sample_index <- match(samples, sample_names)

    if(regions == "exon") {
      total_counts <- c(rhdf5::h5read(root, "/data/total_exon_counts")[sample_index])
    } else if(regions == "intron") {
      total_counts <- c(rhdf5::h5read(root, "/data/total_intron_counts")[sample_index])
    } else if(regions == "both") {
      total_counts <- c(rhdf5::h5read(root, "/data/total_exon_counts")[sample_index] + rhdf5::h5read(root, "/data/total_intron_counts")[sample_index])
    }
    out[,samples] <- sweep(out[,samples], 2, (total_counts/1e6), "/")
  }

  if(transform == "log") {
    out[,samples] <- log(out[,samples] + 1)
  } else if(transform == "log2") {
    out[,samples] <- log2(out[,samples] + 1)
  } else if(transform == "log10") {
    out[,samples] <- log10(out[,samples] + 1)
  }

  out

}

#' Get all gene names in a tome file
#'
#' @param tome tome file to read.
#'
read_tome_gene_names <- function(tome) {
  read_tome_vector(tome,"/gene_names")
}

#' Get all sample names in a tome file
#'
#' @param tome tome file to read.
#'
read_tome_sample_names <- function(tome) {
  read_tome_vector(tome,"/sample_names")
}

#' Get exon lengths from a tome file
#'
#' @param tome tome file to read.
#' @param genes specific genes to retrieve
#' @param return_as If "vector", will return only the length values as a vector.
#' If "data.frame", will return a data.frame with two columns: "gene_name" and "exon_length".
#'
read_tome_exon_lengths <- function(tome,
                                   genes = NULL,
                                   return_as = "vector") {
  exon_lengths <- read_tome_vector(tome, "/data/exon_lengths")

  if(return_as == "data.frame" | !is.null(genes)) {
    tome_genes <- read_tome_gene_names(tome)
    exon_lengths <- data.frame(gene_name = tome_genes,
                               exon_length = exon_lengths)
  }

  if(!is.null(genes)) {
    exon_lengths <- exon_lengths[match(genes,exon_lengths$gene_name),]
  }

  if(return_as == "vector") {
    unlist(exon_lengths$exon_length)
  } else if(return_as == "data.frame") {
    exon_lengths
  }

}

#' Get intron lengths from a tome file
#'
#' @param tome tome file to read.
#' @param genes specific genes to retrieve
#' @param return_as If "vector", will return only the length values as a vector.
#' If "data.frame", will return a data.frame with two columns: "gene_name" and "intron_length".
#'
read_tome_intron_lengths <- function(tome,
                                   genes = NULL,
                                   return_as = "vector") {

  intron_lengths <- read_tome_vector(tome,"/data/intron_lengths")

  if(return_as == "data.frame" | !is.null(genes)) {
    tome_genes <- read_tome_gene_names(tome)
    intron_lengths <- data.frame(gene_name = tome_genes,
                               intron_length = intron_lengths)
  }

  if(!is.null(genes)) {
    intron_lengths <- intron_lengths[match(genes,intron_lengths$gene_name),]
  }

  if(return_as == "vector") {
    unlist(intron_lengths$intron_length)
  } else if(return_as == "data.frame") {
    intron_lengths
  }

}

#' Get total per-gene counts from a tome-file.
#'
#' Useful for computing CPM or FPKM
#'
#' @param tome tome file to read.
#' @param region the gene regions to use. Can be "exon", "intron", or "both". Default = "exon"
#'
read_tome_total_counts <- function(tome,
                                   region = "exon") {
  if(region == "exon") {

    total_counts <- read_tome_vector(tome,"data/total_exon_counts")

  } else if(region == "intron") {

    total_counts <- read_tome_vector(tome,"data/total_intron_counts")

  } else if(region == "both") {

    total_exon_counts <- read_tome_vector(tome,"data/total_exon_counts")
    total_intron_counts <- read_tome_vector(tome,"data/total_intron_counts")
    total_counts <- total_exon_counts + total_intron_counts

  }

  total_counts
}

#' Get dims for data stored in a tome file.
#'
#' @param tome tome file to read.
#' @param transpose logical indicating which orientation to get dims. Default = FALSE.
#' If FALSE, gives dimensions with samples as rows and genes as columns.
#' If TRUE,  gives dimensions for genes as rows and samples as columns.
#'
read_tome_data_dims <- function(tome,
                                transpose = F) {

  if(transpose == F) {
    dims <- read_tome_vector(tome,"data/exon/dims")
  } else if(transpose == T) {
    dims <- read_tome_vector(tome,"data/t_exon/dims")
  }

  dims
}


#' Read annotations table from a tome file
#'
#' @param tome The location of the tome file to read.
#' @param groups The groups to read - matches column names using grep. Can provide multiple with c(). If NULL, will get all columns. Default is NULL.
#'
read_tome_anno <- function(tome,
                           groups = NULL) {

  if(!is.null(groups)) {
    anno <- read_tome_data.frame(tome,
                                 "/sample_meta/anno",
                                 stored_as = "vectors",
                                 columns = c("sample_name", groups),
                                 match_type = "grep",
                                 get_all = FALSE)
  } else {
    anno <- read_tome_data.frame(tome,
                                 "/sample_meta/anno",
                                 stored_as = "vectors",
                                 columns = "sample_name",
                                 get_all = TRUE)
  }

  anno <- anno %>%
    dplyr::select(sample_name, everything())

}

#' Read annotation descriptions table from a tome file
#'
#' @param tome The location of the tome file to read.
#'
read_tome_anno_desc <- function(tome) {

  read_tome_data.frame(tome,
                       "/sample_meta/desc",
                       stored_as = "data.frame")

}

#' Read gene metadata table from a tome file
#'
#' @param tome The location of the tome file to read.
#' @param columns Specific columns to read If NULL, will get all columns. Default is NULL.
#'
read_tome_gene_meta <- function(tome,
                                columns = NULL) {

  if(!is.null(columns)) {
    read_tome_data.frame(tome,
                         "/gene_meta/genes",
                         stored_as = "vectors",
                         columns = c("gene_name", columns),
                         get_all = FALSE)
  } else {
    read_tome_data.frame(tome,
                         "/gene_meta/genes",
                         stored_as = "vectors",
                         columns = "gene_name",
                         get_all = TRUE)
  }

}

#' Read gene metadata descriptions table from a tome file
#'
#' @param tome The location of the tome file to read.
#'
read_tome_gene_meta_desc <- function(tome) {

  read_tome_data.frame(tome,
                       "/gene_meta/desc",
                       stored_as = "data.frame")

}

#' Read dendrogram descriptions table from a tome file
#'
#' @param tome The location of the tome file to read.
#'
read_tome_dend_desc <- function(tome) {

  read_tome_data.frame(tome,
                       "/dend/desc",
                       stored_as = "data.frame")

}

#' Read stats table from a tome file
#'
#' @param tome the location of the tome file to read.
#' @param stats_name the name of the stats table to read
#' @param columns selected columns to read. If NULL, reads all columns. Default = NULL.
#' @param get_all logical, whether or not to append all other columns after the specified columns. Default = FALSE.
#'
read_tome_stats <- function(tome,
                            stats_name = NULL,
                            columns = NULL,
                            get_all = FALSE) {

  ls <- rhdf5::h5ls(tome)
  stats_names <- ls$name[ls$group == "/stats"]
  stats_names <- stats_names[stats_names != "desc"]

  if(is.null(stats_name)) { stats_name <- ".namenotfound" }

  if(stats_name %in% stats_names) {

    stats_target <- paste0("/stats/", stats_name)

    if(!is.null(columns)) {
      stats <- read_tome_data.frame(tome,
                                    stats_target,
                                    stored_as = "vectors",
                                    columns = c("sample_name", columns),
                                    get_all = get_all)
    } else {
      stats <- read_tome_data.frame(tome,
                                    stats_target,
                                    stored_as = "vectors",
                                    columns = columns,
                                    get_all = TRUE)
    }
  } else {

    if(length(stats_names) > 0) {
      stats_message <- paste0("A stats table name (stats_name) is required. Available in this dataset are: ", paste(stats_names,collapse = ", "))
    } else {
      stats_message <- "No stats tables are found in this dataset."
    }

    stop(stats_message)

  }

  stats
}

#' Read stats descriptions table from a tome file
#'
#' @param tome The location of the tome file to read.
#'
read_tome_stats_desc <- function(tome) {

  read_tome_data.frame(tome,
                       "/stats/desc",
                       stored_as = "data.frame")

}

#' Read projection coordinates table from a tome file
#'
#' @param tome the location of the tome file to read.
#' @param proj_name the projection to read
#'
read_tome_projection <- function(tome,
                                 proj_name = NULL) {

  ls <- rhdf5::h5ls(tome)
  proj_names <- ls$name[ls$group == "/projection"]
  proj_names <- proj_names[proj_names != "desc"]

  if(is.null(proj_name)) { proj_name <- ".namenotfound" }

  if(proj_name %in% proj_names) {
    proj_target <- paste0("projection/",proj_name)

    proj <- read_tome_data.frame(tome,
                                 proj_target,
                                 stored_as = "data.frame")

    proj
  } else {

    if(length(proj_names) > 0) {
      proj_message <- paste0("A projection name (proj_name) is required. Available in this dataset are: ", paste(proj_names,collapse = ", "))
    } else {
      proj_message <- "No projections are found in this dataset."
    }
    stop(proj_message)
  }

}

#' Read projection descriptions table from a tome file
#'
#' @param tome The location of the tome file to read.
#'
read_tome_projection_desc <- function(tome) {

  read_tome_data.frame(tome,
                       "/projection/desc",
                       stored_as = "data.frame")

}

#' Read dendrogram object from a tome file
#'
#' @param tome the location of the tome file to read.
#' @param dend_name the dendrogram to read.
#'
read_tome_dend <- function(tome,
                           dend_name = NULL) {

  ls <- rhdf5::h5ls(tome)
  dend_names <- ls$name[ls$group == "/dend"]
  dend_names <- dend_names[dend_names != "desc"]

  if(is.null(dend_name)) { dend_name <- ".namenotfound" }

  if(dend_name %in% dend_names) {
    dend_target <- paste0("dend/",dend_name)

    dend <- read_tome_serialized(tome,
                                 dend_target)

    dend
  } else {

    if(length(dend_names) > 0) {
      dend_message <- paste0("A dendrogram name (dend_name) is required. Available in this dataset are: ", paste(dend_names,collapse = ", "))
    } else {
      dend_message <- "No dendrograms are found in this dataset."
    }
    stop(dend_message)
  }

}


#' Read mapping table from a tome file
#'
#' @param tome the location of the tome file to read.
#' @param mapping_name the name of the mapping table to read
#' @param columns selected columns to read. If NULL, reads all columns. Default = NULL.
#' @param get_all logical, whether or not to append all other columns after the specified columns. Default = FALSE.
#'
read_tome_mapping <- function(tome,
                              mapping_name = NULL,
                              columns = NULL,
                              get_all = FALSE) {

  ls <- rhdf5::h5ls(tome)
  mapping_names <- ls$name[ls$group == "/mapping"]
  mapping_names <- mapping_names[mapping_names != "desc"]

  if(is.null(mapping_name)) { mapping_name <- ".namenotfound"}

  if(mapping_name %in% mapping_names) {

    mapping_target <- paste0("mapping/", mapping_name)

    if(!is.null(columns)) {
      mapping <- read_tome_data.frame(tome,
                                      mapping_target,
                                      stored_as = "vectors",
                                      columns = c("sample_name", columns),
                                      get_all = get_all)
    } else {
      mapping <- read_tome_data.frame(tome,
                                      mapping_target,
                                      stored_as = "vectors",
                                      columns = "sample_name",
                                      get_all = get_all)
    }
  } else {

    if(length(mapping_names) > 0) {
      mapping_message <- paste0("A mapping table name (mapping_name) is required. Available in this dataset are: ", paste(mapping_names,collapse = ", "))
    } else {
      mapping_message <- "No mapping tables are found in this dataset."
    }

    stop(mapping_message)

  }

  mapping
}

#' Read mapping descriptions table from a tome file
#'
#' @param tome The location of the tome file to read.
#'
read_tome_mapping_desc <- function(tome) {

  read_tome_data.frame(tome,
                       "/mapping/desc",
                       stored_as = "data.frame")

}
AllenInstitute/scrattch.io documentation built on Nov. 17, 2021, 10:06 a.m.