ASSAY_NAME_VARIANT = 'dna'
#' Read loom file created by Tapestri pipeline
#'
#' @param filename loom file path
#' @param min_mutation_rate only read variants with mutations rate higher then threshold. Primary done to reduce size of data in memory
#' @return Tapestri object
#' @export
#' @import rhdf5
#'
read_loom <- function(filename, min_mutation_rate=0.05) {
# filename <- "data/PE11.cells.loom"
# min_mutation_rate = 0.1
# experiment_name = basename(filename)
assay_name = ASSAY_NAME_VARIANT
h5f = rhdf5::H5Fopen(filename, flags = "H5F_ACC_RDONLY")
object_str = rhdf5::h5ls(h5f)
object_str$object_names = paste0(object_str$group,'/',object_str$name)
dims = stringr::str_match(string = object_str$dim, pattern = '(.*) x (.*)')
object_str$rows = as.numeric(dims[,2])
object_str$columns = as.numeric(dims[,3])
ngt_path = 'matrix'
nvariants = object_str$columns[object_str$object_names == ngt_path | object_str$name == ngt_path][1]
mutation_rate = c()
chunks = c(seq(from=1, to=nvariants,by = 10000), nvariants)
for(i in 2:length(chunks)){
ngt = rhdf5::h5read(h5f,ngt_path, index = list(NULL,chunks[i-1]:chunks[i]))
mutated <- (ngt == 1 | ngt == 2)
mutation_rate <- c(mutation_rate,
base::colMeans(mutated, na.rm = T))
if(!all.equal(c(0,1,2,3), sort(unique(c(ngt))))) stop('NGT layer can only contain 0, 1, 2, 3 values.')
}
mutated_variants = which(mutation_rate > min_mutation_rate)
cell_annotations <- as_tibble(rhdf5::h5read(h5f,sprintf("col_attrs")))
if(nrow(cell_annotations) ==0) {
cell_annotations = tibble(
sample=rep(basename(filename),nrow(ngt)),
barcode=as.character(1:nrow(ngt))
)
}
cell_annotations <- validate_cell_annotations(cell_annotations = cell_annotations)
all_features =as_tibble(rhdf5::h5read(h5f,sprintf("row_attrs")))
filtered_features = all_features[mutated_variants,]
filtered_features = validate_variant_annotations(filtered_features)
assay = create_assay(assay_name = assay_name,
cell_annotations = cell_annotations,
feature_annotations = filtered_features)
filtered_ngt = data = rhdf5::h5read(h5f,ngt_path, index = list(NULL,mutated_variants))
colnames(filtered_ngt) <- filtered_features$id
assay = add_data_layer(assay = assay,layer_name = 'NGT', data = filtered_ngt)
layer_names = rhdf5::h5ls(h5f&sprintf("layers"))
for(layer in layer_names$name) {
data = rhdf5::h5read(h5f,sprintf("layers/%s", layer), index = list(NULL,mutated_variants))
#data = layers[[layer]][,mutated_variants]
colnames(data) <- filtered_features$id
assay = add_data_layer(assay = assay, layer_name = layer, data = data)
}
rhdf5::h5closeAll()
return(assay)
}
#' Read Tapestri multi-omics h5 file
#'
#' @param filename h5 file path
#' @param assay_name name of assay to load from h5
#' @param min_mutation_rate only read variants with mutations rate higher then treshold. Primarly done to reduce size of data in memory
#'
#' @return Tapestri multi-omics object
#' @export
#' @import rhdf5
#' @import stringr
#' @examples
#' \dontrun{
#' tapestri_raw = h5_reader(filename,min_mutation_rate = 0.1)
#' }
read_assay_h5 <- function(filename, assay_name, min_mutation_rate = 0.005) {
#filename <- "~/Google Drive/launches/r_package/data/merged_all.h5"
#assay_name='dna'
# layer = 'NGT'
# min_mutation_rate = 0.1
h5f = rhdf5::H5Fopen(filename, flags = "H5F_ACC_RDONLY")
object_str = rhdf5::h5ls(h5f)
object_str$object_names = paste0(object_str$group,'/',object_str$name)
dims = stringr::str_match(string = object_str$dim, pattern = '(.*) x (.*)')
object_str$rows = as.numeric(dims[,2])
object_str$columns = as.numeric(dims[,3])
ca = sprintf("/assays/%s/ca",assay_name)
ra = sprintf("/assays/%s/ra",assay_name)
if(!ca %in% object_str$object_names) {
stop(sprintf('H5 file does not seem to be a standard structure. Missing %s.', ca))
}
if(!ra %in% object_str$object_names) {
stop(sprintf('H5 file does not seem to be a standard structure. Missing %s.', ra))
}
cell_annotations <- as_tibble(rhdf5::h5read(h5f,ra))
cell_annotations <- validate_cell_annotations(cell_annotations = cell_annotations)
features =as_tibble(rhdf5::h5read(h5f,ca))
if(assay_name == ASSAY_NAME_VARIANT) {
# filter data to make it manageable in R
features = validate_variant_annotations(features)
ngt_path = sprintf("/assays/%s/layers/NGT",assay_name)
if(!ngt_path %in% object_str$object_names) {
stop(sprintf('H5 file does not seem to be a standard structure. Missing %s.', ngt_path))
}
nvariants = object_str$rows[object_str$object_names == ngt_path]
mutation_rate = c()
chunks = c(seq(from=1, to=nvariants,by = 10000),nvariants)
for(i in 2:length(chunks)){
ngt = rhdf5::h5read(h5f,ngt_path, index = list(chunks[i-1]:chunks[i],NULL))
mutated <- (ngt == 1 | ngt == 2)
mutation_rate <- c(mutation_rate,
base::rowMeans(mutated, na.rm = T))
if(!all.equal(c(0,1,2,3), sort(unique(c(ngt))))) stop('NGT layer can only contain 0, 1, 2, 3 values.')
}
filters = which(mutation_rate > min_mutation_rate)
} else {
filters = 1:nrow(features)
}
filtered_features <- features[filters,]
assay = create_assay(assay_name = assay_name,
cell_annotations = cell_annotations,
feature_annotations = filtered_features)
layer_names = rhdf5::h5ls(h5f&sprintf("assays/%s/layers",assay_name))
for(layer in layer_names$name) {
data = rhdf5::h5read(h5f,sprintf("assays/%s/layers/%s",assay_name,layer), index = list(filters,NULL))
data = t(data)
colnames(data) <- filtered_features$id
assay = add_data_layer(assay = assay, layer_name = layer, data = data)
}
rhdf5::h5closeAll()
return(assay)
}
#' read a tap file generated by Tapestri pipeline
#'
#' @param filename path to tap file
#' @param experiment_name name of experiment, if none file name is used
#'
#' @return multiomics object
#' @export
#'
read_tap <- function(filename, experiment_name = NA) {
if(is.na(experiment_name)) experiment_name = basename(filename)
h5f = rhdf5::H5Fopen(filename, flags = "H5F_ACC_RDONLY")
object_str = rhdf5::h5ls(h5f)
object_str$object_names = paste0(object_str$group,'/',object_str$name)
dims = stringr::str_match(string = object_str$dim, pattern = '(.*) x (.*)')
object_str$rows = as.numeric(dims[,2])
object_str$columns = as.numeric(dims[,3])
assay_names = rhdf5::h5ls(h5f&sprintf("/assays"), recursive = FALSE)
a = read_assay_h5(filename = filename, assay_name = assay_names$name[1])
moo = create_moo(experiment_name = experiment_name, cell_annotations = a@cell_annotations)
moo = add_assay(moo,a)
for(assay in assay_names$name[-1]) {
a = read_assay_h5(filename = filename, assay_name = assay)
moo = add_assay(moo,a)
}
return(moo)
}
#' Read variant assay from data exported from Tapestri Insights
#'
#' Creates a Assay object from Tapestri Insights exported zip file. Inlcudes all the DNA variant variant layers, plus variant annotations. If subclone information exists creates any analysis layer with the subclone labels.
#'
#' @param export_dir directory path to exported data. Should contain "AF.csv", "DP.csv", "GQ.csv", "NGT.csv", "README.txt", "Variants.csv"
#'
#' @return
#' @export
#'
read_insights_export <- function(export_dir) {
layers = c('NGT', 'AF', 'DP', 'GQ')
annotations = 'Variants'
files = c(layers, annotations)
files_exist = file.exists(paste0(export_dir, '/', files, '.csv'))
if (any(!files_exist))
stop(sprintf('Some files missing: %s', paste(files[!files_exist])))
ngt = readr::read_csv(paste0(export_dir, '/', 'NGT.csv'))
# check if ngt file has only relavent values
#unique(c(as.matrix(ngt %>% select(-Sample, -Cell))))
if(!all.equal(c(0,1,2,3), sort(unique(c(as.matrix(ngt %>% select(-matches(c('Sample','Cell','Subclone'))))))))) stop('NGT layer can only contain 0, 1, 2, 3 values.')
cell_annotations = ngt %>% select(sample = Sample, barcode = Cell) %>%
mutate(barcode = as.character(barcode),
id = paste0(sample, '-', barcode))
cell_annotations <- validate_cell_annotations(cell_annotations = cell_annotations)
variant_annotations = readr::read_csv(paste0(export_dir, '/', annotations, '.csv'))
variant_annotations = variant_annotations %>% mutate(id=variant_annotations$Variant)
variant_annotations = validate_variant_annotations(variant_annotations)
variants_from_insights = create_assay(
assay_name = 'variants',
cell_annotations = cell_annotations,
feature_annotations = variant_annotations
)
suppressMessages(for (layer in layers) {
layer_data = readr::read_csv(paste0(export_dir, '/', layer, '.csv'))
layer_data = layer_data %>% select(-Sample,-Cell) %>% select(variant_annotations$id)
variants_from_insights = add_data_layer(assay = variants_from_insights,
layer_name = layer,
data = layer_data)
})
if ('Subclone' %in% colnames(ngt)) {
variants_from_insights = add_analysis_layer(assay=variants_from_insights,layer_name = 'Subclone',data = ngt$Subclone)
}
return(variants_from_insights)
}
validate_cell_annotations <- function(cell_annotations){
cell_annotations = as_tibble(cell_annotations) %>% mutate_all(as.character)
if(!'sample' %in% colnames(cell_annotations)){
cell_annotations = cell_annotations %>% mutate(sample=basename(filename))
}
if(!'barcode' %in% colnames(cell_annotations)){
cell_annotations = cell_annotations %>% mutate(barcode=row_number())
}
if(!'id' %in% colnames(cell_annotations)){
cell_annotations = cell_annotations %>% mutate(id=paste0(sample,'-',barcode))
} else if (any(duplicated(cell_annotations$id))) {
stop('Cell annotations id must be unique.')
}
return(cell_annotations)
}
#' Annotate Variants based on variant ID field
#'
#' @param variant_annotations table of annotations. unique id column must exist. if its in format 'SF3B1:chr2:198266943:C/T' try to parse into CHROM and POS
#'
#' @return
#' @export
#' @import stringr
#' @import tibble
validate_variant_annotations <- function(variant_annotations) {
if(!'id' %in% colnames(variant_annotations)) stop('id column must exist.')
if(any(duplicated(variant_annotations$id))) {
warning("Duplicate variant ids present. You're likely running an older experiment.
Don't worry. We'll fix the duplicate issue here, and new runs will not have this issue.")
variant_annotations = variant_annotations %>% distinct(id, .keep_all = TRUE)
}
if(any(!c('CHROM','POS') %in% colnames(variant_annotations))){
split_ids = stringr::str_match(variant_annotations$id,'(.*?):?chr(.*?):([[:digit:]]*):')
if(any(is.na(split_ids[,3])) | any(is.na(split_ids[,4]))) stop(sprintf('id column cannot be easily parsed into CHROM and POS.'))
variant_annotations = variant_annotations %>% mutate(CHROM=split_ids[,3], POS=as.numeric(split_ids[,4]), gene_name=split_ids[,2])
variant_annotations = variant_annotations %>% arrange(as.numeric(CHROM), as.numeric(POS))
}
return(variant_annotations)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.