#' Processes counts into a Seurat object prepared for alignment.
#'
#' This script reads in a counts file from either the DropSeq or 10x
#' pipeline, converts the genes to a given naming convention (MGI or HGNC) and creates Seurat object from counts.
#' It is the input for align_sers. This file also adds hashing information to a Seurat object and adds hash id to barcodes.
#' @param directory directory containing the output files of align_dropseq_2.sh
#' or cell ranger
#' @param from_gene "ENSG" or "ENSMUSG" or "HGNC" or "MGI"
#' @param to_gene "MGI" or "HGNC"
#' @param sample_id Optional experimental ID to prefix cell names
#' @param hash_dir Directory containing hash counts files
#' @param hash_meta csv file containing conversion of hash ID to sample name
#' @param type "DropSeq" or "10x"
#' @param T_dir Optional directory to T cell TCR
#' @param B_dir optional directory to B cell BCR
#' @param umi_thresh Minimum total umi count for cells
#'
#'
#' @import Seurat
#' @importFrom utils read.table
#' @importFrom utils tail
#' @importFrom methods as
#' @return Outputs a Seurat file prepared to rPCA anchoring
#' @export
process_counts_hash <- function(directory, from_gene, to_gene, hash_dir = NULL,
hash_meta = NULL, sample_id = NULL, type = 'DropSeq', T_dir = NULL,
B_dir = NULL, umi_thresh = 500){
if(type == 'DropSeq'){
prefix = tail(strsplit(directory, '/', fixed = TRUE)[[1]], n = 1)
file = paste0(directory, '/', prefix, '_DGE.txt.gz')
counts = read.table(gzfile(file), header = TRUE, row.names = 1)
}
if(type == '10x'){
counts = readMM(gzfile(paste0(directory,
'/filtered_feature_bc_matrix/matrix.mtx.gz')))
rows = read.table(gzfile(paste0(directory,
'/filtered_feature_bc_matrix/features.tsv.gz')),
stringsAsFactors = FALSE)$V1
rows = sapply(rows, function(x){
strsplit(x, '-', fixed = TRUE)[[1]][1]
})
names(rows) = c()
rownames(counts) = rows
cols = read.table(gzfile(paste0(directory,
'/filtered_feature_bc_matrix/barcodes.tsv.gz')),
stringsAsFactors = FALSE)$V1
cols = sapply(cols, function(x){
strsplit(x, '-', fixed = TRUE)[[1]][1]
})
names(cols) = c()
colnames(counts) = cols
}
if(!is.null(sample_id)){
colnames(counts) = paste(sample_id, colnames(counts), sep = '-')
}
c_sum = colSums(counts)
c_take = which(c_sum >= umi_thresh)
counts = counts[,names(c_take)]
counts = as(as.matrix(counts), "sparseMatrix")
# Convert genes
if(from_gene != to_gene){
conv = convert_genes(rownames(counts), from = from_gene, to = to_gene)
counts = counts[-which(is.na(match(rownames(counts), conv[,1]))),]
conv = conv[match(rownames(counts), conv[,1]),]
rm_id = which(conv[,2] == '')
if (!identical(rm_id, integer(0))){
counts = counts[-rm_id,]
conv = conv[-rm_id,]
}
rownames(counts) = conv[,2]
dups = unique(conv[which(duplicated(conv[,2])),2])
for(dup in dups){
rows = which(rownames(counts) == dup)
duprows = counts[rows, ]
counts = counts[-rows, ]
combined = colSums(duprows)
counts = rbind(combined, counts)
rownames(counts)[1] = dup
print(dup)
}
}
ser = CreateSeuratObject(counts)
if(!is.null(hash_dir) & !is.null(hash_meta)){
if(type == 'DropSeq'){
# IMPLEMENT DROPSEQ HASHING PROTOCOL
} else if(type == '10x'){
counts = readMM(gzfile(paste0(hash_dir,
'/filtered_feature_bc_matrix/matrix.mtx.gz')))
rows = read.table(gzfile(paste0(hash_dir,
'/filtered_feature_bc_matrix/features.tsv.gz')),
stringsAsFactors = FALSE)$V1
rows = sapply(rows, function(x){
strsplit(x, '-', fixed = TRUE)[[1]][1]
})
names(rows) = c()
rownames(counts) = rows
cols = read.table(gzfile(paste0(hash_dir,
'/filtered_feature_bc_matrix/barcodes.tsv.gz')),
stringsAsFactors = FALSE)$V1
cols = sapply(cols, function(x){
strsplit(x, '-', fixed = TRUE)[[1]][1]
})
names(cols) = c()
colnames(counts) = cols
}
cells = intersect(colnames(counts), colnames(ser))
counts = counts[, cells]
ser = subset(ser, cells = cells)
ser[['hash']] = CreateAssayObject(counts)
ser = NormalizeData(ser, assay = 'hash', normalization.method = 'CLR')
ser = HTODemux(ser, assay = 'hash', positive.quantile = .99)
singlets = names(ser$hash_classification.global)[
which(ser$hash_classification.global == 'Singlet')]
ser = subset(ser, cells = singlets)
hash_conv = read.table(hash_meta, sep = ',', row.names = 1)
ser[['Sample']] = hash_conv[ser$hash_maxID,]
meta_drop = match(c('hash_classification', 'hash_classification.global',
'hash.ID', 'hash_margin', 'hash_secondID', 'hash_maxID',
'nFeature_hash', 'nCount_hash'), colnames(ser@meta.data))
ser@meta.data = ser@meta.data[,-meta_drop]
if(type == '10x'){
nn = sapply(colnames(ser), function(x){
strsplit(x, '-', fixed = TRUE)[[1]][1]
})
names(nn) = c()
ser = RenameCells(ser, new.names = nn)
}
ser = RenameCells(ser, new.names = paste0(ser$Sample, '-', colnames(ser)))
}
# Matching T with TCR and BCR sequencing
if (!is.null(T_dir)){
ser = process_T(ser, T_dir)
}
if (!is.null(B_dir)){
ser = process_B(ser, B_dir)
}
return(ser)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.