# EpiTrace: package for computing cell age and phylogeny from ATAC-seq data
# Zhang Yi, <synapse@pku.edu.cn>
#' Negated Value Matching
#'
#' This function is the reciprocal of %in%. See the match funciton in base R.
#'
#' @param x The value to search for in `table`.
#' @param table The set of values to serve as the base for the match function.
#' @export
"%ni%" <- function(x, table) !(match(x, table, nomatch = 0) > 0)
#' Init_Peakset: function for initiating the peak set object for EpiTrace.
#' @title Init_Peakset
#'
#' @description function for converting an peakset to useful object in EpiTrace.
#'
#' @details Init_Peakset(peakset)
#'
#' @param peakset either a GRanges object, or a data frame (at least) with columns named 'chr','start',and 'end'.
#' @return a GRanges object with sequenced peak identifier (peakId).
#' @export
#' @examples Init_Peakset(peakset)
#'
Init_Peakset <- function(peakSet){
if(class(peakSet)[1] %in% 'GRanges'){
peakSet <- peakSet
}else{
peakSet <- peakSet %>% makeGRangesFromDataFrame()
}
peakSet$peakId <- 1:length(peakSet)
return(peakSet)
}
#' Init_Matrix: function for initiating the matrix for EpiTrace.
#' @title Init_Matrix
#'
#' @description function for converting and checking coherence of an input matrix in EpiTrace.
#'
#' @details Init_Matrix(cellname,peakname,matrix)
#'
#' @param peakname a vector
#' @param cecllname a vector
#' @param matrix the input matrix
#'
#' @return a matrix with fixed cell name/peak name
#' @export
#' @examples Init_Matrix(cellname,peakname,matrix)
#'
Init_Matrix <- function(cellname,peakname,matrix){
if(length(cellname)!=ncol(matrix)){
message('cell names number is not similar to cells in matrix ')
stop()
}
if(length(peakname)!=nrow(matrix)){
message('peak names number is not similar to peaks in matrix ')
stop()
}
colnames(matrix) <- cellname
rownames(matrix) <- peakname
return(matrix)
}
#' EpiTrace_prepare_object: wrapper function for preparing input data matrix for EpiTrace.
#' @title EpiTrace_prepare_object
#'
#' @description wrapper function for preparing input data matrix for EpiTrace.
#'
#' @details EpiTrace_prepare_object (peakSet,matrix,celltype=NULL,min.cutoff=50,lsi_dim=2:50,fn.k.param=21,ref_genome='hg38',sep_string= c(":", "-"),clock_gr_list=clock_gr_list,non_standard_clock=F)
#' @details Note: SC/Bulk could not be run at the same time as the normalization/filtering process kills single cells in the face of bulk data
#'
#' @param matrix input count matrix of ATAC data, rows=GRanges and cols=samples/single cells.
#' @param peakSet a GenomicRanges object corresponding to the rows of count matrix.
#' @param min.cutoff minimal cutoff for Signac variableFeature calling
#' @param lsi_dim the dimensionalities used in LSI for Signac
#' @param fn.k.param the k parameter used in FindNeighbours
#' @param ref_genome hg38 or hg19. Currently only support these two.
#' @param sep_string the separation string for row names in input matrix to generate ranges. for example, 'chr1:1-2' is c(':','-')
#' @param clock_gr_list the clockDML set, is a list of reference GRanges, each corresponds to a set of clock-like DML or DMR
#' @param non_standard_clock whether is not using non-standard reference clockDML sets.
#' @param run_reduction whether run reduction or not. Could be time saving to not run it if you only need age derivation.
#' @param remove_peaks_number cut-off of peaks that should be removed with less that this many samples. Default = 10. If your object is too small (like bulk ATAC), try reduce this.
#'
#' @return a seurat object with all input peaks, as well as assays named 'x' for each clock set.
#' @export
EpiTrace_prepare_object <- function(peakSet,matrix,celltype=NULL,min.cutoff=50,lsi_dim=2:50,fn.k.param=21,ref_genome='hg38',sep_string= c(":", "-"),clock_gr_list=clock_gr_list,non_standard_clock=F,qualnum=10,run_reduction=T,remove_peaks_number=10){
# 0. try error catch
if(!is.null(celltype)){
if(length(celltype) != ncol(matrix)){
message('Input cell type vector does not match the cell# in input data')
stop()
}
}
if(!is.null(peakSet)){
if(length(peakSet) != nrow(matrix)){
message('Input peak GRanges# does not match the ranges# in input data')
stop()
}
}
if(ref_genome != 'hg19' & ref_genome != 'hg38'){
message('ref genome is neither hg19 nor hg38')
# stop()
}
if(non_standard_clock %in% F){
if(sum(names(clock_gr_list) %in% c('Mitosis','Chronology','solo_WCGW'))!=3){
message('ref clock list is said to be standard but actually not')
stop()
}else{
message('ref clock list is set to be standard (Homo sapiens, hg19)')
standard_clock = T
}
}else{
message('ref clock list is not standard. Please make sure the input data, peak set and clock set are in similar reference genome.')
standard_clock = F
}
message('Input peakset is set to be ',ref_genome)
# 0.5. Filter the input matrix to a better one (try removing zero expression cells, and zero expression peaks)
which(colSums(matrix)==0) %>% names -> remove_cells
which(rowSums(matrix)<remove_peaks_number | rowSums(matrix>0)<remove_peaks_number) %>% names -> remove_peaks
logical_cell_vec <- colnames(matrix) %in% remove_cells
logical_peak_vec <- rownames(matrix) %in% remove_peaks
#peakSet,matrix,celltype
matrix -> original_matrix
if(length(remove_cells)>0){
if(!is.null(celltype)){
celltype -> original_celltype
celltype <- celltype[!logical_cell_vec]
}
matrix <- matrix[,!logical_cell_vec]
}
if(length(remove_peaks)>0){
peakSet -> original_peakSet
peakSet <- peakSet[!logical_peak_vec,]
matrix <- matrix[!logical_peak_vec,]
}
if(!is.null(celltype)){
names(celltype) <- colnames(matrix)
}
# 1. prepare Clock region intersection to input peak set.
Overlap_Input_with_Clock(peakSet_generanges=peakSet,clock_gr_list=clock_gr_list,ref=ref_genome) -> overlap_result
overlap_list_of_list <- overlap_result$overlap_list_of_list
if(standard_clock & (ref_genome %in% 'hg38')){
clock_gr_list[['Mitosis']] %>% easyLift::easyLiftOver('hg19_hg38') -> mitosis_gr
clock_gr_list[['Chronology']] %>% easyLift::easyLiftOver('hg19_hg38') -> chronology_gr
plyranges::bind_ranges(mitosis_gr,chronology_gr) %>% reduce() -> target_clock_gr
result_clock_gr_list <- list('MitosisClock'=mitosis_gr,'ChronologyClock'=chronology_gr,'AllClock'=target_clock_gr)
}
if(standard_clock & (ref_genome %in% 'hg19')){
clock_gr_list[['Mitosis']] -> mitosis_gr
clock_gr_list[['Chronology']] -> chronology_gr
plyranges::bind_ranges(mitosis_gr,chronology_gr) %>% reduce() -> target_clock_gr
result_clock_gr_list <- list('MitosisClock'=mitosis_gr,'ChronologyClock'=chronology_gr,'AllClock'=target_clock_gr)
}
if(non_standard_clock){
result_clock_gr_list <- clock_gr_list
}
# 2. prepare the chromatin assay with Signac
Signac::CreateChromatinAssay(matrix, sep = sep_string,
genome = ref_genome,ranges=peakSet) -> chrom_assay
# 3. prepare the Seurat object
tempdf <- Seurat::CreateSeuratObject(
counts = chrom_assay,
assay = "peaks"
)
# 4. add clock sets
for (x in names(result_clock_gr_list)){
message('add ',x)
matrix[findOverlaps(peakSet,result_clock_gr_list[[x]])@from %>% unique,] -> mtx2
message('good quality cells ',sum(colSums(mtx2,na.rm=T)>=qualnum),' and peaks ',sum(rowSums(mtx2,na.rm=T)>=qualnum))
dim(mtx2) -> dimcurr
p=0
count = 0
while(p==0){
mtx2 <- mtx2[rowSums(mtx2,na.rm=T)>=qualnum,colSums(mtx2,na.rm=T)>=qualnum]
dim(mtx2) -> dimnext
if((dimcurr[1]-dimnext[1])==0){
p=1
}
count = count + 1
# print(dimcurr[1]-dimnext[1]) # remove this debug tag
}
tempdf$cell <- rownames(tempdf@meta.data)
tempdf <- subset(tempdf,cell %in% colnames(mtx2))
tempdf[[x]] <- Seurat::CreateAssayObject(counts = mtx2[,tempdf$cell],min.cells = 0,min.features = 0,check.matrix = F)
}
# 5. perform dimensionality reduction, feature selection, etc, on this final object
if(run_reduction==T){
tempdf <- Signac::RunTFIDF(tempdf)
tempdf <- Signac::FindTopFeatures(tempdf, min.cutoff = min.cutoff)
tempdf <- Signac::RunSVD(tempdf)
tempdf <- Seurat::RunUMAP(object = tempdf, reduction = 'lsi', dims = lsi_dim)
tempdf <- Seurat::FindNeighbors(object = tempdf, reduction = 'lsi', dims = lsi_dim,k.param = fn.k.param)
tempdf <- Seurat::FindClusters(object = tempdf, verbose = FALSE, algorithm = 3)
# define identity
tempdf@meta.data %>% rownames -> final_cells
if(!is.null(celltype)){
tempdf@meta.data$celltype <- celltype[final_cells]
Idents(tempdf) <- celltype[final_cells]
}else{
tempdf@meta.data$celltype <- tempdf@meta.data$seurat_clusters
Idents(tempdf) <- tempdf$seurat_clusters
}
}else{
if(!is.null(celltype)){
tempdf@meta.data$celltype <- celltype[final_cells]
Idents(tempdf) <- celltype[final_cells]
}
}
return(tempdf)
}
#' RunEpiTraceAge: wrapper function for computing EpiTrace age for input Seurat object with scATAC/bulkATAC data.
#' @title RunEpiTraceAge
#'
#' @description wrapper function for computing EpiTrace age for input scATAC/bulkATAC data.
#'
#' @details RunEpiTraceAge(epitrace_object)
#' @details Note: SC/Bulk could not be run at the same time as the normalization/filtering process kills single cells in the face of bulk data
#'
#' @param epitrace_object a seurat object built by EpiTrace_prepare_object
#' @param select_minimal_percentage: selecting peaks covered in > this fraction of cells to compute dispersion. Default to 0.05. Suggest not to change unless necessary.
#' @param select_size_of_dispersion: numbers of peaks selected for computing the similarity matrix, ranked by dispersion. Default to 3000. Suggest not to change unless necessary.
#' @param ncores: cores used
#' @param subsamplesize: per subsampled batch cell number
#' @param normalization_method: choose from "randomized", "census", and "blank"
#' @param parallel: whether use parallel batch processing
#'
#' @return a Seurat_Object with added meta.data columns EpiTraceAge_Mitotic, EpiTraceAge_Chronological, EpiTraceAge_all, Accessibility_Mitotic, Accessibility_Chronological, Accessibility_all. Where the 'Mitotic' corresponds to EpiTraceAge computed with mitosis-associated DML and 'Chronological' corresponds to non-mitosis, clock-like DML. 'all' is the EpiTraceAge computed by both sets.
#' @export
#' @examples
RunEpiTraceAge <- function(epitrace_object,parallel=F,ncores=20,subsamplesize=2000,normalization_method='randomized',select_minimal_percentage = 0.05, select_size_of_dispersion = 3000){
norm_meth = normalization_method
availableAssays <- SeuratObject::Assays(epitrace_object)
availableAssays_non_peak <- availableAssays[availableAssays!='peaks']
epitrace_object$cell <- rownames(epitrace_object@meta.data)
mtx_list <- lapply(availableAssays_non_peak,function(x){
DefaultAssay(epitrace_object) <- x
Seurat::GetAssayData(epitrace_object,slot='data')
})
names(mtx_list) <- availableAssays_non_peak
lapply(names(mtx_list),function(x){
mtx_list[[x]] -> testmtx
if(sum(Matrix::rowSums(testmtx)>0,na.rm=T)==0 & sum(Matrix::colSums(testmtx)>0,na.rm=T)==0){
message(paste('no data overlapping for clock reference set',x))
stop()
}else{
message('Estimating Age by EpiTraceAge...')
if(parallel==F){
EpiTraceAge(mat = testmtx[Matrix::rowSums(testmtx)>0,Matrix::colSums(testmtx)>0],normalization_method=norm_meth,select_minimal_percentage=select_minimal_percentage,select_size_of_dispersion=select_size_of_dispersion)
}else{
EpiTraceAge(mat = testmtx[Matrix::rowSums(testmtx)>0,Matrix::colSums(testmtx)>0],ncores = ncores,parallel=T,subsamplesize=subsamplesize,normalization_method=norm_meth,select_minimal_percentage=select_minimal_percentage,select_size_of_dispersion=select_size_of_dispersion)
}
}
}) -> res_list
names(res_list) <- names(mtx_list)
count = 0
for (x in names(res_list)){
count = count + 1
res_list[[x]] -> df1
colnames(df1)[2:4] <- paste0(c('EpiTraceAge_','Accessibility_','AccessibilitySmooth_'),x)
if(count == 1){
returndf = df1
}else{
returndf <- left_join(returndf,df1)
}
}
epitrace_object@meta.data$cell <- rownames(epitrace_object@meta.data)
left_join(epitrace_object@meta.data %>% as.data.frame(),returndf) -> cell_res
rownames(cell_res) <- rownames(epitrace_object@meta.data)
epitrace_object@meta.data <- cell_res
return(epitrace_object)
}
#' RunEpiTracePhylogeny: wrapper function for computing EpiTrace phylogeny for input Seurat object with scATAC/bulkATAC data.
#' @title RunEpiTracePhylogeny
#'
#' @description wrapper function for computing EpiTrace phylogeny for input scATAC/bulkATAC data.
#'
#' @details RunEpiTracePhylogeny(epitrace_object)
#'
#' @param epitrace_object a seurat object prepared by EpiTrace_prepare_object
#' @param min.cutoff min.cutoff used in Signac::FindTopFeatures analysis, during re-selecting the clock variable features
#' @return a list, in which each element corresponds to a clock dataset in the original object. Each element is a list: clock = the name of clockDML set, tree = phylogenetic tree for 'clusters of cells' defined by the given 'idents' of the input seurat object, and tree_plot = a ggtree plot of the tree.
#' @export
#' @examples
#'
RunEpiTracePhylogeny <- function(epitrace_object,min.cutoff=50,run_reduction=T){
availableAssays <- SeuratObject::Assays(epitrace_object)
epitrace_object$cell <- rownames(epitrace_object@meta.data)
color_celltype <- grDevices::colorRampPalette(colors = RColorBrewer::brewer.pal(9,"Set1"))(length(unique(Idents(epitrace_object))))
names(color_celltype) <- unique(Idents(epitrace_object))
lapply(availableAssays,function(assayid){
tryCatch({
if(DefaultAssay(epitrace_object) != assayid){
DefaultAssay(epitrace_object) <- assayid
if(run_reduction==T){
epitrace_object <- Signac::RunTFIDF(epitrace_object)
epitrace_object <- Signac::FindTopFeatures(epitrace_object, min.cutoff = min.cutoff)
epitrace_object <- Signac::RunSVD(epitrace_object)
}
}
obj_clock <- BuildClusterTree(object = epitrace_object,verbose = T,assay = assayid)
data.tree_clock <- Tool(object = obj_clock, slot = "BuildClusterTree")
ggtree::ggtree(data.tree_clock,layout='rectangular',ladderize = FALSE) + geom_tiplab(aes(color=label),size=5,offset=10) + scale_color_manual(values=color_celltype) -> tree_plot_clock
xmax <- (tree_plot_clock$data$`branch.length` %>% max(na.rm=T)) * 1.4
tree_plot_clock <- tree_plot_clock + xlim(c(NA,xmax))
result <- list(assay=assayid,tree=data.tree_clock,tree_plot=tree_plot_clock)
return(result)
},error=function(e){message('failed for ',assayid)})
}) -> returnlist
names(returnlist) <- availableAssays
return(returnlist)
}
#' Overlap_Input_with_Clock: function for overlapping the input peak set object to clockDML for EpiTrace.
#' @title Overlap_Input_with_Clock
#'
#' @description function for converting an input peakset to a list of peaks that overlapping with ClockDML in EpiTrace.
#'
#' @details Overlap_Input_with_Clock(peakSet_generanges,clock_gr_list=clock_gr_list,ref=ref)
#'
#' @param peakSet_generanges the input peak set, a GRanges object
#' @param clock_gr_list a list of GRanges, each corresponds to one Clock-DML set. Standard reference is made accompanying the package with 'Mitosis','Chronological','solo_WCGW' and 'all' (Mitosis+Chronological). These are all in hg19.
#' @ref The reference genome of input GRanges. 'hg38' or 'hg19'.
#'
#' @return a list with: overlap_list_of_list corresponds to Ids of peaks overlapping with ClockDML, and a dataframe overlap_df which tells the stats of overlapping. Currently the function would NOT tell a problem when there is no overlapping peak is found.
#' @export
#' @examples Overlap_Input_with_Clock(peakSet_generanges=test_Peakset,clock_gr_list=clock_gr_list,ref='hg38')
#'
Overlap_Input_with_Clock <- function(peakSet_generanges,clock_gr_list=clock_gr_list,ref=ref){
if(ref %in% 'hg19'){
# do nothing
}
if(ref %in% 'hg38'){
lapply(names(clock_gr_list),function(x){
easyLift::easyLiftOver(clock_gr_list[[x]],'hg19_hg38') -> temp
return(temp)
}) -> clock_gr_list_new
names(clock_gr_list_new) <- names(clock_gr_list)
clock_gr_list_new -> clock_gr_list
}
ref_gr_list <- list('Input'=peakSet_generanges)
lapply(names(clock_gr_list),function(x){
lapply(names(ref_gr_list),function(y){
findOverlaps(ref_gr_list[[y]],clock_gr_list[[x]])
}) -> returnlist
names(returnlist) <- paste0(names(ref_gr_list),'_vs_',x)
returnlist
}) -> overlap_list_of_list
names(overlap_list_of_list) <- names(clock_gr_list)
lapply(overlap_list_of_list,function(x) lapply(x,function(y) y@to %>% unique %>% length()) %>% unlist) %>% unlist -> overlap_num
overlap_num <- data.frame(name=names(overlap_num),overlap_num=overlap_num)
overlap_num <- separate(overlap_num,col=1,into=c('Clock_panel','Comparison'),remove=F,sep='\\.')
overlap_num$target <- gsub('_vs.+','',overlap_num$Comparison)
data.frame(Clock_panel=names(clock_gr_list),Size=lapply(clock_gr_list,function(x) nrow(as.data.frame(x)))%>%unlist)-> clock_size
overlap_df <- left_join(overlap_num,clock_size)
overlap_df$overlap_fraction <- overlap_df$overlap_num/overlap_df$Size
return(list(overlap_list_of_list=overlap_list_of_list,overlap_df=overlap_df))
}
#' EpiTraceAge: function for computing cell age with EpiTrace for input scATAC/bulkATAC data.
#' @title EpiTraceAge
#'
#' @description function for computing cell age with EpiTrace for input scATAC/bulkATAC data.
#'
#' @details EpiTraceAge(mat)
#' @details Note: SC/Bulk could not be run at the same time as the normalization/filtering process kills single cells in the face of bulk data
#' @details This is adapted from CytoTrace method.
#'
#' @param mat a count matrix for scATAC/ATAC data, usually built with ArchR, SnapATAC or Signac etc. Row = peaks and col = cells. Specifically, each row (peak) is overlapping with ClockDML.
#' @param normalization_method normalization method used in EpiTraceAge computation, choose between 'randomized', 'census', and 'blank'.
#' @param ncores number of cores used in parallel processing
#' @param size: depreciated parameter
#' @param subsamplesize: subsample cell number, default to 2000.
#' @param select_minimal_percentage: selecting peaks covered in > this fraction of cells to compute dispersion. Default to 0.05. Suggest not to change unless necessary.
#' @param select_size_of_dispersion: numbers of peaks selected for computing the similarity matrix, ranked by dispersion. Default to 3000. Suggest not to change unless necessary.
#'
#' @return a data frame with EpiTrace = epitrace age, Accessible_Loci = total accessibility on clock regions,cells=used cells).
#' @export
#' @examples
EpiTraceAge <- function(mat,ncores=20,parallel=F,size=2000,batch=NULL,subsamplesize=2000,normalization_method='randomized',select_minimal_percentage = 0.05, select_size_of_dispersion = 3000){
#check normalization method
if(normalization_method %ni% c('randomized','census','blank')){
cat ('Please set normalization method to randomized, census, or blank\n')
stop()
}
#Subsample routine
if(parallel == FALSE){
size <- ncol(mat)
} else {
size <- min(ncol(mat),subsamplesize)
}
chunk <- round(ncol(mat)/size)
subsamples <- split(1:ncol(mat), sample(factor(1:ncol(mat) %% chunk)))
require(Matrix)
bad_peaks <- rowSums(mat) == 0
num_bad_peaks <- length(which(bad_peaks == TRUE))
mat <- mat[!bad_peaks,]
chunk <- round(ncol(mat)/size)
subsamples <- split(1:ncol(mat), sample(factor(1:ncol(mat) %% chunk)))
mat_origin <- mat
batch_origin <- batch
sub_batch_function <- function(subsample){
#Checkpoint: Sequencing depth normalization
mat <- mat_origin[,subsample]
batch <- batch_origin[subsample]
# set normalization
if(normalization_method == 'census'){
counts <- colSums(mat>0,na.rm=T)
reads <- colSums(mat,na.rm=T)
normalized_mat <- t(t(mat)*counts/reads)
normalized_mat <- log(normalized_mat+1,2)
normalized_mat[is.na(normalized_mat)] <- 0
}
if(normalization_method == 'randomized'){
counts <- rowSums(mat>0,na.rm=T)
reads <- rowSums(mat,na.rm=T)
normalized_mat <- t(t(mat)*counts/reads)
normalized_mat <- log(normalized_mat+1,2)
normalized_mat[is.na(normalized_mat)] <- 0
}
if(normalization_method == 'blank'){
normalized_mat <- log(mat+1,2)
normalized_mat[is.na(normalized_mat)] <- 0
}
accessible_loci <- rowSums(normalized_mat > 0)
per_cell_accessible_loci <- colSums(normalized_mat > 0)
peaks_expressed_in_n_cell <- rowSums(normalized_mat>0)
peaks_expressed_in_n_cell>select_minimal_percentage*ncol(mat) -> good_peaks_for_selecting_loci
mtx_with_better_peaks <- normalized_mat[good_peaks_for_selecting_loci,]
if(class(mtx_with_better_peaks)[[1]]=='dgeMatrix'){
dispersion <- matrixStats::rowVars(mtx_with_better_peaks %>% as.matrix())/rowMeans(mtx_with_better_peaks)
}else{
dispersion <- sparseMatrixStats::rowVars(mtx_with_better_peaks)/rowMeans(mtx_with_better_peaks)
}
size_variable_loci <- select_size_of_dispersion
cutoff <- sort(dispersion) %>% tail(min(length(dispersion),size_variable_loci)) %>% head(1) %>% as.numeric()
normalized_variable_mat <- normalized_mat[names(dispersion)[dispersion >= cutoff],]
normalized_variable_mat <- normalized_variable_mat[,colSums(normalized_variable_mat)>0]
WGCNA::cor(normalized_variable_mat) -> cor_normalized_variable_mat
mean(as.vector(cor_normalized_variable_mat)) -> mean_number
diag(cor_normalized_variable_mat) <- 0
cor_normalized_variable_mat[cor_normalized_variable_mat<0] <- 0
cor_normalized_variable_mat[cor_normalized_variable_mat<mean_number] <- 0
censored_cor_normalized_variable_mat <- cor_normalized_variable_mat/rowSums(cor_normalized_variable_mat)
censored_cor_normalized_variable_mat[rowSums(cor_normalized_variable_mat)==0,] <- 0
final_cells <- colnames(censored_cor_normalized_variable_mat)
mat2 <- normalized_mat[,final_cells]
per_cell_accessible_loci <- per_cell_accessible_loci[final_cells]
list(mat2 = mat2,counts = counts, censored_cor_normalized_variable_mat = censored_cor_normalized_variable_mat,per_cell_accessible_loci=per_cell_accessible_loci) -> returnlist
return(returnlist)
}
if(parallel==T){
batches <- parallel::mclapply(subsamples, mc.cores = min(chunk, ncores), function(subsample_x){
sub_batch_function(subsample=subsample_x)
})
}else{
batches <- lapply(subsamples, function(subsample_x){
sub_batch_function(subsample=subsample_x)
})
}
mat2 <- do.call(cbind, lapply(batches, function(x) x$mat2))
counts <- do.call(c, lapply(batches, function(x) x$counts))
per_cell_accessible_loci <- lapply(batches,function(x) x$per_cell_accessible_loci) %>% unlist()
correlation_of_each_peak_to_per_cell_accessible_loci_counts <- WGCNA::cor(x=t(mat2),y=per_cell_accessible_loci,drop = T)
correlation_of_each_peak_to_per_cell_accessible_loci_counts[is.na(correlation_of_each_peak_to_per_cell_accessible_loci_counts)] <- 0
names(sort(correlation_of_each_peak_to_per_cell_accessible_loci_counts)%>%tail(min(length(correlation_of_each_peak_to_per_cell_accessible_loci_counts),200))) -> best_loci
accessible_loci_counts_smoothened <- colMeans(mat2[best_loci,])
samplesize <- unlist(lapply(batches, function(x) x$per_cell_accessible_loci %>% length()))
accessible_loci_counts_smoothened_list <- split(accessible_loci_counts_smoothened, as.numeric(rep(names(samplesize), samplesize)))
censored_cor_normalized_variable_mat_list <- lapply(batches, function(x) x$censored_cor_normalized_variable_mat)
rank_function <- function(i){
censored_cor_normalized_variable_mat <- censored_cor_normalized_variable_mat_list[[i]]
accessible_loci_counts_smoothened <- accessible_loci_counts_smoothened_list[[i]]
nnls_res <- nnls::nnls(censored_cor_normalized_variable_mat,accessible_loci_counts_smoothened)
nx1_mat <- censored_cor_normalized_variable_mat %*% nnls_res$x
stepcount = 1
delta = 1
extension_parameter=0.99
original_nx1_mat <- nx1_mat
current_nx1_mat <- nx1_mat
next_nx1_mat <- nx1_mat
while(stepcount < 50000 & delta >= 1e-9){
stepcount = stepcount + 1
next_nx1_mat <- extension_parameter*(censored_cor_normalized_variable_mat%*% current_nx1_mat) + (1-extension_parameter) * original_nx1_mat
delta <- mean(abs(next_nx1_mat-current_nx1_mat))
current_nx1_mat <- next_nx1_mat
}
epitrace_ranked <- rank(current_nx1_mat)
epitrace_ranked
}
if(parallel==T){
epitrace_ranked <- parallel::mclapply(1:length(censored_cor_normalized_variable_mat_list), mc.cores = min(chunk,ncores),function(i) {
rank_function(i)
}) %>% unlist()
}else{
epitrace_ranked <- lapply(1:length(censored_cor_normalized_variable_mat_list), function(i) {
rank_function(i)
}) %>% unlist()
}
final_cells <- colnames(mat2)
epitrace_norm <- (epitrace_ranked-min(epitrace_ranked))/(max(epitrace_ranked)-min(epitrace_ranked))
data.frame(
cell=final_cells,
EpiTrace = epitrace_norm,
Accessible_Loci = per_cell_accessible_loci,
Accessible_Loci_Smooth = accessible_loci_counts_smoothened
) -> returndf
return(returndf)
}
#' AssociationOfPeaksToAge: function for computing each peaks' linear correlation to EpiTrace age.
#' @title AssociationOfPeaksToAge
#'
#' @description function for computing each peaks' linear correlation to EpiTrace age.
#'
#' @details AssociationOfPeaksToAge(epitrace_object,ref_ClockDML_name='AllClock',epitrace_age_name='EpiTraceAge_all',subset_of_cells=NULL,epitrace_age_vector=NULL)
#' @details This is not going to work if the ref_ClockDML_name does not corresponds to epitrace_age_name.
#'
#' @param epitrace_object a seurat object with peaks overlapping reference ClockDML, and computed epitrace age.
#' @param peakSetName a peakset name as in 'assays' of the object. Default to 'peaks'.
#' @param epitrace_age_name name of output epitrace age
#' @param subset_of_cells subset of cell types (Idents) that you want to use in the computation
#' @param epitrace_age_vector alternatively you can have a vector of ages that you would like to test
#' @param parallele switch for parallel
#'
#'
#' @return a data frame with locus (the peak), correlation_of_EpiTraceAge, scaled_correlation_of_EpiTraceAge.
#' @export
#' @examples
AssociationOfPeaksToAge <- function(epitrace_object,peakSetName='peaks',epitrace_age_name='EpiTraceAge_all',subset_of_cells=NULL,epitrace_age_vector=NULL,parallel=F){
if(!is.null(subset_of_cells)){
subset(epitrace_object,celltype %in% subset_of_cells) -> epitrace_object
}else{
epitrace_object -> epitrace_object
}
if(DefaultAssay(epitrace_object)!=peakSetName){
DefaultAssay(epitrace_object) <- peakSetName
}
Seurat::GetAssayData(epitrace_object,slot='data') -> peaks_PT_dat
if(is.null(epitrace_age_vector)){
epitrace_age_vector <- epitrace_object@meta.data[,epitrace_age_name] %>% as.numeric()
}
cell_res_single <- epitrace_object@meta.data %>% as.data.frame()
# add parallel and avoid explosion
if(is.null(names(epitrace_age_vector))){
names(epitrace_age_vector) <- rownames(epitrace_object@meta.data)
}
epitrace_age_vector -> age_vec
is.na(age_vec) -> remove_vec
peaks_PT_dat -> to_be_associated_mtx
if(sum(remove_vec)>0){
age_vec <- age_vec[!remove_vec]
to_be_associated_mtx <- to_be_associated_mtx[,!remove_vec]
}
if(parallel==F){
lapply(c(1:ceiling(dim(to_be_associated_mtx)[1]/1000)),function(x){
WGCNA::cor(x = t(to_be_associated_mtx[((1000*(x-1))+1):min(dim(to_be_associated_mtx)[1],1000*x),]), y = age_vec)
}) -> res_list_cor
}else{
parallel::mclapply(c(1:ceiling(dim(to_be_associated_mtx)[1]/1000)),mc.cores = 12,function(x){
WGCNA::cor(x = t(to_be_associated_mtx[((1000*(x-1))+1):min(dim(to_be_associated_mtx)[1],1000*x),]), y = age_vec)
}) -> res_list_cor
}
res_list_cor %>% unlist -> cor_res_PT
# cor_res_PT <- WGCNA::cor(x=t(peaks_PT_dat),y=epitrace_age_vector)
scale(cor_res_PT) -> scaled_cor_res_PT
names(cor_res_PT) <- rownames(peaks_PT_dat)
data.frame('locus'=names(cor_res_PT),correlation_of_EpiTraceAge=cor_res_PT,scaled_correlation_of_EpiTraceAge=scaled_cor_res_PT) -> returndf
return(returndf)
}
#' EpiTraceAge_Convergence: wrapper function for iterative optimization for EpiTrace age (on a specific system)
#' @title EpiTraceAge_Convergence
#'
#' @description wrapper function for iterative optimization for EpiTrace Age.
#' The function goes through building an EpiTrace object with given clock reference (the DML).
#' Initial derivation of sample age was performed with clock reference only.
#' Age-association of ChrAcc peaks were performed to extract informative peaks.
#' Iterative inference of cell age was performed with an updated clock reference containing
#' the top peaks associating with age.
#' The interation goes up to given times or when age derivation converge under given error limit.
#' Please use **all** peaks instead of **partial** peaks for this optimization.
#' Generally this is a maximal extending approach to include all possible peaks correlated with age.
#'
#' @details EpiTraceAge_Convergence (peakSet,matrix,celltype=NULL,min.cutoff=50,lsi_dim=2:50,fn.k.param=21,ref_genome='hg38',sep_string= c(":", "-"),clock_gr_list=clock_gr_list,non_standard_clock=F,qualnum = 10,Z_cutoff=3,mean_error_limit=1e-2)
#' @details Note: SC/Bulk could not be run at the same time as the normalization/filtering process kills single cells in the face of bulk data
#'
#' @param matrix input count matrix of ATAC data, rows=GRanges and cols=samples/single cells.
#' @param peakSet a GenomicRanges object corresponding to the rows of count matrix.
#' @param min.cutoff minimal cutoff for Signac variableFeature calling
#' @param lsi_dim the dimensionalities used in LSI for Signac
#' @param fn.k.param the k parameter used in FindNeighbours
#' @param ref_genome hg38 or hg19. Currently only support these two.
#' @param sep_string the separation string for row names in input matrix to generate ranges. for example, 'chr1:1-2' is c(':','-')
#' @param clock_gr_list the clockDML set, is a list of reference GRanges, each corresponds to a set of clock-like DML or DMR
#' @param non_standard_clock whether is not using non-standard reference clockDML sets.
#' @param qualnum minimal peak/cell number for classifying a 'qualified' cell/peak, default set to 10, can be 1 for including most cells/peaks.
#' @param Z_cutoff a cutoff for scaled (Z) correlation between peak and cell age. usually set to >= 2.5
#' @param mean_error_limit a limit for the difference between cell ages derived from previous and current iteration. usually set to 1e-2 - 1e-6.
#' @param ncore_lim limit for using parallel cores in age-peak association, default 12 (set to 1 if no parallel is wanted)
#' @param parallel whether use parallel multicore. In Rstudio, switch off to avoid problems
#' @param iterative_time number of iterations. Usually >10 should be fine.
#' @param remove_peaks_number cut-off of peaks that should be removed with less that this many samples. Default = 10. If your object is too small (like bulk ATAC), try reduce this.
#' @param normalization_method normalization method used in EpiTraceAge computation, choose between 'randomized', 'census', and 'blank'.
#' @param select_minimal_percentage: selecting peaks covered in > this fraction of cells to compute dispersion. Default to 0.05. Suggest not to change unless necessary.
#' @param select_size_of_dispersion: numbers of peaks selected for computing the similarity matrix, ranked by dispersion. Default to 3000. Suggest not to change unless necessary.
#'
#' @return a seurat object with EpiTraceAge, Accessibility, and AccessibilitySmoothed. '_iterative' are iteration scores, and '_Clock_initial' are initial clock scores.
#' @export
#' @examples
EpiTraceAge_Convergence <- function (peakSet, matrix, celltype = NULL, min.cutoff = 50,lsi_dim = 2:50, fn.k.param = 21, ref_genome = "hg38", sep_string = c(":","-"), clock_gr = plyranges::reduce_ranges(c(clock_gr_list[[1]],clock_gr_list[[2]])), non_standard_clock = F, qualnum = 10,Z_cutoff = 3, mean_error_limit = 0.01, ncore_lim = 12, parallel = T,iterative_time = 2,remove_peaks_number=10,normalization_method='randomized',select_minimal_percentage = 0.05, select_size_of_dispersion = 3000){
norm_meth = normalization_method
original_clk_peakset <- clock_gr
if (ref_genome %in% "hg38") {
original_clk_peakset <- easyLift::easyLiftOver(original_clk_peakset,
"hg19_hg38")
}
if (ref_genome != "hg19" & ref_genome != "hg38") {
message("please make double sure your ref genome, peak set and cells are similar.")
}
iterative_count = 1
na_vector_current <- ncol(matrix)
na_vector_previous <- ncol(matrix)
error <- rep(1, ncol(matrix))
mean_error <- mean(error)
iterative_GR_list <- list(iterative = original_clk_peakset)
overlap_with_clk <- findOverlaps(peakSet, original_clk_peakset)@from %>%
unique()
initial_matrix_clk <- matrix[overlap_with_clk, ]
initial_peakSet_clk <- peakSet[overlap_with_clk, ]
message("Preparing obj...")
epitrace_obj <- EpiTrace_prepare_object(initial_peakSet_clk,
initial_matrix_clk, celltype, ref_genome = "hg19", non_standard_clock = T,
clock_gr_list = iterative_GR_list, sep_string = sep_string,
fn.k.param = fn.k.param, lsi_dim = lsi_dim, qualnum = qualnum,
min.cutoff = min.cutoff, run_reduction = F,remove_peaks_number=remove_peaks_number)
message("Finished prepare obj")
epitrace_obj_original_metadata <- epitrace_obj@meta.data
message("Estimating age for initialization...")
if (parallel == F) {
epitrace_obj_age_estimated <- RunEpiTraceAge(epitrace_obj,normalization_method=norm_meth,select_minimal_percentage=select_minimal_percentage,select_size_of_dispersion=select_size_of_dispersion) %>%
suppressMessages() %>% suppressWarnings()
} else {
epitrace_obj_age_estimated <- RunEpiTraceAge(epitrace_obj,
ncores = ncore_lim, parallel = T,normalization_method=norm_meth,select_minimal_percentage=select_minimal_percentage,select_size_of_dispersion=select_size_of_dispersion) %>% suppressMessages() %>%
suppressWarnings()
}
message("Finished age estimation")
age_current <- epitrace_obj_age_estimated@meta.data$EpiTraceAge_iterative
names(age_current) <- epitrace_obj_age_estimated@meta.data$cell
na_vector_current <- is.na(age_current)
next_peakset <- iterative_GR_list$iterative
cell_current <- epitrace_obj_age_estimated@meta.data$cell[!is.na(epitrace_obj_age_estimated@meta.data$EpiTraceAge_iterative)]
age_current <- age_current[cell_current]
to_be_associated_mtx <- matrix[,cell_current]
while ((sum(na_vector_current) <= sum(na_vector_previous)) &
(mean_error >= mean_error_limit) & (iterative_count <=
iterative_time)) {
message("Iterating ", iterative_count)
age_previous <- age_current
na_vector_previous <- na_vector_current
iterative_count = iterative_count + 1
cell_current <- cell_current[!na_vector_current] %>% na.omit()
age_current <- age_current[cell_current]
to_be_associated_mtx <- matrix[,cell_current]
message("Performing Corr ")
if (parallel == T) {
res_list_cor <- parallel::mclapply(c(1:ceiling(dim(to_be_associated_mtx)[1]/1000)),
mc.cores = ncore_lim, function(x) {
target_mtx <- t(to_be_associated_mtx[((1000 *
(x - 1)) + 1):min(dim(to_be_associated_mtx)[1],
1000 * x), ]) %>% as.matrix()
WGCNA::cor(x = target_mtx, y = age_current)
})
cor_res_PT <- res_list_cor %>% unlist
}
else {
res_list_cor <- lapply(c(1:ceiling(dim(to_be_associated_mtx)[1]/1000)),
function(x) {
target_mtx <- t(to_be_associated_mtx[((1000 *
(x - 1)) + 1):min(dim(to_be_associated_mtx)[1],
1000 * x), ]) %>% as.matrix()
WGCNA::cor(x = target_mtx, y = age_current)
})
cor_res_PT <- res_list_cor %>% unlist
}
cor_res_PT[is.na(cor_res_PT)] <- 0
scaled_cor_res_PT <- scale(cor_res_PT)
message("Finished Corr ")
updated_peakset <- peakSet
updated_peakset$correlation_of_EpiTraceAge <- cor_res_PT
updated_peakset$scaled_correlation_of_EpiTraceAge <- scaled_cor_res_PT
peaks_overlap_with_clock <- findOverlaps(updated_peakset,
original_clk_peakset)@from %>% unique
updated_peakset$locus_type <- "Others"
updated_peakset$locus_type[peaks_overlap_with_clock] <- "Chronology"
iterative_clock_gr_list <- list(iterative = updated_peakset[(updated_peakset$scaled_correlation_of_EpiTraceAge >
Z_cutoff) | (updated_peakset$locus_type %ni% "Others") %>%
as.vector() | (updated_peakset$peakId %in% next_peakset$peakId),
])
overlap_with_clk <- findOverlaps(peakSet, iterative_clock_gr_list[[1]])@from %>%
unique()
initial_matrix_clk <- matrix[overlap_with_clk, ]
initial_peakSet_clk <- peakSet[overlap_with_clk, ]
message("Calculate iterative age")
if (parallel == T) {
message("Parallel run")
tryCatch({
nrow(initial_matrix_clk) %>% as.numeric() -> row_num
ncol(initial_matrix_clk) %>% as.numeric() -> col_num
if ((row_num*col_num) >
(50000 * 2000)) {
batch_size = 400
}
else {
batch_size = 1000
}
},error=function(e){batch_size=400})
res1 <- EpiTraceAge(initial_matrix_clk, parallel = parallel,
ncores = ncore_lim, size = batch_size, subsamplesize = batch_size,normalization_method=normalization_method,select_minimal_percentage=select_minimal_percentage,select_size_of_dispersion=select_size_of_dispersion) %>%
suppressMessages() %>% suppressWarnings()
}
else {
message("Single thread run")
res1 <- EpiTraceAge(initial_matrix_clk, parallel = F,
ncores = 1,normalization_method=normalization_method,select_minimal_percentage=select_minimal_percentage,select_size_of_dispersion=select_size_of_dispersion) %>% suppressMessages() %>% suppressWarnings()
}
colnames(res1)[2:4] <- c("EpiTraceAge_iterative", "Accessibility_iterative",
"AccessibilitySmooth_iterative")
message("Update dataframe")
epitrace_obj_original_metadata_update <- left_join(epitrace_obj_original_metadata,
res1)
epitrace_obj_iterative_age_estimated <- epitrace_obj_age_estimated
rownames(epitrace_obj_original_metadata_update) <- rownames(epitrace_obj_original_metadata)
epitrace_obj_iterative_age_estimated@meta.data <- epitrace_obj_original_metadata_update
age_current <- epitrace_obj_iterative_age_estimated$EpiTraceAge_iterative
cell_current <- epitrace_obj_iterative_age_estimated$cell[!is.na(epitrace_obj_iterative_age_estimated$EpiTraceAge_iterative)]
names(age_current) <- cell_current
error <- (age_current - age_previous[cell_current])
tryCatch({
error[is.infinite(error)] <- 0
}, error = function(e) {
message("")
})
tryCatch({
error[is.na(error)] <- 1
}, error = function(e) {
message("")
})
mean_error = mean(abs(error), na.rm = T)
na_vector_current <- is.na(age_current)
age_previous <- age_current
epitrace_obj_iterative_age_estimated@misc$iterative_count <- iterative_count
epitrace_obj_iterative_age_estimated@misc$mean_error <- mean_error
epitrace_obj_iterative_age_estimated@misc$next_ref_peak_size <- length(iterative_clock_gr_list$iterative)
message("mean_error = ", mean_error)
next_peakset <- iterative_clock_gr_list$iterative
}
epitrace_obj_iterative_age_estimated$EpiTraceAge_Clock_initial <- epitrace_obj_age_estimated@meta.data$EpiTraceAge_iterative
epitrace_obj_iterative_age_estimated$Accessibility_initial <- epitrace_obj_age_estimated@meta.data$Accessibility_iterative
epitrace_obj_iterative_age_estimated$AccessibilitySmooth_initial <- epitrace_obj_age_estimated@meta.data$AccessibilitySmooth_iterative
return(epitrace_obj_iterative_age_estimated)
}
#' age_associated_peak_test: function for calculating age-peak correlation
#' @title age_associated_peak_test
#'
#' @description function for calculating age-peak correlation.
#'
#'
#' @details age_associated_peak_test (matrix, age, parallel = F)
#' @param matrix input count matrix of ATAC data, rows=GRanges and cols=samples/single cells.
#' @param age a vector of cell ages.
#' @param parallel whether use parallel, logical.
#' @return a dataframe with locus, correlation of peak height to age, and scaled correlation (Z value)
#' @export
#' @examples
age_associated_peak_test <- function (matrix, age, parallel = F)
{
peaks_PT_dat <- matrix
age_vec <- age
remove_vec <- is.na(age_vec)
to_be_associated_mtx <- peaks_PT_dat
if (sum(remove_vec) > 0) {
age_vec <- age_vec[!remove_vec]
to_be_associated_mtx <- to_be_associated_mtx[, !remove_vec]
}
if (parallel == F) {
res_list_cor <- lapply(c(1:ceiling(dim(to_be_associated_mtx)[1]/1000)),
function(x) {
WGCNA::cor(x = t(to_be_associated_mtx[((1000 *
(x - 1)) + 1):min(dim(to_be_associated_mtx)[1],
1000 * x), ]), y = age_vec)
})
}
else {
res_list_cor <- parallel::mclapply(c(1:ceiling(dim(to_be_associated_mtx)[1]/1000)),
mc.cores = 12, function(x) {
WGCNA::cor(x = t(to_be_associated_mtx[((1000 *
(x - 1)) + 1):min(dim(to_be_associated_mtx)[1],
1000 * x), ]), y = age_vec)
})
}
cor_res_PT <- res_list_cor %>% unlist
scaled_cor_res_PT <- scale(cor_res_PT)
if(!is.null(rownames(peaks_PT_dat))){
names(cor_res_PT) <- rownames(peaks_PT_dat)
}else{
rownames(peaks_PT_dat) <- 1:nrow(peaks_PT_dat)
names(cor_res_PT) <- rownames(peaks_PT_dat)
}
returndf <- data.frame(locus = names(cor_res_PT), correlation_of_EpiTraceAge = cor_res_PT,
scaled_correlation_of_EpiTraceAge = scaled_cor_res_PT)
return(returndf)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.