R/runMACS.R

Defines functions runMACSForAll runMACS

Documented in runMACS runMACSForAll

#' Call Peaks Using MACS2
#'
#' Identify peaks using selected cells. Fragments belonging to a subset of cells 
#' are extracted and used to identify peaks using MACS2. This function requires
#' "MACS2" and "snaptools" preinstalled and excutable. 
#' 
#' @param obj A snap object.
#' @param output.prefix Prefix of output file which will be used to generate output file names.
#' @param path.to.snaptools Path to snaptools excutable file.
#' @param path.to.macs Path to macs2 excutable file.
#' @param gsize effective genome size. 'hs' for human, 'mm' for mouse, 'ce' for C. elegans, 'dm' for fruitfly (default: None)
#' @param buffer.size Buffer size for incrementally increasing internal array size to store reads alignment information. In
#' most cases, you don't have to change this parameter. However, if there are very high coverage dataset that each barcode has
#' more than 10000 fragments, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will 
#' take longer time to read snap files) [1000].
#' @param macs.options String indicate options you would like passed to macs2. strongly do not recommand to change unless you 
#' know what you are doing. the default is '--nomodel --shift 37 --ext 73 --qval 1e-2 -B --SPMR --call-summits'.
#' @param tmp.folder Directory to store temporary files generated by runMACS.
#' @param num.cores Number of cores to use for omputing [1].
#' @param keep.minimal Keep minimal version of output [TRUE].
#' @return Return a data.frame object that contains the peak information
#'
#' @export
runMACS <- function(
	obj, 
	output.prefix,
	path.to.snaptools,
	path.to.macs,
	gsize,
	tmp.folder,
	buffer.size=500,
	macs.options="--nomodel --shift 37 --ext 73 --qval 1e-2 -B --SPMR --call-summits",
	num.cores=1,
	keep.minimal=TRUE
){
	cat("Epoch: checking input parameters ... \n", file = stderr())
	if(missing(obj)){
		stop("obj is missing")
	}else{
		if(!is.snap(obj)){
			stop("obj is not a snap object");
		}		
		if((x=nrow(obj))==0L){
			stop("obj is empty");
		}
		if((x=length(obj@barcode))==0L){
			stop("obj@barcode is empty");			
		}
		if((x=length(obj@file))==0L){
			stop("obj@file is empty");			
		}
	}
	
	fileList = as.list(unique(obj@file));
	
	# check if snap files exist
	if(any(do.call(c, lapply(fileList, function(x){file.exists(x)})) == FALSE)){
		idx = which(do.call(c, lapply(fileList, function(x){file.exists(x)})) == FALSE)
		print("error: these files does not exist")
		print(fileList[idx])
		stop()
	}
	
	# check if files are all snap files
	if(any(do.call(c, lapply(fileList, function(x){isSnapFile(x)})) == FALSE)){
		idx = which(do.call(c, lapply(fileList, function(x){isSnapFile(x)})) == FALSE)
		print("error: these files are not snap file")
		print(fileList[idx])
		stop()
	}
	
	# check if FM session exist
	if(any(do.call(c, lapply(fileList, function(x){ "FM" %in% h5ls(x, recursive=1)$name  })) == FALSE)){
		idx = which(do.call(c, lapply(fileList, function(x){ "FM" %in% h5ls(x, recursive=1)$name  })) == FALSE)
		print("error: the following nsap files do not contain FM session")
		print(fileList[idx])
		stop()
	}
	
	if(missing(output.prefix)){
		stop("output.prefix is missing");
	}
	
	if(missing(path.to.snaptools)){
		stop("path.to.snaptools is missing");
	}else{
		if(!file.exists(path.to.snaptools)){
			stop("path.to.snaptools does not exist");
		}
		
		flag = tryCatch({
			file_test('-x', path.to.snaptools);	
		},
		error=function(cond){
			return(FALSE)
		})
		if(flag == FALSE){
			stop("path.to.snaptools is not an excutable file");
		}
	}

	if(missing(path.to.macs)){
		stop("path.to.macs is missing");
	}else{
		if(!file.exists(path.to.macs)){
			stop("path.to.macs does not exist");
		}
		
		flag = tryCatch({
			file_test('-x', path.to.macs);	
		},
		error=function(cond){
			return(FALSE)
		})
		if(flag == FALSE){
			stop("path.to.macs is not an excutable file");
		}
	}
	
	if(missing(gsize)){
		stop("gsize is missing");
	}
	
	if(missing(tmp.folder)){
		stop("tmp.folder is missing")
	}else{
		if(!dir.exists(tmp.folder)){
			stop("tmp.folder does not exist");			
		}
	}
		
	# write the following barcodes down
	barcode.files = lapply(fileList, function(file){
		tempfile(tmpdir = tmp.folder, fileext = ".barcode.txt");
	})

	bed.files = lapply(fileList, function(file){
		tempfile(tmpdir = tmp.folder, fileext = ".bed.gz");
	})
	
	# write down the barcodes
	cat("Epoch: extracting fragments from each snap files ...\n", file = stderr())
	flag.list = lapply(seq(fileList), function(i){
		file.name = fileList[[i]];
		idx = which(obj@file == file.name);
		barcode.use = obj@barcode[idx]
		write.table(barcode.use, file = barcode.files[[i]], append = FALSE, quote = FALSE, sep = "\t",
		                 eol = "\n", na = "NA", dec = ".", row.names = FALSE,
		                 col.names = FALSE, qmethod = c("escape", "double"),
		                 fileEncoding = "")
		
	})
	
	# extract the fragments belong to the barcodes	
	flag.list = mclapply(seq(fileList), function(i){
		flag = system2(command=path.to.snaptools, 
			args=c("dump-fragment", 
				   "--snap-file", fileList[[i]], 
				   "--output-file", bed.files[[i]], 
				   "--barcode-file", barcode.files[[i]],
				   "--buffer-size", buffer.size
				   )		
			)				
	}, mc.cores=num.cores);
	
	# combine these two bed files
	combined.bed = tempfile(tmpdir = tmp.folder, fileext = ".bed.gz");
	flag = system2(command="cat", 
		args=c(paste(bed.files, collapse = ' '),
			   ">", combined.bed
			   )		
		)				
	
	# call peaks using MACS2	
	flag = system2(command=path.to.macs, 
		args=c("callpeak", 
			   "-t", combined.bed, 
			   "-f", "BED",
			   "-g", gsize,
			   macs.options,
			   "-n", output.prefix
			   )		
		)				
	if (flag != 0) {
	   	stop("'MACS' call failed");
	}	

	if(keep.minimal){
		system(paste("rm ", output.prefix, "_control_lambda.bdg", sep=""));
		system(paste("rm ", output.prefix, "_peaks.xls", sep=""));
		system(paste("rm ", output.prefix, "_summits.bed", sep=""));
	}

	return(read.table(paste(output.prefix, "_peaks.narrowPeak", sep="")));
}


#' Call Peaks Using MACS2 For All Clusters
#'
#' Identify peaks for all clusters. Fragments belonging to each subset or cluster of cells 
#' are extracted and used to identify peaks using MACS2. This function requires
#' "MACS2" and "snaptools" preinstalled and excutable. 
#' 
#' @param obj A snap object.
#' @param output.prefix Prefix of output file which will be used to generate output file names.
#' @param path.to.snaptools Path to snaptools excutable file.
#' @param path.to.macs Path to macs2 excutable file.
#' @param min.cells min number of cells to perform peak calling [100]. Clusters with cells less 
#' than num.cells will be excluded.
#' @param num.cores number of cpus to use [1].
#' @param gsize effective genome size. 'hs' for human, 'mm' for mouse, 'ce' for C. elegans, 'dm' for fruitfly (default: None)
#' @param buffer.size Buffer size for incrementally increasing internal array size to store reads alignment information. In
#' most cases, you don't have to change this parameter. However, if there are very high coverage dataset that each barcode has
#' more than 10000 fragments, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read snap files) [1000].
#' @param macs.options String indicate options you would like passed to macs2. strongly do not recommand to change unless you know what you are doing. the default is '--nomodel --shift 37 --ext 73 --qval 1e-2 -B --SPMR --call-summits'.
#' @param tmp.folder Directory to store temporary files generated by runMACSForAll.
#' @param keep.minimal Keep minimal version of output [TRUE].
#' @return Return a GRanges object that contains the non-overlapping combined peaks
#' @importFrom parallel mclapply 
#' @importFrom GenomicRanges GRanges reduce
#' @export
runMACSForAll <- function(
	obj, 
	output.prefix,
	path.to.snaptools,
	path.to.macs,
	gsize,
	tmp.folder,
	num.cores=1,
	min.cells=100,
	buffer.size=500,
	macs.options="--nomodel --shift 37 --ext 73 --qval 1e-2 -B --SPMR --call-summits",
	keep.minimal=TRUE
){
	cat("Epoch: checking input parameters ... \n", file = stderr())
	if(missing(obj)){
		stop("obj is missing")
	}else{
		if(!is.snap(obj)){
			stop("obj is not a snap object");
		}		
		if((x=nrow(obj))==0L){
			stop("obj is empty");
		}
		if((x=length(obj@barcode))==0L){
			stop("obj@barcode is empty");			
		}
		if((x=length(obj@file))==0L){
			stop("obj@file is empty");			
		}
		nclusters = length(levels(obj@cluster));
	}

	fileList = as.list(unique(obj@file));
	
	# check if snap files exist
	if(any(do.call(c, lapply(fileList, function(x){file.exists(x)})) == FALSE)){
		idx = which(do.call(c, lapply(fileList, function(x){file.exists(x)})) == FALSE)
		print("error: these files does not exist")
		print(fileList[idx])
		stop()
	}
	
	# check if files are all snap files
	if(any(do.call(c, lapply(fileList, function(x){isSnapFile(x)})) == FALSE)){
		idx = which(do.call(c, lapply(fileList, function(x){isSnapFile(x)})) == FALSE)
		print("error: these files are not snap file")
		print(fileList[idx])
		stop()
	}
	
	# check if FM session exist
	if(any(do.call(c, lapply(fileList, function(x){ "FM" %in% h5ls(x, recursive=1)$name  })) == FALSE)){
		idx = which(do.call(c, lapply(fileList, function(x){ "FM" %in% h5ls(x, recursive=1)$name  })) == FALSE)
		print("error: the following nsap files do not contain FM session")
		print(fileList[idx])
		stop()
	}
	
	if(nclusters == 0L){
		stop("obj does not have cluster, runCluster first")
	}

	if(missing(output.prefix)){
		stop("output.prefix is missing");
	}
	
	if(missing(path.to.snaptools)){
		stop("path.to.snaptools is missing");
	}else{
		if(!file.exists(path.to.snaptools)){
			stop("path.to.snaptools does not exist");
		}
		
		flag = tryCatch({
			file_test('-x', path.to.snaptools);	
		},
		error=function(cond){
			return(FALSE)
		})
		if(flag == FALSE){
			stop("path.to.snaptools is not an excutable file");
		}
		
	}

	if(missing(path.to.macs)){
		stop("path.to.macs is missing");
	}else{
		if(!file.exists(path.to.macs)){
			stop("path.to.macs does not exist");
		}
		
		flag = tryCatch({
			file_test('-x', path.to.macs);	
		},
		error=function(cond){
			return(FALSE)
		})
		if(flag == FALSE){
			stop("path.to.macs is not an excutable file");
		}
		
	}
	
	if(missing(gsize)){
		stop("gsize is missing");
	}
	
	if(missing(tmp.folder)){
		stop("tmp.folder is missing")
	}else{
		if(!dir.exists(tmp.folder)){
			stop("tmp.folder does not exist");			
		}
	}
		
    peak.ls <-  parallel::mclapply(as.list(levels(obj@cluster)), function(x){
       num.cells = length(which(obj@cluster == x));
       if(num.cells < min.cells){
           return(GenomicRanges::GRanges())
       }
	   peaks.df <- runMACS(
		   obj=obj[which(obj@cluster==x),], 
		   output.prefix=paste(output.prefix, x, sep="."),
		   path.to.snaptools=path.to.snaptools,
		   path.to.macs=path.to.macs,
		   gsize=gsize,
		   buffer.size=buffer.size, 
		   macs.options=macs.options,
		   tmp.folder=tmp.folder,
		   keep.minimal=keep.minimal,
		   num.cores=1
		   );
	   if((x=nrow(peaks.df)) > 0L){
	       peaks.gr = GenomicRanges::GRanges(peaks.df[,1], IRanges(peaks.df[,2], peaks.df[,3]));
	   }else{
	   	   peaks.gr = GenomicRanges::GRanges();
	   }
     }, mc.cores=num.cores)
	
	peaks.gr = GenomicRanges::reduce(do.call(c, peak.ls));
	return(peaks.gr);
}
r3fang/SnapATAC documentation built on March 29, 2022, 4:33 p.m.