#' Combine all footprints of motifs
#' @description combine all footprints related motifs which generated by fimo.
#' Before run this function you should check whether the 'Dir' only contain consequence
#' of fimo
#' @param Dir character, indicating the path of all footprints related motif
#' which generated by fimo. Folder in this path cannot contain any other irrelevant
#' files
#'
#' @return return combined footprints
#' @export
#'
#' @examples
#' #combied<-combine_footprints(Dir2)
combine_footprints <- function(Dir) {
validInput(Dir,'Dir','direxists')
DirFile1 <- list.files(Dir)
con3 <- read.table(paste0(Dir, DirFile1[1]), header = TRUE, sep = "\t")
col1 <- con3[, c(3, 4, 5, 6, 8, 10, 1)]
for (i in 2:length(DirFile1)) {
info <- file.info(paste0(Dir, DirFile1[i]))
if (info$size > 0) {
con12 <- read.table(paste0(Dir, DirFile1[i]), header = TRUE, sep = "\t")
col2 <- con12[, c(3, 4, 5, 6, 8, 10, 1)]
col1 <- rbind(col1, col2)
}
}
return(col1)
}
#' Overlap differential peaks and motif footprints
#' @description Because the running time of this function is too long, it's highly
#' recommanded to use bedtools to implement this function
#' @param footprints footprints file, generated by combine_footprints()
#' @param peak_bed bed format of peak, first column should be 'chr', second
#' column should be 'start', third column should be 'end'
#' @importFrom GenomicRanges GRanges
#' @importFrom GenomicRanges countOverlaps
#' @return return data.frame contain footprints that overlap peaks
#' @export
#'
#' @examples load(system.file("extdata", "test_peak.rda", package = "IReNA"))
#' load(system.file("extdata", "combined.rda", package = "IReNA"))
#' peak_bed <- get_bed(test_peak)
#' overlapped <- overlap_footprints_peaks(combined, peak_bed)
overlap_footprints_peaks <- function(footprints, peak_bed) {
validInput(footprints,'footprints','df')
validInput(peak_bed,'peak_bed','df')
peak <- GenomicRanges::GRanges(paste0(peak_bed[,1],':',peak_bed[,2],'-',peak_bed[,3]))
footprints1 <- GenomicRanges::GRanges(paste0(footprints[,1],':',footprints[,2],'-',footprints[,3]))
overlap_idx <- GenomicRanges::findOverlaps(footprints1,peak)
final_footprints <- cbind(footprints[overlap_idx@from,],peak_bed[overlap_idx@to,])
return(final_footprints)
}
#' Find motifs
#' @description Generate shell commands to find motifs in the footprints through
#' fimo software.
#' @param motif motif file, you can choose our bulit-in motif database of
#' 'mus musculus', 'homo sapiens', 'zebrafish' and 'chicken' by 'motif = Tranfac201803_Mm_MotifTFsF',
#' 'motif = Tranfac201803_Hs_MotifTFsF', 'motif = Tranfac201803_Zf_MotifTFsF',
#' 'motif = Tranfac201803_Ch_MotifTFsF' respectively, or you can upload your own
#' motif data base, but the formata use be the same as our built-in motif database.
#' @param step numeric, indicating the numbers of motif in each group. Because
#' there are so many motifs need to be calculated, so we divided every 20 motifs
#' into groups and then calculated each group using FIMO at the same time by nohup
#' @param fimodir character, indicating path of fimo software,
#' if you have added fimo to the environment variable, just set this argument as 'fimo'
#' @param outputdir1 character, indicating the output path of shell script
#' @param outputdir character, indicating output path of fimo result
#' @param Motifdir character, indicating the path of meme motif file
#' @param sequencedir sequence file directory
#' @return return scripts to run Fimo in command line
#' @export
#'
#' @examples
#' motif1 <- Tranfac201803_Mm_MotifTFsF
#' fimodir <- 'fimo'
#' outputdir <- 'D:/GIBH/IReNA2 R package/IReNA2/ATAC/outputdir/'
#' motifdir <- 'Mememotif/'
#' sequencedir <- 'merged_footprints.fasta'
#' #find_motifs(motif1,step=20,fimodir, outputdir, motifdir, sequencedir)
find_motifs <- function(motif, step = 20, fimodir,outputdir1,outputdir, Motifdir
, sequencedir) {
validInput(motif,'motif','df')
validInput(step,'step','numeric')
validInput(outputdir1,'outputdir1','direxists')
validInput(outputdir,'outputdir','direxists')
validInput(Motifdir,'Motifdir','direxists')
con1 <- motif
no1 <- 1
out2 <- c()
for (i in seq(1, nrow(con1), by = step)) {
out1 <- c()
if (nrow(con1) - i > step) {
for (j in i:(i + step - 1)) {
col1 <- paste0(
fimodir," --parse-genomic-coord --max-stored-scores 2000000 --text >",
outputdir, con1[j, 1],".txt ", ' ', Motifdir, con1[j, 1], '.txt ',
sequencedir,';'
)
out1 <- c(out1, col1)
}
out1 <- as.data.frame(out1)
write.table(out1, paste0(outputdir1, "/Fimo", no1, ".sh"), row.names = FALSE,
col.names = FALSE, quote = FALSE)
} else {
for (k in i:nrow(con1)) {
col1 <- paste0(
fimodir," --parse-genomic-coord --max-stored-scores 2000000 --text >",
outputdir, con1[k, 1],".txt ", ' ', Motifdir, con1[k, 1], '.txt ',
sequencedir,';'
)
out1 <- c(out1, col1)
}
out1 <- as.data.frame(out1)
write.table(out1, paste0(outputdir1, "/Fimo", no1, ".sh"), row.names = FALSE,
col.names = FALSE, quote = FALSE)
}
# write.table(out1,paste0(Dir1,'/Programs/Fimo',no1,'.txt'),row.names=F,col.names = F,quote = F)
col2 <- paste0("nohup sh ", outputdir, "/Fimo", no1, ".sh &")
no1 <- no1 + 1
out2 <- c(out2, col2)
}
out2 <- as.data.frame(out2)
write.table(out2, paste0(outputdir1, "/Fimo", "_All", ".sh"), row.names = FALSE,
col.names = FALSE, quote = FALSE)
}
#' get bed
#' @description Extract 'chr', 'start', 'end' columns of peak file
#' @param peak peak file which should contain 'chr', 'start', 'end' columns
#'
#' @return return bed format data frame
#' @export
#'
#' @examples load(system.file("extdata", "test_peak.rda", package = "IReNA"))
#' peak_bed <- get_bed(test_peak)
get_bed <- function(peak) {
col1 <- peak[, c("chr", "start", "end")]
return(col1)
}
#' Get candidate genes/TFs-related peaks
#' @description refine peaks through genes in expression profile of scRNA-seq data
#' @param list1 list, generated by get_related_genes(), the first element should
#' be bed data.frame(contain 3 columns), the second element should contain 9 columns
#' @param expression_profile dataframe, indicating expression profile of bulk
#' RNA-seq or single-cell RNA-seq.
#'
#' @return return a list contain two dataframe, first one is candidate genes/TFs-related
#' peaks dataframe, second one is bed dataframe
#' @export
#'
#' @examples load(system.file("extdata", "list1.rda", package = "IReNA"))
#' load(system.file("extdata", "test_clustering.rda", package = "IReNA"))
#' Kmeans_clustering_ENS <- add_ENSID(test_clustering, Spec1='Hs')
#' list1 <- get_related_peaks(list1,Kmeans_clustering_ENS)
get_related_peaks <- function(list1, expression_profile) {
validInput(list1,'list1','list')
validInput(expression_profile,'expression_profile','df')
Candid <- list1[[2]]
bed <- list1[[1]]
FilterIdx <- Candid[,2] %in% rownames(expression_profile)
Candid_filter <- Candid[FilterIdx,]
NewColumn <- paste(Candid_filter[,2],Candid_filter[,1],sep = '|')
Candid_filter$PeakGene <- NewColumn
Candid_filter <- Candid_filter[,c(3,4,5,10,7,8,9)]
bed <- bed[FilterIdx,]
footprintlist <- list(bed,Candid_filter)
return(footprintlist)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.