R/ATAC-seq.R

Defines functions get_related_peaks get_bed find_motifs overlap_footprints_peaks combine_footprints

Documented in combine_footprints find_motifs get_bed get_related_peaks overlap_footprints_peaks

#' Combine all footprints of motifs
#' @description combine all footprints related motifs which generated by fimo.
#' Before run this function you should check whether the 'Dir' only contain consequence
#' of fimo
#' @param Dir character, indicating the path of all footprints related motif
#' which generated by fimo. Folder in this path cannot contain any other irrelevant
#' files
#'
#' @return return combined footprints
#' @export
#'
#' @examples
#' #combied<-combine_footprints(Dir2)
combine_footprints <- function(Dir) {
  validInput(Dir,'Dir','direxists')
  DirFile1 <- list.files(Dir)
  con3 <- read.table(paste0(Dir, DirFile1[1]), header = TRUE, sep = "\t")
  col1 <- con3[, c(3, 4, 5, 6, 8, 10, 1)]
  for (i in 2:length(DirFile1)) {
    info <- file.info(paste0(Dir, DirFile1[i]))
    if (info$size > 0) {
      con12 <- read.table(paste0(Dir, DirFile1[i]), header = TRUE, sep = "\t")
      col2 <- con12[, c(3, 4, 5, 6, 8, 10, 1)]
      col1 <- rbind(col1, col2)
    }
  }
  return(col1)
}

#' Overlap differential peaks and motif footprints
#' @description Because the running time of this function is too long, it's highly
#' recommanded to use bedtools to implement this function
#' @param footprints footprints file, generated by combine_footprints()
#' @param peak_bed bed format of peak, first column should be 'chr', second
#' column should be 'start', third column should be 'end'
#' @importFrom GenomicRanges GRanges
#' @importFrom GenomicRanges countOverlaps
#' @return return data.frame contain footprints that overlap peaks
#' @export
#'
#' @examples load(system.file("extdata", "test_peak.rda", package = "IReNA"))
#' load(system.file("extdata", "combined.rda", package = "IReNA"))
#' peak_bed <- get_bed(test_peak)
#' overlapped <- overlap_footprints_peaks(combined, peak_bed)
overlap_footprints_peaks <- function(footprints, peak_bed) {
  validInput(footprints,'footprints','df')
  validInput(peak_bed,'peak_bed','df')
  peak <- GenomicRanges::GRanges(paste0(peak_bed[,1],':',peak_bed[,2],'-',peak_bed[,3]))
  footprints1 <- GenomicRanges::GRanges(paste0(footprints[,1],':',footprints[,2],'-',footprints[,3]))
  overlap_idx <- GenomicRanges::findOverlaps(footprints1,peak)
  final_footprints <- cbind(footprints[overlap_idx@from,],peak_bed[overlap_idx@to,])
  return(final_footprints)
}

#' Find motifs
#' @description Generate shell commands to find motifs in the footprints through
#' fimo software.
#' @param motif motif file, you can choose our bulit-in motif database of
#' 'mus musculus', 'homo sapiens', 'zebrafish' and 'chicken' by 'motif = Tranfac201803_Mm_MotifTFsF',
#' 'motif = Tranfac201803_Hs_MotifTFsF', 'motif = Tranfac201803_Zf_MotifTFsF',
#' 'motif = Tranfac201803_Ch_MotifTFsF' respectively, or you can upload your own
#' motif data base, but the formata use be the same as our built-in motif database.
#' @param step numeric, indicating the numbers of motif in each group. Because
#' there are so many motifs need to be calculated, so we divided every 20 motifs
#' into groups and then calculated each group using FIMO at the same time by nohup
#' @param fimodir character, indicating path of fimo software,
#' if you have added fimo to the environment variable, just set this argument as 'fimo'
#' @param outputdir1 character, indicating the output path of shell script
#' @param outputdir character, indicating output path of fimo result
#' @param Motifdir character, indicating the path of meme motif file
#' @param sequencedir sequence file directory
#' @return return scripts to run Fimo in command line
#' @export
#'
#' @examples
#' motif1 <- Tranfac201803_Mm_MotifTFsF
#' fimodir <- 'fimo'
#' outputdir <- 'D:/GIBH/IReNA2 R package/IReNA2/ATAC/outputdir/'
#' motifdir <- 'Mememotif/'
#' sequencedir <- 'merged_footprints.fasta'
#' #find_motifs(motif1,step=20,fimodir, outputdir, motifdir, sequencedir)
find_motifs <- function(motif, step = 20, fimodir,outputdir1,outputdir, Motifdir
                        , sequencedir) {
  validInput(motif,'motif','df')
  validInput(step,'step','numeric')
  validInput(outputdir1,'outputdir1','direxists')
  validInput(outputdir,'outputdir','direxists')
  validInput(Motifdir,'Motifdir','direxists')

  con1 <- motif
  no1 <- 1
  out2 <- c()
  for (i in seq(1, nrow(con1), by = step)) {
    out1 <- c()
    if (nrow(con1) - i > step) {
      for (j in i:(i + step - 1)) {
        col1 <- paste0(
          fimodir," --parse-genomic-coord --max-stored-scores 2000000  --text >",
          outputdir, con1[j, 1],".txt ", ' ', Motifdir, con1[j, 1], '.txt ',
          sequencedir,';'
        )
        out1 <- c(out1, col1)
      }
      out1 <- as.data.frame(out1)
      write.table(out1, paste0(outputdir1, "/Fimo", no1, ".sh"), row.names = FALSE,
                  col.names = FALSE, quote = FALSE)
    } else {
      for (k in i:nrow(con1)) {
        col1 <- paste0(
          fimodir," --parse-genomic-coord --max-stored-scores 2000000  --text >",
          outputdir, con1[k, 1],".txt ", ' ', Motifdir, con1[k, 1], '.txt ',
          sequencedir,';'
        )
        out1 <- c(out1, col1)
      }
      out1 <- as.data.frame(out1)
      write.table(out1, paste0(outputdir1, "/Fimo", no1, ".sh"), row.names = FALSE,
                  col.names = FALSE, quote = FALSE)
    }
    # write.table(out1,paste0(Dir1,'/Programs/Fimo',no1,'.txt'),row.names=F,col.names = F,quote = F)
    col2 <- paste0("nohup sh ", outputdir, "/Fimo", no1, ".sh &")
    no1 <- no1 + 1
    out2 <- c(out2, col2)
  }
  out2 <- as.data.frame(out2)
  write.table(out2, paste0(outputdir1, "/Fimo", "_All", ".sh"), row.names = FALSE,
              col.names = FALSE, quote = FALSE)
}

#' get bed
#' @description Extract 'chr', 'start', 'end' columns of peak file
#' @param peak peak file which should contain 'chr', 'start', 'end' columns
#'
#' @return return bed format data frame
#' @export
#'
#' @examples load(system.file("extdata", "test_peak.rda", package = "IReNA"))
#' peak_bed <- get_bed(test_peak)
get_bed <- function(peak) {
  col1 <- peak[, c("chr", "start", "end")]
  return(col1)
}

#' Get candidate genes/TFs-related peaks
#' @description refine peaks through genes in expression profile of scRNA-seq data
#' @param list1 list, generated by get_related_genes(), the first element should
#' be bed data.frame(contain 3 columns), the second element should contain 9 columns
#' @param expression_profile dataframe, indicating expression profile of bulk
#' RNA-seq or single-cell RNA-seq.
#'
#' @return return a list contain two dataframe, first one is candidate genes/TFs-related
#' peaks dataframe, second one is bed dataframe
#' @export
#'
#' @examples load(system.file("extdata", "list1.rda", package = "IReNA"))
#' load(system.file("extdata", "test_clustering.rda", package = "IReNA"))
#' Kmeans_clustering_ENS <- add_ENSID(test_clustering, Spec1='Hs')
#' list1 <- get_related_peaks(list1,Kmeans_clustering_ENS)
get_related_peaks <- function(list1, expression_profile) {
  validInput(list1,'list1','list')
  validInput(expression_profile,'expression_profile','df')
  Candid <- list1[[2]]
  bed <- list1[[1]]
  FilterIdx <- Candid[,2] %in% rownames(expression_profile)
  Candid_filter <- Candid[FilterIdx,]
  NewColumn <- paste(Candid_filter[,2],Candid_filter[,1],sep = '|')
  Candid_filter$PeakGene <- NewColumn
  Candid_filter <- Candid_filter[,c(3,4,5,10,7,8,9)]
  bed <- bed[FilterIdx,]
  footprintlist <- list(bed,Candid_filter)
  return(footprintlist)
}
jiang-junyao/IReNA documentation built on May 17, 2024, 12:29 a.m.