R/filter_fimo.R

Defines functions split_motif filter_regulation_fimo generate_fimo_regulation find_motifs_targetgenes extract_genes get_tss_region

Documented in filter_regulation_fimo find_motifs_targetgenes generate_fimo_regulation get_tss_region

#' Get tss region sequence of target genes in regulaotry relationships
#'
#' @param gtf Gene transfer format, you can download it from http://www.ensembl.org/info/data/ftp/index.html
#' @param gene.use character, indicating target genes
#' @param upstream_length numeric, indicating upstream region length of TSS
#' @param downstream_length numeric, indicating downstream region length of TSS
#'
#' @return data.frame, contains four columns are target gene, chr, start and end.
#' @export
#'
get_tss_region <- function(gtf,gene.use,upstream_length=1000,downstream_length=500){
  validInput(gtf,'gtf','df')
  validInput(gene.use,'gene.use','character')
  validInput(upstream_length,'upstream_length','numeric')
  validInput(downstream_length,'downstream_length','numeric')
  gtfgene=gtf[gtf$V3=='gene',]
  genes <- apply(gtfgene, 1, extract_genes)
  gtfgene$genes <- genes
  final <- gtfgene[gtfgene$genes%in%gene.use,]
  chr1 <- gtfgene[gtfgene$genes%in%gene.use,]$V1
  start1 <- c()
  end1 <- c()
  for (i in 1:nrow(final)) {
    if (final[i,7]=='-') {
      start1 <- c(start1,as.numeric(final[i,5])-upstream_length)
      end1 <- c(end1,as.numeric(final[i,5])+downstream_length)
    }else{
      start1 <- c(start1,as.numeric(final[i,4])-upstream_length)
      end1 <- c(end1,as.numeric(final[i,4])+downstream_length)
    }
  }
  final$chr <- chr1
  final$start <- start1
  final$end <- end1
  final <- final[,(ncol(final)-3):ncol(final)]
  return(final)
}


extract_genes <- function(gtf){
  id1 <- strsplit(gtf[9],';')[[1]][1]
  id2 <- strsplit(id1,' ')[[1]][2]
  return(id2)
}


#' Use fimo to scan the TSS regions of candidate genes to find binding motifs
#'
#' @param gene_tss TSS region of genes. first column should be gene, second column
#' should be chr, third column should be start, fourth column should be end.
#' generated by \code{\link{get_tss_region}}
#' @param motif motif file, you can choose our bulit-in motif database of
#' 'mus musculus', 'homo sapiens', 'zebrafish' and 'chicken' by 'motif = Tranfac201803_Mm_MotifTFsF',
#' 'motif = Tranfac201803_Hs_MotifTFsF', 'motif = Tranfac201803_Zf_MotifTFsF',
#' 'motif = Tranfac201803_Ch_MotifTFsF' respectively, or you can upload your own
#' motif data base, but the formata use be the same as our built-in motif database.
#' @param refdir character, indicating the path of reference genome. Reference
#' genome can be download from https://hgdownload.soe.ucsc.edu/downloads.html
#' @param fimodir character, indicating path of fimo software,
#' if you have added fimo to the environment variable, just set this argument as 'fimo'
#' @param outputdir1 character, indicating the output path of the shell scripts
#' and sequence of target genes tss regions.(function 'find_motifs_targetgenes'
#' will automatically generate two folders ('fasta' and 'fimo') in the path
#' 'outputdir1', and store sequence of target genes tss regions in the 'fasta'
#' and shell scripts in the 'fimo')
#' @param Motifdir character, indicating the path of meme motif file
#' @param sequencedir character, indicating the path of sequence of target genes
#' tss regions. If it's NULL, this parameter will be paste0(outputdir1,'fasta/')
#' @param select_motif logic, indicating whether to select motifs whose related
#' transcription factors are in genes of gene_tss
#' @param use_nohup logic, indicating whether use nohup to run all fimo scripts simultaneously
#' @importFrom stringr str_ends
#' @return
#' @export
#'
#' @examples
find_motifs_targetgenes <- function(gene_tss,motif,refdir,fimodir,outputdir1, Motifdir
                                    , sequencedir = NULL,select_motif = T, use_nohup = F){
  validInput(refdir,'refdir','fileexists')
  validInput(Motifdir,'Motifdir','direxists')
  validInput(outputdir1,'outputdir','direxists')
  if (str_ends(outputdir1,'/')==FALSE) {
    warning('the last character of outputdir1 is not "/"')
  }
  fasta <- Biostrings::readBStringSet(refdir, format = "fasta", nrec = -1L,
                                      skip = 0L, seek.first.rec = FALSE,
                                      use.names = TRUE)
  gene_tss[,3] <- as.integer(gene_tss[,3])
  gene_tss[,4] <- as.integer(gene_tss[,4])
  if (select_motif==T) {
    motif1 <- motifs_select(motif,gene_tss[,1])
  }else{motif1 = motif}

  outputdir <- paste0(outputdir1,'fimo/')
  if (is.null(sequencedir)) {
    sequencedir <- paste0(outputdir1,'fasta/')
  }
  fimoall <- c()
  if (!'fasta' %in% dir(outputdir1)) {
    dir.create(paste0(outputdir1,'fasta'))
  }
  if (!'fimo' %in% dir(outputdir1)) {
    dir.create(paste0(outputdir1,'fimo'))
  }
  for (i in 1:nrow(gene_tss)) {
    if (use_nohup==TRUE) {
      fimo1 <- paste0('nohup sh ',outputdir1,'fimo/',gene_tss[i,1],'/','Fimo1.sh &')
    }else if(use_nohup==FALSE){
      fimo1 <- paste0('sh ',outputdir1,'fimo/',gene_tss[i,1],'/','Fimo1.sh ;')
    }else{stop('parameter use_nohup should be TRUE or FALSE')}

    fimoall <- c(fimoall,fimo1)
    dir.create(paste0(outputdir1,'fimo/',gene_tss[i,1]))
    fasta1 <- getfasta2(gene_tss[i,2:4],fasta)
    write.table(fasta1,paste0(outputdir1,'fasta/',gene_tss[i,1],'.fa')
                ,col.names=F,row.names=F,quote=F)
    fimodir1 <- fimodir
    outputdir12 <- paste0(outputdir1,'fimo/',gene_tss[i,1],'/')
    outputdir11 <- paste0(outputdir,gene_tss[i,1],'/')
    sequencedir1 <- paste0(sequencedir,gene_tss[i,1],'.fa')
    find_motifs(motif1,step=500,fimodir1, outputdir12, outputdir11, Motifdir,
                sequencedir1)
  }
  fimoall <- as.data.frame(fimoall)
  write.table(fimoall,paste0(outputdir1,'fimo/','fimoall.sh'),
              quote = F,row.names = F,col.names = F)
}

#' Generate regulation database accoding to the fimo result
#'
#' @param motif_dir character, indicating the path of fimo result
#' @param motif motif file, you can choose our bulit-in motif database of
#' 'mus musculus', 'homo sapiens', 'zebrafish' and 'chicken' by 'motif = Tranfac201803_Mm_MotifTFsF',
#' 'motif = Tranfac201803_Hs_MotifTFsF', 'motif = Tranfac201803_Zf_MotifTFsF',
#' 'motif = Tranfac201803_Ch_MotifTFsF' respectively, or you can upload your own
#' motif data base, but the formata use be the same as our built-in motif database.
#' @importFrom rlang is_empty
#' @return
#' @export
#'
#' @examples
generate_fimo_regulation <- function(motif_dir,motif){
  validInput(motif_dir,'motif_dir','direxists')
  validInput(motif,'motif','df')
  targetgenes <- dir(motif_dir)
  for (i in 1:length(targetgenes)) {
    DirFile1 <- list.files(paste0(motif_dir,targetgenes[i]))
    if (!rlang::is_empty(DirFile1)) {
      info <- file.info(paste0(motif_dir,targetgenes[i],'/',DirFile1))
      rownames(info) <- DirFile1
      info <- info[info$size>0,]
      info <- info[!grepl(c('Fimo_All.sh'),rownames(info)),]
      info <- info[!grepl(c('Fimo..sh'),rownames(info)),]
      info <- info[!grepl(c('Fimo...sh'),rownames(info)),]
      if (nrow(info)>0) {
        motif1 <- rownames(info)
        motif1 <- gsub('.txt','',motif1)
        motif2 <- motif[motif$Accession %in% motif1,]
        motifgene <- apply(motif2, 1, split_motif)
        motifgene <- unlist(motifgene)
        motifgene1 <- paste(motifgene,collapse = ';')
        if (exists('regulation_final') == FALSE) {
          regulation_final <- data.frame(motifgene1,targetgenes[i])
          colnames(regulation_final) <- c('TF','Target')
        }else{
          regulation1 <- data.frame(motifgene1,targetgenes[i])
          rownames(regulation1) <- i
          colnames(regulation1) <- c('TF','Target')
          regulation_final <- rbind(regulation_final,regulation1)
        }
      }
    }
  }
  return(regulation_final)
}

#' Filter regulatory relationships according to binding motifs generated by fimo
#' @description If ATAC-seq data is not available, binding motifs of target genes
#' can be used to filter regulatory relationships.
#' @param fimo_regulation regulation database, you can download it from xxx, or
#' generate it by \code{\link{generate_fimo_regulation}}
#' @param regulatory_relationships  unfiltered regulatory_relationships, generated by
#' \code{\link{get_cor}}
#'
#' @return
#' @export
#'
#' @examples
filter_regulation_fimo <- function(fimo_regulation,regulatory_relationships){
  if (!'TF' %in% colnames(regulatory_relationships)) {
    stop('regulatory_relationships should contain "TF" column')
  }
  if (!'Target' %in% colnames(regulatory_relationships)) {
    stop('regulatory_relationships should contain "Target" column')
  }
  fimo_pair <- apply(fimo_regulation, 1, function(x1){
    TFs <- strsplit(x1[1],';')[[1]]
    Target <- x1[2]
    regulation <- paste(TFs,Target)
    return(regulation)
  })
  fimo_pair <- unlist(fimo_pair)
  regulation_pair <- paste(regulatory_relationships[,1],regulatory_relationships[,4])
  regulation1 <- regulatory_relationships[regulation_pair %in% fimo_pair,]
  return(regulation1)
}

split_motif <- function(motif){
  gene1 <- strsplit(motif[5],';')[[1]]
}
jiang-junyao/IReNA documentation built on May 17, 2024, 12:29 a.m.