R/preprocess.R

Defines functions split_dataframe generate_peak_gtf diff_peaks merge_sort_count motifs_select

Documented in diff_peaks generate_peak_gtf merge_sort_count motifs_select

#' Select the motif related to the input genes
#' @description Select the motif related to the input genes. Genes used here
#' should be ENSEMBEL ID.
#' @param motif motif file, you can choose our bulit-in motif database of
#' 'mus musculus', 'homo sapiens', 'zebrafish' and 'chicken' by
#' 'motif = Tranfac201803_Mm_MotifTFsF', 'motif = Tranfac201803_Hs_MotifTFsF',
#' 'motif = Tranfac201803_Zf_MotifTFsF', 'motif = Tranfac201803_Ch_MotifTFsF'
#' respectively, or you can upload your own motif data base, but the formata
#' use be the same as our built-in motif database.
#' @param gene character, indicating genes used to filter motif. You can use
#' differentially expressed genes of corresponding scRNA-seq or bulk RNA-seq data,
#' or just use expressed genes.
#'
#' @return return filtered motif
#' @export
#'
#' @examples load(system.file("extdata", "test_clustering.rda", package = "IReNA"))
#' motif <- Tranfac201803_Hs_MotifTFsF
#' Kmeans_clustering_ENS <- add_ENSID(test_clustering, Spec1='Hs')
#' gene <- rownames(Kmeans_clustering_ENS)
#' filtered_motif <- motifs_select(motif,gene)
motifs_select <- function(motif,gene){
  validInput(motif,'motif','df')
  validInput(gene,'gene','character')
  index <- c()
  for (i in 1:nrow(motif)) {
    judge <- c()
    gene1 <- strsplit(motif[i,5],';')[[1]]
    for (j in gene1) {
      if (j %in% gene) {
        judge <- c(judge,'YSE')
      }
    }
    if ('YSE' %in% judge) {
      index <- c(index,i)
    }
  }
  motif1 <- motif[index,]
  return(motif1)
}


#' Merge and sort counts
#' @description Sort the peak files by end of peak, start of peak, and chr of peak
#' @param count_all data.frame, indicating counts of peaks, generated by htseq-count.
#' If you calculate counts for each sample separately, you need to combine these counts firstly through cbind().
#' @param gtf data.frame, indicating gtf file of all peaks, generated by
#' \link{generate_gtf} function
#'
#' @return return sorted counts
#' @export
#'
#' @examples \dontrun{
#' SSC_patient1_Counts <- read.delim("SSC_patient1_Counts.txt", header=FALSE)
#' SSC_patient2_Counts <- read.delim("SSC_patient2_Counts.txt", header=FALSE)
#' esc_Counts_Counts <- read.delim("esc_Counts.txt", header=FALSE)
#' peaks_gtf <- read.delim("peak_merged.gtf", header=FALSE)
#' count_all <- cbind(SSC_patient1_Counts, SSC_patient2_Counts[,2],
#'              esc_Counts_Counts[,2])
#' merged_count <- merge_sort_count(count_all, peaks_gtf)
#' }
#'
merge_sort_count <- function(count_all,gtf){
  validInput(count,'motif','df')
  validInput(gtf,'gtf','df')
  count_all <- count_all[1:(nrow(count_all)-5),]
  gtf$num <- 1:nrow(gtf)
  gtf <- gtf[order(gtf[,10]), ]
  count_all <- cbind(count_all,gtf[,c(1,4,5,11)])
  count_all <- count_all[order(count_all[,ncol(count_all)]), ]
  rownames(count_all) <- count_all[,1]
  count_all <- count_all[,-1]
  count_all <- count_all[,c((ncol(count_all)-3):(ncol(count_all)-1),
                            1:(ncol(count_all)-4))]
  return(count_all)
}


#' Identify differential peaks
#' @description Identify differential peaks based on edgeR, samples of peaks
#' need to be divided into at least two groups.
#' @param Count data.frame, indicating counts of peaks, generated by htseq-count.
#' If you calculate counts for each sample separately, you need to run
#' merge_sort_count() function firstly.
#' @param Group vector or factor giving the experimental group/condition for
#' each sample/library.
#' @importFrom edgeR DGEList
#' @importFrom edgeR calcNormFactors
#' @importFrom edgeR estimateDisp
#' @importFrom edgeR glmFit
#' @importFrom VGAM model.matrix
#' @return return FDR q-value and foldchange of each peak, which can be used to
#' identify differential peaks.
#' @export
#'
#' @examples load(system.file("extdata", "test_peak.rda", package = "IReNA"))
#' group <- c(1,1,2,2)
#' peaks <- diff_peaks(test_peak[,4:7],group)
diff_peaks <- function(Count,Group){
  validInput(Count,'Count','df')
  validInput(Group,'Group','vector')
  Group1 <- factor(Group)
  Dge1 <- edgeR::DGEList(counts=Count, group=Group1)
  Dge2 <- edgeR::calcNormFactors(Dge1);
  Des1 <- VGAM::model.matrix(~0+Group1);
  Dge2 <- edgeR::estimateDisp(Dge2, Des1);
  Fit1 <- edgeR::glmFit(Dge2, Des1)
  if(length(levels(Group1))==2){ colnames(Des1)<-c('Neg', 'Pos')
  Contr1 <- limma::makeContrasts(PosVsNeg=Pos-Neg, levels=Des1)
  Lrt1 <- edgeR::glmLRT(Fit1, contrast=Contr1);
  }else if(length(levels(Group1))>2){
    Lrt1 <- edgeR::glmLRT(Fit1, coef=2:length(levels(Group1)));
  }else{ print('There are less than two groups') }
  tLrt1 <- Lrt1$table;
  Fdr1 <- p.adjust(tLrt1[,ncol(tLrt1)], method='fdr');
  Diff01 <- cbind(tLrt1$logFC, tLrt1$PValue, Fdr1);
  rownames(Diff01) <- rownames(Count)
  colnames(Diff01) <- c('logFC', 'Pval', 'Fdr')
  return(Diff01)
}


#' Merge peak and generate gft
#' @description Merge the peaks that are close to each other and modify the length of the peak to 2*(peak_half_width). Finally, return peaks related gtf file
#' @param peak data.frame, indicating peaks file. If you call peak for each sample separately, you need to combine these peaks firstly through cbind().
#' @param peak_half_width numeric, indicating half of peak width
#'
#' @return return peaks related gtf file
#' @export
#'
#' @examples load(system.file("extdata", "test_peak.rda", package = "IReNA"))
#' peaks <- test_peak[,c(1,2,3)]
#' gtf <- generate_gtf(peaks)
#'
#' \dontrun{
#'  sample1_peaks <- read.delim("sample1_peaks.narrowPeak", header=FALSE)
#'  sample2_peaks <- read.delim("sample2_peaks.narrowPeak", header=FALSE)
#'  sample3_peaks <- read.delim("sample3_peaks.narrowPeak", header=FALSE)
#'  sample4_peaks <- read.delim("sample4_peaks.narrowPeak", header=FALSE)
#'  sample5_peaks <- read.delim("sample5_peaks.narrowPeak", header=FALSE)
#'  sample6_peaks <- read.delim("sample6_peaks.narrowPeak", header=FALSE)
#'  all_peak <- rbind(sample1_peaks, sample2_peaks, sample3_peaks, sample4_peaks,
#'  sample5_peaks, sample6_peaks)
#'  peaks_merged_gtf <- generate_peak_gtf(all_peak)
#' }
generate_peak_gtf <- function(peak,peak_half_width = 250){
  validInput(peak,'peak','df')
  validInput(peak_half_width,'peak_half_width','vector')
  Exp1 <- peak
  SortInd1 <- c(3,2,1)
  for(i in 1:length(SortInd1)){
    Exp1 <- Exp1[order(Exp1[, as.numeric(SortInd1[i])]), ]
  }

  str1 <- Exp1[1,1]
  start1 <- Exp1[1,2]
  end1 <- Exp1[1,3]
  col1 <- c()
  no1 <- 1
  width1 <- 250
  for (i in 1:nrow(Exp1)) {
    if (str1 == Exp1[i,1] & (start1 >= Exp1[i,2]&start1<=Exp1[i,3] |
                             end1>=Exp1[i,2]&end1<=Exp1[i,3] | start1<=Exp1[i,2]&end1>=
                             Exp1[i,3] | Exp1[i,2]>end1&(Exp1[i,3]-Exp1[i,2]+
                             end1-start1+Exp1[i,2]-end1)<width1*2 | start1>
                             Exp1[i,3]&(Exp1[i,3]-Exp1[i,2]+end1-start1+start1
                                        -Exp1[i,3])<width1*2)) {
      acc22 <- sort(c(start1,end1,Exp1[i,2],Exp1[i,3]))
      start1 <- acc22[1]
      end1 <- acc22[4]
    }else {
      start12 <- start1 + as.integer((end1-start1)/2) - width1
      end12 <- start1 + as.integer((end1-start1)/2) + width1
      if (start12 < 1) {
        start12 = 1
        end12 = start12 + width1*2
      }
      out1 <- paste0(str1,'\t','.','\t','exon','\t',start12,'\t',end12,
                     '\t','.','\t','.','\t','.','\t','gene_id','\t',paste0('Peak',no1))
      col1 <- c(col1,out1)
      no1 = no1 + 1
      str1 = Exp1[i,1]
      start1 = Exp1[i,2]
      end1 = Exp1[i,3]
    }
  }
  start12 <- start1 + as.integer((end1-start1)/2) - width1
  end12 <- start1 + as.integer((end1-start1)/2) + width1
  if (start12 < 1) {
    start12 = 1
    end12 = start12 + width1*2
  }
  out1 <- paste0(str1,'\t','.','\t','exon','\t',start12,'\t',end12,
                 '\t','.','\t','.','\t','.','\t','gene_id','\t',paste0('Peak',no1))
  col1 <- c(col1,out1)
  PeakGtf <- strsplit(col1,'\t')
  PeakGtf <- as.data.frame(t(as.data.frame(PeakGtf)))
  rownames(PeakGtf) <- 1:nrow(PeakGtf)
  return(PeakGtf)
}

split_dataframe <- function(dataframe1, sep = "\t") {
  out1 <- strsplit(dataframe1[1, ], sep)[[1]]
  out1 <- matrix(out1, ncol = length(out1))
  for (i in 2:nrow(dataframe1)) {
    out2 <- strsplit(dataframe1[i, ], sep)[[1]]
    out1 <- rbind(out1, out2)
  }
  out1 <- as.data.frame(out1)
  return(out1)
}
jiang-junyao/IReNA documentation built on May 17, 2024, 12:29 a.m.