R/GloModAn.R

Defines functions ExtractListModPosByModMotif .rescale letterU letterT letterC letterG letterA addLetter DrawLogoPosNegAxes DrawModLogo GetModRatioByContig .IncludeModPosInMot .AddModMotifPctToDf GetGRangesWindowSeqandParam GetModReportDeepSignal GetModReportPacBio DrawDistriHistBox DrawBarplotBothStrands GetMeanParamByContig

Documented in addLetter .AddModMotifPctToDf DrawBarplotBothStrands DrawDistriHistBox DrawLogoPosNegAxes DrawModLogo ExtractListModPosByModMotif GetGRangesWindowSeqandParam GetMeanParamByContig GetModRatioByContig GetModReportDeepSignal GetModReportPacBio .IncludeModPosInMot letterA letterC letterG letterT letterU .rescale

### =========================================================================
### Functions from ModAn and ModAnPlot categories
### -------------------------------------------------------------------------
###
### DNAModAnnot v.0.0.0.9018 - 2021/02/03
### Licence GPL-3
###
### Contributor:
### Alexis Hardy
### - email: "alexis.hardy1994@outlook.fr"
### - Sandra Duharcourt Laboratory
### - Matthieu Defrance Laboratory
###
### -------------------------------------------------------------------------
###
### Functions:
###
### GetMeanParamByContig
### DrawBarplotBothStrands
### DrawDistriHistBox
### GetModReportPacBio
### GetModReportDeepSignal
### GetGRangesWindowSeqandParam
### .AddModMotifPctToDf
### .IncludeModPosInMot
### GetModRatioByContig
### DrawModLogo
### DrawLogoPosNegAxes
### ExtractListModPosByModMotif
###
### -------------------------------------------------------------------------

#' GetMeanParamByContig Function (GloModAn)
#'
#' Return a list with the mean by strand of the parameter provided for all scaffolds of genome assembly provided.
#' @param grangesData A GRanges-like object containing, in the extra columns, the parameter to be analysed.
#' @param dnastringsetGenome A DNAStringSet object containing the sequence for each contig.
#' @param cParamName The name of the column containing the parameter to be analysed.
#' @return A list composed of 3 dataframes: 1 dataframe for each strand and 1 dataframe with both strands. In each dataframe:
#' \itemize{
#'   \item refName: The names of each contig.
#'   \item strand: The strand of each contig.
#'   \item width: The width of each contig.
#'   \item nb_sequenced: The number of bases sequenced by strand for each contig.
#'   \item seqPct: The percentage of bases sequenced by strand for each contig (percentage of sequencing).
#' }
#' @keywords GetMeanParamByContig
#' @importFrom Biobase isUnique
#' @export
#' @examples
#' myGenome <- Biostrings::readDNAStringSet(system.file(
#'   package = "DNAModAnnot", "extdata",
#'   "ptetraurelia_mac_51_sca171819.fa"
#' ))
#'
#' # Preparing a gposPacBioCSV dataset
#' myGposPacBioCSV <-
#'   ImportPacBioCSV(
#'     cPacBioCSVPath = system.file(
#'       package = "DNAModAnnot", "extdata",
#'       "ptetraurelia.bases.sca171819.csv"
#'     ),
#'     cSelectColumnsToExtract = c(
#'       "refName", "tpl", "strand", "base",
#'       "score", "ipdRatio", "coverage"
#'     ),
#'     lKeepExtraColumnsInGPos = TRUE, lSortGPos = TRUE,
#'     cContigToBeAnalyzed = names(myGenome)
#'   )
#'
#' myMean_cov_list <- GetMeanParamByContig(
#'   grangesData = myGposPacBioCSV,
#'   dnastringsetGenome = myGenome,
#'   cParamName = "coverage"
#' )
#' myMean_cov_list
GetMeanParamByContig <- function(grangesData,
                                 dnastringsetGenome,
                                 cParamName) {
  count_table <- data.frame(
    refName = as.factor(seqnames(grangesData)),
    strand = as.factor(strand(grangesData)),
    parameter = mcols(grangesData)[[cParamName]]
  )
  mean_table <- aggregate(
    count_table$parameter,
    list(count_table$refName, count_table$strand),
    mean
  )
  colnames(mean_table) <- c("refName", "strand", paste("mean_", cParamName, sep = ""))

  mean_table <- merge(mean_table,
    data.frame(
      refName = names(dnastringsetGenome),
      width = width(dnastringsetGenome)
    ),
    by = "refName",
    all.y = TRUE
  )

  # those who have NA for one or both strands : add missing strands with 0 mean_param
  if (length(which(is.na(mean_table$strand))) > 0) {
    mean_table[is.na(mean_table$strand), paste("mean_", cParamName, sep = "")] <- 0
    mean_table[is.na(mean_table$strand), "strand"] <- "+"
  }
  if (length(which(isUnique(mean_table$refName))) > 0) {
    mean_table2 <- mean_table[isUnique(mean_table$refName), ]
    mean_table2$strand <- ifelse(mean_table2$strand == "+", "-", "+")
    mean_table2[, paste("mean_", cParamName, sep = "")] <- 0
    mean_table <- rbind(mean_table, mean_table2)
  }
  mean_table <- mean_table[with(mean_table, order(width, refName, strand, decreasing = TRUE)), ]

  f_strand <- subset(mean_table, strand == "+")
  r_strand <- subset(mean_table, strand == "-")

  return(list(
    mean_table = mean_table,
    f_strand = f_strand,
    r_strand = r_strand
  ))
}

#' DrawBarplotBothStrands Function (GloModAn)
#'
#' Return a barplot describing some parameter values provided by strand for each contig.
#' @param nParamByContigForward A numeric vector containing the parameter values of the forward strand to be plotted.
#' Must have the same order and same length as cContigNames and nParamByContigReverse.
#' @param nParamByContigReverse A numeric vector containing the parameter values of the reverse strand to be plotted.
#' Must have the same order and same length as nParamByContigForward and cContigNames.
#' @param cContigNames A character vector containing the names of the contigs.
#' Must have the same order and same length as nParamByContigForward and nParamByContigReverse.
#' @param cGraphName The graph name to be displayed on top of the plot.
#' @param lIsOrderedLargestToSmallest If contigs are ordered from largest to smallest contig,
#' add TRUE to display "(from largest to smallest)" below the plot. Defaults to FALSE.
#' @keywords DrawBarplotBothStrands
#' @export
#' @examples
#' DrawBarplotBothStrands(
#'   nParamByContigForward = c(100, 86, 75, 56),
#'   nParamByContigReverse = c(96, 88, 80, 83),
#'   cContigNames = c("chrI", "chrII", "chrIII", "chrIV"),
#'   cGraphName = "Mean Coverage per contig",
#'   lIsOrderedLargestToSmallest = TRUE
#' )
DrawBarplotBothStrands <- function(nParamByContigForward,
                                   nParamByContigReverse,
                                   cContigNames,
                                   cGraphName,
                                   lIsOrderedLargestToSmallest = TRUE) {
  layout(matrix(c(
    4, 4,
    3, 1,
    3, 2
  ),
  nrow = 3, ncol = 2, byrow = TRUE
  ), widths = c(1, 29), heights = c(1, 2, 2))

  nMarLeft <- max(nchar(c(pretty(nParamByContigForward), pretty(nParamByContigReverse)))) * 1.1 + 1
  opar <- par()
  par(mar = c(0, nMarLeft, 4, 2))

  ylim_max <- max(max(nParamByContigForward), max(nParamByContigReverse))

  bp <- barplot(nParamByContigForward,
    ylab = "Forward strand", ylim = c(0, ylim_max),
    col = c("grey30", "grey70"),
    las = 2, xaxs = "i", yaxs = "i", cex.axis = 1.25, cex.lab = 1.5, mgp = c(nMarLeft * 3.5 / 5, 1, 0)
  )

  axis(side = 3, pos = NA, at = bp, labels = cContigNames, las = 2, cex.axis = 1.25)

  par(mar = c(4, nMarLeft, 0, 2))

  barplot(nParamByContigReverse,
    ylab = "Reverse strand", ylim = par("usr")[c(4, 3)],
    col = c("grey30", "grey70"),
    las = 1, xaxs = "i", yaxs = "i", cex.axis = 1.25, cex.lab = 1.5, mgp = c(nMarLeft * 3.5 / 5, 1, 0)
  )
  par(xpd = NA)
  if (lIsOrderedLargestToSmallest) {
    mtext("Contigs (from largest to smallest)", cex = 1.25, side = 1, line = 1.5)
  } else {
    mtext("Contigs", cex = 1.25, side = 1, line = 1.5)
  }
  par(xpd = FALSE)

  par(mar = c(0, 0, 0, 0.5))
  barplot(height = 1, col = "white", border = "white", axes = FALSE)

  par(xpd = NA)
  mtext(cGraphName, cex = 1.25, side = 4, srt = 90)
  par(xpd = FALSE)

  layout(matrix(c(1, 1, 1, 1), nrow = 2, ncol = 2))
  par(mar = opar$mar)
}

#' DrawDistriHistBox Function (GloModAn)
#'
#' Return a line-plot describing the cumulative length of contigs (ordered from largest to smallest contig).
#' @param nParam A numeric vector containing the parameter values to be plotted.
#' @param cGraphName The graph name to be displayed on top of the plot.
#' @param cParamName The name of the parameter to be displayed below the x-axis.
#' @param lTrimOutliers If TRUE, remove the outliers from the boxplot and trim the histogram to the borders
#' of the boxplot. Defaults to FALSE.
#' @param nXLimits A numeric vector giving the limits of the plot on the x-axis. If NULL, the limits will be set
#' to the minimum and the maximum of the nParam data (or to the inner fences (1.5*Interquartile Range) of the
#' boxplot if lTrimmingOutliers is TRUE). Defaults to NULL.
#' @keywords DrawDistriHistBox
#' @export
#' @examples
#' # loading genome
#' myGenome <- Biostrings::readDNAStringSet(system.file(
#'   package = "DNAModAnnot", "extdata",
#'   "ptetraurelia_mac_51_sca171819.fa"
#' ))
#'
#' # Preparing a gposPacBioCSV dataset
#' myGposPacBioCSV <-
#'   ImportPacBioCSV(
#'     cPacBioCSVPath = system.file(
#'       package = "DNAModAnnot", "extdata",
#'       "ptetraurelia.bases.sca171819.csv"
#'     ),
#'     cSelectColumnsToExtract = c(
#'       "refName", "tpl", "strand", "base",
#'       "score", "ipdRatio", "coverage"
#'     ),
#'     lKeepExtraColumnsInGPos = TRUE, lSortGPos = TRUE,
#'     cContigToBeAnalyzed = names(myGenome)
#'   )
#'
#' DrawDistriHistBox(head(myGposPacBioCSV$coverage, 100000),
#'   cGraphName = "Coverage distribution of some bases sequenced",
#'   cParamName = "Coverage",
#'   lTrimOutliers = TRUE
#' )
DrawDistriHistBox <- function(nParam,
                              cGraphName,
                              cParamName,
                              lTrimOutliers = FALSE,
                              nXLimits = NULL) {
  d <- density(nParam)
  h <- hist(nParam, plot = FALSE)
  if (length(seq(from = min(h$breaks), to = max(h$breaks), by = 1)) < 10) {
    breaks_v <- seq(from = min(h$breaks), to = max(h$breaks), length.out = 200)
  } else {
    breaks_v <- seq(from = min(h$breaks), to = max(h$breaks), by = 1)
  }
  h <- hist(nParam, breaks = breaks_v, plot = FALSE)

  if (lTrimOutliers) {
    b <- boxplot(nParam, plot = FALSE)
  }

  if (is.null(nXLimits)) {
    nXLimits <- c()
    nXLimits[1] <- ifelse(!lTrimOutliers, min(h$breaks), b$stats[1])
    nXLimits[2] <- ifelse(!lTrimOutliers, max(h$breaks), b$stats[5])
  }

  opar <- par()
  layout(matrix(c(1, 2), 2, 1), heights = c(1, 9))
  par(mar = c(0.1, 5.1, 2.1, 2.1))
  b <- boxplot(nParam,
    horizontal = TRUE, xaxt = "n", frame = FALSE, outline = !lTrimOutliers,
    main = ifelse(!lTrimOutliers,
      cGraphName,
      paste(cGraphName, " (no outliers)", sep = "")
    ),
    outcol = rgb(0, 0, 0, 0.01),
    ylim = nXLimits
  )

  par(mar = c(5, 5.1, 0.1, 2.1))
  hist(nParam,
    freq = FALSE, probability = TRUE,
    xlab = cParamName, ylab = "Density (%)", main = NULL,
    col = "gray",
    xlim = nXLimits,
    ylim = c(0, ifelse(max(h$density) < max(d$y), max(d$y), max(h$density))),
    breaks = breaks_v
  )
  lines(d, col = "red")
  layout(matrix(c(1), 1, 1))
  par(mar = opar$mar)
}

#' GetModReportPacBio Function (GloModAn)
#'
#' Return a report with global characteristics of DNA modifications (Mod) distribution in the genome assembly provided.
#' (adapted to PacBio data)
#' @param dnastringsetGenome A DNAStringSet object containing the sequence for each contig.
#' @param grangesGenome A GRanges object containing the width of each contig.
#' @param grangesPacBioGFF A GRanges object containing PacBio GFF data.
#' @param gposPacBioCSVBase A GPos object containing PacBio CSV data for sites that can be targeted by the modification only.
#' @param cOrgAssemblyName The name of the genome assembly provided.
#' @param cBaseLetterForMod The name of the base letter of the modified base.
#' @param cModNameInOutput Name for the modification in the output.
#' @keywords GetModReportPacBio
#' @export
#' @examples
#' # loading genome
#' myGenome <- Biostrings::readDNAStringSet(system.file(
#'   package = "DNAModAnnot", "extdata",
#'   "ptetraurelia_mac_51_sca171819.fa"
#' ))
#' myGrangesGenome <- GetGenomeGRanges(myGenome)
#'
#' # Preparing a grangesPacBioGFF datasets
#' myGrangesPacBioGFF <-
#'   ImportPacBioGFF(
#'     cPacBioGFFPath = system.file(
#'       package = "DNAModAnnot", "extdata",
#'       "ptetraurelia.modifications.sca171819.gff"
#'     ),
#'     cNameModToExtract = "m6A",
#'     cModNameInOutput = "6mA",
#'     cContigToBeAnalyzed = names(myGenome)
#'   )
#'
#' myGposPacBioCSV <-
#'   ImportPacBioCSV(
#'     cPacBioCSVPath = system.file(
#'       package = "DNAModAnnot", "extdata",
#'       "ptetraurelia.bases.sca171819.csv"
#'     ),
#'     cSelectColumnsToExtract = c(
#'       "refName", "tpl", "strand", "base",
#'       "score", "ipdRatio", "coverage"
#'     ),
#'     lKeepExtraColumnsInGPos = TRUE, lSortGPos = TRUE,
#'     cContigToBeAnalyzed = names(myGenome)
#'   )
#' myGposPacBioCSV <- myGposPacBioCSV[myGposPacBioCSV$base == "A"]
#'
#' # Mod report
#' myReport_Mod <- GetModReportPacBio(
#'   grangesGenome = myGrangesGenome,
#'   dnastringsetGenome = myGenome,
#'   grangesPacBioGFF = myGrangesPacBioGFF,
#'   gposPacBioCSVBase = myGposPacBioCSV,
#'   cOrgAssemblyName = "ptetraurelia_mac_51",
#'   cBaseLetterForMod = "A",
#'   cModNameInOutput = "6mA"
#' )
#' myReport_Mod
GetModReportPacBio <- function(dnastringsetGenome,
                               grangesGenome,
                               grangesPacBioGFF,
                               gposPacBioCSVBase,
                               cOrgAssemblyName,
                               cBaseLetterForMod,
                               cModNameInOutput) {
  lIsFracAvailable <- "frac" %in% names(mcols(grangesPacBioGFF))
  
  dReportTable <- data.frame(
    row.names = cOrgAssemblyName,
    nbMod = length(grangesPacBioGFF),
    nbBaseSequenced = length(gposPacBioCSVBase),
    nbBaseAssembly = letterFrequency(dnastringsetGenome, letters = cBaseLetterForMod, collapse = TRUE) +
      letterFrequency(reverseComplement(dnastringsetGenome), letters = cBaseLetterForMod, collapse = TRUE)
  )

  dReportTable$ratioMod <- dReportTable$nbMod / dReportTable$nbBaseSequenced
  if(lIsFracAvailable){ dReportTable$ratioMod_corrected <- dReportTable$ratioMod * mean(grangesPacBioGFF$frac) }
  dReportTable$ratioModAllAssemblyBases <- dReportTable$nbMod / dReportTable$nbBaseAssembly
  if(lIsFracAvailable){ dReportTable$ratioModAllAssemblyBases_corrected <- dReportTable$ratioModAllAssemblyBases * mean(grangesPacBioGFF$frac) }
  if(lIsFracAvailable){ dReportTable$mean_frac_Mod <- mean(grangesPacBioGFF$frac) }
  dReportTable$mean_coverage_Mod <- mean(grangesPacBioGFF$coverage)
  dReportTable$mean_score_Mod <- mean(grangesPacBioGFF$score)
  dReportTable$mean_ipdRatio_Mod <- mean(grangesPacBioGFF$ipdRatio)
  dReportTable$mean_identificationQv_Mod <- mean(grangesPacBioGFF$identificationQv)

  colnames(dReportTable) <- gsub(
    pattern = "Base", replacement = cBaseLetterForMod,
    x = colnames(dReportTable)
  )
  colnames(dReportTable) <- gsub(
    pattern = "Mod", replacement = cModNameInOutput,
    x = colnames(dReportTable)
  )

  dReportTable <- .AddModMotifPctToDf(dReportTable,
    grangesModPos = grangesPacBioGFF, grangesGenome = grangesGenome,
    dnastringsetGenome = dnastringsetGenome,
    nModPositionInMotif = 1, nUpstreamBpToAdd = 0, nDownstreamBpToAdd = 1,
    cBaseLetterForMod = cBaseLetterForMod, cModNameInOutput = cModNameInOutput
  )


  dReportTable <- .AddModMotifPctToDf(dReportTable,
    grangesModPos = grangesPacBioGFF, grangesGenome = grangesGenome,
    dnastringsetGenome = dnastringsetGenome,
    nModPositionInMotif = 2, nUpstreamBpToAdd = 1, nDownstreamBpToAdd = 0,
    cBaseLetterForMod = cBaseLetterForMod, cModNameInOutput = cModNameInOutput
  )


  dReportTable <- .AddModMotifPctToDf(dReportTable,
    grangesModPos = grangesPacBioGFF, grangesGenome = grangesGenome,
    dnastringsetGenome = dnastringsetGenome,
    nModPositionInMotif = 2, nUpstreamBpToAdd = 1, nDownstreamBpToAdd = 1,
    cBaseLetterForMod = cBaseLetterForMod, cModNameInOutput = cModNameInOutput
  )

  return(t(dReportTable))
}

#' GetModReportDeepSignal Function (GloModAn)
#'
#' Return a report with global characteristics of DNA modifications (Mod) distribution in the genome assembly provided.
#' (adapted to data from DeepSignal software)
#' @param dnastringsetGenome A DNAStringSet object containing the sequence for each contig.
#' @param grangesGenome A GRanges object containing the width of each contig.
#' @param gposDeepSignalMod An UnStitched GPos object containing DeepSignal modified sites data.
#' @param gposDeepSignalModBase An UnStitched GPos object containing DeepSignal modification target sites data.
#' @param cOrgAssemblyName The name of the genome assembly provided.
#' @param cBaseLetterForMod The name of the base letter of the modified base.
#' @param cModNameInOutput Name for the modification in the output.
#' @keywords GetModReportDeepSignal
#' @export
#' @examples
#' # preparing genome (simulated)
#' myGenome <- Biostrings::DNAStringSet(paste0(rep("ATCG", 100000), collapse = ""))
#' names(myGenome) <- "NC_000001.11"
#' myGrangesGenome <- GetGenomeGRanges(myGenome)
#'
#' # Loading Nanopore data
#' myDeepSignalModPath <- system.file(
#'   package = "DNAModAnnot", "extdata",
#'   "FAB39088-288418386-Chr1.CpG.call_mods.frequency.tsv"
#' )
#' mygposDeepSignalModBase <- ImportDeepSignalModFrequency(
#'   cDeepSignalModPath = myDeepSignalModPath,
#'   lSortGPos = TRUE,
#'   cContigToBeAnalyzed = "all"
#' )
#'
#' # Filtering
#' mygposDeepSignalMod <- FiltDeepSignal(
#'   gposDeepSignalModBase = mygposDeepSignalModBase,
#'   cParamNameForFilter = "frac",
#'   nFiltParamLoBoundaries = 0,
#'   nFiltParamUpBoundaries = 1,
#'   cFiltParamBoundariesToInclude = "upperOnly"
#' )$Mod
#'
#' # Mod report
#' myReport_Mod <- GetModReportDeepSignal(
#'   dnastringsetGenome = myGenome,
#'   grangesGenome = myGrangesGenome,
#'   gposDeepSignalMod = as(mygposDeepSignalMod, "GRanges"),
#'   gposDeepSignalModBase = as(mygposDeepSignalModBase, "GRanges"),
#'   cOrgAssemblyName = "Test_function",
#'   cBaseLetterForMod = "C", cModNameInOutput = "5mC"
#' )
#' myReport_Mod
GetModReportDeepSignal <- function(dnastringsetGenome,
                                   grangesGenome,
                                   gposDeepSignalMod,
                                   gposDeepSignalModBase,
                                   cOrgAssemblyName,
                                   cBaseLetterForMod,
                                   cModNameInOutput) {
  dReportTable <- data.frame(
    row.names = cOrgAssemblyName,
    nbMod = length(gposDeepSignalMod),
    nbSitesSequenced = length(gposDeepSignalModBase),
    nbBaseAssembly = letterFrequency(dnastringsetGenome, letters = cBaseLetterForMod, collapse = TRUE) +
      letterFrequency(reverseComplement(dnastringsetGenome), letters = cBaseLetterForMod, collapse = TRUE)
  )

  dReportTable$ratioMod <- dReportTable$nbMod / dReportTable$nbSitesSequenced
  dReportTable$ratioMod_corrected <- dReportTable$ratioMod * mean(gposDeepSignalMod$frac)
  dReportTable$ratioModAllAssemblyBases <- dReportTable$nbMod / dReportTable$nbBaseAssembly
  dReportTable$ratioModAllAssemblyBases_corrected <- dReportTable$ratioModAllAssemblyBases * mean(gposDeepSignalMod$frac)

  dReportTable$mean_prob1sum_Mod <- mean(gposDeepSignalMod$prob_1_sum)
  dReportTable$mean_countModified_Mod <- mean(gposDeepSignalMod$count_modified)
  dReportTable$mean_coverage_Mod <- mean(gposDeepSignalMod$coverage)
  dReportTable$mean_frac_Mod <- mean(gposDeepSignalMod$frac)

  colnames(dReportTable) <- gsub(
    pattern = "Base", replacement = cBaseLetterForMod,
    x = colnames(dReportTable)
  )
  colnames(dReportTable) <- gsub(
    pattern = "Mod", replacement = cModNameInOutput,
    x = colnames(dReportTable)
  )

  dReportTable <- .AddModMotifPctToDf(dReportTable,
    grangesModPos = gposDeepSignalMod, grangesGenome = grangesGenome,
    dnastringsetGenome = dnastringsetGenome,
    nModPositionInMotif = 1, nUpstreamBpToAdd = 0, nDownstreamBpToAdd = 1,
    cBaseLetterForMod = cBaseLetterForMod, cModNameInOutput = cModNameInOutput
  )


  dReportTable <- .AddModMotifPctToDf(dReportTable,
    grangesModPos = gposDeepSignalMod, grangesGenome = grangesGenome,
    dnastringsetGenome = dnastringsetGenome,
    nModPositionInMotif = 2, nUpstreamBpToAdd = 1, nDownstreamBpToAdd = 0,
    cBaseLetterForMod = cBaseLetterForMod, cModNameInOutput = cModNameInOutput
  )


  dReportTable <- .AddModMotifPctToDf(dReportTable,
    grangesModPos = gposDeepSignalMod, grangesGenome = grangesGenome,
    dnastringsetGenome = dnastringsetGenome,
    nModPositionInMotif = 2, nUpstreamBpToAdd = 1, nDownstreamBpToAdd = 1,
    cBaseLetterForMod = cBaseLetterForMod, cModNameInOutput = cModNameInOutput
  )

  return(t(dReportTable))
}

#' GetGRangesWindowSeqandParam Function (GloModAn)
#'
#' Return the GRanges object provided with the sequence associated to each position (and can also retrieve the sequence around each position).
#' @param dnastringsetGenome A DNAStringSet object containing the sequence for each contig.
#' @param grangesGenome A GRanges object containing the width of each contig.
#' @param grangesData A GRanges object containing Modifications Positions data to be extracted with the sequence.
#' @param nUpstreamBpToAdd Number of base pairs to add upstream of the range from the GRanges object provided
#' to obtain some sequence upstream of range. If some new ranges do not fit in the ranges of the contigs (provided with grangesGenome),
#' those new ranges will be removed. New windows with gaps are also removed. Defaults to 0.
#' @param nDownstreamBpToAdd Number of base pairs to add downstream of the range from the GRanges object provided
#' to obtain some sequence downstream of range. If some new ranges do not fit in the ranges of the contigs (provided with grangesGenome),
#' those new ranges will be removed. New windows with gaps are also removed. Defaults to 0.
#' @keywords GetGRangesWindowSeqandParam
#' @importFrom GenomicRanges resize width
#' @importFrom IRanges subsetByOverlaps
#' @importFrom BSgenome getSeq
#' @export
#' @examples
#' # loading genome
#' myGenome <- Biostrings::readDNAStringSet(system.file(
#'   package = "DNAModAnnot", "extdata",
#'   "ptetraurelia_mac_51_sca171819.fa"
#' ))
#' myGrangesGenome <- GetGenomeGRanges(myGenome)
#'
#' # Preparing a grangesPacBioGFF datasets
#' myGrangesPacBioGFF <-
#'   ImportPacBioGFF(
#'     cPacBioGFFPath = system.file(
#'       package = "DNAModAnnot", "extdata",
#'       "ptetraurelia.modifications.sca171819.gff"
#'     ),
#'     cNameModToExtract = "m6A",
#'     cModNameInOutput = "6mA",
#'     cContigToBeAnalyzed = names(myGenome)
#'   )
#'
#' # Retrieve GRanges with sequence
#' myPositions_Mod_Granges_wSeq <- GetGRangesWindowSeqandParam(
#'   grangesData = myGrangesPacBioGFF,
#'   grangesGenome = myGrangesGenome,
#'   dnastringsetGenome = myGenome,
#'   nUpstreamBpToAdd = 5,
#'   nDownstreamBpToAdd = 5
#' )
#' myPositions_Mod_Granges_wSeq
GetGRangesWindowSeqandParam <- function(grangesData,
                                        grangesGenome,
                                        dnastringsetGenome,
                                        nUpstreamBpToAdd = 0, nDownstreamBpToAdd = 0) {
  ans <- grangesData

  print("Window adjustment...")
  ans <- resize(ans, width = width(ans) + nUpstreamBpToAdd, fix = "end")
  ans <- resize(ans, width = width(ans) + nDownstreamBpToAdd, fix = "start")

  # remove windows not included completely in contig windows
  gr_a <- grangesGenome
  print("Removing Windows not included completely in contig windows...")
  ans <- subsetByOverlaps(ans, gr_a, ignore.strand = FALSE, type = "within")

  print("Getting sequence...")
  ans$sequence <- as.factor(as.character(getSeq(dnastringsetGenome, ans)))

  print("Window adjustment...")
  ans <- resize(ans, width = width(ans) - nUpstreamBpToAdd, fix = "end")
  ans <- resize(ans, width = width(ans) - nDownstreamBpToAdd, fix = "start")

  return(ans)
}

#' AddModMotifPctToDf Function (GloModAn)
#'
#' Return the dataframe provided with the motif percentages associated to modifications in the GRanges object.
#' @param dReportTable The dataframe in which the motif percentages must be included.
#' @param grangesModPos A GRanges object containing Modifications Positions data.
#' @param grangesGenome A GRanges object containing the width of each contig.
#' @param dnastringsetGenome A DNAStringSet object containing the sequence for each contig.
#' @param cBaseLetterForMod The name of the base letter of the modified base.
#' @param cModNameInOutput Name for the modification in the output.
#' @param nModPositionInMotif Number representing the position of the modification in the ranges
#' (after resizing if some base pairs were added). Defaults to 1.
#' @param nUpstreamBpToAdd Number of base pairs to add upstream of the range from the GRanges object provided
#' to obtain some sequence upstream of range. If some new ranges do not fit in the ranges of the contigs (provided with grangesGenome),
#' those new ranges will be removed. New windows with gaps are also removed. Defaults to 0.
#' @param nDownstreamBpToAdd Number of base pairs to add downstream of the range from the GRanges object provided
#' to obtain some sequence downstream of range. If some new ranges do not fit in the ranges of the contigs (provided with grangesGenome),
#' those new ranges will be removed. New windows with gaps are also removed. Defaults to 0.
#' @keywords internal
#' @importFrom BiocGenerics table
.AddModMotifPctToDf <- function(dReportTable,
                                grangesModPos,
                                grangesGenome,
                                dnastringsetGenome,
                                cBaseLetterForMod,
                                cModNameInOutput,
                                nModPositionInMotif = 1,
                                nUpstreamBpToAdd = 0,
                                nDownstreamBpToAdd = 0) {
  positions_Mod_Granges_wSeq <- GetGRangesWindowSeqandParam(
    grangesModPos, grangesGenome, dnastringsetGenome,
    nUpstreamBpToAdd, nDownstreamBpToAdd
  )

  motif_pct <- as.data.frame.matrix(
    t(table(positions_Mod_Granges_wSeq$sequence)) / sum(t(table(positions_Mod_Granges_wSeq$sequence)))
  )

  motif_pct <- .IncludeModPosInMot(motif_pct, cBaseLetterForMod, cModNameInOutput, nModPositionInMotif)

  names(motif_pct) <- paste("pct_", names(motif_pct), sep = "")

  dReportTable <- cbind(dReportTable, motif_pct)

  return(dReportTable)
}

#' IncludeModPosInMot Function (GloModAn)
#'
#' Return the numeric vector provided with new names to include the position of the modification.
#' @param nParamByMotif A numeric vector named with motifs.
#' @param cBaseLetterForMod The name of the base letter of the modified base.
#' @param cModNameInOutput Name for the modification in the output.
#' @param nModPositionInMotif Number representing the position of the modification in the ranges
#' (after resizing if some base pairs were added). Defaults to 1.
#' @param lExtractOnlyMotifWithMod If TRUE, the numeric vector will be returned with only the
#' motifs that include DNA modifications. Defaults to TRUE.
#' @keywords internal
.IncludeModPosInMot <- function(nParamByMotif,
                                cBaseLetterForMod,
                                cModNameInOutput,
                                nModPositionInMotif = 1,
                                lExtractOnlyMotifWithMod = TRUE) {
  pattern <- paste("^(.{", (nModPositionInMotif - 1), "})", cBaseLetterForMod, "(.*)$", sep = "")
  replacement <- paste("\\1", cModNameInOutput, "\\2", sep = "")
  names(nParamByMotif) <- gsub(
    pattern = pattern, replacement = replacement,
    x = names(nParamByMotif)
  )
  if (lExtractOnlyMotifWithMod) {
    nParamByMotif <- nParamByMotif[grep(cModNameInOutput, names(nParamByMotif))]
  }
  return(nParamByMotif)
}

#' GetModRatioByContig Function (GloModAn)
#'
#' Return a list with the Modification ratio (Mod ratio) by strand for all scaffolds of genome assembly provided.
#' For "b" as the base that can be modified, Mod ratio = Number of modified "b" / Total number of "b".
#' @param grangesModPos A GRanges object containing Modifications Positions data.
#' @param gposModTargetBasePos A GPos object containing Base Positions that can be targeted by the modification.
#' @param dnastringsetGenome A DNAStringSet object containing the sequence for each contig.
#' @param cBaseLetterForMod The name of the base letter of the modified base.
#' @keywords GetModRatioByContig
#' @export
#' @examples
#' # loading genome
#' myGenome <- Biostrings::readDNAStringSet(system.file(
#'   package = "DNAModAnnot", "extdata",
#'   "ptetraurelia_mac_51_sca171819.fa"
#' ))
#'
#' # Preparing a grangesPacBioGFF and gposPacBioCSV datasets
#' myGrangesPacBioGFF <-
#'   ImportPacBioGFF(
#'     cPacBioGFFPath = system.file(
#'       package = "DNAModAnnot", "extdata",
#'       "ptetraurelia.modifications.sca171819.gff"
#'     ),
#'     cNameModToExtract = "m6A",
#'     cModNameInOutput = "6mA",
#'     cContigToBeAnalyzed = names(myGenome)
#'   )
#'
#' myGposPacBioCSV <-
#'   ImportPacBioCSV(
#'     cPacBioCSVPath = system.file(
#'       package = "DNAModAnnot", "extdata",
#'       "ptetraurelia.bases.sca171819.csv"
#'     ),
#'     cSelectColumnsToExtract = c(
#'       "refName", "tpl", "strand", "base",
#'       "score", "ipdRatio", "coverage"
#'     ),
#'     lKeepExtraColumnsInGPos = TRUE, lSortGPos = TRUE,
#'     cContigToBeAnalyzed = names(myGenome)
#'   )
#' myGposPacBioCSV <- myGposPacBioCSV[myGposPacBioCSV$base == "A"]
#'
#' # Mod report
#' myMod_ratio_list <- GetModRatioByContig(
#'   grangesModPos = myGrangesPacBioGFF,
#'   gposModTargetBasePos = myGposPacBioCSV,
#'   dnastringsetGenome = myGenome,
#'   cBaseLetterForMod = "A"
#' )
#' myMod_ratio_list
GetModRatioByContig <- function(grangesModPos,
                                gposModTargetBasePos,
                                dnastringsetGenome,
                                cBaseLetterForMod) {
  mod_count_table <- data.frame(
    refName = as.factor(seqnames(grangesModPos)),
    strand = as.factor(strand(grangesModPos))
  )
  mod_count_table <- as.data.frame(table(mod_count_table))
  colnames(mod_count_table) <- c("refName", "strand", "Mod_count")

  baseS_count_table <- data.frame(
    refName = as.factor(seqnames(gposModTargetBasePos)),
    strand = as.factor(strand(gposModTargetBasePos))
  )
  baseS_count_table <- as.data.frame(table(baseS_count_table))
  colnames(baseS_count_table) <- c("refName", "strand", "BaseSequenced_count")

  dnastringsetGenome_order <- dnastringsetGenome[order(names(dnastringsetGenome)), ]
  baseA_count_table <- data.frame(
    names(dnastringsetGenome_order), "+",
    letterFrequency(dnastringsetGenome_order, letters = cBaseLetterForMod),
    width(dnastringsetGenome_order)
  )
  tmp_table <- data.frame(
    names(dnastringsetGenome_order), "-",
    letterFrequency(reverseComplement(dnastringsetGenome_order), letters = cBaseLetterForMod),
    width(dnastringsetGenome_order)
  )
  baseA_count_table <- rbind(baseA_count_table, tmp_table)
  colnames(baseA_count_table) <- c("refName", "strand", "BaseAssembly_count", "width")

  base_count_table <- merge(baseS_count_table,
    baseA_count_table,
    by = c("refName", "strand"),
    all.y = TRUE
  )

  mod_count_table <- merge(mod_count_table,
    base_count_table,
    by = c("refName", "strand"),
    all.y = TRUE
  )

  # those who have NA for one or both strands : replace NA by 0
  mod_count_table[is.na(mod_count_table$Mod_count), "Mod_count"] <- 0
  mod_count_table[is.na(mod_count_table$BaseSequenced_count), "BaseSequenced_count"] <- 0
  mod_count_table[is.na(mod_count_table$BaseAssembly_count), "BaseAssembly_count"] <- 0

  mod_count_table <- mod_count_table[with(mod_count_table, order(width, refName, strand, decreasing = TRUE)), ]

  mod_count_table$Mod_ratio <- mod_count_table$Mod_count / mod_count_table$BaseSequenced_count
  mod_count_table$Mod_ratio_AllAssemblyBases <- mod_count_table$Mod_count / mod_count_table$BaseAssembly_count

  f_strand <- subset(mod_count_table, strand == "+")
  r_strand <- subset(mod_count_table, strand == "-")

  return(list(
    mod_count_table = mod_count_table,
    f_strand = f_strand,
    r_strand = r_strand
  ))
}

#' DrawModLogo Function (GloModAn)
#'
#' Return a plot describing the sequence motif associated to the sequences provided.
#' Sequences that do not have full width and sequences that have some
#' N or some gaps "-" are automatically removed before drawing the sequence plot.
#'
#' This function reduce the background signal using the genomic composition and also computes
#' the signal of depleted bases among the sequence provided.
#' Positions that are fixed and displaying only one base are taken in account and avoid these two corrections.
#' It is also possible to tag some positions (+/- labels) in the logo.
#'
#' @param dnastringsetSeqAroundMod A DNAStringSet object containing the sequence around each DNA modification.
#' All these sequences must have the same size and the modification must have the same position in each sequence (e.g. at the center).
#' @param cYunit Units to be used for the y axis. Can be "ic" (information content),
#' "ic_hide_bg" (information content + hide low signal) or "prob" (probability).
#' "ic_hide_bg" will hide the letters on the upper panel (OR the lower panel) if these letters
#' have probabilities above (OR below) the background signal based on the genomic composition.
#' Defaults to "ic_hide_bg".
#' @param nGenomicBgACGT A numeric vector giving the background to be corrected with the genomic composition
#' in Adenine (A) then Cytosine (C) then Guanine (G) then Thymine (T). Defaults to c(A=0.25, C=0.25, G=0.25, T=0.25).
#' @param cColorsACGT A character vector giving the color for the Adenine (A) then Cytosine (C) then Guanine (G)
#' then Thymine (T) letters on the plot. Defaults to c(A="red2", C="blue2", G="orange2", T="green3").
#' @param lPlotNegYAxis If TRUE, allow plotting the lower panel to show depletion among the sequences provided. Defaults to TRUE.
#' @param nPositionsToAnnotate A numeric vector giving the positions to highlight on the logo using small triangular tags
#' (e.g. DNA modifications on a fixed position). Defaults to NULL.
#' @param cAnnotationText A character vector. If nPositionsToAnnotate is not NULL, this option can provide labels to be associated to
#' each annotation tag. If the vector's length is 1, it will be used for all tags to be displayed. Defaults to NULL.
#' @param nTagTextFontSize Font size for the text associated to the annotation tags. Defaults to 20.
#' @param nXFontSize Font size for the text associated to the x axis. Defaults to 15.
#' @param nYFontSize Font size for the text associated to the y axis. Defaults to 15.
#' @param lXAxis If TRUE, the x-axis is plotted. Defaults to TRUE.
#' @param lYAxis If TRUE, the y-axis is plotted. Defaults to TRUE.
#' @keywords DrawModLogo
#' @importFrom Biostrings consensusMatrix
#' @importFrom seqLogo makePWM
#' @importFrom stats setNames
#' @export
#' @examples
#' # loading genome
#' myGenome <- Biostrings::readDNAStringSet(system.file(
#'   package = "DNAModAnnot", "extdata",
#'   "ptetraurelia_mac_51_sca171819.fa"
#' ))
#' myGrangesGenome <- GetGenomeGRanges(myGenome)
#'
#' # Preparing a grangesPacBioGFF datasets
#' myGrangesPacBioGFF <-
#'   ImportPacBioGFF(
#'     cPacBioGFFPath = system.file(
#'       package = "DNAModAnnot", "extdata",
#'       "ptetraurelia.modifications.sca171819.gff"
#'     ),
#'     cNameModToExtract = "m6A",
#'     cModNameInOutput = "6mA",
#'     cContigToBeAnalyzed = names(myGenome)
#'   )
#'
#' # Retrieve GRanges with sequence
#' myPositions_Mod_Granges_wSeq <- GetGRangesWindowSeqandParam(
#'   grangesData = myGrangesPacBioGFF,
#'   grangesGenome = myGrangesGenome,
#'   dnastringsetGenome = myGenome,
#'   nUpstreamBpToAdd = 5,
#'   nDownstreamBpToAdd = 5
#' )
#'
#' DrawModLogo(
#'   dnastringsetSeqAroundMod = as(myPositions_Mod_Granges_wSeq$sequence, "DNAStringSet"),
#'   nGenomicBgACGT = c(0.35, 0.15, 0.15, 0.35),
#'   nPositionsToAnnotate = c(6), cAnnotationText = c("6mA")
#' )
 <- function(dnastringsetSeqAroundMod,
                        nGenomicBgACGT = c(A = 0.25, C = 0.25, G = 0.25, T = 0.25),
                        cYunit = "ic_hide_bg",
                        lPlotNegYAxis = TRUE,
                        cColorsACGT = c(A = "#4daf4a", C = "#377eb8", G = "#ffd92f", T = "#e41a1c"),
                        nPositionsToAnnotate = NULL,
                        cAnnotationText = NULL,
                        nTagTextFontSize = 20,
                        lXAxis = TRUE, lYAxis = TRUE,
                        nXFontSize = 15, nYFontSize = 15) {
  # remove all sequences that do not have full width
  dnastringsetSeqAroundMod <- dnastringsetSeqAroundMod[which(width(dnastringsetSeqAroundMod) == max(width(dnastringsetSeqAroundMod)))]
  # remove all sequences that have some N or some gaps "-"
  dnastringsetSeqAroundMod <- dnastringsetSeqAroundMod[which(letterFrequency(dnastringsetSeqAroundMod, letters = "N-") == 0)]


  mConsensus <- consensusMatrix(dnastringsetSeqAroundMod, as.prob = TRUE)
  mConsensus <- mConsensus[c("A", "C", "G", "T"), ]
  nFixedPositions <- which(apply(mConsensus, 2, function(x) any(x == 1)))

  # Background correction
  if (cYunit %in% c("ic", "ic_hide_bg")) {
    mConsensus <- t(t(mConsensus / nGenomicBgACGT) / colSums(mConsensus / nGenomicBgACGT))
  }

  pConsensus <- makePWM(mConsensus)

  nConsensus <- round(1 / (mConsensus + 0.00001), 4)
  nConsensus <- t(t(nConsensus) / colSums(nConsensus))

  if (cYunit %in% c("ic", "ic_hide_bg")) {
    nConsensus[, nFixedPositions] <- c(0.25, 0.25, 0.25, 0.25)
  } else {
    # if prob_scale: correct values for fixed positions
    nConsensus <- lapply(1:ncol(nConsensus), function(x) {
      if (any(mConsensus[, x] == 1)) {
        n <- setNames(rep(0, 4), nm = rownames(nConsensus))
        switch(names(mConsensus[mConsensus[, x] == 1, x]),
          "A" = n["T"] <- 1,
          "T" = n["A"] <- 1,
          "G" = n["C"] <- 1,
          "C" = n["G"] <- 1
        )
        return(n)
      }
      return(nConsensus[, x])
    })
    nConsensus <- do.call(cbind, nConsensus)
  }
  nConsensus <- makePWM(nConsensus)

  DrawLogoPosNegAxes(
    pwmUp = pConsensus, pwmDown = nConsensus, cColorsACGT = cColorsACGT,
    nPositionsToAnnotate = nPositionsToAnnotate,
    cAnnotationText = cAnnotationText, cYunit = cYunit,
    lPlotNegYAxis = lPlotNegYAxis,
    nGenomicBgACGT = nGenomicBgACGT,
    nTagTextFontSize = nTagTextFontSize,
    lXAxis = lXAxis, lYAxis = lYAxis, nXFontSize = nXFontSize, nYFontSize = nYFontSize
  )
}

#' DrawLogoPosNegAxes Function (GloModAn)
#'
#' Return a plot describing the sequence motif associated to the sequences provided.
#' Two panels are plotted: if cYunit option is used with "ic_hide_bg", data
#' from the lower panel should correspond to the depleted signal by comparison with upper panel
#' that should described the enriched signal.
#'
#' It is also possible to tag some positions (+/- labels) in the logo.
#'
#' @param pwmUp A seqLogo PWM (position weight matrix) object to be used for the upper panel.
#' @param pwmDown A seqLogo PWM (position weight matrix) object to be used for the lower panel. Should correspond to the depleted signal
#' (by comparison to the data provided with the pwmUp option).
#' @param cYunit Units to be used for the y axis. Can be "ic" (information content),
#' "ic_hide_bg" (information content + hide low signal) or "prob" (probability).
#' "ic_hide_bg" will hide the letters on the upper panel (OR the lower panel) if these letters
#' have probabilities above (OR below) the background signal based on the genomic composition.
#' Defaults to "ic_hide_bg".
#' @param nGenomicBgACGT A numeric vector giving the background to be corrected with the genomic composition
#' in Adenine (A) then Cytosine (C) then Guanine (G) then Thymine (T). Defaults to c(A=0.25, C=0.25, G=0.25, T=0.25).
#' @param cColorsACGT A character vector giving the color for the Adenine (A) then Cytosine (C) then Guanine (G)
#' then Thymine (T) letters on the plot. Defaults to c(A="red2", C="blue2", G="orange2", T="green3").
#' @param lPlotNegYAxis If TRUE, allow plotting the lower panel to show depletion among the sequences provided. Defaults to TRUE.
#' @param nPositionsToAnnotate A numeric vector giving the positions to highlight on the logo using small triangular tags
#' (e.g. DNA modifications on a fixed position). Defaults to NULL.
#' @param cAnnotationText A character vector. If nPositionsToAnnotate is not NULL, this option can provide labels to be associated to
#' each annotation tag. If the vector's length is 1, it will be used for all tags to be displayed. Defaults to NULL.
#' @param nTagTextFontSize Font size for the text associated to the annotation tags. Defaults to 20.
#' @param nXFontSize Font size for the text associated to the x axis. Defaults to 15.
#' @param nYFontSize Font size for the text associated to the y axis. Defaults to 15.
#' @param lXAxis If TRUE, the x-axis is plotted. Defaults to TRUE.
#' @param lYAxis If TRUE, the y-axis is plotted. Defaults to TRUE.
#' @keywords DrawLogoPosNegAxes
#' @importFrom seqLogo pwm ic
#' @importFrom grid grid.newpage plotViewport pushViewport dataViewport grid.abline grid.polygon unit grid.xaxis grid.yaxis grid.text gpar popViewport
#' @export
#' @examples
#' pwm1 <- matrix(c(
#'   0.25, 0.25, 0.25, 0.25,
#'   0.8, 0.05, 0.05, 0.1,
#'   0.33, 0.33, 0.33, 0.01
#' ), nrow = 4, byrow = FALSE)
#' row.names(pwm1) <- c("A", "C", "G", "T")
#' pwm2 <- t(t(1 / pwm1) / colSums((1 / pwm1)))
#'
#' pwm1 <- seqLogo::makePWM(pwm1)
#' pwm2 <- seqLogo::makePWM(pwm2)
#'
#' DrawLogoPosNegAxes(
#'   pwmUp = pwm1, pwmDown = pwm2,
#'   nPositionsToAnnotate = c(1, 3), cAnnotationText = c("Text?", "Depletion of T")
#' )
DrawLogoPosNegAxes <- function(pwmUp, pwmDown,
                               nGenomicBgACGT = c(A = 0.25, C = 0.25, G = 0.25, T = 0.25),
                               cYunit = "ic_hide_bg",
                               lPlotNegYAxis = TRUE,
                               cColorsACGT = c(A = "#4daf4a", C = "#377eb8", G = "#ffd92f", T = "#e41a1c"),
                               nPositionsToAnnotate = NULL,
                               cAnnotationText = NULL,
                               nTagTextFontSize = 20,
                               lXAxis = TRUE, lYAxis = TRUE,
                               nXFontSize = 15, nYFontSize = 15) {
  pUp <- pwm(pwmUp)
  pDo <- pwm(pwmDown)

  if (ncol(pUp) != ncol(pDo)) {
    stop("Error: pwmUp and pwmDown must have the same number of columns!")
  }
  npos <- ncol(pUp)

  if (cYunit == "ic_hide_bg") {
    ylim <- 2
    ylab <- "Information content"
    facs_up <- ic(pwmUp)
    facs_down <- ic(pwmDown)
    hideLetterUp <- pUp - nGenomicBgACGT <= 0
    hideLetterDo <- !hideLetterUp
  } else if (cYunit == "ic") {
    ylim <- 2
    ylab <- "Information content"
    facs_up <- ic(pwmUp)
    facs_down <- ic(pwmDown)
    hideLetterUp <- pUp == 0
    hideLetterDo <- pDo == 0
  } else if (cYunit == "prob") {
    ylim <- 1
    ylab <- "Probability"
    facs_up <- rep(1, npos)
    facs_down <- rep(1, npos)
    hideLetterUp <- pUp == 0
    hideLetterDo <- pDo == 0
  }

  charsUp <- rownames(pUp)
  charsDo <- rownames(pDo)
  lettersUp <- list(x = NULL, y = NULL, id = NULL, fill = NULL)
  lettersDo <- list(x = NULL, y = NULL, id = NULL, fill = NULL)

  wt <- 1
  x.pos <- 0

  for (j in 1:npos) {
    column <- pUp[, j]
    hts <- 0.95 * column * facs_up[j]
    letterOrder <- order(hts)

    y.pos <- 0
    for (i in 1:4) {
      letter <- charsUp[letterOrder[i]]
      ht <- hts[letterOrder[i]]
      if (hideLetterUp[letter, j]) {
        ht <- 0
      }

      lettersUp <- addLetter(
        lettersUp, letter, x.pos,
        y.pos, ht, wt, cColorsACGT
      )
      y.pos <- y.pos + ht + 0.01
    }

    column <- pDo[, j]
    hts <- -0.95 * column * facs_down[j]
    letterOrder <- order(-hts)

    y.pos <- 0
    for (i in 1:4) {
      letter <- charsDo[letterOrder[i]]
      ht <- hts[letterOrder[i]]
      if (hideLetterDo[letter, j]) {
        ht <- 0
      }

      oldYLen <- length(lettersDo$y)
      lettersDo <- addLetter(
        lettersDo, letter, x.pos,
        y.pos, ht, wt, cColorsACGT
      )
      newYLen <- length(lettersDo$y)

      # retrieve index of new values
      newValues <- seq(from = oldYLen + 1, to = newYLen, by = 1)

      # invert y-values of new letter
      old.y <- lettersDo$y[newValues]
      new.y <- (max(old.y) - old.y) + min(old.y)
      lettersDo$y[newValues] <- new.y

      y.pos <- y.pos + ht - 0.01
    }
    x.pos <- x.pos + wt
  }


  grid.newpage()

  bottomMargin <- ifelse(lXAxis, 2 + nXFontSize / 3.5, 2)
  leftMargin <- ifelse(lYAxis, 2 + nYFontSize / 3.5, 2)
  pushViewport(plotViewport(c(bottomMargin, leftMargin, 2, 2)))


  if (lPlotNegYAxis) {
    ylim1 <- min(lettersDo$y)
    ylim2 <- max(lettersUp$y) * 1.25
    if (tail(min(lettersDo$y):ylim2, 1) <= ylim) {
      ylim2 <- ylim * 1.25
      if (tail(min(lettersDo$y):ylim2, 1) <= ylim) {
        ylim2 <- ylim * 1.5
      }
    }
  } else {
    ylim1 <- 0
    ylim2 <- ylim
  }

  pushViewport(dataViewport(0:npos, ylim1:ylim2, name = "vp1"))

  grid.polygon(
    x = unit(lettersUp$x, "native"), y = unit(lettersUp$y, "native"),
    id = lettersUp$id, gp = gpar(fill = lettersUp$fill, col = "transparent")
  )
  if (lPlotNegYAxis) {
    grid.polygon(
      x = unit(lettersDo$x, "native"), y = unit(lettersDo$y, "native"),
      id = lettersDo$id, gp = gpar(fill = lettersDo$fill, col = "transparent")
    )
  }

  grid.abline(intercept = 0, slope = 0)

  if (!is.null(nPositionsToAnnotate)) {
    tag_x <- rep(nPositionsToAnnotate, each = 3) - rep(c(1, 0.5, 0), length(nPositionsToAnnotate))
    tag_y <- rep(ylim * 1.1, 3 * length(nPositionsToAnnotate)) - rep(c(0, 0.1, 0), length(nPositionsToAnnotate))

    # tag
    grid.polygon(
      x = unit(tag_x, "native"), y = unit(tag_y, "native"),
      id = rep(nPositionsToAnnotate, each = 3), gp = gpar(fill = c("orange"), col = "black")
    )

    if (!is.null(cAnnotationText)) {
      # text
      grid.text(
        label = cAnnotationText, x = unit(nPositionsToAnnotate - 0.5, "native"),
        y = unit(rep(ylim * 1.2, each = length(nPositionsToAnnotate)), "native"),
        gp = gpar(fontsize = nTagTextFontSize, col = "black")
      )
    }
  }

  if (lXAxis) {
    grid.xaxis(
      at = seq(0.5, npos - 0.5), label = seq_len(npos),
      gp = gpar(fontsize = nXFontSize)
    )
    grid.text("Position",
      y = unit(-3, "lines"),
      gp = gpar(fontsize = nXFontSize)
    )
  }
  if (lYAxis) {
    grid.yaxis(gp = gpar(fontsize = nYFontSize))
    grid.text(ylab,
      x = unit(-3, "lines"), rot = 90,
      gp = gpar(fontsize = nYFontSize)
    )
  }
  popViewport()
  popViewport()
}

#' addLetter Function from seqLogo package (GloModAn)
#'
#' Internal function from seqLogo package. See seqLogo documentation for more details.
#' User should use the addLetter function from seqLogo and should not use directly this version.
#' @keywords internal
addLetter <- function(letters, which, x.pos, y.pos, ht, wt, fill) {
  if (which == "A") {
    letter <- letterA(x.pos, y.pos, ht, wt, fill = fill["A"])
  }
  else if (which == "C") {
    letter <- letterC(x.pos, y.pos, ht, wt, fill = fill["C"])
  }
  else if (which == "G") {
    letter <- letterG(x.pos, y.pos, ht, wt, fill = fill["G"])
  }
  else if (which == "T") {
    letter <- letterT(x.pos, y.pos, ht, wt, fill = fill["T"])
  }
  else if (which == "U") {
    letter <- letterU(x.pos, y.pos, ht, wt, fill = fill["U"])
  }
  else {
    stop(sprintf("\"which\" must be one of %s", paste(names(fill),
      collapse = ", "
    )))
  }
  letters$x <- c(letters$x, letter$x)
  letters$y <- c(letters$y, letter$y)
  lastID <- ifelse(is.null(letters$id), 0, max(letters$id))
  letters$id <- c(letters$id, lastID + letter$id)
  letters$fill <- c(letters$fill, as.character(letter$fill))
  letters
}

#' letterA Function from seqLogo package (GloModAn)
#'
#' Internal function from seqLogo package. See seqLogo documentation for more details.
#' User should use the letterA function from seqLogo and should not use directly this version.
#' @keywords internal
letterA <- function(x.pos, y.pos, ht, wt, fill = "#61D04F", id = NULL) {
  x <- c(
    0, 4, 6, 2, 0, 4, 6, 10, 8, 4, 3.2, 6.8, 6.4, 3.6,
    3.2
  )
  y <- c(0, 10, 10, 0, 0, 10, 10, 0, 0, 10, 3, 3, 4, 4, 3)
  x <- 0.1 * x
  y <- 0.1 * y
  x <- .rescale(x)
  x <- x.pos + wt * x
  y <- y.pos + ht * y
  if (is.null(id)) {
    id <- c(rep(1L, 5), rep(2L, 5), rep(3L, 5))
  }
  else {
    id <- c(rep(id, 5), rep(id + 1L, 5), rep(id + 2L, 5))
  }
  fill <- rep(fill, 3)
  list(x = x, y = y, id = id, fill = fill)
}

#' letterG Function from seqLogo package (GloModAn)
#'
#' Internal function from seqLogo package. See seqLogo documentation for more details.
#' User should use the letterG function from seqLogo and should not use directly this version.
#' @keywords internal
letterG <- function(x.pos, y.pos, ht, wt, fill = "#F5C710", id = NULL) {
  angle1 <- seq(0.3 + pi / 2, pi, length = 100)
  angle2 <- seq(pi, 1.5 * pi, length = 100)
  x.l1 <- 0.5 + 0.5 * sin(angle1)
  y.l1 <- 0.5 + 0.5 * cos(angle1)
  x.l2 <- 0.5 + 0.5 * sin(angle2)
  y.l2 <- 0.5 + 0.5 * cos(angle2)
  x.l <- c(x.l1, x.l2)
  y.l <- c(y.l1, y.l2)
  x <- c(x.l, rev(x.l))
  y <- c(y.l, 1 - rev(y.l))
  x.i1 <- 0.5 + 0.35 * sin(angle1)
  y.i1 <- 0.5 + 0.35 * cos(angle1)
  x.i1 <- x.i1[y.i1 <= max(y.l1)]
  y.i1 <- y.i1[y.i1 <= max(y.l1)]
  y.i1[1] <- max(y.l1)
  x.i2 <- 0.5 + 0.35 * sin(angle2)
  y.i2 <- 0.5 + 0.35 * cos(angle2)
  x.i <- c(x.i1, x.i2)
  y.i <- c(y.i1, y.i2)
  x1 <- c(x.i, rev(x.i))
  y1 <- c(y.i, 1 - rev(y.i))
  x <- c(x, rev(x1))
  y <- c(y, rev(y1))
  h1 <- max(y.l1)
  r1 <- max(x.l1)
  h1 <- 0.4
  x.add <- c(r1, 0.5, 0.5, r1 - 0.2, r1 - 0.2, r1, r1)
  y.add <- c(h1, h1, h1 - 0.1, h1 - 0.1, 0, 0, h1)
  if (is.null(id)) {
    id <- c(rep(1, length(x)), rep(2, length(x.add)))
  }
  else {
    id <- c(rep(id, length(x)), rep(id + 1, length(x.add)))
  }
  x <- c(rev(x), x.add)
  y <- c(rev(y), y.add)
  x <- .rescale(x)
  x <- x.pos + wt * x
  y <- y.pos + ht * y
  fill <- rep(fill, 2)
  list(x = x, y = y, id = id, fill = fill)
}

#' letterC Function from seqLogo package (GloModAn)
#'
#' Internal function from seqLogo package. See seqLogo documentation for more details.
#' User should use the letterC function from seqLogo and should not use directly this version.
#' @keywords internal
letterC <- function(x.pos, y.pos, ht, wt, fill = "#2297E6", id = NULL) {
  angle1 <- seq(0.3 + pi / 2, pi, length = 100)
  angle2 <- seq(pi, 1.5 * pi, length = 100)
  x.l1 <- 0.5 + 0.5 * sin(angle1)
  y.l1 <- 0.5 + 0.5 * cos(angle1)
  x.l2 <- 0.5 + 0.5 * sin(angle2)
  y.l2 <- 0.5 + 0.5 * cos(angle2)
  x.l <- c(x.l1, x.l2)
  y.l <- c(y.l1, y.l2)
  x <- c(x.l, rev(x.l))
  y <- c(y.l, 1 - rev(y.l))
  x.i1 <- 0.5 + 0.35 * sin(angle1)
  y.i1 <- 0.5 + 0.35 * cos(angle1)
  x.i1 <- x.i1[y.i1 <= max(y.l1)]
  y.i1 <- y.i1[y.i1 <= max(y.l1)]
  y.i1[1] <- max(y.l1)
  x.i2 <- 0.5 + 0.35 * sin(angle2)
  y.i2 <- 0.5 + 0.35 * cos(angle2)
  x.i <- c(x.i1, x.i2)
  y.i <- c(y.i1, y.i2)
  x1 <- c(x.i, rev(x.i))
  y1 <- c(y.i, 1 - rev(y.i))
  x <- c(x, rev(x1))
  y <- c(y, rev(y1))
  x <- .rescale(x)
  x <- x.pos + wt * x
  y <- y.pos + ht * y
  if (is.null(id)) {
    id <- rep(1, length(x))
  }
  else {
    id <- rep(id, length(x))
  }
  fill <- rep(fill, 1)
  list(x = x, y = y, id = id, fill = fill)
}

#' letterT Function from seqLogo package (GloModAn)
#'
#' Internal function from seqLogo package. See seqLogo documentation for more details.
#' User should use the letterT function from seqLogo and should not use directly this version.
#' @keywords internal
letterT <- function(x.pos, y.pos, ht, wt, fill = "#DF536B", id = NULL) {
  x <- c(0, 10, 10, 6, 6, 4, 4, 0)
  y <- c(10, 10, 9, 9, 0, 0, 9, 9)
  x <- 0.1 * x
  y <- 0.1 * y
  x <- .rescale(x)
  x <- x.pos + wt * x
  y <- y.pos + ht * y
  if (is.null(id)) {
    id <- rep(1, 8)
  }
  else {
    id <- rep(id, 8)
  }
  fill <- rep(fill, 1)
  list(x = x, y = y, id = id, fill = fill)
}

#' letterU Function from seqLogo package (GloModAn)
#'
#' Internal function from seqLogo package. See seqLogo documentation for more details.
#' User should use the letterU function from seqLogo and should not use directly this version.
#' @keywords internal
letterU <- function(x.pos, y.pos, ht, wt, fill = "#DF536B", id = NULL) {
  angle1 <- seq(pi / 2, pi, length = 100)
  angle2 <- seq(pi, 1.5 * pi, length = 100)
  x.l1 <- 0.5 + 0.5 * sin(angle1)
  y.l1 <- 0.5 + 0.5 * cos(angle1)
  x.l2 <- 0.5 + 0.5 * sin(angle2)
  y.l2 <- 0.5 + 0.5 * cos(angle2)
  x.l <- c(x.l1, x.l2)
  y.l <- c(y.l1, y.l2)
  x.i1 <- 0.5 + 0.3 * sin(angle1)
  y.i1 <- 0.5 + 0.35 * cos(angle1)
  x.i1 <- x.i1[y.i1 <= max(y.l1)]
  y.i1 <- y.i1[y.i1 <= max(y.l1)]
  y.i1[1] <- max(y.l1)
  x.i2 <- 0.5 + 0.3 * sin(angle2)
  y.i2 <- 0.5 + 0.35 * cos(angle2)
  x.i <- c(x.i1, x.i2)
  y.i <- c(y.i1, y.i2)
  x <- c(x.l, 0, 0.2, 0.2, rev(x.i), 0.8, 0.8, 1, 1)
  y <- c(y.l, 1, 1, 0.5, rev(y.i), 0.5, 1, 1, 0.85)
  x <- .rescale(x)
  x <- x.pos + wt * x
  y <- y.pos + ht * y
  if (is.null(id)) {
    id <- rep(1, length(x))
  }
  else {
    id <- rep(id, length(x))
  }
  fill <- rep(fill, 1)
  list(x = x, y = y, id = id, fill = fill)
}

#' .rescale Function from seqLogo package (GloModAn)
#'
#' Internal function from seqLogo package. See seqLogo documentation for more details.
#' User should use the .rescale function from seqLogo and should not use directly this version.
#' @keywords internal
.rescale <- function(x, to = c(0.025, 0.975)) {
  from <- range(x, na.rm = TRUE, finite = TRUE)
  (x - from[1]) / diff(from) * diff(to) + to[1]
}

#' ExtractListModPosByModMotif Function (GloModAn)
#'
#' Return the GRanges object provided with the sequence associated to each position (and can also retrieve the sequence around each position).
#' @param grangesModPos A GRanges object containing Modifications Positions data to be extracted with the sequence.
#' @param grangesGenome A GRanges object containing the width of each contig.
#' @param dnastringsetGenome A DNAStringSet object containing the sequence for each contig.
#' @param nUpstreamBpToAdd Number of base pairs to add upstream of the range from the GRanges object provided
#' to obtain some sequence upstream of range. If some new ranges do not fit in the ranges of the contigs (provided with grangesGenome),
#' those new ranges will be removed. New windows with gaps are also removed. Defaults to 0.
#' @param nDownstreamBpToAdd Number of base pairs to add downstream of the range from the GRanges object provided
#' to obtain some sequence downstream of range. If some new ranges do not fit in the ranges of the contigs (provided with grangesGenome),
#' those new ranges will be removed. New windows with gaps are also removed. Defaults to 0.
#' @param nModMotifMinProp A number indicating the false discovery rate to be used for filtering: this will allow to choose the closest threshold
#' below this number. Defaults to 0.05 (so fdr of 5\%).
#' @param nModPositionInMotif The position of the modification in the window after resizing with nUpstreamBpToAdd and nDownstreamBpToAdd.
#' If GRanges are 1-bp positions, then 1+nUpstreamBpToAdd will return the right position of the modification.
#' Defaults to 1+nUpstreamBpToAdd.
#' @param cBaseLetterForMod The name of the base letter of the modified base.
#' @param cModNameInOutput Name for the modification in the output.
#' @return A list of 4 objects:
#' \describe{
#'   \item{motifs_to_analyse}{A character vector containing the sequence of motifs associated to the modification.}
#'   \item{mod_motif}{A character vector containing the sequence of motifs associated to the modification with the
#'   modification represented inside those motifs.}
#'   \item{motif_pct}{A table containing the percentage of modifications in each motif tested.}
#'   \item{GRangesbyMotif}{A list of GRanges objects with the sequence: one GRanges object by motif associated to the modification.}
#' }
#' @keywords ExtractListModPosByModMotif
#' @import GenomicRanges
#' @importFrom S4Vectors isEmpty
#' @export
#' @examples
#' # loading genome
#' myGenome <- Biostrings::readDNAStringSet(system.file(
#'   package = "DNAModAnnot", "extdata",
#'   "ptetraurelia_mac_51_sca171819.fa"
#' ))
#' myGrangesGenome <- GetGenomeGRanges(myGenome)
#'
#' # Preparing a grangesPacBioGFF dataset
#' myGrangesPacBioGFF <-
#'   ImportPacBioGFF(
#'     cPacBioGFFPath = system.file(
#'       package = "DNAModAnnot", "extdata",
#'       "ptetraurelia.modifications.sca171819.gff"
#'     ),
#'     cNameModToExtract = "m6A",
#'     cModNameInOutput = "6mA",
#'     cContigToBeAnalyzed = names(myGenome)
#'   )
#'
#' # Retrieve GRanges with sequence
#' myMotif_pct_and_GRangesList <- ExtractListModPosByModMotif(
#'   grangesModPos = myGrangesPacBioGFF,
#'   grangesGenome = myGrangesGenome,
#'   dnastringsetGenome = myGenome,
#'   nUpstreamBpToAdd = 0,
#'   nDownstreamBpToAdd = 1,
#'   nModMotifMinProp = 0.05,
#'   cBaseLetterForMod = "A",
#'   cModNameInOutput = "6mA"
#' )
#'
#' myMotif_pct_and_GRangesList$motifs_to_analyse
#' myMotif_pct_and_GRangesList$mod_motif
#' myMotif_pct_and_GRangesList$motif_pct
#' myMotif_pct_and_GRangesList$GRangesbyMotif
ExtractListModPosByModMotif <- function(grangesModPos,
                                        grangesGenome,
                                        dnastringsetGenome,
                                        nUpstreamBpToAdd = 0, nDownstreamBpToAdd = 1,
                                        nModMotifMinProp,
                                        nModPositionInMotif = 1 + nUpstreamBpToAdd,
                                        cBaseLetterForMod,
                                        cModNameInOutput) {
  ans <- GetGRangesWindowSeqandParam(
    grangesData = grangesModPos,
    grangesGenome = grangesGenome,
    dnastringsetGenome = dnastringsetGenome,
    nUpstreamBpToAdd = nUpstreamBpToAdd, nDownstreamBpToAdd = nDownstreamBpToAdd
  )

  motif_pct <- table(ans$sequence) / sum(table(ans$sequence))
  if (isEmpty(which(motif_pct >= nModMotifMinProp))) {
    print(paste("No ", unique(width(ans$sequence)), "nt motif is represented at least ", nModMotifMinProp * 100, "% of 6mA detected.", sep = ""))
    print("Motif width or minimum proportion might be too high.")
  } else {
    motif_to_analyze <- names(motif_pct[motif_pct >= nModMotifMinProp])
    ans <- ans[which(as.character(ans$sequence) %in% motif_to_analyze), ]
    ans <- split(ans, as.factor(ans$sequence))
    ans <- ans[lengths(ans) > 0]
  }
  mod_motif <- .IncludeModPosInMot(motif_pct, cBaseLetterForMod, cModNameInOutput, nModPositionInMotif)

  return(list(
    motifs_to_analyse = names(motif_pct[motif_pct >= nModMotifMinProp]),
    mod_motif = names(mod_motif[motif_pct >= nModMotifMinProp]),
    motif_pct = motif_pct,
    GRangesbyMotif = ans
  ))
}
AlexisHardy/DNAModAnnot documentation built on Feb. 27, 2023, 12:03 a.m.