R/fold.R

Defines functions fold

Documented in fold

#' Get the methylation pattern from a hairpin sequence.
#'
#' @param sq sequence
#' @param linker linker sequene
#' @param barlen length of barcode sequence
#' @param side which side the hairpin is in relation to the main double stranded sequence
#' @param sqlen sequence length (optional)
#' @export
fold <- function(sq, linker, barlen, side = "right", sqlen = NA) {
  ret <- list(
    sq = sq,
    match = FALSE,
    flip = FALSE,
    error = FALSE,
    error_msg = "",
    CpG_sum = 0,
    summary_top = "",
    summary_bot = ""
  )

  # find linker sequence in input sequence
  while (TRUE) {
    sq_m_linker <- sq_match(ret$sq, linker, soft = TRUE)
    if (any(sq_m_linker)) {
      ret$sq_m_linker <- sq_m_linker
      ret$match <- TRUE
      break
    } else if (ret$flip == FALSE) {
      ret$sq <- rev(sq_comp(ret$sq))
      ret$flip <- TRUE
    } else {
      ret$error <- TRUE
      ret$error_msg <- paste0(
        ret$error_msg,
        "ERROR 1: linker sequence not found in input sequence"
      )
      break
    }
  }
  if (ret$error) {
    ret$summary <- paste(c(
      ret$error_msg, "\n"
    ), collapse = "")
    return(ret)
  }

  # check for duplicate linkers
  if (sum(ret$sq_m_linker) / length(linker) != 1) {
    ret$error <- TRUE
    ret$error_msg <- paste0(
      ret$error_msg,
      "ERROR 2: duplicate linker sequences found in input sequence"
    )
  }
  if (ret$error) {
    ret$summary <- paste(c(
      ret$error_msg, "\n"
    ), collapse = "")
    return(ret)
  }

  # define top, bottom and bar sequences
  min <- min(which(ret$sq_m_linker))
  max <- max(which(ret$sq_m_linker))
  ret$top_unxt <- ret$sq[1:(min - 1)]
  ret$bar <- ret$sq[min:max][linker == "N"]
  ret$bot_unxt <- rev(ret$sq[(max + 1):length(ret$sq)])

  # make top and bottom sequences the same length
  if (length(ret$top_unxt) != length(ret$bot_unxt)) {
    ret$large <- max(length(ret$top_unxt), length(ret$bot_unxt))
    ret$small <- min(length(ret$top_unxt), length(ret$bot_unxt))
    ret$dif <- ret$large - ret$small
    ret$filler <- rep("-", ret$dif)
    if (length(ret$top_unxt) < length(ret$bot_unxt)) {
      ret$top_uncut <- c(ret$filler, ret$top_unxt)
      ret$bot_uncut <- ret$bot_unxt
    } else {
      ret$bot_uncut <- c(ret$filler, ret$bot_unxt)
      ret$top_uncut <- ret$top_unxt
    }
  } else {
    ret$top_uncut <- ret$top_unxt
    ret$bot_uncut <- ret$bot_unxt
  }

  ret$top_unfixed <- ret$top_uncut
  ret$bot_unfixed <- ret$bot_uncut

  # fix base errors

  fixed <- compare_fix(ret$top_unfixed, ret$bot_unfixed)
  ret$top <- c(fixed$top)
  ret$bot <- c(fixed$bot)
  ret$shift_count <- fixed$shift_count
  ret$excise_count <- fixed$excise_count

  # trim sequence (optional)
  if (!is.na(sqlen)) {
    ret$top <- ret$top[(length(ret$top) - sqlen):length(ret$top)]
    ret$bot <- ret$bot[(length(ret$bot) - sqlen):length(ret$bot)]
  }

  # flip
  if (side == "left") {
    temp <- ret$top_unxt
    ret$top_unxt <- rev(ret$bot_unxt)
    ret$bot_unxt <- rev(temp)

    temp <- ret$top_uncut
    ret$top_uncut <- rev(ret$bot_uncut)
    ret$bot_uncut <- rev(temp)

    temp <- ret$top_unfixed
    ret$top_unfixed <- rev(ret$bot_unfixed)
    ret$bot_unfixed <- rev(temp)

    temp <- ret$top
    ret$top <- rev(ret$bot)
    ret$bot <- rev(temp)
  }

  # make logical lists corresponding to cpg site methylation state
  topMC <- sq_match(ret$top, "C")
  botMC <- sq_match(ret$bot, "C")
  topMCG <- sq_match(ret$top, c("C", "G"))
  botMGC <- sq_match(ret$bot, c("G", "C"))
  topMTG <- sq_match(ret$top, c("T", "G"))
  botMGT <- sq_match(ret$bot, c("G", "T"))

  ret$fullmeth <- sq_match((topMCG & botMGC), c("TRUE", "TRUE"))
  ret$unmeth <- sq_match((topMTG & botMGT), c("TRUE", "TRUE"))
  ret$top_hemimeth <- sq_match((topMCG & botMGT), c("TRUE", "TRUE"))
  ret$top_meth <- sq_match((ret$top_hemimeth | ret$fullmeth), c("TRUE", "TRUE"))
  ret$bot_hemimeth <- sq_match((topMTG & botMGC), c("TRUE", "TRUE"))
  ret$bot_meth <- ret$bot_hemimeth | ret$fullmeth
  ret$CpG <- ret$top_meth | ret$bot_meth | ret$unmeth

  # identify non-cpg site methylation
  ret$noncpgmeth <- (topMC | botMC) & !ret$CpG
  ret$noncpgmeth_sum <- sum(ret$noncpgmeth)

  # make logical lists of cpg states only
  tmp <- seq(1, sum(ret$CpG), 2)
  ret$CpG_fullmeth <- ret$fullmeth[ret$CpG][tmp]
  ret$CpG_unmeth <- ret$unmeth[ret$CpG][tmp]
  ret$CpG_top_hemimeth <- ret$top_hemimeth[ret$CpG][tmp]
  ret$CpG_top_meth <- ret$top_meth[ret$CpG][tmp]
  ret$CpG_bot_hemimeth <- ret$bot_hemimeth[ret$CpG][tmp]
  ret$CpG_bot_meth <- ret$bot_meth[ret$CpG][tmp]

  # reverse convert sequence
  ret$top_revconv <- ret$top
  ret$top_revconv[topMC] <- "C"
  ret$top_revconv[ret$top == "T" & !ret$bot == "A"] <- "C"
  ret$bot_revconv <- ret$bot
  ret$bot_revconv[botMC] <- "C"
  ret$bot_revconv[ret$bot == "T" & !ret$top == "A"] <- "C"

  # calculate statistics
  ret$CpG_sum <- sum(ret$CpG / 2)
  ret$U <- sum(ret$CpG_unmeth) / ret$CpG_sum
  ret$m <- (sum(ret$CpG_top_meth) + sum(ret$CpG_bot_meth)) / (ret$CpG_sum * 2)
  ret$rcp <- (sqrt(ret$U * (ret$U + 2 * ret$m - 1))) / (1 - ret$U - ret$m)

  # make summary for text output
  temp_CpG <- rep("|", length(ret$bot))
  temp_CpG[!(ret$top %in% c("G", "A", "T", "C")) |
    !(ret$bot %in% c("G", "A", "T", "C"))] <- " "
  temp_CpG[ret$CpG] <- "*"
  temp_topMC <- rep(" ", length(ret$top))
  temp_topMC[topMC] <- "m"
  temp_botMC <- rep(" ", length(ret$bot))
  temp_botMC[botMC] <- "m"
  temp_top_CpG <- rep("\U25A1", ret$CpG_sum)
  temp_top_CpG[ret$CpG_top_meth] <- "\U25A0"
  temp_bot_CpG <- rep("\U25A1", ret$CpG_sum)
  temp_bot_CpG[ret$CpG_bot_meth] <- "\U25A0"

  ret$summary_top <- temp_top_CpG
  ret$summary_bot <- temp_bot_CpG

  ret$summary <- paste(c(
    temp_topMC, "\n",
    # ret$top_unxt, "\n",
    # ret$top_uncut, "\n",
    # ret$top_unfixed, "\n",
    ret$top, "\n",
    # ret$top_revconv, "\n",
    temp_CpG, "\n",
    # ret$bot_revconv, "\n",
    ret$bot, "\n",
    # ret$bot_unfixed, "\n",
    # ret$bot_uncut, "\n",
    # ret$bot_unxt, "\n",
    temp_botMC, "\n",
    "No. CpG Sites = ", ret$CpG_sum, "\n",
    "(Fully-methylated) = ", sum(ret$CpG_fullmeth), "\n",
    "(Hemi-methylated) = ", sum(ret$CpG_top_hemimeth) + sum(ret$CpG_bot_hemimeth), "\n",
    "(Un-methylated) = ", sum(ret$CpG_unmeth), "\n\n",
    "U: ", round(ret$U, 2), " m: ", round(ret$m, 2), "\n\n",
    "Non-CpG methylation = ", ret$noncpgmeth_sum, "\n",
    "Alignment shifts = ", ret$shift_count, "\n",
    "Unfixable erroneous bases = ", ret$excise_count
  ), collapse = "")

  return(ret)
}
g-r-eg/fold2 documentation built on Nov. 4, 2019, 1 p.m.