R/Modstrings-ModStringCodec.R

Defines functions .new_ModStringCodec .convert_one_byte_codes_to_originating_base .convert_letters_to_originating_base .str_replace_all_regex_custom .str_replace_all_fixed_custom .convert_one_byte_codes_to_letters .check_for_invalid_letters .convert_letters_to_one_byte_codes .escape_special_charactes .load_mod_dictionary

#' @include Modstrings.R
#' @include Modstrings-external-C-calls.R
#' @include Modstrings-external-functions.R
NULL

additional_base_codes <- c(N = 15L, `-` = 16L, `+` = 32L, `.` = 64L)

.load_mod_dictionary <- function(file){
  on.exit({close(con)})
  con <- file(file)
  file <- stringr::str_split(readLines(con,
                                       encoding = "UTF-8"),
                             "\t")
  # remove commented lines
  file <- file[!substr(lapply(file,"[",1),1,1) %in% c("#","")]
  #
  m <- t(unname(data.frame(file,
                           stringsAsFactors = FALSE)))
  colnames(m) <- m[1,]
  m <- m[-1,]
  df <- DataFrame(m)
  df$name <- as.character(df$name)
  df$short_name <- as.character(df$short_name)
  df$abbrev <- as.character(df$abbrev)
  df$orig_base <- as.character(df$orig_base)
  df$nc <- as.character(df$nc)
  df$value <- as.integer(m[,"value"])
  df$oneByteInteger <- as.integer(m[,"oneByteInteger"])
  df$oneByteLetter <- unlist(lapply(lapply(as.integer(df$oneByteInteger),
                                           as.raw),
                                    rawToChar))
  df[,!(colnames(df) %in% c("oneByteInteger"))]
}

#load the dictionaries from file

MOD_DNA_BASE_CODES <- .load_mod_dictionary("inst/extdata/Mod_DNA_codes.txt")
MOD_RNA_BASE_CODES <- .load_mod_dictionary("inst/extdata/Mod_RNA_codes.txt")

################################################################################
# ModStringCodec is used to convert a strubg with multi byte letters into string
# with one byte letters. The resulting object can then be used like a BString.
# 
# However, this requires the conversion during printing or export. --> see below
# for conversion functions. These function are used by the Mod*String class
################################################################################

setClass("ModStringCodec",
         slots = c(letters = "character",
                   oneByteCodes = "character",
                   conversion = "logical",
                   originatingBase = "character",
                   values = "integer",
                   lettersEscaped = "character",
                   oneByteCodesEscaped = "character",
                   lettersNeedEscape = "logical",
                   oneByteCodesNeedEscape = "logical",
                   additionalInfo = "DataFrame"))

#' @importFrom stringr str_detect
.escape_special_charactes <- function(x){
  special_chars <- paste0("\\~\\!\\#\\$\\%\\^\\&\\*\\{\\}\\+\\:",
                          "\\\\\"\\?\\,\\.\\/\\'\\[\\]\\-\\|")
  f <- which(stringr::str_detect(x,paste0("[:punct:]|[",special_chars,"]")))
  x[f] <- paste0("\\",x[f])
  x
}

################################################################################
# additional sequence conversion functions, since some characters are longer
# than one byte. They need to encoded first using the information in the 
# modification base code DataFrame.
####
# The incoming string does need to be UTF-8 encoded and the outgoing must be
# ASCII one byte letters
################################################################################

#' @importFrom stringi stri_enc_toutf8 stri_enc_toascii stri_enc_get 
#' stri_enc_tonative
.convert_letters_to_one_byte_codes <- function(string, codec){
  string <- stringi::stri_enc_toutf8(string)
  .check_for_invalid_letters(string, codec)
  obc_string <- .str_replace_all_regex_custom(
    string,
    lettersEscaped(codec)[conversion(codec)],
    oneByteCodes(codec)[conversion(codec)])
  if(stringi::stri_enc_get() == "UTF-8"){
    return(obc_string)
  }
  return(stringi::stri_enc_tonative(obc_string))
}

.check_for_invalid_letters <- function(string, codec){
  letters_in_string <- unique(strsplit(string,"")[[1]])
  if(any(!(letters_in_string %in% letters(codec)))){
    # print has to be used because message changes the encoding of the output
    cat(paste(letters_in_string[!(letters_in_string %in% letters(codec))],
              collapse = ""))
    stop("Invalid character(s) - see above",
         call. = FALSE)
  }
}

.convert_one_byte_codes_to_letters <- function(obc_string, codec){
  obc_string <- stringi::stri_enc_toutf8(obc_string)
  if(stringi::stri_enc_get() == "UTF-8"){
    string <- .str_replace_all_fixed_custom(
      obc_string,
      oneByteCodes(codec)[conversion(codec)],
      letters(codec)[conversion(codec)])
  } else {
    string <- .str_replace_all_regex_custom(
      obc_string,
      oneByteCodesEscaped(codec)[conversion(codec)],
      letters(codec)[conversion(codec)])
  }
  return(string) 
}

#' @importFrom stringi stri_locate_all_fixed stri_sub
.str_replace_all_fixed_custom <- function(string, pattern, replacement){
  locations <- stringi::stri_locate_all_fixed(string, pattern,
                                              omit_no_match = TRUE)
  f <- which(!vapply(locations,function(l){nrow(l) == 0},logical(1)))
  # Currently now idea how to avoid the loops
  for(i in f){
    loc <- locations[[i]]
    for(j in seq_len(nrow(loc))){
      stringi::stri_sub(string,loc[j,"start"],loc[j,"end"]) <- 
        replacement[i]
    }
  }
  string
}
#' @importFrom stringi stri_locate_all_regex stri_sub
.str_replace_all_regex_custom <- function(string, pattern, replacement){
  locations <- stringi::stri_locate_all_regex(string, pattern,
                                              omit_no_match = TRUE)
  f <- which(!vapply(locations,function(l){nrow(l) == 0},logical(1)))
  # Currently now idea how to avoid the loops
  for(i in f){
    loc <- locations[[i]]
    for(j in seq_len(nrow(loc))){
      stringi::stri_sub(string,loc[j,"start"],loc[j,"end"]) <- 
        replacement[i]
    }
  }
  string
}

.convert_letters_to_originating_base <- function(string, codec){
  string <- stringi::stri_enc_toutf8(string)
  orig_string <- .str_replace_all_regex_custom(
    string,
    lettersEscaped(codec),
    originatingBase(codec))
  if(stringi::stri_enc_get() == "UTF-8"){
    return(orig_string)
  }
  return(stringi::stri_enc_tonative(orig_string))
}

.convert_one_byte_codes_to_originating_base <- function(obc_string, codec){
  obc_string <- stringi::stri_enc_toutf8(obc_string)
  if(stringi::stri_enc_get() == "UTF-8"){
    orig_string <- .str_replace_all_fixed_custom(
      obc_string,
      oneByteCodes(codec),
      originatingBase(codec))
  } else {
    orig_string <- .str_replace_all_regex_custom(
      obc_string,
      oneByteCodesEscaped(codec),
      originatingBase(codec))
  }
  return(orig_string) 
}

################################################################################
# get Biostring like base codes, alphabet and Codec object
# the codec object is not inherited from Biostrings package, but is
# used for one byte conversion only

.new_ModStringCodec <- function(base_codes, biostrings_base_codes){
  letters <- base_codes$abbrev
  oneByteCodes <- base_codes$oneByteLetter
  orig_base <- base_codes$orig_base
  values <- base_codes$value
  extra_letters <- biostrings_base_codes
  additionalInfo <- base_codes[,c("name","short_name","nc","orig_base","abbrev")]
    lengths <- unique(c(length(letters),
                        length(oneByteCodes),
                        length(values),
                        length(orig_base)))
  if(length(lengths) != 1){
    stop("ModStringCodec: Input do not have the same length.")
  }
  # remove empty letters. this is four neutrality against currently 
  # unsupported modifications. However they can be part of the
  # additionalInfo, which is used for the construction of the 
  # sanitization dictionaries
  f <- letters == ""
  letters <- letters[!f]
  oneByteCodes <- oneByteCodes[!f]
  values <- values[!f]
  orig_base <- orig_base[!f]
  # 
  letters <- c(letters,names(extra_letters))
  oneByteCodes <- c(oneByteCodes,names(extra_letters))
  originatingBase <- c(orig_base,names(extra_letters))
  values <- c(values,unname(extra_letters))
  # originating base must be in the extra_letter or empty
  if(!all(originatingBase %in% c(names(extra_letters),""))){
    stop("Not all originating bases are in the extra letters.")
  }
  # order based on the values in ascending order
  f <- order(values)
  letters <- letters[f]
  oneByteCodes <- oneByteCodes[f]
  originatingBase <- originatingBase[f]
  values <- values[f]
  # escape necessary values:
  lettersEscaped <- .escape_special_charactes(letters)
  lettersNeedEscape <- letters != lettersEscaped
  oneByteCodesEscaped <- .escape_special_charactes(oneByteCodes)
  oneByteCodesNeedEscape <- oneByteCodes != oneByteCodesEscaped
  # check which letters need conversion
  # and control input
  conversion <- vapply(letters,
                       function(l){
                         length(charToRaw(l)) > 1
                       },
                       logical(1))
  checkConversionValid <- vapply(c(letters[!which(conversion)],
                                   oneByteCodes[which(conversion)]),
                                 function(l){
                                   length(charToRaw(l)) > 1
                                 },
                                 logical(1))
  if(any(checkConversionValid)){
    stop("Not all letters have a proper one byte conversion.")
  }
  new("ModStringCodec", letters = letters, oneByteCodes = oneByteCodes, 
      conversion = conversion, originatingBase = originatingBase,
      values = values, lettersEscaped = lettersEscaped,
      oneByteCodesEscaped = oneByteCodesEscaped, 
      lettersNeedEscape = lettersNeedEscape,
      oneByteCodesNeedEscape = oneByteCodesNeedEscape, 
      additionalInfo = additionalInfo)
}

MOD_DNA_STRING_CODEC <- .new_ModStringCodec(MOD_DNA_BASE_CODES,
                                            c(.DNA_BASE_CODES, 
                                              additional_base_codes))
MOD_RNA_STRING_CODEC <- .new_ModStringCodec(MOD_RNA_BASE_CODES,
                                            c(.RNA_BASE_CODES,
                                              additional_base_codes))

# accessors --------------------------------------------------------------------

setMethod("letters", "ModStringCodec",
          function(x) x@letters)
setMethod("oneByteCodes", "ModStringCodec",
          function(x) x@oneByteCodes)
setMethod("conversion", "ModStringCodec",
          function(x) x@conversion)
setMethod("originatingBase", "ModStringCodec",
          function(x) x@originatingBase)
setMethod("values", "ModStringCodec",
          function(x) x@values)
setMethod("lettersEscaped", "ModStringCodec",
          function(x) x@lettersEscaped)
setMethod("oneByteCodesEscaped", "ModStringCodec",
          function(x) x@oneByteCodesEscaped)
setMethod("lettersNeedEscape", "ModStringCodec",
          function(x) x@lettersNeedEscape)
setMethod("oneByteCodesNeedEscape", "ModStringCodec",
          function(x) x@oneByteCodesNeedEscape)
setMethod("additionalInfo", "ModStringCodec",
          function(x) x@additionalInfo)
FelixErnst/Modstrings documentation built on March 26, 2024, 1:24 p.m.