R/chksum.R
In seguid: Sequence Globally Unique Identifier (SEGUID) Checksums

Documented in cdseguid csseguid ldseguid lsseguid seguid

#' @importFrom base64enc base64encode
b64encode <- function(s) {
  base64encode(s) 
}

b64encode_urlsafe <- function(s) {
  s <- b64encode(s)
  s <- gsub("+", "-", s, fixed = TRUE)
  s <- gsub("/", "_", s, fixed = TRUE)
  s
}

#' @import digest digest
sha1_b64encode <- function(seq) {
  checksum <- digest(seq, algo = "sha1", serialize = FALSE, raw = TRUE)
  checksum <- b64encode(checksum)

  ## Drop newlines (just in case)
  checksum <- sub("[\n]+$", "", checksum)

  ## SHA-1 (160 bits = 20 bytes = 40 hexadecimal character) needs
  ## at most 160/log2(64) = 26.6667 = 27 symbols. Base64 pads to
  ## multiples of 4 symbols, i.e. 28 symbols.  Thus, the last
  ## symbol is always a pad symbol, when using SHA-1. This is
  ## why we drop the last symbol.
  checksum <- sub("[=]$", "", checksum)

  checksum
}    

#' @import digest digest
sha1_b64encode_urlsafe <- function(seq) {
  checksum <- digest(seq, algo = "sha1", serialize = FALSE, raw = TRUE)
  checksum <- b64encode_urlsafe(checksum)

  ## Drop newlines (just in case)
  checksum <- sub("[\n]+$", "", checksum)

  ## SHA-1 (160 bits = 20 bytes = 40 hexadecimal character) needs
  ## at most 160/log2(64) = 26.6667 = 27 symbols. Base64 pads to
  ## multiples of 4 symbols, i.e. 28 symbols.  Thus, the last
  ## symbol is always a pad symbol, when using SHA-1. This is
  ## why we drop the last symbol.
  checksum <- sub("[=]$", "", checksum)

  checksum
}    


with_prefix <- function(s, prefix, form = c("long", "short", "both")) {
  form <- match.arg(form)
  
  checksum <- sub("^(|(l|c)(s|d))*seguid=", "", s)
  assert_checksum(checksum, prefix = "")

  if (form == "both") form <- c("short", "long")

  res <- character(0L)
  for (ff in form) {
    if (ff == "long") {
      res <- c(res, paste0(prefix, checksum))
    } else if (ff == "short") {
      res <- c(res, substr(checksum, start = 1L, stop = 6L))
    }
  }
  
  res
}

.seguid <- function(seq, alphabet, encoding, prefix = "") {
    assert_alphabet(alphabet)
    assert_in_alphabet(seq, alphabet = names(alphabet))
    stopifnot(is.function(encoding))
    stopifnot(length(prefix) == 1, is.character(prefix), !is.na(prefix))

    checksum <- encoding(seq)

    checksum <- paste0(prefix, checksum)
    assert_checksum(checksum, prefix = prefix)
    checksum
}


#' SEGUID checksums for linear, circular, single- and double-stranded sequences
#'
#' @param seq (character string) The sequence for which the checksum
#' should be calculated.  The sequence may only comprise of symbols
#' in the alphabet specified by the `alphabet` argument.
#'
#' @param alphabet (character string) The type of sequence used.
#' If `"{DNA}"` (default), then the input is a DNA sequence.
#' If `"{RNA}"`, then the input is an RNA sequence.
#' If `"{protein}"`, then the input is an amino-acid sequence.
#' If `"{DNA-extended}"` or `"{RNA-extended}"`, then the input is a
#' DNA or RNA sequence specified an extended set of symbols, including
#  IUPAC symbols (3).
#' If `"{protein-extended}"`, then the input is an amino-acid sequence
#' with an extended set of symbols, including IUPAC symbols (4).
#' A custom alphabet may also be used.
#' A non-complementary alphabet is specified as a comma-separated
#' set of single symbols, e.g. `"X,Y,Z"`.
#' A complementary alphabet is specified as a comma-separated
#' set of paired symbols, e.g. `"AT,CG"`.
#' It is also possible to extend a pre-defined alphabet, e.g.
#' `"{DNA},XY"`.
#'
#' @param form (character string) How the checksum is presented.
#' If `"long"` (default), the full-length checksum is outputted.
#' If `"short"`, the short, six-digit checksum is outputted.
#' If `"both"`, both the short and the long checksums are outputted.
#'
#' @return
#' The SEGUID functions return a single character string, if `form` is
#' either `"long"` or `"short"`. If `form` is `"both"`, then a character
#' vector of length two is return, where the first component holds the
#' "short" checksum and the second the "long" checksum.
#' The long checksum, without the prefix, is string with 27 characters.
#' The short checksum, without the prefix, is the first six characters
#' of the long checksum.
#' All checksums are prefixed with a label indicating which SEGUID
#' method was used.
#' Except for `seguid()`, which uses _base64_ encoding, all functions
#' produce checksums using the _base64url_ encoding ("Base 64 Encoding
#' with URL and Filename Safe Alphabet").
#'
#' `seguid()` calculates the SEGUID v1 checksum for a linear,
#' single-stranded sequence. 
#'
#' @section Base64 and Base64url encodings:
#' The base64url encoding is the base64 encoding with non-URL-safe characters
#' substituted with URL-safe ones. Specifically, the plus symbol (`+`) is
#' replaced by the minus symbol (`-`), and the forward slash (`/`) is
#' replaced by the underscore symbol (`_`).
#'
#' The Base64 checksum, which is used for the original SEGUID checksum,
#' is not guaranteed to comprise symbols that can
#' safely be used as-is in Uniform Resource Locator (URL). Specifically,
#' it may consist of forward slashes (`/`) and plus symbols (`+`), which
#' are characters that carry special meaning in a URL.
#' For the same reason, a Base64 checksum cannot safely be used
#' as a file or directory name, because it may have a forward slash.
#'
#' The checksum returned is always 27-character long. This is because the
#" SHA-1 hash (5) is 160-bit long (20 bytes), which result in the encoded
#' representation always end with a padding character (`=`) so that the 
#' length is a multiple of four character. We relax this requirement, by
#' dropping the padding character.
#'
#' @example incl/seguid.R
#'
#' @references
#' 1. Babnigg, G., Giometti, CS. A database of unique protein sequence
#'    identifiers for proteome studies. Proteomics.
#'    2006 Aug;6(16):4514-22. \doi{10.1002/pmic.200600032}.
#' 2. Josefsson, S., The Base16, Base32, and Base64 Data Encodings,
#'    RFC 4648, \doi{10.17487/RFC4648}, October 2006,
#'    <https://www.rfc-editor.org/info/rfc4648>.
#' 3. Wikpedia article 'Nucleic acid notation', February 2024.
#'    <https://en.wikipedia.org/wiki/Nucleic_acid_notation>.
#' 4. Wikpedia article 'Nucleic acid notation', February 2024,
#'    <https://en.wikipedia.org/wiki/Amino_acid>.
#' 5. Wikipedia article 'SHA-1' (Secure Hash Algorithm 1), December 2023.
#'    <https://en.wikipedia.org/wiki/SHA-1>.
#'
#' @importFrom base64enc base64encode
#' @importFrom digest digest
#' @export
seguid <- function(seq, alphabet = "{DNA}", form = c("long", "short", "both")) {
  if (nchar(seq) == 0) {
    stop("A sequence must not be empty")
  }
  
  alphabet2 <- get_alphabet(alphabet)
  with_prefix(.seguid(seq, alphabet = alphabet2, encoding = sha1_b64encode), prefix = "seguid=", form = form)
}


#' @return
#' `lsseguid()` calculates the SEGUID v2 checksum for a linear,
#' single-stranded sequence.
#'
#' @rdname seguid
#' @export
lsseguid <- function(seq, alphabet = "{DNA}", form = c("long", "short", "both")) {
  if (nchar(seq) == 0) {
    stop("A sequence must not be empty")
  }
  
  alphabet2 <- get_alphabet(alphabet)
  with_prefix(.seguid(seq, alphabet = alphabet2, encoding = sha1_b64encode_urlsafe), prefix = "lsseguid=", form = form)
}


#' @return
#' `csseguid()` calculates the SEGUID v2 checksum for a circular,
#' single-stranded sequence.
#'
#' @rdname seguid
#' @export
csseguid <- function(seq, alphabet = "{DNA}", form = c("long", "short", "both")) {
  if (nchar(seq) == 0) {
    stop("A sequence must not be empty")
  }
  
  with_prefix(lsseguid(rotate_to_min(seq), alphabet = alphabet), prefix = "csseguid=", form = form)
}


#' @param watson,crick (character strings) Two reverse-complementary DNA
#' sequences. Both sequences should be specified in the 5'-to-3' direction.
#'
#' @return
#' `ldseguid()` calculates the SEGUID v2 checksum for a linear,
#' double-stranded sequence.
#'
#' @rdname seguid
#' @export
ldseguid <- function(watson, crick, alphabet = "{DNA}", form = c("long", "short", "both")) {
  ## Make sure to collate in the 'C' locale
  old_locale <- Sys.getlocale("LC_COLLATE")
  on.exit(Sys.setlocale("LC_COLLATE", old_locale))
  Sys.setlocale("LC_COLLATE", "C")

  if (nchar(watson) == 0 || nchar(crick) == 0) {
    stop("A sequence must not be empty")
  }

  alphabet2 <- paste0(alphabet, "+[-;]")
  assert_complementary(watson, crick, alphabet = alphabet2)

  if (is_seq_less_than(watson, crick)) {
    spec <- paste(watson, crick, sep = ";")
  } else {
    spec <- paste(crick, watson, sep = ";")
  }
  with_prefix(lsseguid(spec, alphabet = alphabet2), prefix = "ldseguid=", form = form)
}


#' @return
#' `cdseguid()` calculates the SEGUID v2 checksum for a circular,
#' double-stranded sequence.
#'
#' @rdname seguid
#' @export
cdseguid <- function(watson, crick, alphabet = "{DNA}", form = c("long", "short", "both")) {
  if (nchar(watson) == 0 || nchar(crick) == 0) {
    stop("A sequence must not be empty")
  }
  
  stopifnot(nchar(watson) == nchar(crick))
  
  assert_complementary(watson, crick, alphabet = alphabet)

  amount_watson <- min_rotation(watson)
  watson_min <- rotate(watson, amount = amount_watson)
  
  amount_crick <- min_rotation(crick)
  crick_min <- rotate(crick, amount = amount_crick)

  ## Keep the "minimum" of the two variants
  if (is_seq_less_than(watson_min, crick_min)) {
      w <- watson_min
      c <- rotate(crick, amount = -amount_watson)
  } else {
      w <- crick_min
      c <- rotate(watson, amount = -amount_crick)
  }

  with_prefix(ldseguid(watson = w, crick = c, alphabet = alphabet), prefix = "cdseguid=", form = form)
}