#' Make a molecular formula object
#'
#' @description This function generates a molecular formula object.
#'
#' @param MolForm A molecular formula written as a string like "C6H12O6". Only
#' elements up to uranium are supported.
#'
#' @details A list of length 92 (Hydrogen to Uranium) is stored in the attributes
#' and used for quick mathematics. All numerics are rounded to the nearest whole
#' number.
#'
#' @examples
#' \dontrun{
#'
#' # Generate a molecular formula object
#' as.molform("C6H12O6")
#'
#' # Duplicates should be added
#' as.molform("CHCO3")
#'
#' # This should fail, as 'Ot' and 'Lp' are not elements
#' as.molform("C6H12Ot5Lp2")
#'
#' # Negative elements are allowed (useful for modifications where elements are lost)
#' as.molform("H-3N-1")
#'
#' }
#'
#' @export
as.molform <- function(MolForm) {
##################
## CHECK INPUTS ##
##################
# Check that the molecular formula is a string
if (!is.character(MolForm) | length(MolForm) != 1) {
stop("MolForm must be a single string.")
}
# Split out elements
CountElements <- CHNOSZ::count.elements(MolForm)
# List acceptable elements
AcceptableElements <- c(
"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al",
"Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe",
"Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr",
"Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn",
"Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm",
"Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W",
"Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn",
"Fr", "Ra", "Ac", "Th", "Pa", "U"
)
# Check for non-established elements
NonEstablished <- names(CountElements)[names(CountElements) %in% AcceptableElements == FALSE]
if (length(NonEstablished) != 0) {
stop(paste("The elements", paste0(NonEstablished, collapse = ", "),
"are not supported at this time."))
}
#########################
## GENERATE ATTRIBUTES ##
#########################
# Round counts
CountElements <- round(CountElements)
# Fill an atomic vector
AtomList <- rep(0, length(AcceptableElements))
names(AtomList) <- AcceptableElements
AtomList[names(CountElements)] <- CountElements
# Add a class
class(AtomList) <- c(class(AtomList), "molform")
# Return the object
return(AtomList)
}
#' Collapse a molform object to a simple molecular formula
#'
#' @description A simple printing option for an object of the as.molform class.
#'
#' @examples
#' \dontrun{
#' collapse_molform(as.molform("H12C6O6"))
#' }
#' @export
collapse_molform <- function(molform) {
###################
## CHECK INPUTS ##
##################
# Molform should be an object of the molform class
if (!inherits(molform, "molform")) {
stop("molform should be an object from the molform class.")
}
######################
## COLLAPSE MOLFORM ##
######################
# Order by Hill
HillOrder <- c("C", "H", names(molform) %>% .[. %in% c("H", "C") == FALSE] %>% sort())
# The order should be C, H, alphabetical
molform <- molform[HillOrder]
# Remove 0
collapsed <- molform[molform != 0]
# Paste results
return(paste0(names(collapsed), collapsed, sep = "", collapse = ""))
}
#' Get average mass from the molecular formula
#'
#' @description Calculate the average mass for a molform object.
#'
#' @param molform An object of the as.molform class
#'
#' @examples
#' \dontrun{
#'
#' get_mw(as.molform("C6H12O6"))
#'
#' }
#' @export
get_mw <- function(molform) {
##################
## CHECK INPUTS ##
##################
# Molform should be an object of the molform class
if (!inherits(molform, "molform")) {
stop("molform should be an object from the molform class.")
}
######################
## CALCULATE RESULT ##
######################
AverageMasses <- c(1.007940721855, 4.00260164839362, 6.9400370562, 9.012182, 10.811027768,
12.0107358985, 14.0067430888, 15.99940530471, 18.998403, 20.1800463039,
22.98977, 24.3050518651, 26.981538, 28.085413282599, 30.973762,
32.0660849916, 35.45253851, 39.098301439102, 39.947676594063,
40.07802265518, 44.95591, 47.8667497092, 50.9414719975, 51.99613763897,
54.93805, 55.84515013384, 58.693356464998, 58.9332, 63.5456439019,
65.3955668951, 69.72307154608, 72.6127589578, 74.921596, 78.9593889728,
79.9035286243, 83.7993250788, 85.4676637502, 87.6166459844, 88.905848,
91.2236473858, 92.906378, 95.9312915794, 97.907216, 101.0649451127,
102.905504, 106.4153280285, 107.86815069743, 112.4115526683,
114.8180858507, 118.7101106395, 121.7597883042, 126.904468, 127.6031253845,
131.2924806487, 132.905447, 137.32688569073, 138.9054486831,
140.11572154786, 140.907648, 144.236126978, 144.912744, 150.3663440037,
151.964366222, 157.2521192511, 158.925343, 162.4970300387, 164.930319,
167.2563010726, 168.934211, 173.0376918021, 174.9667175726, 178.4849709358,
180.9478759364, 183.8417786794, 186.20670567, 190.2248610803,
192.216053791, 195.07779131504, 196.966552, 200.5991493631, 204.38331701508,
207.21689158, 208.980383, 208.982416, 209.987131, 222.01757,
223.019731, 226.025403, 227.027747, 231.035879, 232.03805, 238.028913066965
)
return((molform * AverageMasses) %>% sum())
}
#' Get monoisotopic mass from the molecular formula
#'
#' @description Calculate the monoisotopic mass for a molform object. No charge is assumed.
#'
#' @param molform An object of the as.molform class
#'
#' @examples
#' \dontrun{
#'
#' get_monoisotopic(as.molform("C6H12O6"))
#'
#' }
#' @export
get_monoisotopic <- function(molform) {
##################
## CHECK INPUTS ##
##################
# Molform should be an object of the molform class
if (!inherits(molform, "molform")) {
stop("molform should be an object from the molform class.")
}
######################
## CALCULATE RESULT ##
######################
MonoisotopicMasses <- c(1.007825, 3.016029, 6.015122, 9.012182,
10.012937, 12, 14.003074, 15.994915, 18.998403, 19.99244, 22.98977,
23.985042, 26.981538, 27.976927, 30.973762, 31.972071, 34.968853,
35.967546, 38.963707, 39.962591, 44.95591, 45.952629, 49.947163,
49.94605, 54.93805, 53.939615, 58.9332, 57.935348, 62.929601,
63.929147, 68.925581, 69.92425, 74.921596, 73.922477, 78.918338,
77.920386, 84.911789, 83.913425, 88.905848, 89.904704, 92.906378,
91.90681, 97.907216, 95.907598, 102.905504, 101.905608, 106.905093,
105.906458, 112.904061, 111.904821, 120.903818, 119.90402, 126.904468,
123.905896, 132.905447, 129.90631, 137.907107, 135.907144, 140.907648,
141.907719, 144.912744, 143.911995, 150.919846, 151.919788, 158.925343,
155.924278, 164.930319, 161.928775, 168.934211, 167.933894, 174.940768,
173.94004, 179.947466, 179.946706, 184.952956, 183.952491, 190.960591,
189.95993, 196.966552, 195.965815, 202.972329, 203.973029, 208.980383,
208.982416, 209.987131, 222.01757, 223.019731, 226.025403, 227.027747,
232.03805, 231.035879, 234.040946)
return((molform * MonoisotopicMasses) %>% sum())
}
# Initialize a list of molecular formulas
MolFormDict <- list(
"I" = as.molform("C6H13N1O2"),
"L" = as.molform("C6H13N1O2"),
"K" = as.molform("C6H14N2O2"),
"M" = as.molform("C5H11N1O2S1"),
"F" = as.molform("C9H11N1O2"),
"T" = as.molform("C4H9N1O3"),
"W" = as.molform("C11H12N2O2"),
"V" = as.molform("C5H11N1O2"),
"R" = as.molform("C6H14N4O2"),
"H" = as.molform("C6H9N3O2"),
"A" = as.molform("C3H7N1O2"),
"N" = as.molform("C4H8N2O3"),
"D" = as.molform("C4H7N1O4"),
"C" = as.molform("C3H7N1O2S1"),
"E" = as.molform("C5H9N1O4"),
"Q" = as.molform("C5H10N2O3"),
"G" = as.molform("C2H5N1O2"),
"P" = as.molform("C5H9N1O2"),
"S" = as.molform("C3H7N1O3"),
"Y" = as.molform("C9H11N1O3")
)
#' Add molecular formula objects
#'
#' @description Add molform objects together
#'
#' @param ... Any number of molform objects from as.molform.
#' @param CapNegatives A TRUE/FALSE to indicate whether negative elements should be
#' capped at 0. Useful for modifications where elements are lost. Default is TRUE.
#'
#' @examples
#' \dontrun{
#'
#' add_molforms(as.molform("C6H12O6"), as.molform("H2K2F2Na5"), as.molform("H-1C-2"))
#'
#' }
#' @export
add_molforms <- function(..., CapNegatives = TRUE) {
##################
## CHECK INPUTS ##
##################
# Check that all inputs are molecular formula objects
InputTest <- lapply(list(...), function(x) {inherits(x, "molform")}) %>% unlist() %>% all()
if (!InputTest) {
stop("All input objects must be molform objects from as.molform.")
}
# Check that CapNegatives is a TRUE or FALSE
if (!is.logical(CapNegatives) | is.na(CapNegatives)) {
stop("CapNegatives must be a TRUE or FALSE.")
}
################
## ADD VALUES ##
################
# Reduce results
Added <- Reduce(`+`, list(...))
# Pull and add all AtomLists
if (!CapNegatives) {return(Added)} else{
Added[Added < 0] <- 0
return(Added)
}
}
#' Multiply molecular formula objects
#'
#' @description Multiply molecular formulas by a constant
#'
#' @param molform An object of the as.molform class
#' @param scalar A positive integer determining the number of times to multiply
#' a molecular formula by.
#'
#' @examples
#' \dontrun{
#'
#' multiply_molforms(as.molform("C6H12O6"), 3)
#'
#' }
#' @export
multiply_molforms <- function(molform, scalar) {
##################
## CHECK INPUTS ##
##################
# Check that molform is an appropriate object
if (!inherits(molform, "molform")) {
stop("molform should be a molform object from as.molform.")
}
# Check that the scalar is a single numeric
if (!is.numeric(scalar) | length(scalar) != 1) {
stop("scalar should be a single numeric.")
}
scalar <- abs(round(scalar))
if (scalar == 0) {return(NULL)}
#####################
## MULTIPLY VALUES ##
#####################
# Multiply the object
return(molform * scalar)
}
#' Calculate a molecular formula for an amino acid sequence
#'
#' @param sequence The amino acid sequence.
#'
#' @examples
#' \dontrun{
#'
#' get_aa_molform("TESTTESTTESTTEST")
#'
#' }
#' @export
get_aa_molform <- function(sequence) {
##################
## CHECK INPUTS ##
##################
if (!is_sequence(sequence)) {
stop(paste0(sequence, " is not an acceptable amino acid sequence. See ?is_sequence."))
}
#####################################
## CALCULATE THE MOLECULAR FORMULA ##
#####################################
# Get counts of amino acids
AA <- strsplit(sequence, "") %>% unlist()
# Generate a water
Water <- as.molform("H2O1")
# Add all counts together
return(Reduce(`+`, MolFormDict[AA]) - Water * (nchar(sequence) - 1))
}
# Stress test
# stress <- rep("ACDEFGHIKLNPQRSTVWYACDEFGHIKLNPQRSTVWY", 100000)
# tictoc::tic(); test <- lapply(stress, BRAIN::getAtomsFromSeq); tictoc::toc() 11.783 sec
# tictoc::tic(); test <- lapply(stress, getAAMolForm); tictoc::toc() 14.865 sec
RelativeAbundances <- data.frame(
element = c("H", "H", "He", "He", "Li", "Li", "Be", "B", "B", "C", "C", "N", "N", "O", "O",
"O", "F", "Ne", "Ne", "Ne", "Na", "Mg", "Mg", "Mg", "Al", "Si", "Si", "Si", "P",
"S", "S", "S", "S", "Cl", "Cl", "Ar", "Ar", "Ar", "K", "K", "K", "Ca", "Ca", "Ca",
"Ca", "Ca", "Ca", "Sc", "Ti", "Ti", "Ti", "Ti", "Ti", "V", "V", "Cr", "Cr", "Cr",
"Cr", "Mn", "Fe", "Fe", "Fe", "Fe", "Co", "Ni", "Ni", "Ni", "Ni", "Ni", "Cu",
"Cu", "Zn", "Zn", "Zn", "Zn", "Zn", "Ga", "Ga", "Ge", "Ge", "Ge", "Ge", "Ge",
"As", "Se", "Se", "Se", "Se", "Se", "Se", "Br", "Br", "Kr", "Kr", "Kr", "Kr",
"Kr", "Kr", "Rb", "Rb", "Sr", "Sr", "Sr", "Sr", "Y", "Zr", "Zr", "Zr", "Zr", "Zr",
"Nb", "Mo", "Mo", "Mo", "Mo", "Mo", "Mo", "Mo", "Tc", "Ru", "Ru", "Ru", "Ru",
"Ru", "Ru", "Ru", "Rh", "Pd", "Pd", "Pd", "Pd", "Pd", "Pd", "Ag", "Ag", "Cd",
"Cd", "Cd", "Cd", "Cd", "Cd", "Cd", "Cd", "In", "In", "Sn", "Sn", "Sn", "Sn",
"Sn", "Sn", "Sn", "Sn", "Sn", "Sn", "Sb", "Sb", "Te", "Te", "Te", "Te", "Te",
"Te", "Te", "Te", "I", "Xe", "Xe", "Xe", "Xe", "Xe", "Xe", "Xe", "Xe", "Xe",
"Cs", "Ba", "Ba", "Ba", "Ba", "Ba", "Ba", "Ba", "La", "La", "Ce", "Ce", "Ce",
"Ce", "Pr", "Nd", "Nd", "Nd", "Nd", "Nd", "Nd", "Nd", "Pm", "Sm", "Sm", "Sm",
"Sm", "Sm", "Sm", "Sm", "Eu", "Eu", "Gd", "Gd", "Gd", "Gd", "Gd", "Gd", "Gd",
"Tb", "Dy", "Dy", "Dy", "Dy", "Dy", "Dy", "Dy", "Ho", "Er", "Er", "Er", "Er",
"Er", "Er", "Tm", "Yb", "Yb", "Yb", "Yb", "Yb", "Yb", "Yb", "Lu", "Lu", "Hf",
"Hf", "Hf", "Hf", "Hf", "Hf", "Ta", "Ta", "W", "W", "W", "W", "W", "Re", "Re",
"Os", "Os", "Os", "Os", "Os", "Os", "Os", "Ir", "Ir", "Pt", "Pt", "Pt", "Pt",
"Pt", "Pt", "Au", "Hg", "Hg", "Hg", "Hg", "Hg", "Hg", "Hg", "Tl", "Tl", "Pb",
"Pb", "Pb", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "U", "U"),
isotope = c("H1", "H2", "He3", "He4", "Li6", "Li7", "Be9", "B10", "B11",
"C12", "C13", "N14", "N15", "O16", "O17", "O18", "F19", "Ne20",
"Ne21", "Ne22", "Na23", "Mg24", "Mg25", "Mg26", "Al27", "Si28",
"Si29", "Si30", "P31", "S32", "S33", "S34", "S36", "Cl35", "Cl37",
"Ar36", "Ar38", "Ar40", "K39", "K40", "K41", "Ca40", "Ca42",
"Ca43", "Ca44", "Ca46", "Ca48", "Sc45", "Ti46", "Ti47", "Ti48",
"Ti49", "Ti50", "V50", "V51", "Cr50", "Cr52", "Cr53", "Cr54",
"Mn55", "Fe54", "Fe56", "Fe57", "Fe58", "Co59", "Ni58", "Ni60",
"Ni61", "Ni62", "Ni64", "Cu63", "Cu65", "Zn64", "Zn66", "Zn67",
"Zn68", "Zn70", "Ga69", "Ga71", "Ge70", "Ge72", "Ge73", "Ge74",
"Ge76", "As75", "Se74", "Se76", "Se77", "Se78", "Se80", "Se82",
"Br79", "Br81", "Kr78", "Kr80", "Kr82", "Kr83", "Kr84", "Kr86",
"Rb85", "Rb87", "Sr84", "Sr86", "Sr87", "Sr88", "Y89", "Zr90",
"Zr91", "Zr92", "Zr94", "Zr96", "Nb93", "Mo92", "Mo94", "Mo95",
"Mo96", "Mo97", "Mo98", "Mo100", "Tc98", "Ru96", "Ru98", "Ru99",
"Ru100", "Ru101", "Ru102", "Ru104", "Rh103", "Pd102", "Pd104",
"Pd105", "Pd106", "Pd108", "Pd110", "Ag107", "Ag109", "Cd106",
"Cd108", "Cd110", "Cd111", "Cd112", "Cd113", "Cd114", "Cd116",
"In113", "In115", "Sn112", "Sn114", "Sn115", "Sn116", "Sn117",
"Sn118", "Sn119", "Sn120", "Sn122", "Sn124", "Sb121", "Sb123",
"Te120", "Te122", "Te123", "Te124", "Te125", "Te126", "Te128",
"Te130", "I127", "Xe124", "Xe126", "Xe128", "Xe129", "Xe130",
"Xe131", "Xe132", "Xe134", "Xe136", "Cs133", "Ba130", "Ba132",
"Ba134", "Ba135", "Ba136", "Ba137", "Ba138", "La138", "La139",
"Ce136", "Ce138", "Ce140", "Ce142", "Pr141", "Nd142", "Nd143",
"Nd144", "Nd145", "Nd146", "Nd148", "Nd150", "Pm145", "Sm144",
"Sm147", "Sm148", "Sm149", "Sm150", "Sm152", "Sm154", "Eu151",
"Eu153", "Gd152", "Gd154", "Gd155", "Gd156", "Gd157", "Gd158",
"Gd160", "Tb159", "Dy156", "Dy158", "Dy160", "Dy161", "Dy162",
"Dy163", "Dy164", "Ho165", "Er162", "Er164", "Er166", "Er167",
"Er168", "Er170", "Tm169", "Yb168", "Yb170", "Yb171", "Yb172",
"Yb173", "Yb174", "Yb176", "Lu175", "Lu176", "Hf174", "Hf176",
"Hf177", "Hf178", "Hf179", "Hf180", "Ta180", "Ta181", "W180",
"W182", "W183", "W184", "W186", "Re185", "Re187", "Os184", "Os186",
"Os187", "Os188", "Os189", "Os190", "Os192", "Ir191", "Ir193",
"Pt190", "Pt192", "Pt194", "Pt195", "Pt196", "Pt198", "Au197",
"Hg196", "Hg198", "Hg199", "Hg200", "Hg201", "Hg202", "Hg204",
"Tl203", "Tl205", "Pb204", "Pb206", "Pb207", "Pb208", "Bi209",
"Po209", "At210", "Rn222", "Fr223", "Ra226", "Ac227", "Th232",
"Pa231", "U234", "U235", "U238"),
mass = c(1.007825, 2.014102, 3.016029, 4.002603, 6.015122, 7.016004, 9.012182, 10.012937,
11.009305, 12, 13.003355, 14.003074, 15.000109, 15.994915, 16.999132, 17.99916,
18.998403, 19.99244, 20.993847, 21.991386, 22.98977, 23.985042, 24.985837,
25.982593, 26.981538, 27.976927, 28.976495, 29.97377, 30.973762, 31.972071,
32.971458, 33.967867, 35.967081, 34.968853, 36.965903, 35.967546, 37.962732,
39.962383, 38.963707, 39.963999, 40.961826, 39.962591, 41.958618, 42.958767,
43.955481, 45.953693, 47.952534, 44.95591, 45.952629, 46.951764, 47.947947,
48.947871, 49.944792, 49.947163, 50.943964, 49.94605, 51.940512, 52.940654,
53.938885, 54.93805, 53.939615, 55.934942, 56.935399, 57.93328, 58.9332,
57.935348, 59.930791, 60.93106, 61.928349, 63.92797, 62.929601, 64.927794,
63.929147, 65.926037, 66.927131, 67.924848, 69.925325, 68.925581, 70.924705,
69.92425, 71.922076, 72.923459, 73.921178, 75.921403, 74.921596, 73.922477,
75.919214, 76.919915, 77.91731, 79.916522, 81.9167, 78.918338, 80.916291,
77.920386, 79.916378, 81.913485, 82.914136, 83.911507, 85.91061, 84.911789,
86.909183, 83.913425, 85.909262, 86.908879, 87.905614, 88.905848, 89.904704,
90.905645, 91.90504, 93.906316, 95.908276, 92.906378, 91.90681, 93.905088,
94.905841, 95.904679, 96.906021, 97.905408, 99.907477, 97.907216, 95.907598,
97.905287, 98.905939, 99.90422, 100.905582, 101.90435, 103.90543, 102.905504,
101.905608, 103.904035, 104.905084, 105.903483, 107.903894, 109.905152,
106.905093, 108.904756, 105.906458, 107.904183, 109.903006, 110.904182,
111.902757, 112.904401, 113.903358, 115.904755, 112.904061, 114.903878,
111.904821, 113.902782, 114.903346, 115.901744, 116.902954, 117.901606,
118.903309, 119.902197, 121.90344, 123.905275, 120.903818, 122.904216,
119.90402, 121.903047, 122.904273, 123.902819, 124.904425, 125.903306,
127.904461, 129.906223, 126.904468, 123.905896, 125.904269, 127.90353,
128.904779, 129.903508, 130.905082, 131.904154, 133.905395, 135.90722,
132.905447, 129.90631, 131.905056, 133.904503, 134.905683, 135.90457,
136.905821, 137.905241, 137.907107, 138.906348, 135.907144, 137.905986,
139.905434, 141.90924, 140.907648, 141.907719, 142.90981, 143.910083,
144.912569, 145.913112, 147.916889, 149.920887, 144.912744, 143.911995,
146.914893, 147.914818, 148.91718, 149.917271, 151.919728, 153.922205,
150.919846, 152.921226, 151.919788, 153.920862, 154.922619, 155.92212,
156.923957, 157.924101, 159.927051, 158.925343, 155.924278, 157.924405,
159.925194, 160.92693, 161.926795, 162.928728, 163.929171, 164.930319,
161.928775, 163.929197, 165.93029, 166.932045, 167.932368, 169.93546,
168.934211, 167.933894, 169.934759, 170.936322, 171.936378, 172.938207,
173.938858, 175.942568, 174.940768, 175.942682, 173.94004, 175.941402,
176.94322, 177.943698, 178.945815, 179.946549, 179.947466, 180.947996,
179.946706, 181.948206, 182.950224, 183.950933, 185.954362, 184.952956,
186.955751, 183.952491, 185.953838, 186.955748, 187.955836, 188.958145,
189.958445, 191.961479, 190.960591, 192.962924, 189.95993, 191.961035,
193.962664, 194.964774, 195.964935, 197.967876, 196.966552, 195.965815,
197.966752, 198.968262, 199.968309, 200.970285, 201.970626, 203.973476,
202.972329, 204.974412, 203.973029, 205.974449, 206.975881, 207.976636,
208.980383, 208.982416, 209.987131, 222.01757, 223.019731, 226.025403,
227.027747, 232.03805, 231.035879, 234.040946, 235.043923, 238.050783),
abundance = c(99.9885, 0.0115, 0.000137, 99.999863, 7.59, 92.41, 100, 19.9, 80.1,
98.93, 1.07, 99.632, 0.368, 99.757, 0.038, 0.205, 100, 90.48, 0.27,
9.25, 100, 78.99, 10, 11.01, 100, 92.2297, 4.6832, 3.0872, 100,
94.93, 0.76, 4.29, 0.02, 75.78, 24.22, 0.3365, 0.0632, 99.6003,
93.2581, 0.0117, 6.7302, 96.941, 0.647, 0.135, 2.086, 0.004, 0.187,
100, 8.25, 7.44, 73.72, 5.41, 5.18, 0.25, 99.75, 4.345, 83.789,
9.501, 2.365, 100, 5.845, 91.754, 2.119, 0.282, 100, 68.0769,
26.2231, 1.1399, 3.6345, 0.9256, 69.17, 30.83, 48.63, 27.9, 4.1,
18.75, 0.62, 60.108, 39.892, 20.84, 27.54, 7.73, 36.28, 7.61, 100,
0.89, 9.37, 7.63, 23.77, 49.61, 8.73, 50.69, 49.31, 0.35, 2.28,
11.58, 11.49, 57, 17.3, 72.17, 27.83, 0.56, 9.86, 7, 82.58, 100,
51.45, 11.22, 17.15, 17.38, 2.8, 100, 14.84, 9.25, 15.92, 16.68,
9.55, 24.13, 9.63, 100, 5.54, 1.87, 12.76, 12.6, 17.06, 31.55,
18.62, 100, 1.02, 11.14, 22.33, 27.33, 26.46, 11.72, 51.839,
48.161, 1.25, 0.89, 12.49, 12.8, 24.13, 12.22, 28.73, 7.49,
4.29, 95.71, 0.97, 0.66, 0.34, 14.54, 7.68, 24.22, 8.59,
32.58, 4.63, 5.79, 57.21, 42.79, 0.09, 2.55, 0.89, 4.74, 7.07,
18.84, 31.74, 34.08, 100, 0.09, 0.09, 1.92, 26.44, 4.08, 21.18,
26.89, 10.44, 8.87, 100, 0.106, 0.101, 2.417, 6.592, 7.854,
11.232, 71.698, 0.09, 99.91, 0.185, 0.251, 88.45, 11.114, 100,
27.2, 12.2, 23.8, 8.3, 17.2, 5.7, 5.6, 100, 3.07, 14.99, 11.24,
13.82, 7.38, 26.75, 22.75, 47.81, 52.19, 0.2, 2.18, 14.8, 20.47,
15.65, 24.84, 21.86, 100, 0.06, 0.1, 2.34, 18.91, 25.51, 24.9,
28.18, 100, 0.14, 1.61, 33.61, 22.93, 26.78, 14.93, 100, 0.13,
3.04, 14.28, 21.83, 16.13, 31.83, 12.76, 97.41, 2.59, 0.16, 5.26,
18.6, 27.28, 13.62, 35.08, 0.012, 99.988, 0.12, 26.5, 14.31, 30.64,
28.43, 37.4, 62.6, 0.02, 1.59, 1.96, 13.24, 16.15, 26.26, 40.78,
37.3, 62.7, 0.014, 0.782, 32.967, 33.832, 25.242, 7.163, 100,
0.15, 9.97, 16.87, 23.1, 13.18, 29.86, 6.87, 29.524, 70.476,
1.4, 24.1, 22.1, 52.4, 100, 100, 100, 100, 100, 100, 100, 100,
100, 0.0055, 0.72, 99.2745) / 100
)
#' Calculate an isotope profile
#'
#' @description Generates an isotope profile using isopat or Rdisop
#'
#' @param molform An object of the as.molform class
#' @param algorithm "isopat" uses the isopat package to calculate isotopes, while
#' "Rdisop" uses the Rdisop package. Though more accurate, Rdisop has been
#' known to crash on Windows computers when called iteratively more than 1000
#' times. Default is Rdisop, though isopat is an alternative.
#' @param min_abundance Minimum abundance for calculating isotopes. Default is 1.
#' @param limit See ?isopat::isopattern for more details. 0.001 appears to calculate enough isotopes.
#'
#' @examples
#' \dontrun{
#'
#' calculate_iso_profile(molform = as.molform("H389C259N65O63S2"), min_abundance = 1)
#'
#' calculate_iso_profile(molform = as.molform("H389C259N65O63S2"), algorithm = "isopat", min_abundance = 1)
#'
#' }
#' @export
calculate_iso_profile <- function(molform,
algorithm = "Rdisop",
min_abundance = 1,
limit = 0.001) {
##################
## CHECK INPUTS ##
##################
# Molform should be an object of the molform class
if (!inherits(molform, "molform")) {
stop("molform should be an object from the molform class.")
}
# Check that the algorithm is an acceptable choice
if (!algorithm %in% c("isopat", "Rdisop")) {
stop("algorithm must either be 'isopat' or 'Rdisop'.")
}
# Min abundance should be a numeric between 0 and 100
if (!is.numeric(min_abundance) | length(min_abundance) != 1) {
stop("min_abundance should be a single numeric.")
}
if (min_abundance < 0 | min_abundance > 100) {
stop("The range of min_abundance is between 0 and 100.")
}
###############################
## CALCULATE ISOTOPE PROFILE ##
###############################
# Collapse the molform
MolForm <- collapse_molform(molform)
if (algorithm == "isopat") {
# Calculate isotope profile with isopat function
IsoProfile <- isopat::isopattern(RelativeAbundances, MolForm, limit) %>%
data.table::data.table() %>%
dplyr::select(mass, abundance) %>%
dplyr::mutate(
abundance = abundance / max(abundance) * 100,
massbin = round(mass)
) %>%
dplyr::group_by(massbin) %>%
dplyr::filter(abundance == max(abundance) & abundance > min_abundance) %>%
dplyr::ungroup() %>%
dplyr::select(-massbin) %>%
dplyr::mutate(
isotope = 0:(nrow(.)-1),
isolabel = paste("M+", isotope, sep = ""),
isolabel = ifelse(isolabel == "M+0", "M", isolabel)
)
} else if (algorithm == "Rdisop") {
# Calculate isotope profile with Rdisop function
IsoProfile <- Rdisop::getMolecule(MolForm, maxisotopes = 20)$isotopes[[1]] %>%
t() %>%
data.table::data.table() %>%
dplyr::rename(mass = V1, abundance = V2) %>%
dplyr::mutate(abundance = abundance / max(abundance) * 100) %>%
dplyr::filter(abundance >= min_abundance) %>%
dplyr::mutate(
isotope = 0:(nrow(.)-1),
isolabel = paste("M+", isotope, sep = ""),
isolabel = ifelse(isolabel == "M+0", "M", isolabel)
)
}
return(IsoProfile)
}
#' Ensures a string is an acceptable amino acid sequence
#'
#' @description A simple test function to ensure the provided sequence is an
#' acceptable amino acid sequence. ProForma sequences are accepted.
#'
#' @param Sequence An amino acid sequence which should be a single string. Required.
#' @param Message Explain why the test failed. Default is FALSE.
#' @param AlternativeGlossary Try a different glossary. See system.file("extdata", "Unimod_v20220602.csv", package = "pspecterlib)
#' for formatting.
#'
#' @details The output will either be a "TRUE" acceptable sequence, or "FALSE"
#' unacceptable sequence. An acceptable sequence cannot be NULL,
#' will not have any spaces or non-letter characters, will be longer than 1
#' amino acid, and will not contain the letters B, J, O, U, X, or Z. ProForma
#' notation (i.e. "TES[Acetyl]T") is now accepted.
#'
#' @examples
#' \dontrun{
#'
#' # An acceptable sequence will return TRUE
#' is_sequence("TEST")
#' is_sequence("TES[Acetyl]T")
#' is_sequence("T[58.77]E[Acetyl]ST")
#'
#' # Unacceptable sequences will return FALSE
#' is_sequence("TESTSEQUENCE")
#' is_sequence("TIRED $CIENTIST WORKING L8")
#' is_sequence("T")
#' is_sequence("T[UnfinishedEST")
#'
#' }
#'
#' @export
is_sequence <- function(Sequence, Message = FALSE, AlternativeGlossary = NULL) {
###########################
## RUN THROUGH THE TESTS ##
###########################
# If sequence is NULL, return FALSE
if (is.null(Sequence) || is.na(Sequence) || Sequence == "") {
if (Message) {message("No sequence provided.")}
return(FALSE)
}
# Test that the square brackets are added
if (grepl("\\[", Sequence) & !grepl("\\]", Sequence) | !grepl("\\[", Sequence) & grepl("\\]", Sequence)) {
if (Message) {message("Square bracket pairs must be completed.")}
return(FALSE)
}
# No spaces
if (grepl("[[:space:]]", Sequence)) {
if (Message) {message("Space not permitted.")}
return(FALSE)
}
# Extract modifications
if (grepl("\\[", Sequence)) {
# Pull modifications
Split <- Sequence %>% strsplit("\\[|\\]") %>% unlist()
Modifications <- Split[c(FALSE, TRUE)]
# Pull Glossary
if (is.null(AlternativeGlossary)) {
Glossary <- data.table::fread(
system.file("extdata", "Unimod_v20220602.csv", package = "pspecterlib")
)
} else {
Glossary <- AlternativeGlossary
}
# Iterate through modifications
Response <- lapply(Modifications, function(mod) {
# Test if numeric
if (suppressWarnings(!is.na(as.numeric(mod)))) {return(TRUE)} else {
return(mod %in% Glossary$Modification)
}
}) %>% unlist()
if (!any(Response)) {
if (Message) {message("Unrecognized modification format. See the Glossary for more details.")}
return(FALSE)
}
Sequence <- Split[c(TRUE, FALSE)] %>% paste0(collapse = "")
}
# Test the sequence is only letters
if (grepl("[^[:alpha:]]", Sequence)) {
if (Message) {message("The sequence with modifications and their brackets removed should be only letters.")}
return(FALSE)
}
# Test the sequence does not contain incorrect amino acids
if (grepl("[BbJjOoUuXxZz]", Sequence)) {
if (Message) {message("B, J, O, U, X, and Z are not acceptable amino acids at this time.")}
return(FALSE)
}
# Unless something else comes up that needs to expand this function, then at
# this step, it is an acceptable sequence.
return(TRUE)
}
#' Convert a proforma string to a PTM object
#'
#' @description Converts proforma strings to a PTM object that get_matched_peaks uses
#' to calculate fragmentation patterns. If there is no modification annotation,
#' the cleaned sequence is returned.
#'
#' @author Degnan, David. Flores, Javier.
#'
#' @param proforma A string written in the format "M.S[Methyl]S[22]S[23].V"
#' @param AlternativeGlossary Try a different glossary. See system.file("extdata", "Unimod_v20220602.csv", package = "pspecterlib)
#' for formatting.
#'
#' @examples
#' \dontrun{
#'
#' convert_proforma("M.(S)[Acetyl]ATNNIAQARKLVEQLRIEAGIERIKVSKAASDLMSYCEQHARNDPLLVGVPASENPFKDK(KPCIIL)[-52.9879].")
#'
#' convert_proforma("M.SS[Methyl]S.V")
#'
#' convert_proforma("M.S[Methyl]S[22]S[23].V")
#'
#' convert_proforma("TESTTESTTEST")
#'
#' }
#' @export
convert_proforma <- function(proforma, AlternativeGlossary = NULL) {
##################
## CHECK INPUTS ##
##################
# Proteoform should be a string
if (is.null(proforma) || !is.character(proforma)) {
stop("proforma must be a vector of characters.")
}
###########################
## CHECK SIMPLE SEQUENCE ##
###########################
# If no brackets, test the sequence is acceptable and then return that sequence
if (!grepl("\\[|\\]", proforma)) {
# Get the number of periods in the sequence
NumPeriods <- stringr::str_count(proforma, "\\.")
# There shouldn't be more than 2 periods at this point in the pipeline
if (NumPeriods > 2) {
stop(paste0("There are too many periods in the input proforma string", proforma))
}
# Split by the periods and take the largest chunk
SeqSplit <- strsplit(proforma, "\\.") %>% unlist()
SeqPosition <- lapply(SeqSplit, nchar) %>% unlist() %>% which.max()
Sequence <- SeqSplit[SeqPosition]
if (is_sequence(Sequence)) {return(Sequence)} else {
stop(paste0("The detected sequence: ", Sequence, " is not acceptable."))
}
}
###################
## LOAD GLOSSARY ##
###################
# Load backend glossary
if (is.null(AlternativeGlossary)) {
Glossary <- data.table::fread(
system.file("extdata", "Unimod_v20220602.csv", package = "pspecterlib")
)
} else {
Glossary <- AlternativeGlossary
}
#####################################
## OTHERWISE, BUILD THE PTM OBJECT ##
#####################################
## SEPARATE THE MODIFICATIONS ##
# Grab the data within each of the hard brackets []
Bracketed_Data <- proforma %>%
gsub(pattern = "[", replacement = "^[", fixed = T) %>%
gsub(pattern= "]", replacement = "]^", fixed = T) %>%
strsplit("^", fixed = T) %>%
unlist() %>%
lapply(function(x) {if (grepl("[", x, fixed = T)) {x}}) %>%
unlist()
# Separate out the modifications from the mass changes
Modifications <- NULL
MassChanges <- NULL
# Separate modifications and mass changes
for (Mod in Bracketed_Data) {
Mod <- gsub("\\[|\\]", "", Mod)
StringTest <- suppressWarnings(is.na(as.numeric(Mod)))
if (StringTest) {Modifications <- c(Modifications, Mod)} else {MassChanges <- c(MassChanges, Mod)}
}
# Check if modification is in the glossary
if (!is.null(Modifications)) {
# Ensure all modifications are in the library
if (all(Modifications %in% Glossary$Modification) == FALSE) {
stop(paste("Modification", Modifications[Modifications %in% Glossary$Modification == FALSE],
"is not in our library. If no input error is found, submit an issue",
"request on github.")
)
}
}
## CLEAN THE SEQUENCE ##
# First, use the proforma annotation
Sequence <- proforma
# Then, remove each of the bracketed pieces of information
for (Mod in Bracketed_Data) {
Sequence <- gsub(Mod, "", Sequence, fixed = TRUE)
}
# Then, remove any of the remaining parenthesis
Sequence <- gsub("\\(|\\)", "", Sequence)
# Get the number of periods in the sequence
NumPeriods <- stringr::str_count(Sequence, "\\.")
# There shouldn't be more than 2 periods at this point in the pipeline
if (NumPeriods > 2) {
stop(paste0("There are too many periods in the input proforma string", proforma))
}
# Split by the periods and take the largest chunk
SeqSplit <- strsplit(Sequence, "\\.") %>% unlist()
SeqPosition <- lapply(SeqSplit, nchar) %>% unlist() %>% which.max()
Sequence <- SeqSplit[SeqPosition]
## CONVERT SEQUENCE TO POSITIONS ##
SeqWithMods <- proforma
# First, convert the mass changes to a simple "MassChange"
for (mod in MassChanges) {
SeqWithMods <- gsub(paste0("[", mod, "]"), "[MassChange]", SeqWithMods, fixed = T)
}
# Get sequence cleaned with modifications
SeqWithMods <- SeqWithMods %>%
strsplit("\\.") %>%
unlist() %>%
.[SeqPosition] %>%
gsub(pattern = "\\(|\\)", replacement = "")
# Get modification starts and finishes
PTM_Object <- data.frame(
Start = stringr::str_locate_all(SeqWithMods, "\\[")[[1]][,1],
Finish = stringr::str_locate_all(SeqWithMods, "\\]")[[1]][,1]
) %>%
dplyr::mutate(
`Mod Number` = 1:nrow(.),
Span = Finish - Start,
`Rolling Span` = cumsum(Span),
`Rolling Span` = ifelse(is.na(`Rolling Span`) & `Mod Number` == 1, Span, `Rolling Span`),
`N Position` = Finish - `Mod Number` - `Rolling Span`,
Name = gsub("\\[|\\]", "", Bracketed_Data),
`AMU Change` = suppressWarnings(as.numeric(Name)),
) %>%
dplyr::select(Name, `AMU Change`, `N Position`) %>%
dplyr::mutate(
`AMU Change` = purrr::map2(`AMU Change`, Name, function(`AMU Change`, Name) {
ifelse(is.na(`AMU Change`), unlist(Glossary[Glossary$Modification == Name, "Mass Change"]), `AMU Change`)
}) %>% unlist()
)
rownames(PTM_Object) <- 1:nrow(PTM_Object)
######################
## BUILD THE OBJECT ##
######################
# Add proforma string input
attr(PTM_Object, "pspecter")$proforma <- proforma
attr(PTM_Object, "pspecter")$cleaned_sequence <- Sequence
attr(PTM_Object, "pspecter")$modifications <- Bracketed_Data
attr(PTM_Object, "pspecter")$mass_changes <- as.numeric(MassChanges)
attr(PTM_Object, "pspecter")$PTMs <- Modifications
# Add class
class(PTM_Object) <- c(class(PTM_Object), "modifications_pspecter")
# Return results
return(PTM_Object)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.