featureLibrary <- function(lipidType,
nameQsession,
nameLibrary,
indexFeature,
standard){
tempGT <- paste("Qsession\t", "Run\t",
"MQ.Q1\t", "MQ.Q3\t", "MQ.RT\t",
"Library.Q1\t", "Library.Q3\t", "Library.RT")
## Important library column names other than Mass Info and sample runs
libNames <- c("GeneralID",
"Family",
"Q1_Modification_Value",
"Barcode",
"Structural.ID.by.EPI-IDA",
"MolecularID",
"TotalCarbonID",
"ExactMass")
## Error checking inputs
if(!(lipidType %in% c("SPH", "GPC")) == T){
stop("lipidType class parameter must be 'SPH' or 'GPC'")
}
if(!(file.exists(nameQsession))){
stop("nameQsession spreadsheet does not exist or the path is incorrect.")
}
if(!(file.exists(nameLibrary))){
stop("nameLibrary spreadsheet does not exist or the path is incorrect.")
}
if(any(!(indexFeature %in% c(56, 38, 43, 64, 71, 82, 83))) == T ){
stop("indexFeature must be from the following: RT(56), Area Ratio(38), Height Ratio(43), Full Width 50Percent(64), Relative RT(71), Tailing Factor(82), Assymmetry Factor(83)")
}
if(any(indexFeature %in% c(38, 43, 71)) == T && !(exists("standard"))){
stop("standard parameter must be filled.")
}
## Organism/Matrix in order of library sample runs
## THIS IS HARD-CODED!
{
if(!(lipidType %in% c("SPH", "GPC"))){
stop("Lipid class must be from the following:\n'SPH' 'GPC' ")
} else if(lipidType == "SPH"){
libOrgMat <-c("1;_;Human.Blood", "2;_;Mouse.Brain",
"3;_;Human.Blood", "4;_;Mouse.Brain",
"5;_;Human.Fibroblast", "6;_;Human.Lymphoblast",
"7;_;Human.Brain", "8;_;Human.Blood",
"9;_;Human.CSF", "10;_;Mouse.Brain",
"11;_;Mouse.Brain", "12;_;Mouse.Brain",
"13;_;Mouse.Brain", "14;_;Mouse.Macrophage",
"15;_;Mouse.Brain", "16;_;Mouse.Brain",
"17;_;Mouse.Blood", "18;_;Mouse.Brain",
"19;_;Mouse.Brain", "20;_;Mouse.Synaptosome",
"21;_;Mouse.Brain", "22;_;Mouse.Brain",
"23;_;Media", "24;_;Human.Fibroblast",
"25;_;Mouse.Brain", "26;_;Mouse.Oligodendrocyte")
} else if(lipidType == "GPC"){
libOrgMat <- c("1;_;Mouse.Brain", "2;_;Human.Brain",
"3;_;Human.Blood", "4;_;Human.Blood",
"5;_;Mouse.Brain", "6;_;Mouse.Brain",
"7;_;Mouse.Brain", "8;_;Mouse.Brain",
"9;_;Mouse.Brain", "10;_;Mouse.Brain",
"11;_;Mouse.Blood", "12;_;Mouse.Blood",
"13;_;Mouse.Blood", "14;_;Mouse.Blood",
"15;_;Human.Blood", "16;_;Mouse.Macrophage",
"17;_;Human.Brain", "18;_;Mouse.Brain",
"19;_;Mouse.Synaptosome", "20;_;Mouse.Brain",
"21;_;Mouse.Brain", "22;_;Mouse.Brain",
"23;_;Mouse.Brain", "24;_;Mouse.Brain",
"25;_;Mouse.Brain", "26;_;Mouse.Blood",
"27;_;Mouse.Brain", "28;_;Mouse.Brain")
}
}
##Label all runs with acquisition method and matrix
{
library <- read.xlsx(nameLibrary, rows = c(1),
skipEmptyCols = T,
skipEmptyRows = F)
## HARD-CODED keywords to look for "spacer" column separating projects!
temp <- grep("Mass.Info.Method|Qtrap5500_Agilent1290", colnames(library))
## Project spacer columns are separated by at least 1 sample run. If
## 'temp' contains consecutive elements, they are disregarded as project
## spacer columns. First few columns in the library are excluded, for
## example
if(length(which(diff(temp) == 1)) > 0 && length(temp) != length(libOrgMat)){
temp <- c(temp[diff(temp) != 1][-1], temp[length(temp)])
}
if(length(libOrgMat) > length(temp) && grepl("testthat", nameLibrary) == F){
stop("Check library has same number of Mass Info spacer columns as data set summary columns. Data set summary columns are stored as a vector of organism/matrix descriptions in 'libOrgMat'.")
} else if(length(temp) == 0){
stop("Multiquant Qsession file(s) does not contain any sample runs that match the library.")
} else if(length(temp) < length(libOrgMat) && length(temp) > 0 &&
grepl("testthat", nameLibrary) == F){
stop(paste0("Library sample runs are not present within all Multiquant Qsession file(s). ", round(length(temp)/length(libOrgMat), 2), "% of the Multiquant sample runs are present in the library."))
}
## Concatenate acquisition method with all columns to the right of it
## This gives us the mass spec method and matrix information for each run
libNamesMethod <- colnames(library)
j <- 1
for(i in (temp[1]+1):length(colnames(library))){
if((i %in% temp) == TRUE){
j <- j + 1
} else{
libNamesMethod[i] <- paste(colnames(library[i]),
";_;",
libOrgMat[j],
";_;",
colnames(library)[temp[j]],
sep = "")
}
}
rm(temp, libOrgMat, i, j)
}
## Load and collapse multiple MRM methods to determine mass info
{
temp <- grep("Mass.Info", as.character(colnames(library)))
tempNames <- colnames(library)[temp]
rm(library)
libMethod <- read.xlsx(nameLibrary,
skipEmptyCols = T,
skipEmptyRows = F)
libMethod <- as.data.table(libMethod[, temp])
colnames(libMethod) <- tempNames
libMethod <- libMethod[, as.vector(lapply(libMethod, function(x) sum(!is.na(x)) / length(x) ) > 0.9), with=F]
libMethod[libMethod == "x"] <- NA
#library <- as.character(colnames(library))
#temp <- grep("Mass.Info", library)
#rm(library)
#libMethod <- read.xlsx(nameLibrary, cols = c(temp),
# skipEmptyCols = T,
# skipEmptyRows = F)
#libMethod <- libMethod[lapply(libMethod, function(x) sum(!is.na(x)) / length(x) ) > 0.9]
#libMethod[libMethod == "x"] <- NA
## Remove "HELPER" columns if they exist
dropRow <- 0 # Keep in environment
while(any(grep("/", libMethod[1,])) == F){
dropRow <- dropRow + 1
libMethod <- as.data.frame(libMethod[-1,])
}
## Create a 'temp' vector with a single mass info for each method
libMethod <- as.data.table(libMethod)
temp <- vector(mode = "character", length = nrow(libMethod))
for(i in seq_along(temp)){
if(all(is.na(unique(unlist(libMethod[i, ])))) == F){
temp[i] <- as.character(unique(unlist(libMethod[i, ]))[
which(!is.na(unique(unlist(libMethod[i, ]))))])
} else {
temp[i] = NA
}
}
## Specific format to GPC because there is only 1 MRM method
#if(lipidType == "GPC"){
# temp <- library
# colnames(temp) <- "temp"
#}
## Split "Mass.Info" into Q1 and Q3 as numerics
## 'libMassInfo' holds the Q1/Q3 columns in the library
libMassInfo <- as.data.table(temp)
libMassInfo[, c("library.Q1", "library.Q3") := tstrsplit(temp, "/")]
libMassInfo[, library.Q1 := as.numeric(library.Q1)]
libMassInfo[, library.Q3 := as.numeric(library.Q3)]
libMassInfo[, temp := NULL]
rm(temp, i)
}
## Get library columns like validation level, exact mass, GeneralID, family, etc.
## These columns are from 'libNames'
## Library columns of interest are stored in 'libCols'
{
libCols <- read.xlsx(nameLibrary, rows = c(1),
skipEmptyCols = T,
skipEmptyRows = F)
temp <- vector(mode = "character", length = length(libNames))
for(i in seq_along(temp)){
temp[i] <- grep(libNames[i], colnames(libCols), fixed = T)
}
libCols <- read.xlsx(nameLibrary, cols = c(temp),
skipEmptyCols = T,
skipEmptyRows = F)
## Remove "HELPER" columns if they exist
if(dropRow > 0){
libCols <- libCols[-(1:dropRow),]
}
rm(i)
}
## Merge mass info columns with library columns
## At this point, 'library' holds Q1/Q3 mass info and other important
## library columns except run information
{
if(nrow(libCols) == nrow(libMassInfo)){
library <- cbind(libMassInfo, libCols)
rm(libMassInfo, libCols, libNames)
} else{
stop("Number of mass info rows do not match the number of rows in
the 'libNames' columns.")
}
rm(libMethod)
}
## Get Q1 and Q3 for later below with 'libCols' and 'libColsOrder'
libBarcode <- data.table(library.Q1=as.character(library[, library.Q1]),
library.Q3=as.character(library[, library.Q3]))
## Load file containing all Qsession links and store names of unique runs
{
Qsession <- fread(nameQsession, sep = "\t",
header = T, strip.white = T)
temp <- vector("list", length = nrow(Qsession))
for(i in seq_along(temp)){
temp[i] <- unique(fread(as.character(Qsession[i,1, with=F]),
select = c(4), sep = "\t", header = T))
}
temp <- unlist(temp)
## Convert special characters for consistency for all unique column name runs
temp <- gsub("\\s+", ".", temp)
rm(i)
}
## Read library and convert special characters in column names for consistency
{
libCols <- read.xlsx(nameLibrary,
skipEmptyCols = T,
skipEmptyRows = F)
colnames(libCols) <- gsub("\\s+", ".", colnames(libCols))
}
## Find library indices of run names matching Qsession files
{
for(i in seq_along(temp)){
if(length(grep(temp[i], colnames(libCols), fixed=T)) == 0){
warning(paste0("Sample ", temp[i], " in", Qsession, " did not map to the library"))
temp[i] <- NA
} else if(length(grep(temp[i], colnames(libCols), fixed=T)) == 1){
temp[i] <- grep(temp[i], colnames(libCols), fixed = T)
} else if(length(grep(temp[i], colnames(libCols), fixed=T)) > 1){
stop("A sample run in the Qsession files mapped to two or more sample runs in the library. Check that the library contains no duplicated sample run columns with the same name.")
}
}
temp <- as.numeric(temp)
temp <- temp[!is.na(temp)]
## Only keep unique sample run indices in the library. It is possible that the same
## sample run is present in multiple Qsession files (as a template)
temp <- temp[!duplicated(temp)]
rm(i)
}
## Load library and only keep matching Qsession columns
{
## Note how the column names in 'libCols' are unchanged!
libCols <- read.xlsx(nameLibrary,
skipEmptyCols = T,
skipEmptyRows = F)
libCols <- as.data.table(libCols)
libCols <- libCols[, temp, with=F]
## Remove "HELPER" column if it exists
if(dropRow > 0){
libCols <- libCols[-(1:dropRow),]
}
rm(temp, dropRow)
}
## Convert to matrix
libCols <- as.matrix(libCols)
class(libCols) <- "numeric"
## 'indexFeatureName' holds the name of the feature of interest
{
if(indexFeature == 71){
fileQsession <- fread(as.character(Qsession[1, 1, with=F]),
sep="\t", header = T, na.strings = "N/A",
select = c(56))
} else{
fileQsession <- fread(as.character(Qsession[1, 1, with=F]),
sep="\t", header = T, na.strings = "N/A",
select = c(indexFeature))
}
indexFeatureName <- colnames(fileQsession)
}
## Check if 'indexFeature' is something we need to calculate with a
## standard (stored in column 17 of the Qsession)
if(length(grep("Area|Height", colnames(fileQsession))) > 0){
indexFeature <- c(17, indexFeature)
}
## For each Qsession file and individual run, map the retention times in the
## library with the feature of interest
for(i in 1:nrow(Qsession)){
## Load Qsession file
{
if(length(indexFeature) > 1){
fileQsession <- fread(as.character(Qsession[i, 1, with=F]),
sep="\t", header = T, na.strings = "N/A",
select = c(4, 21, 56, indexFeature))
} else if(indexFeature == 56){
fileQsession <- fread(as.character(Qsession[i, 1, with=F]),
sep="\t", header = T, na.strings = "N/A",
select = c(4, 21, 56))
} else if(indexFeature == 71){
# For relative RT
fileQsession <- fread(as.character(Qsession[i, 1, with=F]),
sep="\t", header = T, na.strings = "N/A",
select = c(4, 21, 56, 17))
} else {
fileQsession <- fread(as.character(Qsession[i, 1, with=F]),
sep="\t", header = T, na.strings = "N/A",
select = c(4, 21, 56, indexFeature))
}
}
## Split Qsession Mass Info into Q1 and Q3. Q3 is necessary for any normalization calculations
{
## Remove spaces
fileQsession[, "Mass Info" := gsub("\\s+", "", `Mass Info`)]
## Mass info must follow regex format: triple/quadruple number and
## optional period and/or number followed by forward slash and
## triple/quadruple number and optional period and/or number
#fileQsession[, "temp" := grepl("[0-9]{3,4}[\\.]?[0-9]{0,1}/[0-9]{3,4}[\\.]?[0-9]{0,1}", `Mass Info`)]
fileQsession[, "temp" := grepl("^[0-9]{3,4}[.][0-9]/[0-9]{3,4}[.][0-9]$", `Mass Info`)]
if(all(fileQsession[, temp]) == F){
stop(paste0("Check that 'Mass Info' column in ", Qsession[i,1], " is formatted correctly."))
} else{
fileQsession[, temp := NULL]
fileQsession[, c("library.Q1", "library.Q3") := tstrsplit(`Mass Info`, "/")[1:2]]
fileQsession[, library.Q1 := as.character(as.numeric(library.Q1))]
fileQsession[, library.Q3 := as.character(as.numeric(library.Q3))]
fileQsession[, `Mass Info` := NULL]
}
}
## For every Multiquant Qsession file:
## Enforced, strict rounding rule. Truncate to 7 decimals, convert to numeric,
## potentially lose trailing zeros, then round to two decimal places
suppressWarnings(
fileQsession[, `Retention Time` :=
as.character(
round(
as.numeric(
sprintf("%.7f", `Retention Time`)), 2))]
)
## Loop over each run within the Qsession
for(j in 1:length(fileQsession[, unique(`Sample Name`)])){
## Load individual run into a data table
{
runQsession <- fileQsession[`Sample Name` == unique(`Sample Name`)[j]]
runQsession <- runQsession[!is.na(`Retention Time`)]
runQsession[, `Retention Time` := as.numeric(`Retention Time`)]
}
## Create temporary copies of 'libCols' and 'runQsession'
## Convert any special characters into normal ones
{
matchLibCols <- colnames(libCols)
matchLibCols <- gsub("\\s+", ".", matchLibCols)
matchRunQsession <- runQsession[, unique(`Sample Name`)]
matchRunQsession <- gsub("\\s+", ".", matchRunQsession)
}
if(length(grep(matchRunQsession, matchLibCols, fixed = T)) == 0){
warning(paste0("Qsession file ", Qsession[i,1], " at run ", matchRunQsession, " did not match to the library."))
} else{
## Column index in 'libCols' that matches the Qsession run
indexLibCols <- grep(matchRunQsession, matchLibCols, fixed = T)
if(length(indexLibCols) > 1){
warning(paste0("Duplicate library sample run detected: "), matchRunQsession)
}
## Normalization based on input feature 'standard' if applicable
## CHECK LATER TO SEE IF THIS WORKS AS USUAL
if(length(grep("Area|Height", colnames(fileQsession), fixed = T)) > 0){
e <- paste("(", indexFeatureName, ")", sep="")
e <- parse(text=e)[[1]]
num <- grep(indexFeatureName, colnames(runQsession), fixed = T)
runQsession[, normalization := NA]
## Normalizing based on input parameter 'standard'
for(l in 1:nrow(standard)){
norm <- which(runQsession[, `Component Name` == standard[l][1]])[1] # Assumption of no duplicate sample names
norm <- as.numeric(runQsession[norm, indexFeatureName, with = F])
runQsession[library.Q3 == standard[l,2], "Normalization" := as.numeric(norm)]
}
runQsession[, colnames(runQsession)[num] := eval(e) / Normalization]
runQsession[, Normalization := NULL]
rm(norm)
}
## Normalize retention time to indicated standard
if(length(indexFeature) == 1 && indexFeature == 71){
# Normalizing based on input parameter 'standard'
for(l in 1:nrow(standard)){
norm <- which(runQsession[, `Component Name` == standard[l][1]])[1] # Assumption of no duplicate sample names
norm <- as.numeric(runQsession[norm, `Retention Time`])
runQsession[library.Q3 == standard[l,2], "Normalization" := as.numeric(norm)]
}
runQsession[, `Relative Retention Time` := `Retention Time` / Normalization]
runQsession[, Normalization := NULL]
}
## Matrices are easier to do value replacement below
runQsession <- as.matrix(runQsession)
## Find correct indices for lookup table to map new features to RT in 'libCols'
## NOTE: entries that do not match (like a mismatched RT between library/Multiquant
## file get converted to NA)
{
if(any(indexFeature %in% 71)){
oldMap <- grep("^Retention Time$", colnames(runQsession))
newMap <- grep("Relative Retention Time", colnames(runQsession), fixed = T)
} else {
oldMap <- grep("^Retention Time$", colnames(runQsession))
newMap <- grep(indexFeatureName, colnames(runQsession), fixed = T)
}
libQ1 <- grep("library.Q1", colnames(runQsession), fixed = T)
libQ3 <- grep("library.Q3", colnames(runQsession), fixed = T)
mapDT <- data.table(old=paste(runQsession[, libQ1],
runQsession[, libQ3],
runQsession[, oldMap],
sep = "\t"),
new=paste(runQsession[, libQ1],
runQsession[, libQ3],
runQsession[, newMap],
sep = "\t"))
rm(oldMap, newMap)
}
## For every matching library sample run:
## Enforced, strict rounding rule. Truncate to 7 decimals, convert to numeric,
## potentially lose trailing zeros, then round to two decimal places
suppressWarnings(
libFeature <- paste(library[, library.Q1],
library[, library.Q3],
sprintf("%.2f",as.numeric(as.character(
round(as.numeric(sprintf(
"%.7f", libCols[, indexLibCols[1]])), 2) ## Assumpton that if indexLibCols > 1, must be duplicates
))), sep = "\t"))
## Lookup table to replace library sample run transition RTs
## with the new feature
libFeature2 <- libFeature
libFeature <- mapDT[, new][match(unlist(libFeature), mapDT[, old])]
## Skip warning and feature mapping if we encounter duplicated
## sample run names in the Qsession files. Assumption that
## running the same Qsession file will fail on the second attempt
## because mapDT will yield only NA and therefore
## libCols[, indexLibCols] <- as.numeric(unlist(libFeature))
## will fail. This isn't necessarily true if the new feature is RT
## although I can't figure out exactly why. If you see an entire
## sample run with duplicate values, it must mean that there is another
## run in the Qsession file with the same matching name.
if(all(is.na(libFeature)) == T){
warning(paste0("Check that ", matchRunQsession, " is a unique sample run and not present in multiple Qsession files. Also check that this sample run is not duplicated in the library."))
} else {
if(sum(!(is.na(libFeature))) != nrow(mapDT)){
warning(paste0("The following transitions (Q1;_;Q3;_;RT) in ", Qsession[i, 1], " at run ", matchRunQsession, " did not map to the library: ", paste(mapDT[, old][which(!(mapDT[, old] %in% libFeature[which(!(is.na(libFeature)))]))], collapse = " AND ")))
## Loop over each Qsession transition that did not map to the library
for(a in 1:(nrow(mapDT)-sum(!(is.na(libFeature)))) ){
temp <- paste(strsplit(as.character(Qsession[i, 1]), "/")[[1]][length(strsplit(as.character(Qsession[i, 1]), "/")[[1]])],
matchRunQsession,
mapDT[, old][which(!(mapDT[, new] %in% libFeature[which(!(is.na(libFeature)))]))][a],
libFeature2[grep(strsplit(mapDT[, old][which(!(mapDT[, new] %in% libFeature[which(!(is.na(libFeature)))]))][a], "\t")[[1]][3], libFeature2, fixed = T)],
sep = "\t")
## Special case when mapDT (Qsession run) contains a duplicated Q1, Q3, and RT
## If this is the case, temp will return NA for Q1 and Q3 and blanks for everything else
## The dimension will be length(libFeature) long
## If this is the case, temp is set to mapDT$old and mapDT$new
if(is.na(mapDT[, old][which(!(mapDT[, new] %in% libFeature[which(!(is.na(libFeature)))]))][a])){
temp <- paste(strsplit(as.character(Qsession[i, 1]), "/")[[1]][length(strsplit(as.character(Qsession[i, 1]), "/")[[1]])],
paste0("duplicate_Qsession_Q1.Q3.RT_", matchRunQsession),
mapDT[which(duplicated(mapDT))][, old],
mapDT[which(duplicated(mapDT))][, new],
sep = "\t")
}
for(b in seq_along(temp)){
tempGT <- rbind(tempGT, temp[b])
}
#tempGT <- rbind(tempGT, temp)
}
}
## Only keep feature value; discard Q1 and Q3
libFeature <- as.data.table(libFeature)
libFeature[, libFeature := tstrsplit(libFeature, "\t")[3]]
libCols[, indexLibCols] <- as.numeric(unlist(libFeature))
}
}
}
}
cat(tempGT, file="~/Downloads/GT.txt", append=FALSE, sep = "\n")
## Replace library column names with library column names plus organism and
## matrix information
libCols <- as.data.table(libCols)
grepCol <- vector(mode = "numeric", length = ncol(libCols))
for(i in 1:ncol(libCols)){
if(length(grep(colnames(libCols)[i], libNamesMethod, fixed = T)) == 1){
grepCol[[i]] <- grep(colnames(libCols)[i], libNamesMethod, fixed = T)
} else{
stop("Weird. 'libCols' couldn't match to libNamesMethod. Check that the library sample run columns match the Qsession files and/or there are duplicate column names.")
}
}
## Re-map column names with matrix and mass spec method information
colnames(libCols) <- libNamesMethod[grepCol]
## Merge library columns with new feature information replacing RT
## Also add a brand new index column
library <- cbind(library, libCols)
library <- data.table(Index = c(1:dim(library)[1]), library)
rm(libCols, libNamesMethod, grepCol)
return(library)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.