R/featureLibrary.R

featureLibrary <- function(lipidType,
                           nameQsession,
                           nameLibrary,
                           indexFeature,
                           standard){

    tempGT <- paste("Qsession\t", "Run\t",
                    "MQ.Q1\t", "MQ.Q3\t", "MQ.RT\t",
                    "Library.Q1\t", "Library.Q3\t", "Library.RT")

    ## Important library column names other than Mass Info and sample runs
    libNames <- c("GeneralID",
                  "Family",
                  "Q1_Modification_Value",
                  "Barcode",
                  "Structural.ID.by.EPI-IDA",
                  "MolecularID",
                  "TotalCarbonID",
                  "ExactMass")

    ## Error checking inputs
    if(!(lipidType %in% c("SPH", "GPC")) == T){
        stop("lipidType class parameter must be 'SPH' or 'GPC'")
    }
    if(!(file.exists(nameQsession))){
        stop("nameQsession spreadsheet does not exist or the path is incorrect.")
    }
    if(!(file.exists(nameLibrary))){
        stop("nameLibrary spreadsheet does not exist or the path is incorrect.")
    }
    if(any(!(indexFeature %in% c(56, 38, 43, 64, 71, 82, 83))) == T ){
        stop("indexFeature must be from the following: RT(56), Area Ratio(38), Height Ratio(43), Full Width 50Percent(64), Relative RT(71), Tailing Factor(82), Assymmetry Factor(83)")
    }
    if(any(indexFeature %in% c(38, 43, 71)) == T && !(exists("standard"))){
        stop("standard parameter must be filled.")
    }

    ## Organism/Matrix in order of library sample runs
    ## THIS IS HARD-CODED!
    {
        if(!(lipidType %in% c("SPH", "GPC"))){
            stop("Lipid class must be from the following:\n'SPH' 'GPC' ")
        } else if(lipidType == "SPH"){
            libOrgMat <-c("1;_;Human.Blood", "2;_;Mouse.Brain",
                          "3;_;Human.Blood", "4;_;Mouse.Brain",
                          "5;_;Human.Fibroblast", "6;_;Human.Lymphoblast",
                          "7;_;Human.Brain", "8;_;Human.Blood",
                          "9;_;Human.CSF", "10;_;Mouse.Brain",
                          "11;_;Mouse.Brain", "12;_;Mouse.Brain",
                          "13;_;Mouse.Brain", "14;_;Mouse.Macrophage",
                          "15;_;Mouse.Brain", "16;_;Mouse.Brain",
                          "17;_;Mouse.Blood", "18;_;Mouse.Brain",
                          "19;_;Mouse.Brain", "20;_;Mouse.Synaptosome",
                          "21;_;Mouse.Brain", "22;_;Mouse.Brain",
                          "23;_;Media", "24;_;Human.Fibroblast",
                          "25;_;Mouse.Brain", "26;_;Mouse.Oligodendrocyte")
        } else if(lipidType == "GPC"){
            libOrgMat <- c("1;_;Mouse.Brain",        "2;_;Human.Brain",
                           "3;_;Human.Blood",        "4;_;Human.Blood",
                           "5;_;Mouse.Brain",        "6;_;Mouse.Brain",
                           "7;_;Mouse.Brain",        "8;_;Mouse.Brain",
                           "9;_;Mouse.Brain",        "10;_;Mouse.Brain",
                           "11;_;Mouse.Blood",       "12;_;Mouse.Blood",
                           "13;_;Mouse.Blood",       "14;_;Mouse.Blood",
                           "15;_;Human.Blood",       "16;_;Mouse.Macrophage",
                           "17;_;Human.Brain",       "18;_;Mouse.Brain",
                           "19;_;Mouse.Synaptosome", "20;_;Mouse.Brain",
                           "21;_;Mouse.Brain",       "22;_;Mouse.Brain",
                           "23;_;Mouse.Brain",       "24;_;Mouse.Brain",
                           "25;_;Mouse.Brain",       "26;_;Mouse.Blood",
                           "27;_;Mouse.Brain",       "28;_;Mouse.Brain")
        }
    }

    ##Label all runs with acquisition method and matrix
    {
        library <- read.xlsx(nameLibrary, rows = c(1),
                             skipEmptyCols = T,
                             skipEmptyRows = F)
        ## HARD-CODED keywords to look for "spacer" column separating projects!
        temp <- grep("Mass.Info.Method|Qtrap5500_Agilent1290", colnames(library))

        ## Project spacer columns are separated by at least 1 sample run. If
        ## 'temp' contains consecutive elements, they are disregarded as project
        ## spacer columns. First few columns in the library are excluded, for
        ## example
        if(length(which(diff(temp) == 1)) > 0 && length(temp) != length(libOrgMat)){
            temp <- c(temp[diff(temp) != 1][-1], temp[length(temp)])
        }

        if(length(libOrgMat) > length(temp) && grepl("testthat", nameLibrary) == F){
            stop("Check library has same number of Mass Info spacer columns as data set summary columns. Data set summary columns are stored as a vector of organism/matrix descriptions in 'libOrgMat'.")
        } else if(length(temp) == 0){
            stop("Multiquant Qsession file(s) does not contain any sample runs that match the library.")
        } else if(length(temp) < length(libOrgMat) && length(temp) > 0 &&
                grepl("testthat", nameLibrary) == F){
            stop(paste0("Library sample runs are not present within all Multiquant Qsession file(s). ", round(length(temp)/length(libOrgMat), 2), "% of the Multiquant sample runs are present in the library."))
        }

        ## Concatenate acquisition method with all columns to the right of it
        ## This gives us the mass spec method and matrix information for each run
        libNamesMethod <- colnames(library)
        j <- 1
        for(i in (temp[1]+1):length(colnames(library))){
            if((i %in% temp) == TRUE){
                j <- j + 1
            } else{
                libNamesMethod[i] <- paste(colnames(library[i]),
                                           ";_;",
                                           libOrgMat[j],
                                           ";_;",
                                           colnames(library)[temp[j]],
                                           sep = "")
            }
        }
        rm(temp, libOrgMat, i, j)
    }

    ## Load and collapse multiple MRM methods to determine mass info
    {
        temp <- grep("Mass.Info", as.character(colnames(library)))
        tempNames <- colnames(library)[temp]
        rm(library)
        libMethod <- read.xlsx(nameLibrary,
                               skipEmptyCols = T,
                               skipEmptyRows = F)
        libMethod <- as.data.table(libMethod[, temp])
        colnames(libMethod) <- tempNames
        libMethod <- libMethod[, as.vector(lapply(libMethod, function(x) sum(!is.na(x)) / length(x) ) > 0.9), with=F]
        libMethod[libMethod == "x"] <- NA
        #library <- as.character(colnames(library))
        #temp <- grep("Mass.Info", library)
        #rm(library)

        #libMethod <- read.xlsx(nameLibrary, cols = c(temp),
        #                       skipEmptyCols = T,
        #                       skipEmptyRows = F)
        #libMethod <- libMethod[lapply(libMethod, function(x) sum(!is.na(x)) / length(x) ) > 0.9]
        #libMethod[libMethod == "x"] <- NA

        ## Remove "HELPER" columns if they exist
        dropRow <- 0 # Keep in environment
        while(any(grep("/", libMethod[1,])) == F){
            dropRow <- dropRow + 1
            libMethod <- as.data.frame(libMethod[-1,])
        }

        ## Create a 'temp' vector with a single mass info for each method
        libMethod <- as.data.table(libMethod)
        temp <- vector(mode = "character", length = nrow(libMethod))
        for(i in seq_along(temp)){
            if(all(is.na(unique(unlist(libMethod[i, ])))) == F){
                temp[i] <- as.character(unique(unlist(libMethod[i, ]))[
                    which(!is.na(unique(unlist(libMethod[i, ]))))])
            } else {
                temp[i] = NA
            }
        }

        ## Specific format to GPC because there is only 1 MRM method
        #if(lipidType == "GPC"){
        #    temp <- library
        #    colnames(temp) <- "temp"
        #}

        ## Split "Mass.Info" into Q1 and Q3 as numerics
        ## 'libMassInfo' holds the Q1/Q3 columns in the library
        libMassInfo <- as.data.table(temp)
        libMassInfo[, c("library.Q1", "library.Q3") := tstrsplit(temp, "/")]
        libMassInfo[, library.Q1 := as.numeric(library.Q1)]
        libMassInfo[, library.Q3 := as.numeric(library.Q3)]
        libMassInfo[, temp := NULL]
        rm(temp, i)
    }

    ## Get library columns like validation level, exact mass, GeneralID, family, etc.
    ## These columns are from 'libNames'
    ## Library columns of interest are stored in 'libCols'
    {
        libCols <- read.xlsx(nameLibrary, rows = c(1),
                             skipEmptyCols = T,
                             skipEmptyRows = F)
        temp <- vector(mode = "character", length = length(libNames))
        for(i in seq_along(temp)){
            temp[i] <- grep(libNames[i], colnames(libCols), fixed = T)
        }

        libCols <- read.xlsx(nameLibrary, cols = c(temp),
                             skipEmptyCols = T,
                             skipEmptyRows = F)

        ## Remove "HELPER" columns if they exist
        if(dropRow > 0){
            libCols <- libCols[-(1:dropRow),]
        }
        rm(i)
    }

    ## Merge mass info columns with library columns
    ## At this point, 'library' holds Q1/Q3 mass info and other important
    ## library columns except run information
    {
        if(nrow(libCols) == nrow(libMassInfo)){
            library <- cbind(libMassInfo, libCols)
            rm(libMassInfo, libCols, libNames)
        } else{
            stop("Number of mass info rows do not match the number of rows in
                 the 'libNames' columns.")
        }
        rm(libMethod)
    }

    ## Get Q1 and Q3 for later below with 'libCols' and 'libColsOrder'
    libBarcode <- data.table(library.Q1=as.character(library[, library.Q1]),
                             library.Q3=as.character(library[, library.Q3]))

    ## Load file containing all Qsession links and store names of unique runs
    {
        Qsession <- fread(nameQsession, sep = "\t",
                          header = T, strip.white = T)
        temp <- vector("list", length = nrow(Qsession))
        for(i in seq_along(temp)){
            temp[i] <- unique(fread(as.character(Qsession[i,1, with=F]),
                                    select = c(4), sep = "\t", header = T))
        }
        temp <- unlist(temp)

        ## Convert special characters for consistency for all unique column name runs
        temp <- gsub("\\s+", ".", temp)
        rm(i)
    }

    ## Read library and convert special characters in column names for consistency
    {
        libCols <- read.xlsx(nameLibrary,
                             skipEmptyCols = T,
                             skipEmptyRows = F)
        colnames(libCols) <- gsub("\\s+", ".", colnames(libCols))
    }

    ## Find library indices of run names matching Qsession files
    {
        for(i in seq_along(temp)){
            if(length(grep(temp[i], colnames(libCols), fixed=T)) == 0){
                warning(paste0("Sample ", temp[i], " in", Qsession, " did not map to the library"))
                temp[i] <- NA
            } else if(length(grep(temp[i], colnames(libCols), fixed=T)) == 1){
                temp[i] <- grep(temp[i], colnames(libCols), fixed = T)
            } else if(length(grep(temp[i], colnames(libCols), fixed=T)) > 1){
                stop("A sample run in the Qsession files mapped to two or more sample runs in the library. Check that the library contains no duplicated sample run columns with the same name.")
            }
        }
        temp <- as.numeric(temp)
        temp <- temp[!is.na(temp)]

        ## Only keep unique sample run indices in the library. It is possible that the same
        ## sample run is present in multiple Qsession files (as a template)
        temp <- temp[!duplicated(temp)]
        rm(i)
    }

    ## Load library and only keep matching Qsession columns
    {
        ## Note how the column names in 'libCols' are unchanged!
        libCols <- read.xlsx(nameLibrary,
                             skipEmptyCols = T,
                             skipEmptyRows = F)
        libCols <- as.data.table(libCols)
        libCols <- libCols[, temp, with=F]

        ## Remove "HELPER" column if it exists
        if(dropRow > 0){
            libCols <- libCols[-(1:dropRow),]
        }
        rm(temp, dropRow)
    }

    ## Convert to matrix
    libCols <- as.matrix(libCols)
    class(libCols) <- "numeric"

    ## 'indexFeatureName' holds the name of the feature of interest
    {
        if(indexFeature == 71){
            fileQsession <- fread(as.character(Qsession[1, 1, with=F]),
                                  sep="\t", header = T, na.strings = "N/A",
                                  select = c(56))
        } else{
            fileQsession <- fread(as.character(Qsession[1, 1, with=F]),
                                  sep="\t", header = T, na.strings = "N/A",
                                  select = c(indexFeature))
        }
        indexFeatureName <- colnames(fileQsession)
    }

    ## Check if 'indexFeature' is something we need to calculate with a
    ## standard (stored in column 17 of the Qsession)
    if(length(grep("Area|Height", colnames(fileQsession))) > 0){
        indexFeature <- c(17, indexFeature)
    }

    ## For each Qsession file and individual run, map the retention times in the
    ## library with the feature of interest
    for(i in 1:nrow(Qsession)){
        ## Load Qsession file
        {
            if(length(indexFeature) > 1){
                fileQsession <- fread(as.character(Qsession[i, 1, with=F]),
                                      sep="\t", header = T, na.strings = "N/A",
                                      select = c(4, 21, 56, indexFeature))
            } else if(indexFeature == 56){
                fileQsession <- fread(as.character(Qsession[i, 1, with=F]),
                                      sep="\t", header = T, na.strings = "N/A",
                                      select = c(4, 21, 56))
            } else if(indexFeature == 71){
                # For relative RT
                fileQsession <- fread(as.character(Qsession[i, 1, with=F]),
                                      sep="\t", header = T, na.strings = "N/A",
                                      select = c(4, 21, 56, 17))
            } else {
                fileQsession <- fread(as.character(Qsession[i, 1, with=F]),
                                      sep="\t", header = T, na.strings = "N/A",
                                      select = c(4, 21, 56, indexFeature))
            }
        }

        ## Split Qsession Mass Info into Q1 and Q3. Q3 is necessary for any normalization calculations
        {
            ## Remove spaces
            fileQsession[, "Mass Info" := gsub("\\s+", "", `Mass Info`)]

            ## Mass info must follow regex format: triple/quadruple number and
            ## optional period and/or number followed by forward slash and
            ## triple/quadruple number and optional period and/or number
            #fileQsession[, "temp" := grepl("[0-9]{3,4}[\\.]?[0-9]{0,1}/[0-9]{3,4}[\\.]?[0-9]{0,1}", `Mass Info`)]
            fileQsession[, "temp" := grepl("^[0-9]{3,4}[.][0-9]/[0-9]{3,4}[.][0-9]$", `Mass Info`)]
            if(all(fileQsession[, temp]) == F){
                stop(paste0("Check that 'Mass Info' column in ", Qsession[i,1], " is formatted correctly."))
            } else{
                fileQsession[, temp := NULL]
                fileQsession[, c("library.Q1", "library.Q3") := tstrsplit(`Mass Info`, "/")[1:2]]
                fileQsession[, library.Q1 := as.character(as.numeric(library.Q1))]
                fileQsession[, library.Q3 := as.character(as.numeric(library.Q3))]
                fileQsession[, `Mass Info` := NULL]
            }
        }

        ## For every Multiquant Qsession file:
        ## Enforced, strict rounding rule. Truncate to 7 decimals, convert to numeric,
        ## potentially lose trailing zeros, then round to two decimal places
        suppressWarnings(
            fileQsession[, `Retention Time` :=
                             as.character(
                                 round(
                                     as.numeric(
                                         sprintf("%.7f", `Retention Time`)), 2))]
        )

        ## Loop over each run within the Qsession
        for(j in 1:length(fileQsession[, unique(`Sample Name`)])){

            ## Load individual run into a data table
            {
                runQsession <- fileQsession[`Sample Name` == unique(`Sample Name`)[j]]
                runQsession <- runQsession[!is.na(`Retention Time`)]
                runQsession[, `Retention Time` := as.numeric(`Retention Time`)]
            }

            ## Create temporary copies of 'libCols' and 'runQsession'
            ## Convert any special characters into normal ones
            {
                matchLibCols <- colnames(libCols)
                matchLibCols <- gsub("\\s+", ".", matchLibCols)
                matchRunQsession <- runQsession[, unique(`Sample Name`)]
                matchRunQsession <- gsub("\\s+", ".", matchRunQsession)
            }

            if(length(grep(matchRunQsession, matchLibCols, fixed = T)) == 0){
                warning(paste0("Qsession file ", Qsession[i,1], " at run ", matchRunQsession, " did not match to the library."))
            } else{
                ## Column index in 'libCols' that matches the Qsession run
                indexLibCols <- grep(matchRunQsession, matchLibCols, fixed = T)

                if(length(indexLibCols) > 1){
                    warning(paste0("Duplicate library sample run detected: "), matchRunQsession)
                }

                ## Normalization based on input feature 'standard' if applicable
                ## CHECK LATER TO SEE IF THIS WORKS AS USUAL
                if(length(grep("Area|Height", colnames(fileQsession), fixed = T)) > 0){
                    e <- paste("(", indexFeatureName, ")", sep="")
                    e <- parse(text=e)[[1]]
                    num <- grep(indexFeatureName, colnames(runQsession), fixed = T)
                    runQsession[, normalization := NA]

                    ## Normalizing based on input parameter 'standard'
                    for(l in 1:nrow(standard)){
                        norm <- which(runQsession[, `Component Name` == standard[l][1]])[1] # Assumption of no duplicate sample names
                        norm <- as.numeric(runQsession[norm, indexFeatureName, with = F])
                        runQsession[library.Q3 == standard[l,2], "Normalization" := as.numeric(norm)]
                    }
                    runQsession[, colnames(runQsession)[num] := eval(e) / Normalization]
                    runQsession[, Normalization := NULL]
                    rm(norm)
                }

                ## Normalize retention time to indicated standard
                if(length(indexFeature) == 1 && indexFeature == 71){

                    # Normalizing based on input parameter 'standard'
                    for(l in 1:nrow(standard)){
                        norm <- which(runQsession[, `Component Name` == standard[l][1]])[1] # Assumption of no duplicate sample names
                        norm <- as.numeric(runQsession[norm, `Retention Time`])
                        runQsession[library.Q3 == standard[l,2], "Normalization" := as.numeric(norm)]
                    }
                    runQsession[, `Relative Retention Time` := `Retention Time` / Normalization]
                    runQsession[, Normalization := NULL]
                }

                ## Matrices are easier to do value replacement below
                runQsession <- as.matrix(runQsession)

                ## Find correct indices for lookup table to map new features to RT in 'libCols'
                ## NOTE: entries that do not match (like a mismatched RT between library/Multiquant
                ## file get converted to NA)
                {
                    if(any(indexFeature %in% 71)){
                        oldMap <- grep("^Retention Time$", colnames(runQsession))
                        newMap <- grep("Relative Retention Time", colnames(runQsession), fixed = T)
                    } else {
                        oldMap <- grep("^Retention Time$", colnames(runQsession))
                        newMap <- grep(indexFeatureName, colnames(runQsession), fixed = T)
                    }
                    libQ1 <- grep("library.Q1", colnames(runQsession), fixed = T)
                    libQ3 <- grep("library.Q3", colnames(runQsession), fixed = T)
                    mapDT <- data.table(old=paste(runQsession[, libQ1],
                                                  runQsession[, libQ3],
                                                  runQsession[, oldMap],
                                                  sep = "\t"),
                                        new=paste(runQsession[, libQ1],
                                                  runQsession[, libQ3],
                                                  runQsession[, newMap],
                                                  sep = "\t"))
                    rm(oldMap, newMap)
                }

                ## For every matching library sample run:
                ## Enforced, strict rounding rule. Truncate to 7 decimals, convert to numeric,
                ## potentially lose trailing zeros, then round to two decimal places
                suppressWarnings(
                    libFeature <- paste(library[, library.Q1],
                                        library[, library.Q3],
                                        sprintf("%.2f",as.numeric(as.character(
                                        round(as.numeric(sprintf(
                                        "%.7f", libCols[, indexLibCols[1]])), 2) ## Assumpton that if indexLibCols > 1, must be duplicates
                                        ))), sep = "\t"))

                ## Lookup table to replace library sample run transition RTs
                ## with the new feature
                libFeature2 <- libFeature
                libFeature <- mapDT[, new][match(unlist(libFeature), mapDT[, old])]

                ## Skip warning and feature mapping if we encounter duplicated
                ## sample run names in the Qsession files. Assumption that
                ## running the same Qsession file will fail on the second attempt
                ## because mapDT will yield only NA and therefore
                ## libCols[, indexLibCols] <- as.numeric(unlist(libFeature))
                ## will fail. This isn't necessarily true if the new feature is RT
                ## although I can't figure out exactly why. If you see an entire
                ## sample run with duplicate values, it must mean that there is another
                ## run in the Qsession file with the same matching name.
                if(all(is.na(libFeature)) == T){
                    warning(paste0("Check that ", matchRunQsession, " is a unique sample run and not present in multiple Qsession files. Also check that this sample run is not duplicated in the library."))
                } else {
                    if(sum(!(is.na(libFeature))) != nrow(mapDT)){
                        warning(paste0("The following transitions (Q1;_;Q3;_;RT) in ", Qsession[i, 1], " at run ", matchRunQsession, " did not map to the library: ", paste(mapDT[, old][which(!(mapDT[, old] %in% libFeature[which(!(is.na(libFeature)))]))], collapse = " AND ")))
                        ## Loop over each Qsession transition that did not map to the library
                        for(a in 1:(nrow(mapDT)-sum(!(is.na(libFeature)))) ){
                            temp <- paste(strsplit(as.character(Qsession[i, 1]), "/")[[1]][length(strsplit(as.character(Qsession[i, 1]), "/")[[1]])],
                                          matchRunQsession,
                                          mapDT[, old][which(!(mapDT[, new] %in% libFeature[which(!(is.na(libFeature)))]))][a],
                                          libFeature2[grep(strsplit(mapDT[, old][which(!(mapDT[, new] %in% libFeature[which(!(is.na(libFeature)))]))][a], "\t")[[1]][3], libFeature2, fixed = T)],
                                          sep = "\t")
                            ## Special case when mapDT (Qsession run) contains a duplicated Q1, Q3, and RT
                            ## If this is the case, temp will return NA for Q1 and Q3 and blanks for everything else
                            ## The dimension will be length(libFeature) long
                            ## If this is the case, temp is set to mapDT$old and mapDT$new
                            if(is.na(mapDT[, old][which(!(mapDT[, new] %in% libFeature[which(!(is.na(libFeature)))]))][a])){
                                temp <- paste(strsplit(as.character(Qsession[i, 1]), "/")[[1]][length(strsplit(as.character(Qsession[i, 1]), "/")[[1]])],
                                              paste0("duplicate_Qsession_Q1.Q3.RT_", matchRunQsession),
                                              mapDT[which(duplicated(mapDT))][, old],
                                              mapDT[which(duplicated(mapDT))][, new],
                                              sep = "\t")
                            }
                            for(b in seq_along(temp)){
                                tempGT <- rbind(tempGT, temp[b])
                            }
                            #tempGT <- rbind(tempGT, temp)
                        }
                    }

                    ## Only keep feature value; discard Q1 and Q3
                    libFeature <- as.data.table(libFeature)
                    libFeature[, libFeature := tstrsplit(libFeature, "\t")[3]]
                    libCols[, indexLibCols] <- as.numeric(unlist(libFeature))
                }
            }
        }
    }

    cat(tempGT, file="~/Downloads/GT.txt", append=FALSE, sep = "\n")

    ## Replace library column names with library column names plus organism and
    ## matrix information
    libCols <- as.data.table(libCols)
    grepCol <- vector(mode = "numeric", length = ncol(libCols))
    for(i in 1:ncol(libCols)){
        if(length(grep(colnames(libCols)[i], libNamesMethod, fixed = T)) == 1){
            grepCol[[i]] <- grep(colnames(libCols)[i], libNamesMethod, fixed = T)
        } else{
            stop("Weird. 'libCols' couldn't match to libNamesMethod. Check that the library sample run columns match the Qsession files and/or there are duplicate column names.")
        }
    }

    ## Re-map column names with matrix and mass spec method information
    colnames(libCols) <- libNamesMethod[grepCol]

    ## Merge library columns with new feature information replacing RT
    ## Also add a brand new index column
    library <- cbind(library, libCols)
    library <- data.table(Index = c(1:dim(library)[1]), library)
    rm(libCols, libNamesMethod, grepCol)

    return(library)
}
jchitpin/blistR documentation built on July 8, 2019, 6:29 p.m.