R/studyDefinition.R

Defines functions studyDefinitionExtensionSettings quanRatios quanRatio replicateInfo samplesInfo sampleInfo sampleGroupIn studyDefinitionExtensions getCorrections studyDefinitionSamples studyDefinitionQuanMethods studyDefinitionFileSets studyDefinitionFactors analysisDefinition intoTable

Documented in analysisDefinition studyDefinitionExtensions studyDefinitionExtensionSettings studyDefinitionFactors studyDefinitionFileSets studyDefinitionQuanMethods studyDefinitionSamples

#' internal function that transforms/transposes a set of named lists into a
#'  data.frame with the list names as column names
#'  
#' @param xmlList the named set of lists (can be just a single one) to be
#'  transformed. Usually (part of) the output of xmlToList from the
#'  XML package. Please note that all lists should have the same element
#'  names. If not the resulting data.frame will most likely have a weird
#'  strucutre with (partially) NA filled columns
#' @param elementName allows for the selection of named elements of the list
#'  object
#' @param leaveout default value is ".attrs", usually this is the closing
#'  element of a set of lists coming from xmlToList
#'  
#' @return data.frame
#' 
#' @noRd
intoTable <- function(xmlList, elementName = NA, leaveOut = ".attrs"){
  if (identical(elementName, NA)){
    if (leaveOut %in% names(xmlList)){
      # multi line xmlTable
      numberofElements <- length(xmlList)-1
      return(dplyr::bind_rows(
        lapply(1:numberofElements,
               function(x){
                 xmlList[[x]]
               })))
    } else {
      # single line xmlTable
      return(dplyr::bind_rows(xmlList))
    }
  } else {
    return(dplyr::bind_rows(xmlList[names(xmlList) == elementName]))
  }
}

#' function that gets the first element of the AnalysisDefinitionXML
#'  column from the AnalysisDefinition table in a .pdResult file
#'  
#' @param db database access 'handle' pointing to a .pdResult file
#' @return a named tree like list that contains the info like file names,
#'  study factors, correction factors, etc etc
#'  
#' @export
analysisDefinition <- function(db){
  return(XML::xmlToList((
    dbGetTable(db = db,
               tablename = "AnalysisDefinition"))$AnalysisDefinitionXML[1]))
}

#' function that extracts the factors used in the study to generate the
#'  .pdResult file. The result contains some internal info in the form
#'  of columns named id (identifiers).
#'  
#' @param analysisDef generated by the analysisDefinition function
#' 
#' @return data.frame with the info
#' 
#' @export
studyDefinitionFactors <- function(analysisDef){
  if (length(analysisDef$StudyDefinition$Factors) == 0){
    return(NA)
  } else {
    result <- dplyr::bind_rows(
      lapply(
        analysisDef$StudyDefinition$Factors, function(x){x[[".attrs"]]})) %>%
      dplyr::select("Kind", "Name", "Description", "Id") %>%
      dplyr::rename_with(tolower) %>%
      dplyr::rename(type = "kind")
    result$factors <- unname(lapply(analysisDef$StudyDefinition$Factors,
                        function(x){dplyr::bind_rows(x$FactorOptions) %>%
                            dplyr::select("Kind", "Value", "Id") %>%
                            dplyr::rename_with(tolower) %>%
                            dplyr::rename(type = "kind")}))
    return(result)
  }
}

#' function that extracts file information on the original .raw files
#'  used to generate the .pdResult file. Information includes the
#'  original file name, location & size. It also contains some internal info in
#'  the form of columns named id (identifiers).
#'  
#' @param analysisDef generated by the analysisDefinition function
#' @param splitFileSize boolean (default: TRUE), specifies if the FileSize
#'  column should be split into the actual file size (still a character vector)
#'  and the file size format
#' @param joinedTables boolean (default: TRUE), specifies if all info should be
#'  put in a single data.frame. If FALSE it will generate a list of two
#'  data.frame objects; this might be useful in some scenarios
#'  
#' @return data.frame or list of two data.frame objects
#' 
#' @export 
studyDefinitionFileSets <- function(analysisDef,
                                    splitFileSize = TRUE,
                                    joinedTables = TRUE){
  if (length(analysisDef$StudyDefinition$FileSets) == 0){
    return(NA)
  } else {
    result <- list()
    result[[1]] <- dplyr::bind_rows(lapply(analysisDef$StudyDefinition$FileSets, function(x){x[[".attrs"]]}))
    result[[2]] <- dplyr::bind_rows(lapply(analysisDef$StudyDefinition$FileSets, function(x){x[["Files"]]$File}))
    if (splitFileSize){
      result[[2]] <- result[[2]] %>%
        tidyr::separate(col = "FileSize", into = c("FileSize","FileSizeFormat"), sep = " ")
    }
    if (joinedTables){
      return(dplyr::bind_cols(result))
    } else {
      return(result)
    }
  }
}

#' function that extracts quantification method information if a quantification
#'  method was used to generate the .pdResult file
#'  
#' @param analysisDef generated by the analysisDefinition function
#' 
#' @return A list of two data.frame objects. The first one will contain the
#'  name, description, etc. The second one will specify the names of the labels
#'  used. The result will be NA in the case that no quantification method was
#'  used.
#'  
#' @export
studyDefinitionQuanMethods <- function(analysisDef){
  if (length(analysisDef$StudyDefinition$QuanMethods) == 0){
    return(NA)
  } else {
    result <- list()
    result[[1]] <- dplyr::bind_rows(analysisDef$StudyDefinition$QuanMethods[["QuanMethod"]][[".attrs"]])
    result[[2]] <- dplyr::bind_rows(analysisDef$StudyDefinition$QuanMethods[["QuanMethod"]][["QuanChannels"]])
    return(result)
  }
}

#' function that extracts sample information. The information seems to be a bit
#'  redundant, as the info is also seen in other tables.
#'  
#' @param analysisDef generated by the analysisDefinition function
#' 
#' @return a data.frame
#'  
#' @export
studyDefinitionSamples <- function(analysisDef){
  return(dplyr::bind_rows(lapply(analysisDef$StudyDefinition$Samples,
                                 function(x){x[[".attrs"]]})))
}

#' internal function used by \empf{studyDefinitionExtensions}. Extracts the
#'  correction factors for the mass tags/reporter ions from the provided XML
#'  object
#'  
#' @param resultXML xmlobject containing the correction factor information
#'  
#' @return a data.frame of the isoptic correction factors
#' 
#' @noRd
getCorrections <- function(resultXML){
  elements <- lapply(resultXML[["MethodPart"]],
                     function(x){
                       if (".attrs" %in% names(x)){
                         return(x[[".attrs"]])
                       } else {
                         return(NA)
                       }})
  elements <- elements[!is.na(elements)]
  methodParts <- resultXML[["MethodPart"]][which(names(elements) == "MethodPart")]
  correctionTable <- data.frame()
  for (counter in 1:length(methodParts)){
    rowElements <- unlist(lapply(1:4, function(x){methodParts[[counter]][[x]][["text"]]}))
    names(rowElements) <- unname(unlist(lapply(1:4, function(x){methodParts[[counter]][[x]][[".attrs"]]})))
    correctionTable <- dplyr::bind_rows(correctionTable,rowElements)
  }
  correctionFactors <- list()
  for (counter in 1:length(methodParts)){
    correctionFactors[[counter]] <- data.frame()
    for (counter2 in 1:(length(methodParts[[counter]])-1)){
      rowElements <- lapply(1:(length(methodParts[[counter]][["MethodPart"]][[counter2]])-1),
                            function(z){methodParts[[counter]][["MethodPart"]][[counter2]][[z]][["text"]]})
      names(rowElements) <- unname(unlist(lapply(1:(length(methodParts[[counter]][["MethodPart"]][[counter2]])-1), function(z){methodParts[[counter]][["MethodPart"]][[counter2]][[z]][[".attrs"]]})))
      correctionFactors[[counter]] <- dplyr::bind_rows(correctionFactors[[counter]], dplyr::bind_rows(rowElements))
    }
  }
  for (counter in 1:length(correctionFactors)){
    namesCorrections <- correctionFactors[[counter]]$CorrectionFactorName 
    correctionFactors[[counter]] <- correctionFactors[[counter]]$Factor
    names(correctionFactors[[counter]]) <- namesCorrections
  }
  result <- dplyr::bind_cols(correctionTable, dplyr::bind_rows(correctionFactors)) %>%
    dplyr::select(-!!rlang::sym("TagID"),-rlang::sym("IsActive"))
  colnames(result)[1:2] <- c("MassTag","ReporterIon")
  return(result)
}

#' function that extracts information on isotope corrections (if available)
#' 
#' @param analysisDef generated by the analysisDefinition function
#' @param correctXML can only have two different values: NA or a two element
#'  character vector c("utf-16","utf-8"). During the research into the
#'  method descriptions in the XML object it was noticed that the
#'  XML::xmlToList gave an error Document labelled UTF-16 but has
#'  UTF-8 content. This was solved by replacing the 'utf-16' string by 'utf-8'
#'  string in the XML object. This may be a country specific issue, so the
#'  function allows setting this parameter to NA will not do the replacement.
#' 
#' @return NA or a list of two data.frame objects
#'  
#' @export
studyDefinitionExtensions <- function(analysisDef, correctXML = c("utf-16","utf-8")){
  if (length(analysisDef$StudyDefinition$StudyDefinitionExtensions[["StudyDefinitionExtension"]][["QuantitationMethods"]]) == 1){
    return(NA)
  } else {
    result <- analysisDef$StudyDefinition$StudyDefinitionExtensions[["StudyDefinitionExtension"]][["QuantitationMethods"]][["QuantitationMethod"]][["text"]]
    if (identical(correctXML, NA)){
      return(XML::xmlToList(result))
    } else {
      resultXML <- XML::xmlToList(stringr::str_replace(result,
                                                       pattern = correctXML[1],
                                                       replacement = correctXML[2]))
      result <- list()
      result[[1]] <- dplyr::bind_rows(resultXML[[".attrs"]])
      result[[2]] <- getCorrections(resultXML)
      names(result) <- c("Method","CorrectionFactors" )
      return(result)
    }
  }
}

#' internal function used by the quanRatio function to separate info from
#'  a quanRatio numerator or denominator
#'  
#' @param numOrdenom either a numerator or denominator element
#' @param numerator boolean: if TRUE then numOrdenom is a numerator, otherwise
#'  it's a denominator
#'  
#' @return a list of information to be further processed by quanratio
#' 
#' @noRd
sampleGroupIn <- function(numOrdenom, numerator){
  if (sum(
    unlist(
      lapply(names(numOrdenom),
             function(x){
               return(!(x %in% c("StudyVariablesForGrouping","Samples","Replicates",".attrs" )))}))) != 0){
    warning("Possible unusual numerator/denominator element")
  }
  x <- list(numOrdenom$StudyVariablesForGrouping %>%
              as.data.frame() %>%
              dplyr::slice(-1) %>% # remove Version
              dplyr::rename_at("StudyVariable",
                               ~ifelse(numerator,
                                       "Numerator",
                                       "Denominator")), # rename column
            numOrdenom$Samples,
            numOrdenom$Replicates)  # always seems to be NULL
  names(x) <- names(numOrdenom)[1:3]
  return(x)
}

#' internal function which extracts the (part of the) info on a numerator or
#'  denominator from a quanRatio element
#'  
#' @param sampleInfos numerator or denominator sample element from a quanRatio
#' 
#' @noRd
sampleInfo <- function(sampleInfos){
  result <- list()
  result[[1]] <- dplyr::bind_rows(sampleInfos$StudyVariables)
  result[[2]] <- dplyr::bind_rows(sampleInfos[[".attrs"]])
  names(result) <- c("info1", "info2")
  return(result)
}

#' internal function to extract info about the numerator or denominator samples
#'  (puts together also info from sampleInfos function)
#' @param numOrDenom numerator or denominator sample element from a quanRatio
#'
#' @return data.frame with the info on the ratio numerator or denominator,
#'  among other things it specifies the samples/files used
#' 
#' @noRd
samplesInfo <- function(numOrDenom){
  tempResult <- lapply(numOrDenom, sampleInfo)
  result <- dplyr::bind_rows(lapply(tempResult, function(x){x$info2})) %>%
    dplyr::select(-rlang::sym("Version")) %>%
    dplyr::select("SampleName", tidyselect::everything())
  for (counter in 1:length(tempResult)){
    tempResult[[counter]]$info1 <- tempResult[[counter]]$info1 %>%
      dplyr::select(-rlang::sym("Version"), -rlang::sym("Kind")) %>%
      dplyr::rename_at("Value", ~paste(c("Value",1), collapse = "")) %>%
      dplyr::mutate(Name =  stringr::str_replace_all(!!rlang::sym("Name"), pattern = " ",
                                                     replacement = "_")) %>%
      t() %>%
      data.frame()
    colnames(tempResult[[counter]]$info1) <- tempResult[[counter]]$info1[1,]
    tempResult[[counter]]$info1 <- tempResult[[counter]]$info1 %>%
      dplyr::slice(-1)
    rownames(tempResult[[counter]]$info1) <- NULL
  }
  tempResult <- dplyr::bind_rows(lapply(tempResult, function(x){x$info1}))
  result <- dplyr::bind_cols(result, tempResult)
  return(result)
}

#' internal function used by quanRatio to extract info about the 
#'  replicates in a quanRatio
#'  
#' @param replicateInfos coming from the quanRatiosElement parameter of the
#'  quanRatio function
#'  
#'  @return a list of numerator & denominator data.frame objects
#' 
#' @noRd
replicateInfo <- function(replicateInfos){
  result <- list()
  result[[1]] <- NA
  result[[2]] <- NA
  names(result) <- c("Numerator","Denominator")
  if (!is.null(replicateInfos$SampleGroupInNumerator$Replicates)){
    result[[1]] <- list()
    result[[1]][[1]] <- dplyr::bind_rows(purrr::map(replicateInfos$SampleGroupInNumerator$Replicates,
                                             ~samplesInfo(.x$Samples)))
    result[[1]][[2]] <- dplyr::bind_rows(purrr::map(replicateInfos$SampleGroupInNumerator$Replicates,
                                             ~dplyr::bind_rows(.x$ReplicateStudyVariable) %>%
                                               dplyr::select(-Version)))
    names(result[[1]]) <- c("Table","Factors")
  }
  if (!is.null(replicateInfos$SampleGroupInDenominator$Replicates)){
    result[[2]] <- list()
    result[[2]][[1]] <- dplyr::bind_rows(purrr::map(replicateInfos$SampleGroupInDenominator$Replicates,
                                             ~samplesInfo(.x$Samples)))
    result[[2]][[2]] <- dplyr::bind_rows(purrr::map(replicateInfos$SampleGroupInDenominator$Replicates,
                                             ~dplyr::bind_rows(.x$ReplicateStudyVariable) %>%
                                               dplyr::select(-Version)))
    names(result[[2]]) <- c("Table","Factors")
  }
  return(result)
}

#' internal function used for the extraction of information from the individual
#'  quanRatios
#'  
#' @param quanRatiosElement one of the quanRatio coming from the quanRatios
#'  element (see quanRatios)
#'  
#' @return list of information in a single quanRatio
#' 
#' @noRd
quanRatio <- function(quanRatiosElement){
  if (sum(
    unlist(
      lapply(names(quanRatiosElement),
             function(x){
               return(!(x %in% c("SampleGroupInNumerator",
                                 "SampleGroupInDenominator",
                                 "ReplicateRatios",".attrs")))}))) != 0){
    warning("Possible unusual quanRatio element")
  }
  x <- list(sampleGroupIn(quanRatiosElement$SampleGroupInNumerator$SampleGroup,
                          numerator = TRUE),
            sampleGroupIn(quanRatiosElement$SampleGroupInDenominator$SampleGroup,
                          numerator = FALSE),
            quanRatiosElement$ReplicaRatios) # not used, seems to always be NULL
  names(x) <- names(quanRatiosElement)[1:3]
  x[[length(x)+1]] <- 
    dplyr::bind_cols(x$SampleGroupInNumerator$StudyVariablesForGrouping,
                     x$SampleGroupInDenominator$StudyVariablesForGrouping)
  names(x)[length(x)] <- "RatioTable"
  x[[length(x)+1]] <- paste(c(x$RatioTable["Value","Numerator"],
                              "/",
                              x$RatioTable["Value","Denominator"]),
                            collapse = "")
  names(x)[length(x)] <- "RatioString"
  x[[length(x)+1]] <- samplesInfo(quanRatiosElement$SampleGroupInNumerator$SampleGroup$Samples)
  names(x)[length(x)] <- "NumeratorSamples"
  x[[length(x)+1]] <- samplesInfo(quanRatiosElement$SampleGroupInDenominator$SampleGroup$Samples)
  names(x)[length(x)] <- "DenominatorSamples"
  x[[length(x)+1]] <- replicateInfo(x)
  names(x)[length(x)] <- "Replicates" # note: seems to always contain redundant
  # info, that is to say: info that is
  # already in rest of quanRatiosElement
  return(x[-c(1,2,3)])
}

#' internal function for the extraction of the ratios for the quantification
#' 
#' @param XMLData should be StudyAnalysisExtensionSettings/
#'  AnalysisDefinitionExtensionSettings/ GroupingAndQuantificationExtension
#'  part of the analysisDefinition generated by the analysisDefinition
#'  function
#' 
#' @return QuanRatios list for the third list item returned by
#'  studyDefinitionExtensionSettings
#'  
#' @noRd
quanRatios <- function(XMLData){
  result <- lapply(XMLData$QuanRatios, quanRatio)
  names(result) <- unname(purrr::map_chr(result,~.x$RatioString))
  return(result)
}

#' function to extract sample/factor/ratio/replicate information.
#' 
#' @param analysisDef generated by the analysisDefinition function
#' 
#' @return a lits of 4 elements:
#' \enumerate{
#'   \item StudyVariablesForGrouping : a data.frame of factors used
#'   \item StudyVariablesForSorting : a data.frame of sorting specification for
#'    the factors
#'   \item QuanRatios : a list object of all ratios. Each ratio has the
#'    following elements: RatioTable (specifying numerator/denominator),
#'    RatioString (for easy info printing), NumeratorSamples &
#'    DenominatorSamples specifying which samples are in the numerator and
#'    denominator and finally Replicates which contains info on replicates.
#'   \item XML : the actual from which the information comes. This was included
#'    because the exact specification for all possible cases is not (yet) known
#' }
#' 
#' @note So far, this function has not been tested for all possible cases/
#'  scenarios.
#' 
#' @export
studyDefinitionExtensionSettings <- function(analysisDef){
  tempResult <- analysisDef$StudyAnalysisExtensionSettings$AnalysisDefinitionExtensionSettings$GroupingAndQuantificationExtension
  if (is.null(tempResult$StudyVariablesForGrouping) & is.null(tempResult$StudyVariablesForSorting) & is.null(tempResult$QuanRatios)){
    return(NA)
  } else {
    result <- list()
    result[[1]] <- intoTable(tempResult$StudyVariablesForGrouping$StudyVariable)
    result[[2]] <- 
      intoTable(tempResult$StudyVariablesForSorting$StudyVariableForSorting)
    if (!purrr::is_empty(tempResult$QuanRatios)){
      result[[3]] <- quanRatios(tempResult)
    } else {
      result[[3]] <- NA
    }
    result[[4]] <- tempResult
    names(result) <- c(names(tempResult)[1:3],"XML")
    return(result)
  }
}
BenBruyneel/proteinDiscover documentation built on March 16, 2024, 4:36 p.m.