knitr::opts_chunk$set( echo = TRUE )

Introduction

This document describes, how the lipidomer::cancerlipidome data set was prepared.

The data come from the project PR000742 in the Metabolomics Workbench.

Citation of the Data

Publication

Purwaha, P., et al. Unbiased Lipidomic Profiling of Triple-Negative Breast Cancer Tissues Reveals the Association of Sphingomyelin Levels with Patient Disease-Free Survival. Metabolites 2018, 8, 41. https://doi.org/10.3390/metabo8030041

Repository

This data is available at the NIH Common Fund's National Metabolomics Data Repository (NMDR) website, the Metabolomics Workbench, https://www.metabolomicsworkbench.org, where it has been assigned Project ID PR000742. The data can be accessed directly via it's Project DOI: 10.21228/M8RX01 This work is supported by NIH grant, U2C-DK119886.

URL: http://dx.doi.org/10.21228/M8RX01

Download the Data

Specify the File Paths

url.download <-
  c(
    "https://www.metabolomicsworkbench.org/data/study_textformat_view.php?STUDY_ID=ST001111&ANALYSIS_ID=AN001805&MODE=d",
    "https://www.metabolomicsworkbench.org/data/study_textformat_view.php?STUDY_ID=ST001111&ANALYSIS_ID=AN001806&MODE=d"

  )

Download the Data

sample.info.loaded <-
  read.table(
    file = url.download[ 1 ],
    check.names = FALSE,
    header = FALSE,
    nrows = 148-29-1,
    sep = "\t",
    skip = 29,
    stringsAsFactors = FALSE
  )

data.loaded.pos <-
  read.table(
    file = url.download[ 1 ],
    check.names = FALSE,
    header = TRUE,
    nrows = 618-171-2,
    sep = "\t",
    skip = 171,
    stringsAsFactors = FALSE
  )

data.loaded.neg <-
  read.table(
    file = url.download[ 2 ],
    check.names = FALSE,
    header = TRUE,
    nrows = 505-171-2,
    sep = "\t",
    skip = 171,
    stringsAsFactors = FALSE
  )

Prepare the Data

Reformat

Sample Information

sample.info <- sample.info.loaded

sample.info$"V3" <- as.character( sample.info$"V3" )

colnames( sample.info )[ colnames( sample.info ) == "V3" ] <- "ID"

sample.info$"V4" <-
  stringr::str_replace(
    string = sample.info$"V4",
    pattern = "[A-za-z]+\\:",
    replacement = ""
  )

colnames( sample.info )[ colnames( sample.info ) == "V4" ] <- "Group"

tmp <-
  stringr::str_split_fixed(
    string = sample.info$"V5",
    pattern = "; ",
    n = 3
  )

colnames( tmp ) <-
  stringr::str_extract(
    string = tmp[ 1, ],
    pattern = "[A-Za-z]+\\="
  )

colnames( tmp ) <-
  stringr::str_replace(
    string = colnames( tmp ),
    pattern = "\\=",
    replacement = ""
  )

tmp <-
  apply(
    X = tmp,
    MAR = 2,
    FUN = stringr::str_replace,
    # string = tmp,
    pattern = "[A-Za-z]+\\=",
    replacement = ""
  )

tmp <- data.frame( tmp, check.names = FALSE )

sample.info <-
  dplyr::bind_cols(
    sample.info,
    tmp
  )

sample.info[ sample.info$"Stage" == "NA", "Stage" ] <- NA
sample.info[ grepl( x = sample.info$"Type", pattern = "NA" ), "Type" ] <- NA
sample.info$"Stage" <- droplevels( sample.info$"Stage" )

sample.info <-
  sample.info[ 
    ,
    !grepl( x = colnames( sample.info ), pattern = "^V[0-9]" )
    ]

sample.info$"Race" <-
  stringr::str_replace_all(
    string = sample.info$"Race",
    pattern = "\\.",
    replacement = "-"
  )

sample.info$"Type" <-
  stringr::str_replace_all(
    string = sample.info$"Type",
    pattern = "\\.",
    replacement = "-"
  )

Lipids from the Positive Ion Mode

data.pos <- data.loaded.pos

rownames( data.pos ) <- data.pos$"Samples"

data.pos <- t( data.pos[ -1, -1 ] )

mode( data.pos ) <- "numeric"

Lipids from the Negative Ion Mode

data.neg <- data.loaded.neg

rownames( data.neg ) <- data.neg$"Samples"

data.neg <- t( data.neg[ -1, -1 ] )

mode( data.neg ) <- "numeric"

Merge Lipid Data

data <-
  merge(
    x = data.pos,
    y = data.neg,
    by = "row.names"
  )

rownames( data ) <- data$"Row.names"

data$"Row.names" <- NULL

Reformat Lipid Names

tmp <-
  stringr::str_detect(
    string = colnames( data ),
    pattern = "N\\-"
  )

colnames( data )[ tmp ] <-
  stringr::str_replace(
    string = colnames( data )[ tmp ],
    pattern = "N\\-\\(",
    replacement = "N-"
  )

colnames( data )[ tmp ] <-
  stringr::str_replace(
    string = colnames( data )[ tmp ],
    pattern = "noyl\\)",
    replacement = "noyl"
  )

colnames( data ) <-
  stringr::str_replace(
    string = colnames( data ),
    pattern = "\\([0-9].+$",
    replacement = ""
  )

tmp <-
  stringr::str_detect(
    string = colnames( data ),
    pattern = "[0-9]$"
  )

colnames( data )[ tmp ] <-
  paste0(
    colnames( data )[ tmp ],
    ")"
  )

tmp <-
  stringr::str_detect(
    string = colnames( data ),
    pattern = " [0-9]"
  )

colnames( data )[ tmp ] <-
  stringr::str_replace(
    string = colnames( data )[ tmp ],
    pattern = " ",
    replacement = "("
  )

tmp <-
  stringr::str_detect(
    string = colnames( data ),
    pattern = "[0-9] "
  )

colnames( data )[ tmp ] <-
  stringr::str_replace(
    string = colnames( data )[ tmp ],
    pattern = " ",
    replacement = ")"
  )

colnames( data ) <-
  stringr::str_replace(
    string = colnames( data ),
    pattern = "\\[.+\\]\\+",
    replacement = ""
  )

Combine Duplicate Lipids

names.duplicates <- names( which( table( colnames( data ) ) > 1 ) )

sum.duplicates <-
  lapply(
    X = names.duplicates,
    FUN =
      function( x ) {

        idx.i <- which( colnames( data ) == x )

        tmp <-
          apply(
            X = data[ , idx.i ],
            MAR = 1,
            FUN = prod,
            na.rm = TRUE
          )

        return( tmp )

      }
  )

sum.duplicates <- simplify2array( x = sum.duplicates )
colnames( sum.duplicates ) <- names.duplicates

sum.duplicates <-
  data.frame(
    sum.duplicates,
    check.names = FALSE
  )

data <-
  merge(
    x = data[ , !( colnames( data ) %in% names.duplicates ) ],
    y = sum.duplicates,
    by = "row.names"
  )

rownames( data ) <- data$"Row.names"

data$"Row.names" <- NULL

data <- data[ , order( colnames( data ), decreasing = FALSE ) ]

Omit Classes with Only One Lipid

data <- data[ , !grepl( x = colnames( data ), pattern = "SB N" ) ]

Round Observations

data <- signif( x = data, digits = 3 )

Merge Lipids and Sample Information

names.lipids <- colnames( data )

data <-
  merge(
    x = sample.info,
    y = data,
    by.x = "ID",
    by.y = "row.names"
  )

data$"Group" <- as.factor( x = data$"Group" )

Convert the Data into a Longer Format

cancerlipidome <- 
  tidyr::pivot_longer(
    data = data,
    cols = names.lipids,
    names_to = "Lipid_Name",
    values_to = "Lipid_Level"
  )

Save the Resulting Data

save(
  cancerlipidome,
  file = "../data/cancerlipidome.rda",
  compress = "xz"
)

SessionInfo

utils::sessionInfo()


tommi-s/lipidomeR documentation built on March 17, 2020, 12:14 a.m.