Tommi Suvitaival, Steno Diabetes Center Copenhagen, TSUV0001@RegionH.DK 2020-01-23
This document describes, how the lipidomer::liverlipidome data set was prepared.
The data come from the project PR000633 in the Metabolomics Workbench.
Gorden, D. Lee, et al. Biomarkers of NAFLD Progression: a Lipidomics Approach to an Epidemic. J Lip Res. 56(3) 722-36 (2015). https://dx.doi.org/10.1194/jlr.P056002
This data is available at the NIH Common Fund’s National Metabolomics Data Repository (NMDR) website, the Metabolomics Workbench, https://www.metabolomicsworkbench.org, where it has been assigned Project ID PR000633. The data can be accessed directly via it’s Project DOI: 10.21228/M8V961 This work is supported by NIH grant, U2C-DK119886.
URL: http://dx.doi.org/10.21228/M8V961
url.download <-
c(
"https://www.metabolomicsworkbench.org/data/study_textformat_view.php?STUDY_ID=ST000915&ANALYSIS_ID=AN001485&MODE=d",
"https://www.metabolomicsworkbench.org/data/study_textformat_view.php?STUDY_ID=ST000915&ANALYSIS_ID=AN001486&MODE=d",
"https://www.metabolomicsworkbench.org/data/study_textformat_view.php?STUDY_ID=ST000915&ANALYSIS_ID=AN001487&MODE=d",
"https://www.metabolomicsworkbench.org/data/study_textformat_view.php?STUDY_ID=ST000915&ANALYSIS_ID=AN001488&MODE=d",
"https://www.metabolomicsworkbench.org/data/study_textformat_view.php?STUDY_ID=ST000915&ANALYSIS_ID=AN001489&MODE=d",
"https://www.metabolomicsworkbench.org/data/study_textformat_view.php?STUDY_ID=ST000915&ANALYSIS_ID=AN001490&MODE=d"
)
sample.info.loaded <-
read.table(
file = url.download[ 1 ],
check.names = FALSE,
header = FALSE,
nrows = 158-69-1,
# sep = "\t",
skip = 69,
stringsAsFactors = FALSE
)
data.loaded <- list()
data.loaded[[ 1 ]] <-
read.table(
file = url.download[ 1 ],
check.names = FALSE,
header = TRUE,
nrows = 373-274-1,
sep = "\t",
skip = 274,
stringsAsFactors = FALSE
)
data.loaded[[ 2 ]] <-
read.table(
file = url.download[ 2 ],
check.names = FALSE,
header = TRUE,
na.strings = "ND",
nrows = 302-261-1,
sep = "\t",
skip = 261,
stringsAsFactors = FALSE
)
data.loaded[[ 3 ]] <-
read.table(
file = url.download[ 3 ],
check.names = FALSE,
header = TRUE,
nrows = 288-276-1,
sep = "\t",
skip = 276,
stringsAsFactors = FALSE
)
data.loaded[[ 4 ]] <-
read.table(
file = url.download[ 4 ],
check.names = FALSE,
header = TRUE,
nrows = 456-265-1,
sep = "\t",
skip = 265,
stringsAsFactors = FALSE
)
data.loaded[[ 5 ]] <-
read.table(
file = url.download[ 5 ],
check.names = FALSE,
header = TRUE,
nrows = 368-269-1,
sep = "\t",
skip = 269,
stringsAsFactors = FALSE
)
data.loaded[[ 6 ]] <-
read.table(
file = url.download[ 6 ],
check.names = FALSE,
header = TRUE,
nrows = 373-261-1,
sep = "\t",
skip = 261,
stringsAsFactors = FALSE
)
sample.info <- sample.info.loaded
for ( i in 4:ncol( sample.info ) ) {
tmp <-
stringr::str_split_fixed(
string = sample.info[ , i ],
pattern = "(\\:)|(\\=)",
n = 2
)
sample.info[ , i ] <- tmp[ , 2 ]
colnames( sample.info )[ i ] <- tmp[ 1, 1 ]
sample.info[ , i ] <-
stringr::str_replace_all(
string = sample.info[ , i ],
pattern = "\\;",
replacement = ""
)
}
sample.info[ sample.info == "-"] <- NA
colnames( sample.info )[ colnames( sample.info ) == "V3" ] <- "ID"
sample.info <- sample.info[ , 3:ncol( sample.info ) ]
sample.info$"Diagnosis" <-
factor(
x = sample.info$"Diagnosis",
levels = c( "Normal", "Steatosis", "NASH", "Cirrhosis" )
)
names.numeric <- c( "BMI", "AGE", "AST", "ALT", "ALKP", "TBIL", "GLUCOSE" )
for ( i in 1:length( names.numeric ) ) {
sample.info[[ names.numeric[ i ] ]] <-
as.numeric( sample.info[[ names.numeric[ i ] ]] )
}
data <- data.loaded
data <-
lapply(
X = data,
FUN =
function( x ) {
rownames( x ) <- x$"Samples"
x <- t( x[ -1, -1 ] )
y <-
apply(
X = x,
MAR = 2,
FUN = as.numeric
)
rownames( y ) <- rownames( x )
return( y )
}
)
## Warning in apply(X = x, MAR = 2, FUN = as.numeric): NAs introduced by coercion
## Warning in apply(X = x, MAR = 2, FUN = as.numeric): NAs introduced by coercion
## Warning in apply(X = x, MAR = 2, FUN = as.numeric): NAs introduced by coercion
data.merged <- data[[ 1 ]]
for ( i in 2:length( data ) ) {
data.merged <-
merge(
x = data.merged,
y = data[[ i ]],
by = "row.names"
)
rownames( data.merged ) <- data.merged$"Row.names"
data.merged$"Row.names" <- NULL
}
data <- data.merged
is.C.name <-
stringr::str_detect(
string = colnames( data ),
pattern = "C[0-9]+" # "C[0-9]+:?[0-9]+?D?H?"
)
if ( any( is.C.name ) ) {
colnames( data )[ is.C.name ] <-
stringr::str_replace(
string = colnames( data )[ is.C.name ],
pattern = "DH ",
replacement = " DH-"
)
name.split <-
stringr::str_split_fixed(
string = colnames( data )[ is.C.name ],
pattern = " ",
n = 3
)
number <-
stringr::str_replace_all(
string = name.split[ , 1 ],
pattern = "[A-Z]+",
replacement = ""
)
number.split <-
stringr::str_split_fixed(
string = number,
pattern = ":",
n = 2
)
number.split[ which( number.split[ , 2 ] == "" ), 2 ] <- "0"
number.split[ , 1 ] <- as.numeric( number.split[ , 1 ] ) + 18
name.split[ , 2 ] <-
stringr::str_replace(
string = name.split[ , 2 ],
pattern = "Sphingomyelin",
replacement = "SM"
)
colnames( data )[ is.C.name ] <-
paste0(
name.split[ , 2 ],
"(",
number.split[ , 1 ],
":",
number.split[ , 2 ],
")"
)
}
is.numbered <-
grepl(
x = colnames( data ),
pattern = "[A-Za-z]\\s?\\("
)
data <- data[ , is.numbered ]
# Fixes to names
is.N3 <-
grepl(
x = colnames( data ),
pattern = " N3" )
colnames( data )[ is.N3 ] <-
stringr::str_replace(
string = colnames( data )[ is.N3 ],
pattern = " N3",
replacement = ""
)
colnames( data )[ is.N3 ] <-
stringr::str_replace(
string = colnames( data )[ is.N3 ],
pattern = "\\(",
replacement = "-N3("
)
is.N6 <-
grepl(
x = colnames( data ),
pattern = " N6" )
colnames( data )[ is.N6 ] <-
stringr::str_replace(
string = colnames( data )[ is.N6 ],
pattern = " N6",
replacement = ""
)
colnames( data )[ is.N6 ] <-
stringr::str_replace(
string = colnames( data )[ is.N6 ],
pattern = "\\(",
replacement = "-N6("
)
is.N9 <-
grepl(
x = colnames( data ),
pattern = " N9" )
colnames( data )[ is.N9 ] <-
stringr::str_replace(
string = colnames( data )[ is.N9 ],
pattern = " N9",
replacement = ""
)
colnames( data )[ is.N9 ] <-
stringr::str_replace(
string = colnames( data )[ is.N9 ],
pattern = "\\(",
replacement = "-N9("
)
tmp <-
grep(
x = colnames( data ),
pattern = "p\\)"
)
colnames( data )[ tmp ] <-
stringr::str_replace(
string = colnames( data )[ tmp ],
pattern = "\\(",
replacement = "-P("
)
colnames( data )[ tmp ] <-
stringr::str_replace(
string = colnames( data )[ tmp ],
pattern = "p\\)",
replacement = ")"
)
colnames( data ) <-
stringr::str_replace(
string = colnames( data ),
pattern = " \\(",
replacement = "("
)
colnames( data ) <-
stringr::str_replace(
string = colnames( data ),
pattern = "\\.[0-9]",
replacement = ""
)
data <- signif( x = data, digits = 3 )
names.lipids <- colnames( data )
data <-
merge(
x = sample.info,
y = data,
by.x = "ID",
by.y = "row.names"
)
N.found <-
by(
data = !is.na( data[ , names.lipids ] ),
INDICES = data$"Diagnosis",
FUN = colSums
)
N.found <- simplify2array( N.found )
is.found <- ( N.found > 1 )
is.found <-
apply(
X = is.found,
MAR = 1,
FUN = all
)
names.missing <- names.lipids
names.missing <- names.missing[ !is.found ]
print( names.missing )
## [1] "CE(14:0)" "CE(15:1)" "CE(15:0)" "CE(16:2)" "CE(16:1)"
## [6] "CE(17:1)" "CE(17:0)" "CE(18:0)" "CE(20:1)" "CE(22:1)"
## [11] "CE(22:0)" "TG(40:2)" "TG(40:1)" "TG(40:0)" "TG(41:5)"
## [16] "TG(41:4)" "TG(41:3)" "TG(41:2)" "TG(41:1)" "TG(41:0)"
## [21] "TG(42:5)" "TG(42:4)" "TG(42:3)" "TG(42:2)" "TG(43:2)"
## [26] "TG(43:1)" "TG(43:0)" "TG(44:5)" "TG(44:4)" "TG(46:5)"
## [31] "TG(47:5)" "TG(47:4)" "TG(49:4)" "TG(56:0)" "TG(58:0)"
## [36] "TG(59:2)" "TG(59:1)" "TG(59:0)" "TG(60:2)" "TG(60:1)"
## [41] "TG(60:0)" "1,2-DG(32:5)" "1,2-DG(37:5)" "1,2-DG(37:4)" "PA(34:0)"
data <- data[ , !( colnames( data ) %in% names.missing ) ]
names.lipids <- names.lipids[ is.found ]
liverlipidome <-
tidyr::pivot_longer(
data = data,
cols = names.lipids,
names_to = "Lipid_Name",
values_to = "Lipid_Level"
)
save(
liverlipidome,
file = "../data/liverlipidome.rda",
compress = "xz"
)
utils::sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17763)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.3 knitr_1.27 magrittr_1.5 tidyselect_0.2.5
## [5] R6_2.4.1 rlang_0.4.2 stringr_1.4.0 dplyr_0.8.3
## [9] tools_3.6.2 xfun_0.12 htmltools_0.4.0 yaml_2.2.0
## [13] digest_0.6.23 assertthat_0.2.1 tibble_2.1.3 lifecycle_0.1.0
## [17] crayon_1.3.4 purrr_0.3.3 tidyr_1.0.0 vctrs_0.2.1
## [21] zeallot_0.1.0 glue_1.3.1 evaluate_0.14 rmarkdown_2.1
## [25] stringi_1.4.4 compiler_3.6.2 pillar_1.4.3 backports_1.1.5
## [29] pkgconfig_2.0.3
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.