knitr::opts_chunk$set(echo = FALSE)
knitr::opts_chunk$set(warning=FALSE, collapse = TRUE, message=FALSE, comment = "")
suppressMessages(library(OmicSelector))
suppressMessages(library(dplyr))
suppressMessages(library(data.table))
suppressMessages(library(knitr))
suppressMessages(library(rmarkdown))
suppressMessages(library(kableExtra))
suppressMessages(library(naniar))
suppressMessages(library(VIM))
setwd("/OmicSelector")
set.seed(1)
require(XML)
config <- xmlParse("1_preprocessing.xml")
config <- xmlToList(config)

knitr::opts_chunk$set(fig.width=13, fig.height=9) 
knitr::opts_knit$set(root.dir = '/OmicSelector')
message(getwd())
dataset = fread("data_start.csv")
datasetx = dplyr::select(dataset, starts_with("hsa"))
metadane = dplyr::select(dataset, -starts_with("hsa"))

Initial data check:

readLines("initial_check.txt") %>% paste0(collapse="\n") %>% cat

Names correction:

if(config[["correct_names"]] == "yes") {
    old = colnames(dataset)
    dataset = OmicSelector_correct_miRNA_names(dataset)
    new = colnames(dataset)
    zmiany = data.frame(`Old name` = old[old != new], `New name` = new[old != new])
    if(nrow(zmiany)>0) {
     kable(zmiany, col.names = c("Old feature name","New feature name")) %>%
      kable_styling(bootstrap_options = c("striped", "hover"))
      fwrite(dataset, "data_start.csv")
        dataset = fread("data_start.csv")
        datasetx = dplyr::select(dataset, starts_with("hsa"))
        metadane = dplyr::select(dataset, -starts_with("hsa"))
    }
    else { cat("Correction was performed, but no names were changed.") }

} else { cat("No names correction was performed.") }
colnames(dataset) = make.names(colnames(dataset), unique=TRUE)
colnames(datasetx) = make.names(colnames(datasetx), unique=TRUE)
colnames(metadane) = make.names(colnames(metadane), unique=TRUE)

Missing values

Initial pattern of missing values:

vis_miss(datasetx, warn_large_data = FALSE)
gg_miss_var(datasetx)
if (n_var_miss(datasetx) > 2) { gg_miss_upset(datasetx) }

Missing data imputation

if(is.null(config[["missing_imput"]])) {
    cat("No missing data imputation was performed.")
} else if(config[["missing_imput"]] == "pmm") {
    suppressMessages(library(mice))
    temp1 = mice(datasetx, m=1, maxit = 50)
    datasetx = complete(temp1)
    fwrite(cbind(metadane, datasetx), "data_start.csv")
    # Sprawdzanie
    cat("\nAfter imputation:")
    vis_miss(datasetx)
    gg_miss_var(datasetx)
    if (n_var_miss(datasetx) > 2) { gg_miss_upset(datasetx) }
} else if(config[["missing_imput"]] == "median") {
    suppressMessages(library(imputeMissings))
    datasetx = impute(datasetx, method = "median/mode")
    fwrite(cbind(metadane, datasetx), "data_start.csv")
    # Sprawdzanie
    cat("\nAfter imputation:")
    vis_miss(datasetx)
    gg_miss_var(datasetx)
    if (n_var_miss(datasetx) > 2) { gg_miss_upset(datasetx) }
} else if(config[["missing_imput"]] == "mean") {
    for(i in 1:ncol(datasetx)) {
    datasetx[ , i][is.na(datasetx[ , i])] <- mean(datasetx[ , i], na.rm = TRUE)
    }
    fwrite(cbind(metadane, datasetx), "data_start.csv")
    # Sprawdzanie
    cat("\nAfter imputation:")
    vis_miss(datasetx)
    gg_miss_var(datasetx)
    if (n_var_miss(datasetx) > 2) { gg_miss_upset(datasetx) }
} else if(config[["missing_imput"]] == "mean") {
    do_zostawienia = complete.cases(datasetx)
    datasetx = datasetx[do_zostawienia,]
    metadane = datasetx[metadane,]
    fwrite(cbind(metadane, datasetx), "data_start.csv")
    kable(table(do_zostawienia), col.names = c("Remaining","Number of features")) %>% kable_styling(bootstrap_options = c("striped", "hover")) 
    cat("Cases with NAs were deleted. FALSE cases were deleted.")
  }

Normalization:

if(config[["input_format"]] == "counts") {
    # Czy filtracja
if(config[["filtration"]] == "yes") { 
  ttpm = OmicSelector_counts_to_log10tpm(datasetx, metadane, ids = metadane$sample, filtr = T, filtr_minimalcounts = config[["filtration_mincounts"]], filtr_howmany = config[["filtration_propotion"]]) 
  fwrite(cbind(metadane, ttpm), "data_start.csv")
  dataset = fread("data_start.csv")
  datasetx = dplyr::select(dataset, starts_with("hsa"))
  metadane = dplyr::select(dataset, -starts_with("hsa"))
} else { 
  ttpm = OmicSelector_counts_to_log10tpm(datasetx, metadane, ids = metadane$sample, filtr = F) 
  fwrite(cbind(metadane, ttpm), "data_start.csv")
  dataset = fread("data_start.csv")
  datasetx = dplyr::select(dataset, starts_with("hsa"))
  metadane = dplyr::select(dataset, -starts_with("hsa"))
}  
    } else { 
cat("No normalization was performed (per pipeline config).")
    }

Download starting (normalized) dataset: data_start.csv

Exploratory analysis

Class frequences:

barplot(table(dataset$Class), horiz=TRUE)
kable(table(dataset$Class), col.names = c("Class","Number of cases")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Check if dataset is balanced using binominal test (vs. p=0.5):

cancer_cases = sum(dataset$Class == "Case")
all_cases = length(dataset$Class)
binom.test(n = all_cases, x = cancer_cases, p = 0.5, alternative = "two.sided")

Features

OmicSelector_table(psych::describe(datasetx))
fwrite(psych::describe(datasetx), "result_preprocessing_descriptivestats.csv")

Download result_preprocessing_descriptivestats.csv

Correlations between features:

suppressMessages(library(ComplexHeatmap))
suppressMessages(library(circlize))
correlations = cor(datasetx, method = "pearson")
correlations[lower.tri(correlations)] = 0
fwrite(psych::describe(datasetx), "result_preprocessing_correlationmatrix.csv")
col_fun = colorRamp2(c(-1, 0, 1), c("blue", "white", "red"), transparency = 0.5)
Heatmap(correlations, name = "corr", col = col_fun, cluster_rows = FALSE, cluster_columns = FALSE, show_row_names = FALSE, show_column_names = FALSE)

flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

suppressMessages(library(Hmisc)) 
cor_p = rcorr(as.matrix(datasetx), type="pearson")
cor_p2 = flattenCorrMatrix(cor_p$r, cor_p$P)
fwrite(cor_p2, "result_preprocessing_correlationmatrix_details.csv")
OmicSelector_table(cor_p2)

Correlation matrix (correlation coef.): Download result_preprocessing_correlationmatrix.csv

Details: Download result_preprocessing_correlationmatrix_details.csv

Analysis (whole dataset)

Differential expression:

if(config['input_format'] == "transformed" | config['input_format'] == "counts") {
DE = OmicSelector_differential_expression_ttest(datasetx, metadane$Class, mode = "logtpm")
}
if(config['input_format'] == "deltact") {
DE = OmicSelector_differential_expression_ttest(datasetx, metadane$Class, mode = "deltact")
}
fwrite(DE, "result_preprocessing_wholedataset_DE.csv")
OmicSelector_table(DE)

Prepare split

Splitting the dataset to training, testing and validation datasets.

mixed = OmicSelector_prepare_split(metadane = metadane, ttpm = datasetx, train_proc = as.numeric(config[["training_split"]]))

Comparison of splits

Closing credits

packageDescription("OmicSelector")
sessionInfo()


kstawiski/OmicSelector documentation built on April 10, 2024, 11:11 p.m.