DOAGDC: A Package to Download, Organize and Analyze Genomic Data Commons (GDC) Data

Documented in groups_identification_cox_hr

#' Separate patients in groups
#'
#' \code{groups_identification_cox} is a function designed to separate patients
#' in groups, based in clinical data, and coxHR.
#'
#' @param name
#' @param data_type
#' @param width,height,res,unit,image_format
#' @param save_data
#' @param env
#' @param tumor
#' @param data_base
#' @param work_dir
#' @inheritParams concatenate_exon
#' @inheritParams download_gdc
#' @inheritParams groups_identification_mclust
#'
#' @return the groups generated after using the coxHR analysis.
#' @export
#'
#' @importFrom survival coxph
#' @importFrom survival Surv
#' @importFrom survival survfit
#' @importFrom survminer ggsurvplot
#' @importFrom XML xmlParse
#' @importFrom XML xmlToList
#' @importFrom yarrr pirateplot
#' @importFrom pROC roc
#' @importFrom pROC plot.roc
#'
#' @examples
#' library(DOAGDC)
#'
#' concatenate_expression("gene",
#'     name = "HIF3A",
#'     data_base = "legacy",
#'     tumor = "CHOL",
#'     work_dir = "~/Desktop"
#' )
#'
#' groups_identification_cox_hr("HIF3A", "gene",
#'     tumor = "CHOL",
#'     work_dir = "~/Desktop",
#'     data_base = "legacy",
#'     env = CHOL_LEGACY_gene_tumor_data
#' )
groups_identification_cox_hr <- function(name,
                                         data_type,
                                         width = 3000,
                                         height = 2000,
                                         res = 400,
                                         unit = "px",
                                         image_format = "png",
                                         save_data = TRUE, env,
                                         tumor, data_base,
                                         work_dir) {

    # Library load (xlsx tools httr jsonlite data.table XML ggplot2 survival
    # survminer ggthemes grid yarrr reshape extrafont scales cgdsr) (plyr png)

    # localFUN
    expander <- function(big_list) {
        tmp_list <- vector("list")

        for (j in seq_len(length(big_list))) {
            name <- as.character(names(big_list[j]))
            tmp_list <- append(tmp_list, name)
        }

        return(tmp_list)
    }

    ty <- function(a) {
        ifelse(is.null(a), "", a)
    }

    open_xml <- function() {
        message("Extracting clinical data from XML...")
        pb <- txtProgressBar(min = 0, max = length(clinical), style = 3)
        for (i in seq_len(length(clinical))) {
            if (i > 1) {
                list[[i]] <- XML::xmlToDataFrame(
                    clinical[i],
                    stringsAsFactors = FALSE
                )
            } else {
                list <- list(XML::xmlToDataFrame(
                    clinical[i],
                    stringsAsFactors = FALSE
                ))
            }
            setTxtProgressBar(pb, i)
        }
        close(pb)

        fds <- as.data.frame(matrix(nrow = length(clinical), ncol = 90))
        fd <- as.data.frame(matrix(nrow = 1, ncol = 74))

        message("Exporting to csv...")
        pb <- txtProgressBar(min = 0, max = length(clinical), style = 3)
        for (patient in seq_len(length(clinical))) {
            for (col in seq_len(74)) {
                fd[1, col] <- na.omit(list[[patient]][col])
            }

            # stage_event
            data <- XML::xmlParse(clinical[patient])
            xml_data <- XML::xmlToList(data)$patient
            stage_event <- xml_data$stage_event
            tnm <- stage_event$tnm_categories$pathologic_categories
            fd$system_version <- ty(stage_event$system_version[[1]])
            fd$pathologic_stage <- ty(stage_event$pathologic_stage[[1]])
            fd$pathologic_t <- ty(tnm$pathologic_t[[1]])
            fd$pathologic_n <- ty(tnm$pathologic_n[[1]])
            fd$pathologic_m <- ty(tnm$pathologic_m[[1]])

            # follow_ups
            follow_ups <- xml_data$follow_ups$follow_up
            fd$days_to_death_followup <- ty(follow_ups$days_to_death[[1]])
            fd$days_to_last_followup_followup <- ty(
                follow_ups$days_to_last_followup[[1]]
            )

            if (ty(
                follow_ups$days_to_new_tumor_event_after_initial_treatment[[1]]
            ) == "") {
                fd$days_to_new_tumor_event_after_initial_treatment_followup <- ty(
                    follow_ups[["new_tumor_events"]][["new_tumor_event"]][["days_to_new_tumor_event_after_initial_treatment"]][[1]]
                )
            } else {
                fd$days_to_new_tumor_event_after_initial_treatment_followup <- ty(
                    follow_ups$days_to_new_tumor_event_after_initial_treatment[[1]]
                )
            }

            fd$day_of_form_completion2 <- ty(
                follow_ups$day_of_form_completion[[1]]
            )
            fd$month_of_form_completion2 <- ty(
                follow_ups$month_of_form_completion[[1]]
            )
            fd$year_of_form_completion2 <- ty(
                follow_ups$year_of_form_completion[[1]]
            )

            if (ty(follow_ups$new_neoplasm_occurrence_anatomic_site_text[[1]]) == "") {
                fd$new_neoplasm_event_occurrence_anatomic_site_followup <- ty(
                    follow_ups[["new_tumor_events"]][["new_tumor_event"]][["new_neoplasm_event_occurrence_anatomic_site"]][[1]]
                )
            } else {
                fd$new_neoplasm_event_occurrence_anatomic_site_followup <- ty(
                    follow_ups$new_neoplasm_occurrence_anatomic_site_text[[1]]
                )
            }

            fd$new_neoplasm_event_type_followup <- ty(
                follow_ups[["new_tumor_events"]][["new_tumor_event"]][["new_neoplasm_event_type"]][[1]]
            )

            if (ty(follow_ups$new_tumor_event_after_initial_treatment[[1]]) == "") {
                fd$new_tumor_event_after_initial_treatment_followup <- ty(
                    follow_ups$new_tumor_events$new_tumor_event_after_initial_treatment[[1]]
                )
            } else {
                fd$new_tumor_event_after_initial_treatment_followup <- ty(
                    follow_ups$new_tumor_event_after_initial_treatment[[1]]
                )
            }
            fd$person_neoplasm_cancer_status_followup <- ty(
                follow_ups$person_neoplasm_cancer_status[[1]]
            )
            fd$vital_status_followup <- ty(follow_ups$vital_status[[1]])

            fds[patient, ] <- fd

            setTxtProgressBar(pb, patient)
        }
        close(pb)

        colnames(fds)[1:74] <- names(list[[1]])
        colnames(fds)[75:90] <- colnames(fd)[75:90]
        rownames(fds) <- fds$bcr_patient_barcode

        fds$form_completion_ymd <- paste(fds$year_of_form_completion,
            fds$month_of_form_completion, fds$day_of_form_completion,
            sep = "/"
        )

        fds$form_completion_ymd_followup <- paste(
            fds$day_of_form_completion2,
            fds$month_of_form_completion2,
            fds$day_of_form_completion2,
            sep = "/"
        )

        fds$new_tumor_event_after_initial_treatment_initial <- fds$new_tumor_events
        fds$days_to_new_tumor_event_after_initial_treatment_initial <- NA
        for (patient in seq_len(nrow(fds))) {
            if (!(fds$new_tumor_events[patient] %in% c("", "NO"))) {
                tmp <- stringr::str_extract_all(
                    fds$new_tumor_events[patient],
                    "^(YES)\\d+"
                )[[1]][1]
                fds[patient, 93] <- "YES"
                fds[patient, 94] <- as.numeric(gsub("YES", "", tmp))
            }
        }

        # final fix
        tmp <- fds$days_to_new_tumor_event_after_initial_treatment_followup == "day"
        fds$days_to_new_tumor_event_after_initial_treatment_followup[tmp] <- ""
        tmp <- fds$days_to_new_tumor_event_after_initial_treatment_followup == "false"
        fds$days_to_new_tumor_event_after_initial_treatment_followup[tmp] <- ""

        fds$final_vital_status <- fds$vital_status
        tmp <- !(fds$vital_status_followup %in% c("", "vital_status"))
        fds$final_vital_status[tmp] <- fds$vital_status_followup[tmp]

        fds$final_vital_status_times <- ifelse(
            fds$final_vital_status == "Dead",
            ifelse(
                fds$days_to_death == "",
                fds$days_to_death_followup,
                fds$days_to_death
            ),
            fds$days_to_last_followup
        )

        fds$final_vital_status_times <- as.numeric(
            fds$final_vital_status_times
        )
        fds$final_vital_status_times[is.na(
            fds$final_vital_status_times
        )] <- 0

        fds$final_rfs_status <- ifelse(
            fds$new_tumor_event_after_initial_treatment_initial == "YES",
            "relapse",
            ifelse(
                fds$new_tumor_event_after_initial_treatment_followup == "YES",
                "relapse",
                "censored"
            )
        )

        tmp <- is.na(
            fds$days_to_new_tumor_event_after_initial_treatment_initial
        )
        fds$days_to_new_tumor_event_after_initial_treatment_initial[tmp] <- ""

        fds$final_rfs_status_times <- ifelse(
            fds$days_to_new_tumor_event_after_initial_treatment_initial == "",
            fds$days_to_new_tumor_event_after_initial_treatment_followup, ""
        )

        fds$final_rfs_status_times <- as.numeric(
            fds$final_rfs_status_times
        )
        fds$final_rfs_status_times[is.na(fds$final_rfs_status_times)] <- 0

        # sort before save
        fds <- fds[, order(colnames(fds))]

        return(fds)
    }

    roc_auc <- function(dir, name, width, height, res, unit, image_format) {
        values <- framel[, "log2p1"]
        groups <- framel[, "group"]

        # area under the curve...
        save_plot("coxHR", "_AUC_LR")
        par(pty = "s")
        pROC::roc(groups, glm_fit$fitted.values,
            plot = TRUE, legacy.axes = TRUE, percent = TRUE, quiet = TRUE,
            xlab = "False Positive Percentage",
            ylab = "True Postive Percentage",
            col = "#377eb8", lwd = 4, print.auc = TRUE
        )
        dev.off()

        rf_model <- randomForest::randomForest(groups ~ values)

        # ROC for random forest
        save_plot("coxHR", "_AUC_RF")
        par(pty = "s")
        pROC::roc(groups, rf_model$votes[, 1],
            plot = TRUE, legacy.axes = TRUE, percent = TRUE, quiet = TRUE,
            xlab = "False Positive Percentage",
            ylab = "True Postive Percentage",
            col = "#4daf4a", lwd = 4, print.auc = TRUE
        )
        dev.off()

        save_plot("coxHR", "_LR_RF")
        par(pty = "s")
        pROC::roc(groups, glm_fit$fitted.values,
            plot = TRUE, legacy.axes = TRUE, percent = TRUE, quiet = TRUE,
            xlab = "False Positive Percentage",
            ylab = "True Postive Percentage",
            col = "#377eb8", lwd = 4, print.auc = TRUE
        )
        pROC::plot.roc(groups, rf_model$votes[, 1],
            percent = TRUE, col = "#4daf4a", lwd = 4,
            print.auc = TRUE, add = TRUE, print.auc.y = 40
        )
        par(
            fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0),
            new = TRUE
        )
        plot(0, 0,
            type = "n", bty = "n", xaxt = "n", yaxt = "n",
            ylab = "", xlab = ""
        )
        legend("top",
            legend = c("Logisitic Regression", "Random Forest"),
            col = c("#377eb8", "#4daf4a"), lwd = 4,
            xpd = TRUE, horiz = TRUE, inset = c(0, 0), bty = "n",
            cex = 1, lty = 1
        )
        dev.off()
    }

    save_plot <- function(dir_name, file_name) {
        if (tolower(image_format) == "png") {
            png(
                filename = file.path(
                    dir, dir_name,
                    paste0(
                        name,
                        file_name, ".png"
                    )
                ),
                width = width, height = height, res = res, units = unit
            )
        } else if (tolower(image_format) == "svg") {
            svg(
                filename = file.path(
                    dir, dir_name,
                    paste0(
                        name,
                        file_name, ".svg"
                    )
                ),
                width = width, height = height, onefile = TRUE
            )
        } else {
            stop(message(
                "Please, Insert a valid image_format! ('png' or 'svg')"
            ))
        }
    }

    cox_ph <- function(step) {
        path_capture <- file.path(
            dir, "kaplan_meier", paste(
                name, toupper(step), "coxHR_summary.txt",
                sep = "_"
            )
        )

        if (step == "overall") {
            coxmodel <- summary(survival::coxph(survival::Surv(
                final_vital_status_times,
                final_vital_status
            ) ~ group_factor,
            data = kaplan_now
            ))

            surv <- summary(survival::Surv(
                final_vital_status_times,
                final_vital_status
            ) ~ group_factor,
            data = kaplan_now
            )
        } else if (step == "rfs") {
            coxmodel <- summary(survival::coxph(survival::Surv(
                final_rfs_status_times,
                final_rfs_status
            ) ~ group_factor,
            data = kaplan_now
            ))

            surv <- summary(survival::Surv(
                final_rfs_status_times,
                final_rfs_status
            ) ~ group_factor,
            data = kaplan_now
            )
        } else {
            coxmodel <- summary(survival::coxph(survival::Surv(
                final_dmfs_status_times,
                final_dmfs_status
            ) ~ group_factor,
            data = kaplan_now
            ))

            surv <- summary(survival::Surv(
                final_dmfs_status_times,
                final_dmfs_status
            ) ~ group_factor,
            data = kaplan_now
            )
        }

        cat("CoxHR summary\n", file = path_capture)
        capture.output(print(coxmodel), file = path_capture, append = TRUE)

        cat("\n********************\n\n", file = path_capture, append = TRUE)
        cat("CoxHR value\n", file = path_capture, append = TRUE)
        capture.output(print(coxmodel$conf.int[1]),
            file = path_capture,
            append = TRUE
        )

        # logrank pvalue
        cat("\n********************\n\n", file = path_capture, append = TRUE)
        cat("logrank pvalue\n", file = path_capture, append = TRUE)
        capture.output(print(coxmodel$logtest[3]),
            file = path_capture,
            append = TRUE
        )

        cat("\n\nSurv only\n", file = path_capture, append = TRUE)
        capture.output(print(surv), file = path_capture, append = TRUE)
        cat("\n\nlogrank_p\n", file = path_capture, append = TRUE)
        capture.output(as.numeric(surv$sctest["pvalue"]))
    }

    surv_plot <- function(dir, name, step,
                                    width, height, res, unit, image_format) {
        amount <- table(kaplan_now$group_factor)
        percent <- amount / nrow(kaplan_now)

        if (step == "overall") {
            fit <- survival::survfit(survival::Surv(
                final_vital_status_times,
                final_vital_status
            ) ~ group_factor,
            data = kaplan_now
            )
        } else if (step == "rfs") {
            fit <- survival::survfit(survival::Surv(
                final_rfs_status_times,
                final_rfs_status
            ) ~ group_factor,
            data = kaplan_now
            )
        } else {
            fit <- survival::survfit(survival::Surv(
                final_dmfs_status_times,
                final_dmfs_status
            ) ~ group_factor,
            data = kaplan_now
            )
        }

        # Plot data
        color <- RColorBrewer::brewer.pal(
            length(levels(kaplan_now$group_factor)), "Set2"
        )

        save_plot("kaplan_meier", paste0("Kaplan_", step))
        par(mar = c(3, 3, 3, 6))
        print(survminer::ggsurvplot(fit,
            data = kaplan_now, risk.table = TRUE,
            linetype = rep(1, length(color)),
            # conf.int = TRUE,
            palette = color, xlab = "Days",
            ggtheme = ggplot2::theme_bw(),
            size = 2,
            font.main = 30,
            font.x = 26,
            font.y = 26,
            font.tickslab = 24
        ))
        dev.off()

        # Create 5yr status
        kaplan_now$yr5_status <- "unknown"
        tmp <- kaplan_now$final_vital_status_times >= (365 * 5)
        kaplan_now$yr5_status[tmp] <- "alive_5yr"
        tmp1 <- kaplan_now$final_vital_status_times < (365 * 5)
        tmp2 <- kaplan_now$final_vital_status == 1
        kaplan_now$yr5_status[tmp1 & tmp2] <- "dead_5yr"
        kaplan_now$yr5_status <- as.factor(kaplan_now$yr5_status)

        # define the formula now
        pirate_formula <- formula(expression ~ yr5_status)

        message("Pirate plotting \n")
        if (nrow(kaplan_now) > 0) {
            save_plot(
                "kaplan_meier", paste0("PiratePlot_log2_expression_5yr_", step)
            )
            par(mar = c(3, 4.8, 4, 2))
            yarrr::pirateplot(
                formula = pirate_formula,
                data = kaplan_now,
                ylab = expression("Log"[2] * "(Expression + 1)"),
                inf.method = "iqr", cex.lab = 1.2, cex.axis = 1.8,
                inf.disp = "rect", jitter.val = 0.08, theme = 2,
                cex.names = 1.5,
                pal = "pony", avg.line.fun = median, point.cex = 1.3,
                inf.f.col = "#969696", xlab = ""
            )
            dev.off()
        }

        # stats
        path_capture <- file.path(
            dir, "kaplan_meier",
            paste0(
                name, "PiratePlot_log2Expression_5yr_", step, "_stats.txt"
            )
        )

        fit <- aov(pirate_formula, data = kaplan_now)

        cat("********************\n", file = path_capture, append = TRUE)
        cat("A - ANOVA\n", file = path_capture, append = TRUE)
        capture.output(summary(fit), file = path_capture, append = TRUE)

        cat("\n********************\n", file = path_capture, append = TRUE)
        cat("B - Shapiro Test for Normality\n\n",
            file = path_capture,
            append = TRUE
        )
        cat("*B1 - all values test\n", file = path_capture, append = TRUE)
        tmp <- shapiro.test(kaplan_now$expression)
        capture.output(print(tmp), file = path_capture, append = TRUE)

        cat("\n*B2 - fit residuals test\n", file = path_capture, append = TRUE)
        tmp <- shapiro.test(residuals(fit))
        capture.output(print(tmp), file = path_capture, append = TRUE)

        cat("\n********************\n\n", file = path_capture, append = TRUE)
        cat("C - Tukey Honest Significant Differences post-test\n",
            file = path_capture, append = TRUE
        )
        tmp <- TukeyHSD(fit)
        capture.output(print(tmp), file = path_capture, append = TRUE)

        if (length(levels(kaplan_now$group_factor)) == 2) {
            cat("\n********************\n", file = path_capture, append = TRUE)
            cat("D - unpaired t-test\n", file = path_capture, append = TRUE)
            test <- var.test(expression ~ yr5_status, kaplan_now,
                alternative = "two.sided"
            )
            tmp <- t.test(pirate_formula,
                data = kaplan_now, alternative = "two.sided",
                mu = 0, paired = FALSE, var.equal = as.numeric(test[3]) <= 0.05,
                conf.level = 0.95
            )
            capture.output(print(tmp), file = path_capture, append = TRUE)
        }

        cat("\n********************\n", file = path_capture, append = TRUE)
        cat("non-parametric assumption:\n", file = path_capture, append = TRUE)
        cat("********************\n", file = path_capture, append = TRUE)
        cat("A - Kruskal-Wallis test\n", file = path_capture, append = TRUE)
        tmp <- kruskal.test(data = kaplan_now, pirate_formula)
        capture.output(print(tmp), file = path_capture, append = TRUE)

        cat("\n********************\n", file = path_capture, append = TRUE)
        cat("B - Non-adjusted Wilcoxon (Mann-Whitney U)\n",
            file = path_capture,
            append = TRUE
        )
        tmp <- pairwise.wilcox.test(
            x = kaplan_now$expression,
            g = kaplan_now$yr5_status,
            p.adjust.method = "none",
            paired = FALSE,
            exact = TRUE,
            correct = TRUE,
            alternative = "two.sided"
        )
        capture.output(print(tmp), file = path_capture, append = TRUE)

        cat("\n********************\n\n", file = path_capture, append = TRUE)
        cat("C - Benjamini FDR adjusted Wilcoxon (Mann-Whitney U)\n",
            file = path_capture, append = TRUE
        )
        tmp <- pairwise.wilcox.test(
            x = kaplan_now$expression,
            g = kaplan_now$yr5_status,
            p.adjust.method = "fdr",
            paired = FALSE,
            exact = TRUE,
            correct = TRUE,
            alternative = "two.sided"
        )
        capture.output(print(tmp), file = path_capture, append = TRUE)
    }

    # code
    data_type <- gsub(" ", "_", data_type)
    name <- gsub("-", "_", name)

    data_base_boo <- tolower(data_base) == "legacy"

    path <- file.path(
        work_dir, "DOAGDC", toupper(tumor), "Analyses",
        toupper(name)
    )

    dir.create(
        path = file.path(work_dir, "DOAGDC", toupper(tumor), "Analyses"),
        showWarnings = FALSE
    )

    dir.create(
        path = file.path(
            work_dir, "DOAGDC", toupper(tumor), "Analyses",
            toupper(name)
        ),
        showWarnings = FALSE
    )

    dir.create(file.path(
        path,
        paste0("/survival_Results_", toupper(name))
    ),
    showWarnings = FALSE
    )
    dir <- paste0(path, "/survival_Results_", toupper(name))
    dir.create(file.path(dir, "coxHR"), showWarnings = FALSE)
    dir.create(file.path(dir, "kaplan_meier"), showWarnings = FALSE)

    # Download clinical
    download_gdc(
        data_type = "clinical_supplement", tumor = tumor,
        data_base = ifelse(
            data_base_boo, "legacy", "gdc"
        ), work_dir = work_dir
    )

    folder_name <- ifelse(
        data_base_boo,
        "clinical_supplement_data",
        "gdc_clinical_supplement_data"
    )

    manifest <- data.frame(read.table(
        file = file.path(
            work_dir, "DOAGDC",
            toupper(tumor),
            folder_name,
            "manifest.sdrf"
        ),
        stringsAsFactors = FALSE, header = TRUE,
        sep = "\t"
    ))

    available_files <- manifest$file_name

    clinical <- file.path(
        workDir, "DOAGDC", tumor, folder_name, available_files
    )

    final_df  <- open_xml()

    # Save table
    write.csv(final_df, file.path(dir, paste0(tumor, "_clinical_xml", ".csv")))

    # Table File position
    if (missing(env)) {
        stop(message("Insert the Environment name as 'env' argument!"))
    } else {
        ev <- deparse(substitute(env))
    }

    assign("clinical_xml", final_df, envir = get(ev))

    # get expression values
    if (tolower(data_type) == "methylation") {
        tmp <- paste0("methylation_tumor_filtered_selected_", name)
    } else {
        tmp <- paste0(
            tolower(data_type), "_tumor_normalized_selected_",
            toupper(name)
        )
    }

    string_vars <- list(ev = get(ev))
    framel <- eval(parse(text = paste0(
        'string_vars[["ev"]]$',
        paste0(tmp)
    )))

    framel <- as.data.frame(framel)

    # remove undisered tissues
    tmp <- c("Hypopharynx", "Larynx", "Lip")
    boolean_tmp <- final_df$anatomic_neoplasm_subdivision %in% tmp
    final_df <- final_df[!boolean_tmp, ]

    # NOTE Start Cutoff Finder
    # https://www.ncbi.nlm.nih.gov/pubmed/23251644

    # clinical
    rownames(final_df) <- final_df$bcr_patient_barcode

    best_z_table <- NULL

    # add patient code
    framel$patient_code <- rownames(framel)

    # collect ordering
    matching_ordering <- match(
        framel[, "patient_code"],
        rownames(final_df)
    )

    # Add clinical values to the frames list
    framel <- cbind(framel, final_df[
        matching_ordering,
        c(
            "race", "gender",
            "pathologic_stage",
            "pathologic_t",
            "pathologic_n",
            "pathologic_m",
            "final_vital_status",
            "final_vital_status_times",
            "final_rfs_status",
            "final_rfs_status_times",
            "final_dmfs_status",
            "final_dmfs_status_times"
        )
    ])

    tissue_type_simple <- unname(sapply(rownames(framel), function(w) {
        paste(unlist(strsplit(w, "-"))[4], collapse = "-")
    }))
    tissue_type_simple <- unname(sapply(tissue_type_simple, function(w) {
        paste(unlist(strsplit(w, ""))[1:2], collapse = "")
    }))
    framel[, "tissue_type_simple"] <- tissue_type_simple

    # Calculate z-scores
    tumor_valueszscore <- as.numeric(scale(framel[, name]))

    # Save values to table
    framel[, "log2p1"] <- log2(framel[, name] + 1)
    framel[, "zscore"] <- NA
    framel[, "zscore"] <- tumor_valueszscore

    # Collect gene with
    table <- framel

    # Sort by expression level
    table <- table[order(table[, "zscore"]), ]

    ### Remove missing data
    table <- table[(!table$final_rfs_status == "unknown"), ]
    tmp <- table$final_rfs_status == "dead_other_cause_before_relapse"
    table <- table[!tmp, ]
    table <- table[(!table$final_rfs_status == "vital_status"), ]
    table <- table[!is.na(table$final_rfs_status), ]
    table <- table[!is.nan(table$final_rfs_status), ]
    table <- table[(!table$final_rfs_status_times == "unknown"), ]
    tmp <- table$final_rfs_status_times == "dead_other_cause_before_relapse"
    table <- table[!tmp, ]
    table <- table[(!table$final_rfs_status_times == "day"), ]
    table <- table[!is.na(table$final_rfs_status_times), ]
    table <- table[!is.nan(table$final_rfs_status_times), ]
    table$final_rfs_status_times <- as.numeric(
        table$final_rfs_status_times
    )

    # remove patients without relapse information
    table$final_rfs_status <- ifelse(
        table$final_rfs_status == "relapse", 1, 0
    )

    nmin <- round(0.04 * dim(table)[1])
    if (nmin < 10) {
        nmin <- 10
    }
    nmax <- dim(table)[1] - nmin
    patient_cuts_sequence <- nmin:nmax

    # create matrix to receive the results
    cutoff_opt <- matrix(
        nrow = length(patient_cuts_sequence),
        ncol = 6
    )
    colnames(cutoff_opt) <- c(
        "patient_no", "zscorecut", "HR",
        "HR_min95", "HR_max95", "logrank_p"
    )
    cutoff_opt <- as.data.frame(cutoff_opt)
    cutoff_opt$patient_no <- patient_cuts_sequence

    for (cut in patient_cuts_sequence) {
        if (table[, 1][cut] == 0) {
            # check if it isnt the last zero
            if (table[, 1][cut + 1] == 0) {
                # CoxHR
                cutoff_opt[cut, "HR"] <- Inf

                # CoxHR min .95
                cutoff_opt[cut, "HRmin95"] <- Inf

                # CoxHR max .95
                cutoff_opt[cut, "HRmax95"] <- Inf

                # logrank pvalue
                cutoff_opt[cut, "logrank_p"] <- 1

                # if is the last zero, do normally
            } else {
                # Create the classification vector for now
                classification_vector <- c(
                    rep("low", cut),
                    rep(
                        "high",
                        dim(table)[1] - cut
                    )
                )
                classification_vector <- factor(classification_vector,
                    levels = c("low", "high")
                )

                # create the model summary
                coxmodel <- suppressWarnings(summary(
                    survival::coxph(survival::Surv(
                        final_rfs_status_times, final_rfs_status
                    ) ~ classification_vector,
                    data = table
                    )
                ))

                # collect line now for table
                line_now <- match(cut, cutoff_opt[, "patient_no"])

                cutoff_opt[line_now, "zscorecut"] <- table[cut, "zscore"]

                # CoxHR
                cutoff_opt[line_now, "HR"] <- coxmodel$conf.int[1]

                # CoxHR min .95
                cutoff_opt[line_now, "HR_min95"] <- coxmodel$conf.int[3]

                # CoxHR max .95
                cutoff_opt[line_now, "HR_max95"] <- coxmodel$conf.int[4]

                # logrank pvalue
                cutoff_opt[line_now, "logrank_p"] <- coxmodel$logtest[3]
            }
        } else {

            # Create the classification vector for now
            classification_vector <- c(
                rep("low", cut),
                rep(
                    "high",
                    dim(table)[1] - cut
                )
            )
            classification_vector <- factor(classification_vector,
                levels = c("low", "high")
            )

            # create the model summary
            coxmodel <- suppressWarnings(
                summary(
                    survival::coxph(survival::Surv(
                        final_rfs_status_times, final_rfs_status
                    ) ~ classification_vector,
                    data = table
                    )
                )
            )

            # collect line now for table
            line_now <- match(cut, cutoff_opt[, "patient_no"])

            cutoff_opt[line_now, "zscorecut"] <- table[
                cut,
                "zscore"
            ]

            # CoxHR
            cutoff_opt[line_now, "HR"] <- coxmodel$conf.int[1]
            # CoxHR min .95
            cutoff_opt[line_now, "HR_min95"] <- coxmodel$conf.int[3]
            # CoxHR max .95
            cutoff_opt[line_now, "HR_max95"] <- coxmodel$conf.int[4]
            # logrank pvalue
            cutoff_opt[line_now, "logrank_p"] <- coxmodel$logtest[3]
        }
    }

    # remove infinite rows
    # pick infinite rows
    remove_rows <- -sort(unique(c(
        which(is.infinite(cutoff_opt$HR)),
        which(is.infinite(cutoff_opt$HR_min95)),
        which(is.infinite(cutoff_opt$HR_max95)),
        which(is.na(cutoff_opt$logrank_p))
    )))

    cutoff_opt <- cutoff_opt[!is.na(cutoff_opt$HR_min95), ]

    # save optim table
    write.csv(cutoff_opt,
        file = file.path(dir, "coxHR", paste0(
            "Optimization_",
            name, ".csv"
        ))
    )

    if (nrow(cutoff_opt) == 0) {
        stop(message(
            "\nLow expression values. Please, ",
            "try another expression data!\n"
        ))
    }

    # Collect best z score
    best_z_table <- suppressWarnings(
        cutoff_opt$zscorecut[which(min(cutoff_opt$logrank_p,
            na.rm = TRUE
        ) == cutoff_opt$logrank_p)][1]
    )

    if (nrow(cutoff_opt[remove_rows, ]) > 0) {
        save_plot("coxHR", "_with_confidence_interval")
        par(mar = c(5, 5, 3, 5))
        plot(
            x = cutoff_opt$zscorecut,
            y = cutoff_opt$HR, type = "l",
            ylim = c(
                min(cutoff_opt$HR_min95, na.rm = TRUE),
                max(cutoff_opt$HR_max95, na.rm = TRUE)
            ),
            xlim = c(
                min(cutoff_opt$zscorecut, na.rm = TRUE),
                max(cutoff_opt$zscorecut, na.rm = TRUE)
            ),
            lwd = 2, lty = 1, xlab = "Expression z-score",
            ylab = "Cox HR RFS", axes = FALSE,
            col = hsv(.58, .95, .62)
        )

        axis(
            side = 1, lwd = 2, cex.axis = 1.5, cex = 1.5,
            at = round(seq(min(cutoff_opt$zscorecut, na.rm = TRUE),
                max(cutoff_opt$zscorecut, na.rm = TRUE),
                by = 0.1
            ), 2)
        )
        axis(
            side = 2, lwd = 2, cex.axis = 1.5, cex = 1.5,
            col = hsv(.58, .95, .62), las = 1
        )

        lines(
            x = cutoff_opt$zscorecut,
            y = cutoff_opt$HR_min95,
            lwd = 1, lty = 3, col = hsv(.34, .88, .60, 0.78)
        )
        lines(
            x = cutoff_opt$zscorecut,
            y = cutoff_opt$HR_max95,
            lwd = 1, lty = 3, col = hsv(.34, .88, .60, 0.78)
        )

        text(best_z_table, max(cutoff_opt$HR, na.rm = TRUE) + 0.12,
            paste0("z-score = ", round(best_z_table, 2)),
            cex = 0.8
        )
        points(best_z_table, max(cutoff_opt$HR, na.rm = TRUE))

        points(table[, "zscore"],
            rep(
                min(cutoff_opt$HR_min95, na.rm = TRUE),
                length(table[, "zscore"])
            ),
            pch = "|", col = hsv(.0, .0, .15, .35)
        )

        par(new = TRUE)
        plot(
            x = cutoff_opt$zscorecut,
            y = cutoff_opt$logrank_p, type = "l",
            ylim = c(
                max(cutoff_opt$logrank_p, na.rm = TRUE),
                min(cutoff_opt$logrank_p, na.rm = TRUE) - 0.1
            ),
            xlim = c(
                min(cutoff_opt$zscorecut, na.rm = TRUE),
                max(cutoff_opt$zscorecut, na.rm = TRUE)
            ),
            lwd = 2, lty = 1, xlab = NA, ylab = NA,
            axes = FALSE, cex = 1.5, col = hsv(.028, .83, .99, .59)
        )
        axis(
            side = 4, lwd = 2, cex.axis = 1.5, cex = 1.5,
            col = hsv(.028, .83, .99), las = 1
        )
        mtext(side = 4, line = 3.8, "p logrank", cex = 1.2)

        min_p <- min(cutoff_opt$logrank_p, na.rm = TRUE)

        text(best_z_table, min_p - 0.01, paste0(
            "p logrank = ",
            round(min_p, 4)
        ), cex = 0.8)
        points(best_z_table, min_p)
        dev.off()

        save_plot("coxHR", "_log2_with_confidence_interval")
        par(mar = c(5, 5, 3, 5))
        plot(
            x = cutoff_opt$zscorecut,
            y = log2(cutoff_opt$HR), type = "l",
            ylim = c(
                min(log2(cutoff_opt$HR_min95), na.rm = TRUE),
                max(log2(cutoff_opt$HR_max95), na.rm = TRUE)
            ),
            xlim = c(
                min(cutoff_opt$zscorecut, na.rm = TRUE),
                max(cutoff_opt$zscorecut, na.rm = TRUE)
            ),
            lwd = 2, lty = 1, xlab = "Expression z-score",
            ylab = expression("Log"[2] * "(Cox HR RFS)"), axes = FALSE,
            col = hsv(.58, .95, .62)
        )

        axis(
            side = 1, lwd = 2, cex.axis = 1.5, cex = 1.5,
            at = round(seq(min(cutoff_opt$zscorecut, na.rm = TRUE),
                max(cutoff_opt$zscorecut, na.rm = TRUE),
                by = 0.1
            ), 2)
        )
        axis(
            side = 2, lwd = 2, cex.axis = 1.5, cex = 1.5,
            col = hsv(.58, .95, .62), las = 1
        )

        lines(
            x = cutoff_opt$zscorecut,
            y = log2(cutoff_opt$HR_min95),
            lwd = 1, lty = 3, col = hsv(.34, .88, .60, 0.78)
        )
        lines(
            x = cutoff_opt$zscorecut,
            y = log2(cutoff_opt$HR_max95),
            lwd = 1, lty = 3, col = hsv(.34, .88, .60, 0.78)
        )

        text(best_z_table, max(log2(cutoff_opt$HR),
            na.rm = TRUE
        ) + 0.105,
        paste0("z-score = ", round(best_z_table, 2)),
        cex = 0.8
        )
        points(best_z_table, max(log2(cutoff_opt$HR), na.rm = TRUE))

        points(table[, "zscore"],
            rep(
                min(log2(cutoff_opt$HR_min95), na.rm = TRUE),
                length(table[, "zscore"])
            ),
            pch = "|", col = hsv(.0, .0, .15, .35)
        )
        par(new = TRUE)

        plot(
            x = cutoff_opt$zscorecut,
            y = cutoff_opt$logrank_p, type = "l",
            ylim = c(
                max(cutoff_opt$logrank_p, na.rm = TRUE),
                min(cutoff_opt$logrank_p, na.rm = TRUE)
            ),
            xlim = c(
                min(cutoff_opt$zscorecut, na.rm = TRUE),
                max(cutoff_opt$zscorecut, na.rm = TRUE)
            ),
            lwd = 2, lty = 1, xlab = NA, ylab = NA,
            axes = FALSE, cex = 1.5, col = hsv(.028, .83, .99, .59)
        )
        axis(
            side = 4, lwd = 2, cex.axis = 1.5, cex = 1.5,
            col = hsv(.028, .83, .99), las = 1
        )
        mtext(side = 4, line = 3.8, "p logrank", cex = 1.2)

        min_p <- min(cutoff_opt$logrank_p, na.rm = TRUE)

        text(best_z_table, min_p - 0.015, paste0(
            "p logrank = ",
            round(min_p, 4)
        ), cex = 0.8)
        points(best_z_table, min_p)

        dev.off()
    }

    save_plot("coxHR", "_hide_confidence_interval")
    par(mar = c(5, 5, 3, 5))
    plot(
        x = cutoff_opt$zscorecut,
        y = cutoff_opt$HR, type = "l",
        lwd = 2, lty = 1, xlab = "Expression z-score", ylab = "Cox HR RFS",
        axes = FALSE,
        col = hsv(.58, .95, .62, .39)
    )

    axis(
        side = 1, lwd = 2, cex.axis = 1.5, cex = 1.5,
        at = round(seq(min(cutoff_opt$zscorecut, na.rm = TRUE),
            max(cutoff_opt$zscorecut, na.rm = TRUE),
            by = 0.1
        ), 2)
    )
    axis(
        side = 2, lwd = 2, cex.axis = 1.5, cex = 1.5,
        col = hsv(.58, .95, .62), las = 1
    )
    text(best_z_table, max(cutoff_opt$HR, na.rm = TRUE) + 0.12,
        paste0("z-score = ", round(best_z_table, 2)),
        cex = 0.8
    )
    points(best_z_table, max(cutoff_opt$HR, na.rm = TRUE))

    points(cutoff_opt$zscorecut,
        rep(
            min(cutoff_opt$HR, na.rm = TRUE),
            length(cutoff_opt$zscorecut)
        ),
        pch = "|", col = hsv(.0, .0, .15, .35)
    )

    par(new = TRUE)
    plot(
        x = cutoff_opt$zscorecut,
        y = cutoff_opt$logrank_p, type = "l",
        ylim = c(
            max(cutoff_opt$logrank_p, na.rm = TRUE),
            min(cutoff_opt$logrank_p, na.rm = TRUE)
        ),
        lwd = 3, lty = 1, xlab = NA, ylab = NA,
        axes = FALSE, cex = 1.5, col = hsv(.028, .83, .99, .39)
    )
    axis(
        side = 4, lwd = 2, cex.axis = 1.5, cex = 1.5,
        col = hsv(.028, .83, .99), las = 1
    )
    mtext(side = 4, line = 3.8, "p logrank", cex = 1.2)

    min_p <- min(cutoff_opt$logrank_p, na.rm = TRUE)
    text(best_z_table, min_p - 0.01, paste0("p logrank = ", round(min_p, 4)),
        cex = 0.8
    )
    points(best_z_table, min_p)

    dev.off()

    save_plot("coxHR", "_log2_hide_confidence_interval")
    par(mar = c(5, 5, 3, 5))
    plot(
        x = cutoff_opt$zscorecut,
        y = log2(cutoff_opt$HR), type = "l",
        lwd = 2, lty = 1, xlab = "Expression z-score",
        ylab = expression("Log"[2] * "(Cox HR RFS)"), axes = FALSE,
        col = hsv(.58, .95, .62, .39)
    )

    axis(
        side = 1, lwd = 2, cex.axis = 1.5, cex = 1.5,
        at = round(seq(min(cutoff_opt$zscorecut, na.rm = TRUE),
            max(cutoff_opt$zscorecut, na.rm = TRUE),
            by = 0.1
        ), 2)
    )
    axis(
        side = 2, lwd = 2, cex.axis = 1.5, cex = 1.5,
        col = hsv(.58, .95, .62), las = 1
    )

    text(best_z_table, max(log2(cutoff_opt$HR), na.rm = TRUE) + 0.105,
        paste0("z-score = ", round(best_z_table, 2)),
        cex = 0.8
    )
    points(best_z_table, max(log2(cutoff_opt$HR), na.rm = TRUE))

    points(cutoff_opt$zscorecut,
        rep(
            min(log2(cutoff_opt$HR), na.rm = TRUE),
            length(cutoff_opt$zscorecut)
        ),
        pch = "|", col = hsv(.0, .0, .15, .35)
    )

    par(new = TRUE)

    plot(
        x = cutoff_opt$zscorecut,
        y = cutoff_opt$logrank_p, type = "l",
        ylim = c(
            max(cutoff_opt$logrank_p, na.rm = TRUE),
            min(cutoff_opt$logrank_p, na.rm = TRUE)
        ),
        lwd = 3, lty = 1, xlab = NA, ylab = NA,
        axes = FALSE, cex = 1.5, col = hsv(.028, .83, .99, .39)
    )
    axis(
        side = 4, lwd = 2, cex.axis = 1.5, cex = 1.5,
        col = hsv(.028, .83, .99), las = 1
    )
    mtext(side = 4, line = 3.8, "p logrank", cex = 1.2)

    min_p <- min(cutoff_opt$logrank_p, na.rm = TRUE)
    text(best_z_table, min_p - 0.01, paste0(
        "p logrank = ",
        round(min_p, 4)
    ), cex = 0.8)
    points(best_z_table, min_p)

    dev.off()

    # Save best z table
    # save optim table
    write.csv(best_z_table, file = file.path(
        dir, "coxHR",
        paste0("Best_z_Table", ".csv")
    ))

    # NOTE A
    # add unknown to everyone
    framel[, "classification"] <- "unknown"
    # add low expressing tag
    framel[
        which(framel[, "zscore"] <= best_z_table),
        "classification"
    ] <- "low"
    # add high expressing tag
    framel[
        which(framel[, "zscore"] > best_z_table),
        "classification"
    ] <- "high"
    # create group tissue_type
    framel[, "group"] <- framel[, "classification"]

    move <- which(framel[, "classification"] == "unknown")
    framel[move, "group"] <- framel[move, "tissue_type_simple"]
    framel[, "group"] <- factor(framel[, "group"],
        levels = c("low", "high")
    )

    roc_auc()

    color <- RColorBrewer::brewer.pal(length(levels(framel$group)), "Set2")

    plot_pie <- data.frame(
        Group = names(table(framel$group)),
        n = as.numeric(table(framel$group)),
        prop = round(as.numeric(table(framel$group) / nrow(framel)) * 100, 2)
    )
    plot_pie <- plot_pie[order(plot_pie$Group, decreasing = TRUE), ]
    plot_pie$lab.ypos <- cumsum(plot_pie$prop) - (0.5 * plot_pie$prop)

    save_plot("coxHR", "_patients_all_conditions")
    a <- ggplot2::ggplot(plot_pie, ggplot2::aes(2, y = prop, fill = Group)) +
        ggplot2::geom_bar(stat = "identity", color = "white") +
        ggplot2::coord_polar("y", start = 0) +
        ggplot2::geom_text(ggplot2::aes(y = lab.ypos, label = n),
            color = "white"
        ) +
        ggplot2::scale_fill_manual(values = color) +
        ggplot2::theme_void() +
        ggplot2::xlim(0.5, 2.5)
    print(a)
    dev.off()

    # define the formula now
    formula_now <- formula(framel[, "log2p1"] ~ framel[, "group"])

    save_plot("coxHR", "_PiratePlot_log2_expression_all_conditions")
    par(mar = c(3, 4.8, 4, 2))
    yarrr::pirateplot(
        formula = formula_now,
        data = framel, gl.lwd = 0,
        ylab = expression("Log"[2] * "(Expression + 1)"),
        inf.method = "iqr", cex.lab = 1.2, cex.axis = 1.8,
        inf.disp = "rect", jitter.val = 0.08, theme = 2,
        cex.names = 1.5,
        pal = "pony", avg.line.fun = median, point.cex = 1.3,
        inf.f.col = hsv(0, 0, .78), xlab = "",
        inf.b.col = hsv(0, 0, .47)
    )
    dev.off()

    path_capture <- file.path(
        dir, "coxHR",
        paste0(
            name, "_PiratePlot_log2_expression",
            "_all_conditions_stats.txt"
        )
    )

    fit <- aov(formula_now, data = framel)
    cat("********************\n", file = path_capture, append = TRUE)
    cat("A - ANOVA\n", file = path_capture, append = TRUE)
    capture.output(summary(fit), file = path_capture, append = TRUE)

    cat("\n********************\n", file = path_capture, append = TRUE)
    cat("B - Shapiro Test for Normality\n\n",
        file = path_capture,
        append = TRUE
    )
    cat("*B1 - all values test\n", file = path_capture, append = TRUE)
    tmp <- shapiro.test(framel[, "log2p1"])
    capture.output(print(tmp), file = path_capture, append = TRUE)

    cat("\n*B2 - fit residuals test\n", file = path_capture, append = TRUE)
    tmp <- shapiro.test(residuals(fit))
    capture.output(print(tmp), file = path_capture, append = TRUE)

    cat("\n********************\n\n", file = path_capture, append = TRUE)
    cat("C - Tukey Honest Significant Differences post-test\n",
        file = path_capture, append = TRUE
    )
    tmp <- TukeyHSD(fit)
    capture.output(print(tmp), file = path_capture, append = TRUE)

    if (length(levels(framel[, "group"])) == 2) {
        cat("\n********************\n", file = path_capture, append = TRUE)
        cat("D - unpaired t-test\n", file = path_capture, append = TRUE)
        test <- var.test(expression ~ yr5_status, framel,
            alternative = "two.sided"
        )
        tmp <- t.test(pirate_formula,
            data = framel, alternative = "two.sided",
            mu = 0, paired = FALSE, var.equal = as.numeric(test[3]) <= 0.05,
            conf.level = 0.95
        )
        capture.output(print(tmp), file = path_capture, append = TRUE)
    }

    cat("\n********************\n", file = path_capture, append = TRUE)
    cat("non-parametric assumption:\n", file = path_capture, append = TRUE)
    cat("********************\n", file = path_capture, append = TRUE)
    cat("A - Kruskal-Wallis test\n", file = path_capture, append = TRUE)
    tmp <- kruskal.test(data = framel, pirate_formula)
    capture.output(print(tmp), file = path_capture, append = TRUE)

    cat("\n********************\n", file = path_capture, append = TRUE)
    cat("B - Non-adjusted Wilcoxon (Mann-Whitney U)\n",
        file = path_capture,
        append = TRUE
    )
    tmp <- pairwise.wilcox.test(
        x = framel[, "log2p1"],
        g = framel[, "group"],
        p.adjust.method = "none",
        paired = FALSE,
        exact = TRUE,
        correct = TRUE,
        alternative = "two.sided"
    )
    capture.output(print(tmp), file = path_capture, append = TRUE)

    cat("\n********************\n\n", file = path_capture, append = TRUE)
    cat("C - Benjamini FDR adjusted Wilcoxon (Mann-Whitney U)\n",
        file = path_capture, append = TRUE
    )
    tmp <- pairwise.wilcox.test(
        x = framel[, "log2p1"],
        g = framel[, "group"],
        p.adjust.method = "fdr",
        paired = FALSE,
        exact = TRUE,
        correct = TRUE,
        alternative = "two.sided"
    )
    capture.output(print(tmp), file = path_capture, append = TRUE)


    # NOTE Overall
    # Create table for patient numbers
    patient_amount_table <- matrix(ncol = 4, nrow = 8)
    colnames(patient_amount_table) <- c(
        "type", "classification",
        "amount", "percent"
    )
    patient_amount_table <- data.frame(patient_amount_table)

    # use previously defined markers

    # Copy patient_amount_table
    patient_n <- patient_amount_table

    # OVERALL SURVIVAL ANALYSIS
    # Kaplan meyer using best scenario
    kaplan_now <- framel[, c(
        name,
        "classification",
        "final_vital_status_times",
        "final_vital_status",
        "final_rfs_status",
        "final_rfs_status_times",
        "final_dmfs_status",
        "final_dmfs_status_times"
    )]

    # Kaplan
    colnames(kaplan_now)[2] <- "classification"

    # Collect just rows with classification
    kaplan_now <- kaplan_now[c(
        which(kaplan_now[, "classification"] == "low"),
        which(kaplan_now[, "classification"] == "high")
    ), ]


    # Factorize classification
    kaplan_now$group_factor <- factor(
        kaplan_now$classification,
        levels = c("low", "high")
    )

    ############# COLLECT DATA

    # Fill table with patient amount - ALL
    rows_now <- c(1, 2)
    patient_n[rows_now, "type"] <- "All"
    patient_n[rows_now, "classification"] <- c("low", "high")
    patient_n[rows_now[1], "amount"] <- as.numeric(
        table(kaplan_now[, "classification"])["low"]
    )
    patient_n[rows_now[2], "amount"] <- as.numeric(
        table(kaplan_now[, "classification"])["high"]
    )

    tmp <- patient_n[rows_now[1], "amount"] + patient_n[rows_now[2], "amount"]
    patient_n[rows_now[1], "percent"] <- patient_n[rows_now[1], "amount"] / tmp
    patient_n[rows_now[2], "percent"] <- patient_n[rows_now[2], "amount"] / tmp


    # Retrieve non-NA entries for final vital status
    kaplan_now <- kaplan_now[(!kaplan_now$final_vital_status == "unknown"), ]
    tmp <- kaplan_now$final_vital_status == "vital_status"
    kaplan_now <- kaplan_now[!tmp, ]
    kaplan_now <- kaplan_now[!is.na(kaplan_now$final_vital_status), ]
    kaplan_now <- kaplan_now[!is.nan(kaplan_now$final_vital_status), ]
    tmp <- kaplan_now$final_vital_status_times == "unknown"
    kaplan_now <- kaplan_now[!tmp, ]
    kaplan_now <- kaplan_now[(!kaplan_now$final_vital_status_times == "day"), ]
    kaplan_now <- kaplan_now[!is.na(kaplan_now$final_vital_status_times), ]
    kaplan_now <- kaplan_now[!is.nan(kaplan_now$final_vital_status_times), ]
    kaplan_now$final_vital_status_times <- as.numeric(
        kaplan_now$final_vital_status_times
    )

    kaplan_now$final_vital_status <- ifelse(
        kaplan_now$final_vital_status == "Dead", 1, 0
    )

    kaplan_now$expression <- log2(kaplan_now[, name] + 1)

    # Fill table with patient amount - OSurv
    rows_now <- c(3, 4)
    patient_n[rows_now, "type"] <- "OSurv"
    patient_n[rows_now, "classification"] <- c("low", "high")
    patient_n[rows_now[1], "amount"] <- as.numeric(
        table(kaplan_now[, "classification"])["low"]
    )
    patient_n[rows_now[2], "amount"] <- as.numeric(
        table(kaplan_now[, "classification"])["high"]
    )

    patient_n[rows_now[1], "percent"] <- patient_n[rows_now[1], "amount"] / tmp
    patient_n[rows_now[2], "percent"] <- patient_n[rows_now[2], "amount"] / tmp

    cox_ph(step = "overall")
    surv_plot(step = "overall")


    # NOTE RFS
    kaplan_now <- framel[, c(
        name,
        "classification",
        "final_vital_status_times",
        "final_vital_status",
        "final_rfs_status",
        "final_rfs_status_times",
        "final_dmfs_status",
        "final_dmfs_status_times"
    )]

    # Kaplan
    colnames(kaplan_now)[2] <- "classification"

    # Collect just rows with classification
    kaplan_now <- kaplan_now[c(
        which(kaplan_now[, "classification"] == "low"),
        which(kaplan_now[, "classification"] == "high")
    ), ]

    # Factorize classification
    kaplan_now$group_factor <- factor(
        kaplan_now$classification,
        levels = c("low", "high")
    )

    kaplan_now <- kaplan_now[(!kaplan_now$final_rfs_status == "unknown"), ]
    kaplan_now <- kaplan_now[!is.na(kaplan_now$final_rfs_status), ]
    kaplan_now <- kaplan_now[!is.nan(kaplan_now$final_rfs_status), ]

    kaplan_now <- kaplan_now[!kaplan_now$final_rfs_status_times == "unknown", ]
    tmp <- kaplan_now$final_rfs_status_times == "dead_other_cause_before_relapse"
    kaplan_now <- kaplan_now[!tmp, ]
    kaplan_now <- kaplan_now[(!kaplan_now$final_rfs_status_times == "day"), ]
    kaplan_now <- kaplan_now[!is.na(kaplan_now$final_rfs_status_times), ]
    kaplan_now <- kaplan_now[!is.nan(kaplan_now$final_rfs_status_times), ]
    kaplan_now <- kaplan_now[!is.infinite(kaplan_now$final_rfs_status_times), ]
    kaplan_now$final_rfs_status_times <- as.numeric(
        kaplan_now$final_rfs_status_times
    )

    kaplan_now$final_rfs_status <- ifelse(
        kaplan_now$final_rfs_status == "relapse", 1, 0
    )

    kaplan_now$expression <- log2(kaplan_now[, name] + 1)

    # Fill table with patient amount - Relapse
    rows_now <- c(5, 6)
    patient_n[rows_now, "type"] <- "Relapse"
    patient_n[rows_now, "classification"] <- c("low", "high")
    patient_n[rows_now[1], "amount"] <- as.numeric(
        table(kaplan_now[, "classification"])["low"]
    )
    patient_n[rows_now[2], "amount"] <- as.numeric(
        table(kaplan_now[, "classification"])["high"]
    )

    tmp <- patient_n[rows_now[1], "amount"] + patient_n[rows_now[2], "amount"]
    patient_n[rows_now[1], "percent"] <- patient_n[rows_now[1], "amount"] / tmp
    patient_n[rows_now[2], "percent"] <- patient_n[rows_now[2], "amount"] / tmp
    cox_ph(step = "rfs")
    surv_plot(step = "rfs")


    # NOTE dmfs

    # dmfs
    # Kaplan meyer using best scenario
    kaplan_now <- framel[, c(
        name,
        "classification",
        "final_vital_status_times",
        "final_vital_status",
        "final_rfs_status",
        "final_rfs_status_times",
        "final_dmfs_status",
        "final_dmfs_status_times"
    )]

    # Kaplan
    colnames(kaplan_now)[2] <- "classification"

    # Collect just rows with classification
    kaplan_now <- kaplan_now[c(
        which(kaplan_now[, "classification"] == "low"),
        which(kaplan_now[, "classification"] == "high")
    ), ]

    # Factorize classification
    kaplan_now$group_factor <- factor(
        kaplan_now$classification,
        levels = c("low", "high")
    )

    # Retrieve non-NA entries for final vital status
    kaplan_now <- kaplan_now[(!kaplan_now$final_dmfs_status == "unknown"), ]
    kaplan_now <- kaplan_now[!is.na(kaplan_now$final_dmfs_status), ]
    kaplan_now <- kaplan_now[!is.nan(kaplan_now$final_dmfs_status), ]
    kaplan_now <- kaplan_now[!kaplan_now$final_dmfs_status_times == "unknown", ]
    tmp <- kaplan_now$final_dmfs_status_times == "dead_other_cause_before_relapse"
    kaplan_now <- kaplan_now[!tmp, ]
    kaplan_now <- kaplan_now[(!kaplan_now$final_dmfs_status_times == "day"), ]
    kaplan_now <- kaplan_now[!is.na(kaplan_now$final_dmfs_status_times), ]
    kaplan_now <- kaplan_now[!is.nan(kaplan_now$final_dmfs_status_times), ]
    kaplan_now <- kaplan_now[!is.infinite(kaplan_now$final_dmfs_status_times), ]
    kaplan_now$final_dmfs_status_times <- as.numeric(
        kaplan_now$final_dmfs_status_times
    )

    # Overall survival
    kaplan_now$final_dmfs_status <- as.integer(gsub(
        "metastasis", 1, gsub("censored", 0, kaplan_now$final_dmfs_status)
    ))

    kaplan_now$expression <- log2(kaplan_now[, name] + 1)


    # Fill table with patient amount - DMFS
    rows_now <- c(7, 8)
    patient_n[rows_now, "type"] <- "DMFS"
    patient_n[rows_now, "classification"] <- c("low", "high")
    patient_n[rows_now[1], "amount"] <- as.numeric(
        table(kaplan_now[, "classification"])["low"]
    )
    patient_n[rows_now[2], "amount"] <- as.numeric(
        table(kaplan_now[, "classification"])["high"]
    )

    tmp <- patient_n[rows_now[1], "amount"] + patient_n[rows_now[2], "amount"]
    patient_n[rows_now[1], "percent"] <- patient_n[rows_now[1], "amount"] / tmp
    patient_n[rows_now[2], "percent"] <- patient_n[rows_now[2], "amount"] / tmp

    if (sum(kaplan_now$final_dmfs_status) > 0) {
        cox_ph(step = "dmfs")
        surv_plot(step = "dmfs")
    }


    # NOTE PatientAmount
    patient_n$type <- factor(
        patient_n$type,
        levels = c("All", "OSurv", "Relapse", "DMFS")
    )
    patient_n$classification <- factor(
        patient_n$classification,
        levels = c("high", "low")
    )

    # Add position for plotting
    patient_n$pos <- 0
    for (type in levels(patient_n$type)) {
        lines <- patient_n$type == type
        numbers <- patient_n$amount[lines]
        patient_n$pos[lines] <- cumsum(numbers) - (0.5 * numbers)
    }

    # Add percents
    patient_n$percent <- (patient_n$percent * 100)
    patient_n$percent <- round(patient_n$percent, 0)

    save_plot("kaplan_meier", "_PatientAmount")
    p4 <- ggplot2::ggplot() +
        ggplot2::theme_bw() +
        ggplot2::geom_bar(ggplot2::aes(
            y = amount, x = type,
            fill = classification
        ),
        data = patient_n,
        stat = "identity"
        ) +
        ggplot2::geom_text(
            data = patient_n,
            ggplot2::aes(
                x = type, y = pos,
                label = paste0(percent, "%")
            ),
            size = 3.6
        ) +
        ggplot2::scale_fill_manual(values = c(
            hsv(.008, .82, .84, .86),
            hsv(.61, .66, .78, .70)
        )) +
        ggplot2::theme(
            legend.position = "bottom",
            legend.direction = "horizontal",
            legend.title = ggplot2::element_blank(),
            panel.grid.major = ggplot2::element_blank(),
            panel.grid.minor = ggplot2::element_blank(),
            axis.text.x = ggplot2::element_text(
                colour = "black",
                size = 14
            ),
            axis.text.y = ggplot2::element_text(
                colour = "black",
                size = 13
            )
        )
    print(p4)
    dev.off()


    # Save table with patient amounts
    write.csv(patient_n, file.path(
        dir, "kaplan_meier",
        paste0(
            "PatientAmount_",
            name, ".csv"
        )
    ))

    clinical_groups <- framel[, c("group", "classification")]
    rownames(clinical_groups) <- framel[, "patient_code"]

    assign("clinical_groups", clinical_groups, envir = get(ev))

    write.csv(clinical_groups, file = file.path(dir, "clinical_groups"))

    gc()
    message("Done!\n")
}
Facottons/DOAGDC documentation built on April 7, 2020, 3:17 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Facottons/DOAGDC
A Package to Download, Organize and Analyze Genomic Data Commons (GDC) Data

R/function_coxph.R
In Facottons/DOAGDC: A Package to Download, Organize and Analyze Genomic Data Commons (GDC) Data

Defines functions groups_identification_cox_hr

Documented in groups_identification_cox_hr

R Package Documentation

Browse R Packages

We want your feedback!

Facottons/DOAGDC A Package to Download, Organize and Analyze Genomic Data Commons (GDC) Data

R/function_coxph.R In Facottons/DOAGDC: A Package to Download, Organize and Analyze Genomic Data Commons (GDC) Data

Defines functions groups_identification_cox_hr

Documented in groups_identification_cox_hr

R Package Documentation

Browse R Packages

We want your feedback!

Facottons/DOAGDC
A Package to Download, Organize and Analyze Genomic Data Commons (GDC) Data

R/function_coxph.R
In Facottons/DOAGDC: A Package to Download, Organize and Analyze Genomic Data Commons (GDC) Data