R/smoothCoxph_man.R

#' Create a correlation of gene expression to log hazard ratio
#'
#' @param x 1 or multiple Genes, as a character vector list, to analyze
#' @param Eset The Expression Set, containing the expression data in columns and the survival indicators in the columns X_OS and X_OS_IND
#' @param exclude_values add a vector with following style c("column of clinical data to look for exclusion", "exclude group 1 from this column", "exclude group 2 from this column", ...)
#' @param xlimit Manually define the x-axis limit
#' @param ylimit Manually define the y-axis limit
#' @param xlabel Manually define the x-axis label
#' @param ylabel manually define the y-axis label
#' @param logrisk don't exactly know... need to check
#' @param average values: `mean` or `median`, defines how >1 genes are averaged
#' @param survival "overall", "DFS"
#' @param ... additional variables that can be added
#'
#' @return Plots the log Hazard ratio in correlation to its relative gene expression level using ggplot2
#' @export
#'
#' @examples
#' \dontrun{
#' smoothCoxph_man(x = c("BAMBI", "CD44"), Eset=Eset,
#' exclude_values=c("pathologic_stage", "Stage II", "Stage I", "Stage IV"),
#' logrisk=T, average="median")
#' }
#'
smoothCoxph_man <- function (x, Eset, exclude_values,
                               xlimit, ylimit, xlabel, ylabel,
                               logrisk = TRUE, survival="overall", average = "median", ...) {

  if(survival=="overall") {
    time <- Biobase::pData(Eset)$X_OS
    event <- Biobase::pData(Eset)$X_OS_IND
  } else if(survival=="DFS"){
    time <- Biobase::pData(Eset)$DFS_MONTH
    event <- Biobase::pData(Eset)$DFS_STATUS
    event <- ifelse(event=="Recurred/Progressed",1,0)
  } else {
    stop(paste("You cannot use \"", survival, "\" as censoring"), sep="")
  }
  # x="Age"
  if(x=="Age"){
    gene <- "Age"
  } else {
    gene <- x
  }


  # for package reasons: Define variables first:
  hazard_ratio <- NULL
  upper95 <- NULL
  lower95 <- NULL

  if(length(gene)>1){
    z <- data.frame(time = time, event = event)
    z[,gene] <- t(Biobase::exprs(Eset)[gene,])

    # z-score the tcga data

    for(i in 3:length(z)){
      Z_score_1 <- z[,i] - mean(z[,i])
      Z_score <- Z_score_1 / stats::sd(z[,i])
      z[,i] <- Z_score
    }

    df_matrix <- matrix(t(z[,gene]), ncol = length(gene), byrow=TRUE)

    z$Median <- Biobase::rowMedians(df_matrix)
    z$Mean <- base::rowMeans(df_matrix)

  } else {
    if(x=="Age"){
      q <- as.numeric(Biobase::pData(Eset)$AGE)
    } else {
      q <- Biobase::exprs(Eset)[gene[1],]
    }

    z <- data.frame(time = time, event = event, q = q)

    if(x!="Age"){
      # z-score the data
      for(i in 3:length(z)){
        Z_score_1 <- z[,i] - mean(z[,i])
        Z_score <- Z_score_1 / stats::sd(z[,i])
        z[,i] <- Z_score
        q <- Z_score
      }
    }
  }

  if (!missing(exclude_values)){
    if (exclude_values[1] %in% colnames(Biobase::pData(Eset))){
      z$exclude_values <- Biobase::pData(Eset)[,exclude_values[1]]
    } else {
      stop(paste(exclude_values[1],"does not exist in the phenoData"))
    }
    if(length(exclude_values)<2){
      stop(paste("Please state values to be excluded in",exclude_values[1]))
    } else {
      for(i in 2:length(exclude_values)){
        z <- z[!z$exclude_values==exclude_values[i], ]
      }
    }
  }

  z <- z[!is.na(z$event) & !is.na(z$time), ]

  if(length(gene)>1){
    for(i in 1:length(gene)){
      z <- z[!is.na(z[,gene[i]]), ]
    }
    if(average=="mean"){
      z <- z[order(z$Mean), ]
      coxph1 <- survival::coxph(survival::Surv(time, event) ~ survival::pspline(Mean, df = 4), data = z)
    } else {
      z <- z[order(z$Median), ]
      coxph1 <- survival::coxph(survival::Surv(time, event) ~ survival::pspline(Median, df = 4), data = z)
    }
  } else {
    z <- z[!is.na(z$q), ]
    z <- z[order(z$q), ]
    coxph1 <- survival::coxph(survival::Surv(time, event) ~ survival::pspline(q, df = 4), data = z)
  }

  if (logrisk) {
    pred <- stats::predict(coxph1, type = "lp", se.fit = TRUE)
    y <- cbind(pred$fit, pred$fit - 1.96 * pred$se.fit, pred$fit +
                 1.96 * pred$se.fit)
  } else {
    pred <- stats::predict(coxph1, type = "risk", se.fit = TRUE)
    y <- cbind(log(pred$fit), log(pred$fit - 1.96 * pred$se.fit),
               log(pred$fit + 1.96 * pred$se.fit))
  }
  if (missing(xlimit)){
    if(length(gene)>1){
      if(average=="mean"){
        xlimit <- range(z$Mean)
      } else {
        xlimit <- range(z$Median)
      }
    } else {
      xlimit <- range(q)
    }
  }
  if (missing(ylimit)) ylimit <- c(min(y[, 1], stats::median(y[, 2]), na.rm = T), max(y[, 1], stats::median(y[, 3]), na.rm = T)) * 1.5
  if (missing(xlabel)) {
    if(x=="Age"){
      xlabel <- "Age"
    } else {
      xlabel <- "Relative gene expression level"
    }
  }
  if (missing(ylabel)) ylabel <- "log Hazard Ratio"

  if(length(gene)>1){
    if(average=="mean"){
      coxph2 <- survival::coxph(survival::Surv(time, event) ~ Mean, data = z)
      data_smooth <- data.frame(expression = z$Mean[1:length(pred$fit)],
                                hazard_ratio = y[, 1],
                                lower95 = y[, 2],
                                upper95 = y[, 3])
    } else {
      coxph2 <- survival::coxph(survival::Surv(time, event) ~ Median, data = z)
      data_smooth <- data.frame(expression = z$Median[1:length(pred$fit)],
                                hazard_ratio = y[, 1],
                                lower95 = y[, 2],
                                upper95 = y[, 3])
    }

  } else {
    coxph2 <- survival::coxph(survival::Surv(time, event) ~ q, data = z)
    data_smooth <- data.frame(expression = z$q[1:length(pred$fit)],
                              hazard_ratio = y[, 1],
                              lower95 = y[, 2],
                              upper95 = y[, 3])
  }

  coeffs <- stats::coef(summary(coxph2))
  if(length(gene)>1){
    label_mean <- ifelse(average=="mean","Mean","Median")
    if(length(gene)>5){
      length_gene_rounded <- floor(length(gene)/5)
      gene_names <- c()
      for(i in 1:length_gene_rounded){
        start <- 1+((i-1)*5)
        end <- i*5
        gene_names <- paste(gene_names, paste(gene[start:end],collapse=" & "), "\n")
      }
      if(length(gene)%%5!=0){
        end_length <- length_gene_rounded*5
        gene_names <- paste(gene_names, paste(gene[end_length:length(gene)],collapse=" & "), "\n")
      }
    } else {
      gene_names <- paste(gene,collapse=" & ")
    }
    gene_name <- paste(label_mean, "of", gene_names)
  } else {
    gene_name <- gene[1]
  }

  Plot_Title <- paste("Correlation of ", gene_name,
                      if(x!="Age") {
                        " expression levels\n"
                      }," to log hazard ratio",sep="")

  ggplot2::ggplot(data_smooth, ggplot2::aes(x = expression)) +
    ggplot2::ggtitle(Plot_Title) +
    ggplot2::theme(plot.title = ggplot2::element_text(size=11,
                                    face="bold",
                                    margin = ggplot2::margin(10, 0, 20, 0))) +
    ggplot2::theme_bw() +
    ggplot2::geom_hline(yintercept=0, size=1, color = "grey") +
    ggplot2::geom_line(ggplot2::aes(y = hazard_ratio), color = "dark blue", lwd = 1) +
    ggplot2::geom_line(ggplot2::aes(y = upper95), color = "dark grey", linetype = "dashed") +
    ggplot2::geom_line(ggplot2::aes(y = lower95), color = "dark grey", linetype = "dashed") +
    ggplot2::geom_ribbon(ggplot2::aes(ymin = lower95, ymax = upper95), fill = "steelblue2", color=NA, alpha = 0.1) +
    ggplot2::ylim(ylimit) +
    ggplot2::xlim(xlimit) +
    ggplot2::ylab(label=ylabel) +
    ggplot2::xlab(label=xlabel) +
    ggplot2::annotate("text", x=xlimit[2], y=(ylimit[2]-(ylimit[2]-ylimit[1])*0.95), hjust = 1, label=paste("HR =", round(coeffs[,2], digits=4), "+/-", round(coeffs[,3], digits=4))) +
    ggplot2::annotate("text", x=xlimit[2], y=(ylimit[2]-(ylimit[2]-ylimit[1])*0.99), hjust = 1, label=paste("p <", ifelse(coeffs[,5]>0.05, "NS", ifelse(coeffs[,5]>0.01, "0.05", ifelse(coeffs[,5]>0.005, "0.01", "0.005")))))
}
w2felix/TCGADataFelix documentation built on May 19, 2019, 8:24 a.m.