R/models.R

Defines functions generateRFM generateCPHM generateGLM

Documented in generateCPHM generateGLM generateRFM

#' @title Generation a generalized linear models of cluster abundances
#' 
#' @description
#' This function generates a generalized linear model between a provided vector of values associated to samples and the abundance of each cluster.
#'
#' @details 
#' The 'clusters' parameter provides the name of the clusters to include in the model. 
#' 
#' The number of clusters allowed in the model can vary depending the number of values to infer. Please refer to the documentation of R coxph and coxph.predict functions for more details.
#' 
#' Firstly, the function computed the glm model using all clusters as terms of the model. Then, the model is iteratively regenerated by discarding at each step the terms with the highest p-value higher than the 'th.pvalue' thershold. In this way, the model can correctly fit the data while being parsimonious. By default, the 'p-value' thershold parameter is set to 1 in order to include all terms in the model. If no terms having a p-value below the threshold, both the returned model and the prediction are set to NULL.
#' 
#' @param Results a 'Results' object
#' @param variable a numerical named vector providing the correspondence between sample names and specific phenotypes (or NA values to infer the phenotypes)
#' @param use.percentages a logical specifying if the computations should be performed on percentage
#' @param clusters a character vector specifying the names of the clusters used to compute the linear model (all clusters are selected by default)
#' @param th.pvalue a numeric between 0 and 1 specifying the maximal p-value of each term in the returned model
#' @param show.error a logical indicating if error bars should be used to display the coefficients standard deviations
#' @param verbose a logical indicating if debug messages must be displayed
#' @param ... further parameters passed to the R glm method
#' 
#' @return a list of 5 elements corresponding to: a generalized linear model object as provided by the R glm function ('model' element), a named vector of predicted values ('variable.predictions' element), a named vector of predicted cluster abundance coefficiants ('cluster.coefficients' element), the representation of clusters coeficients ('plot.cluster' element), and the representation of samples predictions values ('plot.samples' element). 
#'
#' @export
generateGLM <- function(Results,
                        variable,
						use.percentages = FALSE,
                        clusters        = NULL,
                        th.pvalue       = 1,
						show.error      = FALSE,
                        verbose         = FALSE,
						...){

	y = variable
	
    if (is.null(Results)) {
        stop("Error in generateGLM: 'Results' parameter can not be NULL")
    } else if (class(Results)[1] != "Results") {
        stop("Error in generateGLM: 'Results' parameter must be a 'Results' object")
    }
    
    if (is.null(y)) {
        stop("Error in generateGLM: 'variable' parameter can not be NULL")
    }
    
    data <- Results@cluster.abundances
	
	if (use.percentages) {
        data <- prop.table(as.matrix(data), 2)
        data <- data * 100
    } else {
        data <- data
    }
	
    if (is.null(names(y))) {
        data <- data
    } else if (!all(names(y) %in% Results@sample.names)) {
        stop("Error in generateGLM: names of the y parameter must contain only samples names\n Unknown sample names: ",
             paste(setdiff(unique(names(y)), Results@sample.names), collapse = " "))
    } else {
        data <- data[, names(y), drop = FALSE]
    }
    
    if (is.null(clusters)) {
        clusters <- Results@cluster.names
    } else if (all(clusters %in% Results@cluster.names)) {
        if (typeof(clusters) != "character") {
            stop("Error in generateGLM: 'clusters' parameter must be a character vector")
        }
        clusters <- unique(clusters)
        data <- data[clusters, , drop = FALSE]
    } else {
        stop("Error in generateGLM:\nUnknown clusters : ", paste(setdiff(unique(clusters), Results@cluster.names), collapse = " "))
    }
    
    if (length(clusters) >= (length(y)-1)){
        stop("Error in generateGLM: the numer of included terms (clusters) must be inferior to the number of observation (samples) minus 1")
    }
    
    data   <- as.data.frame(t(data))
	
    data$y <- y

    res.glm     <- NULL
    res.predict <- NULL
    count <- 1
	repeat{
        res.glm      <- stats::glm(y~.,data = data, ...)
        if(verbose){
				cat(paste("********************\n"))
				cat(paste("step ",count,"\n"))
				cat(paste("model summary: \n"))
				print(summary(res.glm))
			}
		res.predict  <- stats::predict.glm(res.glm, newdata=data)
        pvalues      <- summary(res.glm)$coefficients[-1,4]
        to.remove    <- which.max(pvalues)

        if (pvalues[to.remove] > th.pvalue) {
			if(verbose){
				cat(paste0("removing clusters: ",colnames(data)[to.remove]))
			}
            data <- data[,-to.remove]
            if (!is.data.frame(data)) {
                res.glm     <- NULL
                res.predict <- NULL
                break
            }

        } else {
            break
        }
		count <- count+1
		if(verbose){
			cat(paste("********************\n"))
			cat("\n")
		}
    }
    
    if (!is.null(res.glm)) {
        data           <- as.data.frame(summary(res.glm)$coefficients)
        data           <- data [-1,]
        colnames(data) <- c("estimate", "sd", "tvalue", "pvalue")
        data$clusters  <- gsub("`", "", rownames(data))
        data$ymin      <- data$estimate - data$sd/2
        data$ymax      <- data$estimate + data$sd/2
        order          <- order(data$estimate, decreasing = TRUE)
        data$clusters  <- factor(data$clusters,levels = data$clusters[order])
        
		clusters = data$clusters
		estimate = data$estimate
		data.old     = data
		
        plot.clusters <- ggplot2::ggplot(data = data) +
					     ggplot2::ggtitle("Coefficiants of the Generalized Linear Model") +
                         ggplot2::geom_bar(ggplot2::aes_string(x = "clusters", y = "estimate", fill = "pvalue"),stat = "identity", position = "identity") +   ggplot2::scale_fill_gradient(low = "red", high = "white", limit = c(0,th.pvalue), name = "p-value") +
						 ggplot2::ylab("coefficient") +
                         ggplot2::theme(legend.text = ggplot2::element_text(size = 6),
                                        axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, vjust = 0.5),plot.title=ggplot2::element_text(hjust=0.5))
										
		if(show.error)
			plot.clusters <- plot.clusters + 
						ggplot2::geom_errorbar(ggplot2::aes_string(x = "clusters", ymin = "ymin", ymax = "ymax"), width = 0.25)
                         
						
    }
    
    if (!is.null(res.predict)) {

        data   <- data.frame(samples = names(y), y = y, variable = "provided")
        data   <- data[!is.na(data$y),]

        data.predict <- data.frame(samples = names(res.predict), y = res.predict, variable = "predicted")
        data         <- rbind(data,data.predict)
        data$y       <- round(data$y, 2)
        
        data$predict <- !(duplicated(data$samples) | duplicated(data$samples, fromLast = TRUE))

        data$samples <- factor(data$samples,levels = names(y))
        
        residues <- plyr::ddply(data, "samples", function(df){return(c(min(df$y), max(df$y)))})

        plot.samples <- ggplot2::ggplot() +
						ggplot2::ggtitle("Predictions of the Generalized Linear Model") +
                        ggplot2::geom_errorbar(data = residues, ggplot2::aes_string(x = "samples", ymin = "V1", ymax = "V2"), width = 0, color = "blue") +
                        ggplot2::geom_point(data = data,  ggplot2::aes_string(x = "samples", y = "y", shape = "variable", color= "predict"), size = 3) +
                        ggplot2::scale_color_manual(values = c("TRUE" = "orange", "FALSE" = "#33CC33"), guide=FALSE) +
                        ggplot2::theme(legend.text = ggplot2::element_text(size = 6),
                                       axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, vjust = 0.5),plot.title=ggplot2::element_text(hjust=0.5))
        
		
    }

	names(estimate) <- clusters
	
    return(list(model = res.glm, variable.predictions = res.predict, cluster.coefficients=estimate, plot.samples = plot.samples, plot.clusters = plot.clusters))

}


#' @title Generation of a Cox proportional hazards regression model of cluster abundances
#' 
#' @description
#' This function generates a Cox proportional hazards regression model between a provided vector of values associated to samples and the abundance of each clusters. This generated model can then be used to predict biological outcomes in the context of survival/progression studies.
#'
#' @details 
#' The 'clusters' parameter provides the name of the clusters to include in the model. 
#'
#' The number of clusters allowed in the model can vary depending the number of values to infer. Please refer to the documentation of R coxph and coxph.predict functions for more details.
#'
#' Firstly, the function computed the glm model using all clusters as terms of the model. Then, the model is iteratively regenerated by discarding at each step the terms with the highest p-value higher than the 'th.pvalue' thershold. In this way, the model can correctly fit the data while being parsimonious. By default, the 'p-value' thershold parameter is set to 1 in order to include all terms in the model. If no terms having a p-value below the threshold, both the returned model and the prediction are set to NULL.
#'  
#' @param Results a 'Results' object
#' @param variable a numerical named vector providing the correspondence between sample names and specific phenotypes (or NA values to infer the phenotypes)
#' @param status a numeric vector providing the correspondence between sample names and specific status (or NA values to infer the status)
#' @param use.percentages a logical specifying if the computations should be performed on percentage
#' @param clusters a character vector specifying the names of the clusters used to compute the Cox model (all clusters are selected by default)
#' @param th.pvalue a numeric between 0 and 1 specifying the maximal p-value of each term in the returned model
#' @param show.error a logical indicating if error bars should be used to display the coefficients standard deviations
#' @param ... further parameters passed to the R coxph method
#' 
#' @return a list of 5 elements corresponding to: a Cox proportional hazards regression model object as provided by the R coxph function ('model' element), a named vector of predicted values for each sample ('variable.predictions' element), the representation of clusters coeficients ('plot.clusters' element), and the representation of samples predictions values ('plot.samples' element), and a representation of the provided survival curve ('plot.provided_survival_curve' element). 
#'
#' @import survival ggfortify
#'
#' @export
generateCPHM <- function(Results,
                        variable,
						status,
						use.percentages = FALSE,
                        clusters        = NULL,
						th.pvalue       = 1,
						show.error      = FALSE,
                        ...){
	
	if (is.null(Results)) {
        stop("Error in generateCPHM: 'Results' parameter can not be NULL")
    } else if (class(Results)[1] != "Results") {
        stop("Error in generateCPHM: 'Results' parameter must be a 'Results' object")
    }
    
    if (is.null(variable)) {
        stop("Error in generateCPHM: 'variable' parameter can not be NULL")
    }

    data <- Results@cluster.abundances
	
	if (use.percentages) {
        data <- prop.table(as.matrix(data), 2)
        data <- data * 100
    } else {
        data <- data
    }
	
	if (is.null(names(variable))) {
        data <- data
    } else if (!all(names(variable) %in% Results@sample.names)) {
        stop("Error in generateCPHM: names of the variable parameter must contain only samples names\n Unknown sample names: ",
             paste(setdiff(unique(names(variable)), Results@sample.names), collapse = " "))
    } else {
        data <- data[, names(variable), drop = FALSE]
    }
    
    if (is.null(names(status))) {
        stop("Error in generateCPHM: 'status' parameter can not be NULL")
    } else if (!all(names(status) %in% Results@sample.names)) {
        stop("Error in generateCPHM: names of the 'status' parameter must contain only samples names\n Unknown sample names: ",
             paste(setdiff(unique(names(status)), Results@sample.names), collapse = " "))
    } else if (!identical(names(variable),names(status))){
        stop("Error in generateCPHM: names of the 'status' and names of the 'variable' parameter must be identical and in the same order")
    } else if (!identical(is.na(variable),is.na(status))){
        stop("Error in generateCPHM: NA of the 'status' and NA of the 'variable' parameter must be synchronised")
    }
    
    if (is.null(clusters)) {
        clusters <- Results@cluster.names
    } else if (all(clusters %in% Results@cluster.names)) {
        if (typeof(clusters) != "character") {
            stop("Error in generateCPHM: 'clusters' parameter must be a character vector")
        }
        clusters <- unique(clusters)
        data <- data[clusters, , drop = FALSE]
    } else {
        stop("Error in generateCPHM:\nUnknown clusters : ", paste(setdiff(unique(clusters), Results@cluster.names), collapse = " "))
    }
	
	data <- t(data)
	data <- data.frame(data,check.names=FALSE)
	data <- cbind(data,variable,status)

	data.training <- data[!is.na(data$variable),]

	repeat{
	    tryCatch(coxph <- survival::coxph(survival::Surv(variable, status) ~ ., data = data.training, ...),
	             error = function(x){
	                 stop("Error in generateCPHM: Cox model cannot provide a model. Please reduce the number of provided clusters.")
	             })
	    coef              <- coxph$coefficients
	    se                <- sqrt(diag(coxph$var))
	    pvalues           <- as.data.frame(cbind(coef, se, coef/se, 1 - stats::pchisq((coef/se)^2, 1)))
	    dimnames(pvalues) <- list(names(coef), c("coef", "se", "z", "p"))
	    to.remove         <- which.max(pvalues$p)
	    
	    if (any(is.na(pvalues$p))) {
	        stop("Error in generateCPHM: Cox model ran out of iterations and did not converge. Please reduce the number of provided clusters.")
	    }
	    
	    if (pvalues[to.remove,"p"] > th.pvalue) {
	        data.training <- data.training[,-to.remove]
	        if (ncol(data.training) == 2) {
	            coxph     <- NULL
	            break
	        }
	    } else {
	        break
	    }
	}

	if (!is.null(coxph)) {
		suppressWarnings(predict <- stats::predict(coxph, newdata = data, type = "lp"))
		names(predict)                <- names(variable)
		data$type                     <- ifelse(is.na(data$variable),"predicted","real")
		
		plot.data <- data
		plot.data <- stats::na.omit(plot.data)
		fit                           <- survival::survfit(survival::Surv(variable, status) ~ 1, data = plot.data)
		plot.provided_survival_curve  <- ggfortify:::autoplot.survfit(fit,surv.colour="green",censor.size=5,censor.colour="darkgreen",xlab="variable")
		### sould be replaced by: plot.provided_survival_curve  <- ggplot2::autoplot(fit,surv.colour="green",censor.size=5,censor.colour="darkgreen",xlab="variable")
		
		coef <- coxph$coefficients
	    se   <- sqrt(diag(coxph$var))
	    
		if (is.null(coxph$naive.var)) {
	        data           <- as.data.frame(cbind(coef, exp(coef), se, coef/se, 1 - stats::pchisq((coef/se)^2, 1)))
	        dimnames(data) <- list(names(coef), c("coef", "exp.coef", "se", "z", "p"))
	    } else {
	        nse            <- sqrt(diag(coxph$naive.var))
	        data           <- as.data.frame(cbind(coef, exp(coef), nse, se, coef/se, 1 - stats::pchisq((coef/se)^2, 1)))
	        dimnames(data) <- list(names(coef), c("coef", "exp.coef", "se", "robust se", "z", "p"))
	    }
	    
	    data$clusters  <- gsub("`", "", rownames(data))

	    order          <- order(data$coef, decreasing = TRUE)
	    data$clusters  <- factor(data$clusters,levels = data$clusters[order])
	    
	    data$ymin      <- data$coef - data$se/2
	    data$ymax      <- data$coef + data$se/2

	    plot.clusters <- ggplot2::ggplot(data = data) +
						 ggplot2::ggtitle("Coefficiants of the Cox proportional hazards regression model") +
              	         ggplot2::geom_bar(ggplot2::aes_string(x = "clusters", y = "coef", fill = "p"), stat = "identity", position = "identity") +   
             	         ggplot2::scale_fill_gradient(low = "red", high = "white", limit = c(0, max(data$p)), name = "p-value") +
            	         ggplot2::theme(legend.text = ggplot2::element_text(size = 6),
            	                        axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, vjust = 0.5),
										plot.title=ggplot2::element_text(hjust=0.5))
		
		if(show.error)
			plot.clusters <- plot.clusters + 
						ggplot2::geom_errorbar(ggplot2::aes_string(x = "clusters", ymin = "ymin", ymax = "ymax"), width = 0.25)
	
	    data        <- data.frame(samples = names(variable), variable = variable)
	    data        <- cbind(data, predict = predict)
		data$color  <- "green" 
		
		data[is.na(data$variable), "color"]    <- "orange" 
		data[is.na(data$variable), "variable"] <- 0
		
		data$predict <- -(data$predict-max(abs(data$predict)))
		
		plot.samples <- ggplot2::ggplot() +
						ggplot2::ggtitle("Predictions of the Cox proportional hazards regression model") +
            	        ggplot2::geom_point(data = data,  ggplot2::aes_string(x = "variable", y = "predict", color = "color"), size = 3) +
            		    ggplot2::geom_smooth(data = data[data$color != "orange",],  ggplot2::aes_string(x = "variable", y = "predict"), method = "lm", se = FALSE, color = "grey40") +
            		    ggplot2::scale_colour_identity() +
            			ggrepel::geom_text_repel(data = data,ggplot2::aes_string(label = "samples",x = "variable", y = "predict"), size = 3, color = "blue") +
						ggplot2::ylab("-predicted") +
            			ggplot2::theme(legend.text = ggplot2::element_text(size = 6),
            	                       axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, vjust = 0.5),
            						   aspect.ratio = 1,
            						   legend.position = "none",
									   plot.title=ggplot2::element_text(hjust=0.5))
	    
	} else{
	    predict                       <- NULL
	    plot.samples                  <- NULL
	    plot.clusters                 <- NULL
	    plot.provided_survival_curve  <- NULL
	    plot.predicted_survival_curve <- NULL
	}
	
	return(list(model = coxph, variable.predictions = predict, plot.samples = plot.samples, plot.clusters = plot.clusters, plot.provided_survival_curve = plot.provided_survival_curve))
}


#' @title Generation of a random forest of cluster abundances
#' 
#' @description
#' This function generates a random forest model between the values associated to samples and the abundance of each clusters. This generated model can then be used to predict biological outcomes in the context of survival/progression studies.
#'
#' @details 
#' The 'clusters' parameter provide the name of the clusters to include in the model. 
#' 
#' The named vector containg the value associated to each sample are provided to the 'variable' parameter. In order to infer unknown values associated to samples, it is possible to set to NA for these samples. In this way, the function will infer, based on the computed model, all values associated to these samples.
#'   
#' The function involve the randomForestSRC to compute a random forest model using all clusters as variable of the model. 
#' Several visualisation based on the "ggfortify" package are returned allowing to identify the most important variables, their minimum depth, the OOB Error Rate, as well as the variables predictions.
#'
#' @param Results a 'Results' object
#' @param variable a numerical named vector providing the correspondence between sample names and specific phenotypes (or NA values to infer the phenotypes)
#' @param status a numerical named vector providing the correspondence between sample names and specific status (or NA values to infer the status)
#' @param use.percentages a logical specifying if the computations should be performed on percentage
#' @param clusters a character vector specifying the names of the clusters used to compute the model (all clusters are selected by default)
#' @param ntree a numerical value specifying the number of tree to generate
#' @param ... further parameters passed to the R randomForest method
#' 
#' @return a list of 4 elements corresponding to: random forest model object as provided by the R randomForest function ('model' element), and a named vector of predicted values ('variable.predictions' element), the representation of clusters coeficients ('plot.vimp' element), and the representation of samples predictions values ('plot.samples' element)
#'
#' @import randomForestSRC survival
#'
#' @export
generateRFM <- function(Results,
                        variable,
						status,
						use.percentages = FALSE,
                        clusters        = NULL,
						ntree           = 1000,
                        ...){
	
	set.seed(4242)
	
	if (is.null(Results)) {
        stop("Error in generateRFM: 'Results' parameter can not be NULL")
    } else if (class(Results)[1] != "Results") {
        stop("Error in generateRFM: 'Results' parameter must be a 'Results' object")
    }
    
    if (is.null(variable)) {
        stop("Error in generateRFM: 'variable' parameter can not be NULL")
    }
	
	if (is.null(status)) {
        stop("Error in generateRFM: 'status' parameter can not be NULL")
    }
    
    data <- Results@cluster.abundances
	
	if (use.percentages) {
        data <- prop.table(as.matrix(data), 2)
        data <- data * 100
    } else {
        data <- data
    }
	
	if (is.null(names(variable))) {
        data <- data
    } else if (!all(names(variable) %in% Results@sample.names)) {
        stop("Error in generateRFM: names of the variable parameter must contain only samples names\n Unknown sample names: ",
             paste(setdiff(unique(names(variable)), Results@sample.names), collapse = " "))
    } else {
        data <- data[, names(variable), drop = FALSE]
    }
    
    if (is.null(names(status))) {
        stop("Error in generateRFM: 'status' parameter can not be NULL")
    } else if (!all(names(status) %in% Results@sample.names)) {
        stop("Error in generateRFM: names of the 'status' parameter must contain only samples names\n Unknown sample names: ",
             paste(setdiff(unique(names(status)), Results@sample.names), collapse = " "))
    } else if (!identical(names(variable),names(status))){
        stop("Error in generateRFM: names of the 'status' and names of the 'variable' parameter must be identical and in the same order")
    } else if (!identical(is.na(variable),is.na(status))){
        stop("Error in generateRFM: NA of the 'status' and NA of the 'variable' parameter must be synchronised")
    }
    
    if (is.null(clusters)) {
        clusters <- Results@cluster.names
    } else if (all(clusters %in% Results@cluster.names)) {
        if (typeof(clusters) != "character") {
            stop("Error in generateRFM: 'clusters' parameter must be a character vector")
        }
        clusters <- unique(clusters)
        data <- data[clusters, , drop = FALSE]
    } else if (!identical(is.na(variable),is.na(status))){
        stop("Error in generateRFM: NA of the 'status' and NA of the 'variable' parameter must be synchronised")
    }else {
        stop("Error in generateRFM:\nUnknown clusters : ", paste(setdiff(unique(clusters), Results@cluster.names), collapse = " "))
    }

	data <- t(data)
	data <- data.frame(data,check.names=FALSE)
	data <- cbind(data,variable,status)
	
	data.training <- data[!is.na(data$variable),]

	forest  <- randomForestSRC::rfsrc(Surv(variable, status) ~ ., data = data.training, ntree = ntree, forest = TRUE, na.action = "na.impute", tree.err=TRUE, ...)
	predict <- randomForestSRC::predict.rfsrc(forest, data, na.action = "na.impute")
	
	predict        <- predict$predicted
	names(predict) <- names(variable)
	
	if(!is.null(forest)){
	    
		data         <- data.frame(samples = names(variable), variable = variable)
	    data         <- cbind(data, predict = predict)
		data$predict <- -(data$predict-max(abs(data$predict)))
		data$color   <- "green" 
		data[is.na(data$variable), "color"]    <- "orange" 
		data[is.na(data$variable), "variable"] <- 0
		
		plot.samples <- ggplot2::ggplot() +
						ggplot2::ggtitle("Predictions of the Random Forest Model") +
            	        ggplot2::geom_point(data = data,  ggplot2::aes_string(x = "variable", y = "predict", color = "color"), size = 3) +
            		    ggplot2::geom_smooth(data = data[data$color != "orange",],  ggplot2::aes_string(x = "variable", y = "predict"), method = "lm", se = FALSE, color = "grey40") +
            		    ggplot2::scale_colour_identity() +
            			ggrepel::geom_text_repel(data = data,ggplot2::aes_string(label = "samples",x = "variable", y = "predict"), size = 3, color = "blue") +
						ggplot2::ylab("-predicted") +
            			ggplot2::theme(legend.text = ggplot2::element_text(size = 6),
            	                       axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, vjust = 0.5),
            						   aspect.ratio = 1,
            						   legend.position = "none",
									   plot.title=ggplot2::element_text(hjust=0.5))
	
	
		#gg_error <- plot(ggRandomForests::gg_error(forest))
		suppressWarnings(gg_vimp <- plot(ggRandomForests::gg_vimp(forest)))
		utils::capture.output(gg_minimal_depth <- plot(ggRandomForests::gg_minimal_depth(forest)))
		
	}
	
	return(list(model = forest, variable.predictions = predict, plot.samples = plot.samples, plot.vimp = gg_vimp, plot.minimal_depth = gg_minimal_depth))
	
}
tchitchek-lab/SPADEVizR documentation built on Jan. 27, 2024, 8:58 p.m.