R/corrigraph.R

Defines functions corrigraph

Documented in corrigraph

#' igraph of correlated variables global or in relation to y
#'
#' @param data a data.frame
#' @param colX a vector of indices or variables to follow. We will only keep the variables that are connected to them on 1 or more levels (level parameter).
#' @param colY a vector of indices or variables to predict. To force the correlogram to display only the variables correlated to a selection of Y.
#' @param type "x" or "y". To force the display in correlogram mode (colX, type = "x") or in prediction mode (colY, type = "y").
#' @param alpha the maximum permissible p-value for the display
#' @param exclude the minimum threshold of displayed correlations - or a vector of threshold in this order : c(cor,mu,prop)
#' @param ampli coefficient of amplification of vertices
#' @param return if return=TRUE, returns the correlation matrix of significant correlation.
#' @param wash automatically eliminates variables using differents methods when there are too many variables (method = NA, stn (signal-to-noise ratio), sum, length).
#' @param multi to ignore multiple regressions and control only single regressions.
#' @param mu to display the effect on median/mean identified by m.test().
#' @param prop to display the dependencies between categorical variables identified by GTest().
#' @param layout to choose the network organization method - choose "fr", "circle", "kk" or "3d".
#' @param cluster to make automatic clustering of variables or not.
#' @param verbose to see the comments.
#' @param NAfreq from 0 to 1. NA part allowed in the variables. 1 by default (100% of NA tolerate).
#' @param NAcat TRUE or FALSE. Requires recognition of missing data as categories.
#' @param level to be used with colY. Number of variable layers allowed (minimum 2, default 5).
#' @param evolreg TRUE or FALSE. Not yet available. Allows you to use the evolreg function to improve the predictive ability (R squared) for the variables specified in colY.
#'
#' @return Depending on the parameters:
#' \describe{
#'   \item{igraph}{A correlation graph network (igraph) of the variables of a data.frame. Non-numeric variables or missing data may be present. Vertices (circles) represent variables, with size indicating connectivity. The color of the edges reflects the nature of the correlation (positive in blue, negative in red). The width of the edges represents the strength of the correlation.}
#'   \item{mu/prop}{If \code{mu} is TRUE or \code{prop} is specified: Connections display mean effects (orange) and dependencies between categorical variables (pink). The edge sizes depend on p-values from kruskal.test() and GTest().}
#'   \item{Y specification}{When \code{colY} is specified: The correlogram identifies X variables correlated to Y, iterating through layers specified by \code{level}. X variables not related to Y are excluded.}
#'   \item{vertex colors}{The color of vertices indicates significant correlations (blue for positive, red for negative, purple for both).}
#'   \item{max predictive capacity}{Values displayed next to Y variables (\code{colY}) indicate the maximum predictive capacity by one or two variables.}
#'   \item{correlation matrix}{If \code{return} is TRUE, the function returns the correlation matrix of significant correlations.}
#' }
#'
#' @import utils
#' @rawNamespace import(igraph, except = c(decompose,spectrum))
#' @importFrom stats na.omit
#' @importFrom stats cor
#' @importFrom DescTools GTest
#'
#' @export
#'
#' @examples
#' # Example 1
#' data(swiss)
#' corrigraph(swiss)
#' # Example 2
#' data(airquality)
#' corrigraph(airquality,layout="3d")
#' # Example 3
#' data(airquality)
#' corrigraph(airquality,c("Ozone","Wind"),type="y")
#' # Example 4
#' data(iris)
#' corrigraph(iris,mu=TRUE)
#' # Example 5
#' require(MASS) ; data(Aids2)
#' corrigraph(Aids2 ,prop=TRUE,mu=TRUE,exclude=c(0.3,0.3,0))
#' # Example 6
#' data(airquality)
#' corrigraph(airquality,c("Ozone","Wind"),type="x")
corrigraph <- function(data,colY=c(),colX=c(),type="x",alpha=0.05,exclude=c(0,0,0), ampli=4,return=FALSE,wash="stn",multi=TRUE,
					   mu=FALSE,prop=FALSE,layout="fr",cluster=TRUE,verbose=FALSE,NAfreq=1,NAcat=FALSE,level=2,evolreg=FALSE) {
  # Fonction réalisée par Antoine Massé
  # Ctrl Alt Shift R
  # Version 06
  # May 2023
  #igraph::graph_from_adjacency_matrix
  databrut<-data # saving
  # Eviter les tableaux au format matrice
  if (class(data)[1]=="matrix"){data<-data.frame(data)}
  # Control 1 - is.numeric ?
  which(sapply(data, is.numeric)) -> temp_id_num ; temp_id_factor <- c()
  if ((mu==FALSE)&(prop==FALSE)){
	if (length(temp_id_num)<2) {stop("Error! Not enough numerical variable. Try mu=TRUE or prop=TRUE ?\n")}
	#print("temp_id_num")
	#print(temp_id_num)
	data <- data[,temp_id_num]
	which(sapply(data, is.numeric)) -> temp_id_num
	if (length(temp_id_num)!=length(1:ncol(data))) {warning("Warning! Presence of non-numeric variables that cannot be taken into account.\n")}
  } else {
	temp_id_factor <- setdiff(1:ncol(data),temp_id_num)
  }
  # temp_id_factor = categorical values identified
  # Control 2 - var is NULL ?
  which(sapply(data.frame(data[,temp_id_num]), var, na.rm=T)!=0)-> temp_var ; temp_var	<- temp_id_num[temp_var]
  if (length(temp_var) != length(temp_id_num)) {warning("Warning! Some variables have a null variance and cannot be taken into account.\n")}
  data <- data[,union(temp_var,temp_id_factor)] ; temp_id_num <- which(sapply(data, is.numeric)) ; temp_id_factor  <- setdiff(1:ncol(data),temp_id_num)
  # Control 3 - is.na ?
  if (any(is.na(data))==TRUE) {if (NAfreq == 1) {warning("Warning ! Presence of missing values. Please check NAfreq argument.\n")}}
  if (ncol(data)>50) {warning("Warning! The calculation time increases exponentially with the number of variables (do not exceed 50).\n")}
  ##################################################
  # Prediction mode (colY)
  ##################################################
  if (length(colY)>0) {
	#print("mode 2")
	# If modelization of Y
	if (is.numeric(colY)) {
	  colnames(databrut)[colY] -> colY
	}
	which(colnames(data)%in%colY) -> indY
	if (length(colY)!=length(indY)){stop("ERROR! some Y's provided cannot be analyzed.\n")}
	if (length(indY)==0) {stop()}
	if (type=="x"){
		indX <- indY
		colX <- colY
	} else {
		type<-"y"
		if((mu==TRUE)|(prop==TRUE)){stop("Error! mu and prop can't be used width colY.\n")}
	}
}
if ((type=="y")&(length(colY)>0)){
	indices_all <- 1:ncol(data)
	# Matrice de correlation
	matrix(rep(0,ncol(data)^2),ncol(data),ncol(data))->mymat
	rownames(mymat) <- colnames(data)
	colnames(mymat) <- colnames(data)
	#mymat2 <- mymat
	sum_mymat2 <- ncol(data)^2
	levels <- rep(0,ncol(mymat))
	borders <- rep("black",ncol(mymat))
	vertices <- rep("white",ncol(mymat))
	level_count <- 1
	levels[indY] <- level_count
	warning<-0
	##############################
	# Function for calculation of different BUC for different interaction X1 & X2 for Y
	##############################
	check_mdl <- function(x) {
	  data_temp <- data.frame("data[,i]"=data[,i],"data[,j]"=data[,j],"x"=x)
	  total <- nrow(data_temp)
	  data_temp <- na.omit(data_temp)
	  total_NA <- total - nrow(data_temp)
	  seuil_NA <- total_NA/total
	  nb_subset <- nrow(data_temp)
	  #cat("seuil_NA",NAfreq,"\n")
	  #print(seuil_NA)
	  if ((min(apply(data_temp,2,var))!=0)&(nb_subset>3)&(seuil_NA<=NAfreq)) {
		#cat("seuil_NA",seuil_NA,"\n")
		regA <- lm(data_temp[,1]~data_temp[,2]+data_temp[,3])
		regB <- lm(data_temp[,1]~data_temp[,2]+data_temp[,2]:data_temp[,3])
		regC <- lm(data_temp[,1]~data_temp[,2]*data_temp[,3])
		synthese <- data.frame(R2 = NA,BIC = NA, R2_2 = NA, BIC_2 = NA)
		oldw <- getOption("warn")
		options(warn = -1)
		pvalsA <- c(
		  pf(summary(regA)$fstatistic[1],summary(regA)$fstatistic[2],summary(regA)$fstatistic[3],lower.tail=FALSE),
		  summary(regA)[[4]][,4]	)
		pvalsB <- c(
		  pf(summary(regB)$fstatistic[1],summary(regB)$fstatistic[2],summary(regB)$fstatistic[3],lower.tail=FALSE),
		  summary(regB)[[4]][,4]	)
		pvalsC <- c(
		  pf(summary(regC)$fstatistic[1],summary(regC)$fstatistic[2],summary(regC)$fstatistic[3],lower.tail=FALSE),
		  summary(regC)[[4]][,4]	)
		options(warn = oldw)
		# Comparison with k alone
		data_temp2 <- data.frame(data[,i],x)
		data_temp2 <- na.omit(data_temp2)
		nb_subset2 <- nrow(data_temp2)
		if ((min(apply(data_temp2,2,var))!=0)&(nb_subset2>3)) {
		  temp2 <- cor.test(temp_tab[,1],temp_tab[,2],na.rm=T)
		  temp2 <- ifelse(temp2$p.value<= alpha,abs(temp2$estimate),0)
		  options(warn=-1)
		  if (temp2 != 0 ) {
			temp2 <- mean(c(summary(lm(temp_tab[,1]~temp_tab[,2]))$adj.r.squared,summary(lm(temp_tab[,2]~temp_tab[,1]))$adj.r.squared))
		  }
		  if (nrow(na.omit(data.frame(data[,i],data[,j]))) < length(data[,i])&temp>0) {
				temp <- mean(c(summary(lm(data[,i]~data[,j]))$adj.r.squared,summary(lm(data[,j]~data[,i]))$adj.r.squared))
			}
		  if (temp2>0) {
			reg2 <- lm(data_temp2[,1]~data_temp2[,2],data_temp2)
			oldw <- getOption("warn")
			options(warn = -1)
			pvals2 <- try(c(
			  pf(summary(reg2)$fstatistic[1],summary(reg2)$fstatistic[2],summary(reg2)$fstatistic[3],lower.tail=FALSE),
			  summary(reg2)[[4]][,4]	))
			options(warn = oldw)
			if ((any(is.na(pvals2)))|(length(pvals2)<=1)){synthese2 <- data.frame(R2_2 = 0,BIC_2 = myBIC)
			} else {synthese2 <- data.frame(R2_2 = abs(summary(reg2)$adj.r.squared),BIC_2 = BIC(reg2))}
		  } else {synthese2 <- data.frame(R2_2 = 0,BIC_2 = myBIC)}
		}else {synthese2 <- data.frame(R2 = NA,BIC = NA, R2_2 = NA, BIC_2 = NA)}
		# Agglomeration of datas
		if (any(is.na(pvalsA))){	rbind(synthese,data.frame(R2 = NA,BIC = NA, R2_2 = NA, BIC_2 = NA))
		} else if (max(pvalsA)>alpha){rbind(synthese,data.frame(R2 = NA,BIC = NA, R2_2 = NA, BIC_2 = NA))
		} else {
		  tablo <- data.frame(R2 = abs(summary(regA)$adj.r.squared),BIC = BIC(regA))
		  synthese <- rbind(synthese,cbind(tablo,synthese2))}
		if (any(is.na(pvalsB))){	rbind(synthese,data.frame(R2 = NA,BIC = NA, R2_2 = NA, BIC_2 = NA))
		} else if (max(pvalsB)>alpha){rbind(synthese,data.frame(R2 = NA,BIC = NA, R2_2 = NA, BIC_2 = NA))
		} else {
		  tablo <- data.frame(R2 = abs(summary(regB)$adj.r.squared),BIC = BIC(regB))
		  synthese <- rbind(synthese,cbind(tablo,synthese2))
		}
		if (any(is.na(pvalsC))){	rbind(synthese,data.frame(R2 = NA,BIC = NA, R2_2 = NA, BIC_2 = NA))
		} else if (max(pvalsC)>alpha){rbind(synthese,data.frame(R2 = NA,BIC = NA, R2_2 = NA, BIC_2 = NA))
		} else {
		  tablo <- data.frame(R2 = abs(summary(regC)$adj.r.squared),BIC = BIC(regC))
		  synthese <- rbind(synthese,cbind(tablo,synthese2))}
	  } else {return(data.frame(R2 = NA,BIC = NA, R2_2 = NA, BIC_2 = NA))}
	  return(synthese)
	}
	mymat_interconnect <- mymat
	while ((sum_mymat2  != sum(mymat))|(level_count <= level)) { # Tant que la matrice evolue à chaque cycle...
	  sum_mymat2 <- sum(mymat)
	  indices <- c()
	  for (i in indY) { # Modelization of Y (i = Y)
		if (evolreg==TRUE) {
			level <- 2
			X_retenus <- evolreg(data,colnames(data)[i],NAfreq=NAfreq, interaction=FALSE, multix=FALSE,multidiv=FALSE)
			# ATTENTION DE PROTEGER SI PAS DE R2 EN SORTIE
			if ((length(X_retenus$R2)>0)&(X_retenus$R2!=0)){ # Coloration selon lien neg ou pos
				temp <- X_retenus$R2
				for (j in X_retenus$indices) {
						indices <- c(indices,j) ; mymat[i,j] <- temp ; mymat[j,i] <- temp
						#if (abs(temp) > abs(temp_save)){
						mymat_interconnect[i,j]<-2;mymat_interconnect[j,i]<-2
						#} else if (temp_save<0) {
						#  mymat_interconnect[i,j]<- -1;mymat_interconnect[j,i]<- -1;
						#} else if (temp_save>0){
						#  mymat_interconnect[i,j]<-1;mymat_interconnect[j,i]<- 1;
						#}
				}
			}
		} else {
		indices_X <- setdiff(indices_all,indY)
		if (length(indices_X)==0){break}
		for (j in indices_X) { # En fonction d'un X
			temp_tab <- data.frame(data[,i],data[,j])
		  	total <- nrow(temp_tab)
			na.omit(temp_tab) -> temp_tab
			total_NA <- total - nrow(temp_tab)
			seuil_NA <- total_NA/total
			nb_subset <- nrow(temp_tab)
			if ((min(apply(temp_tab,2,var)) != 0)&(nb_subset>3)&(seuil_NA<=NAfreq)) {
				temp <- cor.test(data[,i],data[,j],na.rm=T)
				temp_save <- ifelse(temp$p.value<= alpha,temp$estimate,0) # saving cor neg or pos
				temp <- ifelse(temp$p.value<= alpha,abs(temp$estimate),0) # saving abs(cor)
				options(warn=-1)
				if (temp != 0) {
					temp <- mean(c(summary(lm(data[,i]~data[,j]))$adj.r.squared,summary(lm(data[,j]~data[,i]))$adj.r.squared))
				}
				options(warn=0)
				if (temp_save > 0) {  # coloring vertices if significant correlation neg or pos // Y
				  if (borders[j]=="purple") {
				  } else if (borders[j]=="black") {
					borders[j]<-"blue" ; vertices[j] <- "cyan"
				  } else if (borders[j]=="red") {
					borders[j]<-"purple"  ;  vertices[j] <- "#FBC5F8"
				  }
				} else if (temp_save < 0) {
				  if (borders[j]=="purple") { # do nothing
				  } else if (borders[j]=="black") {
					borders[j]<-"red" ; vertices[j] <- "#F9A09E"
				  } else if (borders[j]=="blue") {
					borders[j]<-"purple";  vertices[j] <- "#FBC5F8"
				  }
				}
				# Regresiv analysis (only for cor < 0.9) , for acceleration
				k <- setdiff(indices_all,c(indY,j))
				if ((temp<0.90)&(length(k)>0)&(multi==TRUE)) {
				  reg <- lm(data[,i]~data[,j])
				  myBIC <- BIC(reg)
				  temps <- data.frame(R2=temp,BIC=myBIC,R2_2=0,BIC_2=myBIC)
				  dataij <- data.frame(data[,i],data[,j])
				  datak <- data.frame(data[,k])
				  temps <- rbind(temps,do.call("rbind", apply(datak,2,check_mdl)))
				  if (all(is.na(temps))) {# do nothing
				  } else {
					temps <- na.omit(temps)
					temps <- temps[union(1,which(temps$R2>temps$R2_2)),]
					temp_tp <- temps$R2[which((temps$BIC == min(temps$BIC))&(temps$BIC <= min(temps$BIC_2)))]
					if (length(temp_tp )>=1){temp <- max(temp,temp_tp)}
				  }
				}
			}else{
				temp<-0
				if(warning==0){
				  warning <- 1
				  warning("Warning! Failure to account for missing data generated zero variances on some variables that had to be ignored.\n")}
			}
			if (temp!=0){ # Coloration selon lien neg ou pos
				indices <- c(indices,j) ; mymat[i,j] <- temp ; mymat[j,i] <- temp
				if (abs(temp) > abs(temp_save)){
				  mymat_interconnect[i,j]<-2;mymat_interconnect[j,i]<-2
				} else if (temp_save<0) {
				  mymat_interconnect[i,j]<- -1;mymat_interconnect[j,i]<- -1;
				} else if (temp_save>0){
				  mymat_interconnect[i,j]<-1;mymat_interconnect[j,i]<- 1;
				}
			}
		}
		}
	  }
	  indices_all <- setdiff(indices_all,indY)
	  indY <- unique(indices)
	  level_count <- level_count+1
	  levels[indY] <- level_count
	  if (level_count==level) {break}
	}
	indices <- which(levels!=0)
	mymat <- mymat[indices,indices]
	if (sum(mymat)==0) {stop("Error! No x reliebable to Y\n")}
	mymat_interconnect <- mymat_interconnect[indices,indices]
	levels <- levels[indices]
	borders <- borders[indices]
	vertices <- vertices[indices]
	layout <- matrix(rep(0,length(levels)*2),length(levels),2)
	maximum <- max(table(levels))
	largeur = 4*maximum
	for (i in unique(levels)) {
	  rev(which(levels==i)) -> pos
	  layout[pos,1] <- (max(levels)-i)*(largeur/(max(levels)))+1 # Positionnement horiz
	  step <- maximum/(length(pos)+1)
	  layout[pos,2] <- seq(maximum/(length(pos)+1),to = step*length(pos),by=step)  # Positionnement vertic
	}
	net_interconnect <- graph_from_adjacency_matrix(mymat_interconnect, weighted=TRUE,mode="directed")
	net <- graph_from_adjacency_matrix(mymat, weighted=TRUE,mode="directed")
	net <- simplify(net, remove.multiple = T, remove.loops = TRUE) # elaguer les liens redondants
	net_interconnect  <- delete.edges(net_interconnect , E(net_interconnect)[ abs(E(net)$weight) < exclude[1] ])
	net <-               delete.edges(net, E(net)[ abs(weight) < exclude[1] ])
	E(net)$colour <- ifelse(E(net_interconnect)$weight<0,"red",ifelse(E(net_interconnect)$weight==2,"green","blue"))
	E(net)$weight <- abs(E(net)$weight)
	plot.igraph(net,layout=layout,vertex.size=8*ampli,vertex.color=vertices,vertex.frame.color=borders,
				edge.arrow.size =0,arrow.mode=0,edge.width=(abs(E(net)$weight)*2)^3, edge.color =E(net)$colour)
	# Display max correlation
	tabulation <-  table(levels)
	seq_max <- layout[which(levels==as.numeric(names(tabulation)[which(tabulation==max(tabulation))])),2]
	seq <- layout[layout[,1]==max(layout[,1]),2]
	seq <- seq -min(seq_max)
	seq <- seq/(max(seq_max)-min(seq_max))
	seq <- (seq*2)-1
	correl  <- apply(mymat,2,max)
	correl <- round(correl[which(levels==1)],2)
	ifelse(correl>0.9,">0.9",correl)-> correl
	text(rep(1.3,length(seq_max)),seq,
		 correl)
	if (return==TRUE) {return(mymat)}
  } else {
##################################################
# Correlation matrice for basal corrigraph()
##################################################
	#print("mode1")
	#	 Matrice de correlation - Initialization
	matrix(rep(0,ncol(data)^2),ncol(data),ncol(data))->mymat
	rownames(mymat) <- colnames(data)
	colnames(mymat) <- colnames(data)
	matrix(rep(1,ncol(data)^2),ncol(data),ncol(data))->mymat2
	rownames(mymat2) <- colnames(data)
	colnames(mymat2) <- colnames(data)
	warning<-0
	mymat_interconnect <- mymat
	vertices <- rep("white",ncol(mymat)) ; vertices[temp_id_factor]<-"grey"
	if ((prop==TRUE)|(mu==TRUE)) {
	  # Factor interactions
	  if (verbose==TRUE) {
		cat("Non-numeric variables.\n")
		print(colnames(data)[temp_id_factor])
	  }
		if (length(temp_id_factor)>5){
			barre <- txtProgressBar(min=0,max=length(temp_id_factor),width=50)
			cat("\nprop calculation\n")
			cat(paste(rep("=",50),collapse=""),"\n")
		}
	  # For each categorical
	  for (i in temp_id_factor) {
		if ((length(temp_id_factor)>5)&(prop==TRUE)){setTxtProgressBar(barre,i)}
		if (min(table(data[,i]))<3) {
		  temp_id_factor <- setdiff(temp_id_factor,i)
		  next
		}
		#########################
		#	prop=T
		#########################
		if ((prop==TRUE)&(length(temp_id_factor)>1)) {
		  if (verbose==TRUE) {cat("\nAnalysis of the cat:") ; cat(colnames(data)[i])}
		  for (j in setdiff(temp_id_factor,i)) {
			if (verbose==TRUE) {
				cat("\n\t vs categorical : ") ; cat(colnames(data)[j])
			}

			data_temp <- data.frame(data[,i],data[,j])
			#print(colnames(table(data_temp)))
			#print(head(data_temp))
			#print(which(is.na(data_temp[,1])))
			#print(which(is.na(data_temp[,2])))


			if (identical(which(is.na(data[,i])),which(is.na(data[,j])))==TRUE) {
				#cat("na omit")
				data_temp <- na.omit(data_temp)
				#print(table(data_temp))
			}
			if (NAcat==TRUE) {
				for (k in 1:2){
					data_temp[which(is.na(data_temp[,k])),k]<-"MANQUANTES"
				}
			}
			# Pourquoi plante qd mu activé seul sur big_fusion

			# NA are automatically eliminated by table()
			tablo <- table(data_temp)

			if ((ncol(tablo)>=2) & (nrow(tablo)>=2)) {
			  #cat("Test entre",i,"&",j,"\n")
			  #print(tablo)
			  if (min(tablo)<5) {
				#cat("Effectif insuffisant\n")
				if (quantile(tablo,probs=c(0.2))<5) {
				  if (verbose==TRUE) {
				    cat("\nNot many individuals per category.\n")
				  }
				  #next
				}
			  }
			  oldw <- getOption("warn")
			  options(warn = -1)
			  pvals <- try(GTest(tablo)$p.value,silent=TRUE)
			  options(warn = oldw)
			  #print("GTEST")
			  #print(pvals)
			  #cat("Test entre",colnames(data)[i],"&",colnames(data)[j],"pval",pvals,"\n")
			  if (pvals <= alpha) {
				#print(pvals)
				log10(1/pvals)/10 -> temp ; ifelse(temp>1 ,1 ,temp)-> temp
				mymat[i,j] <- temp ; mymat[j,i] <- temp
				mymat2[i,j] <- pvals ; mymat2[j,i] <- pvals
				mymat_interconnect[i,j] <- 2 ; mymat_interconnect[j,i] <- 2
			  }
			} else {next}
		  }
		}
	  }
	  if ((length(temp_id_factor)>5)&(prop==TRUE)){close(barre)}
	  #print(colnames(data)[temp_id_factor])
	  #########################
	  #	mu=TRUE
	  #########################
	  if ((mu == TRUE)&(length(temp_id_factor)>0)) {
		if (length(temp_id_factor)>5){
			barre <- txtProgressBar(min=0,max=length(temp_id_factor),width=50)
			cat("\nmu calculation\n")
			cat(paste(rep("=",50),collapse=""),"\n")
		}
		for (i in temp_id_factor) {
			if ((length(temp_id_factor)>5)&(mu==TRUE)){
				#print("Barre de progression")
				setTxtProgressBar(barre,i)
			}
		  if (verbose==TRUE) {cat("\nAnalysis of the cat:") ; print(colnames(data)[i])}
		  for (j in temp_id_num) {
			if (verbose==TRUE) {cat("\n\tvs numerical: ") ; print(colnames(data)[j])}
			data_temp <- data.frame(data[,i],data[,j])
			if (NAcat==TRUE) {
				data_temp[which(is.na(data_temp[,1])),1]<-"MANQUANTES"
			}
			pvals <- m.test(data_temp[,2],data_temp[,1],alpha=alpha,return=FALSE,verbose=FALSE,plot=FALSE,boot=FALSE)
			#print("Message de contrôle")
			#print(pvals)
			#print(data_temp[1:10,2])
			#print(data_temp[1:10,1])
			# Plante si une variable numérique a été déguisée en facteur
			if (pvals < alpha) {
			  log10(1/pvals)/10 -> temp ; ifelse(temp>1 ,1 ,temp)-> temp
			  mymat[i,j] <- temp ; mymat[j,i] <- temp
			  mymat2[i,j] <- pvals ; mymat2[j,i] <- pvals
			  mymat_interconnect[i,j] <- 3 ; mymat_interconnect[j,i] <- 3
			}
		  }
		}
		if (length(temp_id_factor)>5){close(barre)}
	  }
	}

	###############################
	# Numerical correlations
	###############################
	if (length(temp_id_num)>1) {
	  barre <- txtProgressBar(min=0,max=length(temp_id_num),width=50)
	  cat("\ncor calculation\n")
	  cat(paste(rep("=",50),collapse=""),"\n")
	  for (i in temp_id_num) {
		setTxtProgressBar(barre,i)
		for (j in setdiff(temp_id_num,i)) {
			temp_tab <- data.frame(data[,i],data[,j])
			total <- nrow(temp_tab)
			na.omit(temp_tab) -> temp_tab
			total_NA <- total - nrow(temp_tab)
			seuil_NA <- total_NA/total
			# L'ajout de NAfreq fait planter prop = T et mu = T?
			# Ajoutons que si mu=T (prop = T) apparait d'office si NAfreq < 0
			# Etablir le lien entre NAfreq et prop et mu...






		  if ((var(temp_tab[,1])!=0)&(var(temp_tab[,2])!=0)&(nrow(temp_tab)>2)&(seuil_NA<=NAfreq)){
			tempA <- cor.test(data[,i],data[,j],na.rm=T)
			#temp <- ifelse(temp$p.value<= alpha,temp$estimate,0) # Working with correlation
			pvals  <- ifelse(tempA$p.value<= alpha,tempA$p.value,1) # Working with p-value
			log10(1/pvals)/10 -> temp ; ifelse(temp>1 ,1 ,temp)-> temp
			ifelse(tempA$estimate<0,-temp,temp)->temp
		  }else{
			temp<-0
			pvals <- 1
			if(warning==0){warning <- 1}
		  }
		  mymat[i,j] <- temp ; mymat[j,i] <- temp
		  mymat2[i,j] <- pvals ; mymat2[j,i] <- pvals
		  mymat_interconnect[i,j] <- temp ; mymat_interconnect[j,i] <- temp
		}
	  }
	  close(barre)
	}
	cat("\n")
	if (warning==1) {warning("Warning! Failure to account for missing data generated zero variances on some variables that had to be ignored.\n")}
	#print(mymat)
	#print(mymat_interconnect)
	#########################
	#	colX
	#########################
	# Modelization arround X
	if ((type=="x")&(length(colY)>0)){
		#if (length(colX)!=length(indX)){stop("ERROR! some X's provided cannot be analyzed.\n")}
		#if (level<=1){stop("ERROR! Level must exceed 1.")
		#} else {
			if (level<2) {level<-2}
			for (lev in 1:(level-1)) {
				#print(mymat[indX,])
				indX_temp <- union(indX, which(apply(data.frame(mymat[indX,]),1,sum)!=0))
				if (length(indX_temp)>length(indX)){
					indX <- unique(sort(indX_temp))
				} else {
					break
				}
			}
		#}
		mymat <- mymat[indX,indX]
		mymat2 <- mymat2[indX,indX]
		mymat_interconnect <- mymat_interconnect[indX,indX]

	}
	#########################
	#	Wash
	#########################
	if (verbose ==TRUE) {cat("	Reduction of the number of variables (wash).\n")}
	pas = (ncol(mymat)-50)/20
	if ((is.na(wash)!=TRUE)&(wash!=FALSE)) {
	  indices <- apply(abs(mymat),2,sum)
	  indices <- which(indices>0)
	  mymat <- mymat[indices,indices]
	  mymat2 <- mymat2[indices,indices]
	  mymat_interconnect <- mymat_interconnect[indices,indices]
	  vertices <- vertices[indices]
	  while(ncol(mymat)>50){
		#mymat <- ifelse(abs(mymat)< exclude,0,mymat)
		if (wash=="length"){
			if(verbose==T) {cat("Cleaning of the variables (wash parameter) in length mode.\n")}
		  taille <- apply(abs(mymat),2,function(x){return(length(x[x>0]))})
		  indices <- 1:ncol(mymat)
		  indices [order(taille ,decreasing=T)]
		  indices <- indices[1:(ncol(mymat)-pas)]
		  mymat <- mymat[indices,indices]
		  mymat2 <- mymat2[indices,indices]
		  mymat_interconnect <- mymat_interconnect[indices,indices] ; vertices <- vertices[indices]
		  indices <- apply(abs(mymat),2,sum)
		  indices <- which(indices>1)
		  mymat <- mymat[indices,indices]
		  mymat2 <- mymat2[indices,indices]
		  mymat_interconnect <- mymat_interconnect[indices,indices] ; vertices <- vertices[indices]
		} else if (wash=="sum"){
			if(verbose==T) {cat("Cleaning of the variables (wash parameter) in sum mode.\n")}
		  somme <- apply(abs(mymat),2,sum)
		  indices <- 1:ncol(mymat)
		  indices [order(somme,decreasing=T)]
		  indices <- indices[1:(ncol(mymat)-pas)]
		  mymat <- mymat[indices,indices]
		  mymat2 <- mymat2[indices,indices]
		  mymat_interconnect <- mymat_interconnect[indices,indices] ; vertices <- vertices[indices]
		  indices <- apply(abs(mymat),2,sum)
		  indices <- which(indices>1)
		  mymat <- mymat[indices,indices]
		  mymat2 <- mymat2[indices,indices]
		  mymat_interconnect <- mymat_interconnect[indices,indices] ; vertices <- vertices[indices]
		} else { #(wash=="stn"){
			if(verbose==T) {cat("Cleaning of the variables (wash parameter) in stn mode.\n")}
		  if (ncol(mymat) > 50) {
			moyennes <- apply(abs(mymat),2,mean)
			deviation <- apply(abs(mymat),2,sd)
			snp <- moyennes/deviation
			indices <- 1:ncol(mymat)
			indices [order(snp,decreasing=T)]
			indices <- indices[1:(ncol(mymat)-pas)]
			mymat <- mymat[indices,indices]
			mymat2 <- mymat2[indices,indices]
			mymat_interconnect <- mymat_interconnect[indices,indices] ; vertices <- vertices[indices]
			indices <- apply(abs(mymat),2,sum)
			indices <- which(indices>1)
			mymat <- mymat[indices,indices]
			mymat2 <- mymat2[indices,indices]
			mymat_interconnect <- mymat_interconnect[indices,indices] ; vertices <- vertices[indices]
		  }
		}
	  }
	}
	if (ncol(mymat)==0) {stop("No interaction identified.\n")}
	if ((mu==FALSE)&(prop==FALSE)) {
	  net <- graph_from_adjacency_matrix(mymat, weighted=T,mode="lower")
	  net <- simplify(net, remove.multiple = T, remove.loops = TRUE) # élaguer les liens redondants
	  net <- delete_edges(net, E(net)[ abs(weight) < exclude[1] ])
	  E(net)$colour <- ifelse(E(net)$weight<0,"red","blue")
	  E(net)$weight <- abs(E(net)$weight)
	  if (layout=="circle") {l <- layout_in_circle(net)
	  } else if (layout=="kk"){l <- layout_with_kk(net)
	  } else  {l <- layout_with_fr(net)}
	  if ((layout=="3d") |(layout=="3D")) {
		l <- layout_with_fr(net,dim=3)
		rglplot( net, layout = l, vertex.size = sqrt(betweenness(net))*ampli/4+10,
				 vertex.color = vertices, edge.arrow.size = 0,
				 arrow.mode = 0, edge.width = (abs(E(net)$weight) * 2)^2,
				 edge.color = E(net)$colour,label.dist=sqrt(betweenness(net))*ampli/4+10)
	  }else if (ncol(mymat)<50) {
		clp <- cluster_optimal(net)
		class(clp)
		plot(clp, net, layout = l,vertex.size=sqrt(betweenness(net))*ampli+10,vertex.color="yellow",
			 edge.arrow.size =0,arrow.mode=0,edge.width=(abs(E(net)$weight)*2)^2, edge.color =E(net)$colour)
	  } else {
		plot(net,layout=l,vertex.size=sqrt(betweenness(net))*ampli+10,vertex.color=vertices,
			 edge.arrow.size =0,arrow.mode=0,edge.width=(abs(E(net)$weight)*2)^2, edge.color =E(net)$colour)
	  }
	} else {
	  ######################
	  # mu = TRUE or prop = TRUE
	  ######################
	  net <- graph_from_adjacency_matrix(mymat, weighted=T,mode="directed")
	  #net <- simplify(net, remove.multiple = TRUE, remove.loops = TRUE) # elaguer les liens redondants
	  net_interconnect <- graph_from_adjacency_matrix(mymat_interconnect, weighted=T,mode="directed")
	  #net_interconnect <- simplify(net_interconnect, remove.multiple = TRUE, remove.loops = TRUE) # elaguer les liens redondants
	  ##############################
	  #		Nettoyage différentielle des liens en fonction de la p-value
	  ##############################
	  if (length(exclude)==3) {
		net2 <- net ; net_interconnect2 <- net_interconnect
		net_interconnect  <- delete.edges(net_interconnect , E(net_interconnect)[ abs(E(net2)$weight) < exclude[1] & abs(E(net_interconnect2)$weight) < 2 ])
		net <- 	delete.edges(net, E(net)[ abs(E(net2)$weight) < exclude[1] & abs(E(net_interconnect2)$weight) < 2])
		net2 <- net ; net_interconnect2 <- net_interconnect
		net_interconnect  <- delete.edges(net_interconnect , E(net_interconnect)[ abs(E(net2)$weight) < exclude[2] & abs(E(net_interconnect2)$weight) == 3 ])
		net <- 	delete.edges(net, E(net)[ abs(E(net2)$weight) < exclude[2] & abs(E(net_interconnect2)$weight) ==3])
		net2 <- net ; net_interconnect2 <- net_interconnect
		net_interconnect  <- delete.edges(net_interconnect , E(net_interconnect)[ abs(E(net2)$weight) < exclude[3] & abs(E(net_interconnect2)$weight) == 2 ])
		net <- 	delete.edges(net, E(net)[ (abs(E(net2)$weight) < exclude[3]) & (abs(E(net_interconnect2)$weight) ==2)])
	  } else {
		net_interconnect  <- delete.edges(net_interconnect , E(net_interconnect)[ abs(E(net)$weight) < exclude[1]])
		net <- 	delete.edges(net, E(net)[ abs(E(net)$weight) < exclude[1] ])
	  }
	  # Coloration des liens
	  E(net)$weight <- abs(E(net)$weight)
	  #print(E(net_interconnect)$weight)
	  E(net)$colour <- ifelse(E(net_interconnect)$weight<0,"red",
							  ifelse(E(net_interconnect)$weight==2,"#FF99BB",
									 ifelse(E(net_interconnect)$weight==3,"orange","blue")))
	  # Mise en place de la layout
	  if (layout=="circle") {l <- layout_in_circle(net)
	  } else if (layout=="kk"){l <- layout_with_kk(net)
	  } else  {l <- layout_with_fr(net)}
	  # Basculer en affichage 3D ou non
	  if ((layout=="3d") |(layout=="3D")) {
		l <- layout_with_fr(net,dim=3)
		rglplot( net, layout = l, vertex.size = sqrt(betweenness(net))*ampli/4+10,
				 vertex.color = vertices, edge.arrow.size = 0,
				 arrow.mode = 0, edge.width = (abs(E(net)$weight) * 2)^2,
				 edge.color = E(net)$colour,label.dist=sqrt(betweenness(net))*ampli/4+10)
	  } else {
		# Si AFFICHAGE Non-3D : clustering sur les p-values
		#plot(net,layout=l,vertex.size=sqrt(betweenness(net))*ampli+10,vertex.color=vertices,
		#	 edge.arrow.size =0,arrow.mode=0,edge.width=(abs(E(net)$weight)*2)^2, edge.color =E(net)$colour)
		# Matrice de distances
		as.dist(mymat2) -> mydi
		if ((length(mydi)>1)&(cluster==TRUE)) {
			# Clustering
			myclust <- hclust(mydi, method="ward.D2")
			inertie <- sort(myclust$height, decreasing = TRUE)
			inertie <- inertie[2:(length(inertie)-1)]-inertie[3:length(inertie)]
			which(inertie==max(inertie,na.rm=T))+1->n_cluster
			#print(n_cluster)
			mytree = cutree(myclust, k=n_cluster)
			# ATTENTION NE MARCHE QUE SI WASH = TRUE, faire en sorte que mymat2 suiva mymat
			# Mettre les communautés en option pour ne pas masquer la couleur des vertices catego vs num
			# INACTIVE OU PAS LE CLUSTERING POUR QU'ON PUISSE FAIRE LA DISTINCTION ENTRE VARIABLE CATEGORIELLES ET QUANTITATIVES.
			##method1: communities object type clone
			members <- as.double(mytree)
			nam <- as.character(names(mytree))
			comms <- list(membership=members, vcount=vcount(net), names=nam, algorithm="by.hand")
			class(comms) <- "communities"
			#try(modularity(net, membership(comms) ),silent=TRUE)
			plot(comms,net,layout=l,vertex.size=sqrt(betweenness(net))*ampli+10,vertex.color=vertices,
				 edge.arrow.size =0,arrow.mode=0,edge.width=(abs(E(net)$weight)*2)^2, edge.color =E(net)$colour)
		} else {
			plot(net,layout=l,vertex.size=sqrt(betweenness(net))*ampli+10,vertex.color=vertices,
				 edge.arrow.size =0,arrow.mode=0,edge.width=(abs(E(net)$weight)*2)^2, edge.color =E(net)$colour)
		}
	  }
	}
	if (return==TRUE){return(mymat2)}
  }
}
Antoine-Masse/KefiR documentation built on July 4, 2024, 11:40 a.m.