Autre/4_etude_hypertension.R

rm(list=ls())

library(mice)
library(VIM)
library(data.table)

# Chargement du fichier transcodifié, enlevement de la 1ere colonne X et renommer la variable de sortie en Y
nhanes_hyper <- read.csv("data/nhanes_hyper_transcodifie.csv")
nhanes_hyper$X <- NULL
names(nhanes_hyper)[ncol(nhanes_hyper)] <- "Y"
nhanes_hyper$Y <- as.factor(nhanes_hyper$Y)
levels(nhanes_hyper$Y) <- c("Yes","No",NA)

# on remet les noms des champs propre
names(nhanes_hyper) <- gsub("\\.\\.?\\.?","_",names(nhanes_hyper))
names(nhanes_hyper) <- gsub("_$","", names(nhanes_hyper))
names(nhanes_hyper)[89] <- "Total_plain_water_drank_yesterday"
names(nhanes_hyper)[57] <- paste0("alpha_",names(nhanes_hyper)[57])
names(nhanes_hyper)[58] <- paste0("beta_",names(nhanes_hyper)[58])
names(nhanes_hyper)[58] <- sub("_1$","",names(nhanes_hyper)[58])

# pour les codes qui ont et des soucis de transcodification, transco manuel
levels(nhanes_hyper$Ever_told_you_have_Hepatitis_B) <- c("Yes","No","Refused","Don't know")
levels(nhanes_hyper$Ever_told_you_have_Hepatitis_C) <- c("Yes","No","Don't know")
levels(nhanes_hyper$Received_Hepatitis_A_vaccine) <- c("At least 2 doses","Less than 2 doses",
                                                       "No doses","Refused", "Don't know")
levels(nhanes_hyper$Received_Hepatitis_B_3_dose_series) <- c("At least 2 doses","Less than 2 doses",
                                                             "No doses","Refused", "Don't know")
levels(nhanes_hyper$Ever_been_told_you_have_asthma) <- c("Yes", "No", "Don't know")
levels(nhanes_hyper$X_of_people_who_live_here_smoke_tobacco) <- c("No one in houseold is a smoker",
                                                                  "1 household member is a smoker",
                                                                  "2 household members are smokers",
                                                                  "3 or more household members are smokers",
                                                                  "Refused",NA)
levels(nhanes_hyper$Doctor_ever_said_you_were_overweight) <- c("Yes","No","Don't know")
levels(nhanes_hyper$Home_owned_bought_rented_other)[2] <- NA
levels(nhanes_hyper$Overall_recommendation_for_care)[2] <- NA
levels(nhanes_hyper$Overall_recommendation_for_care)[3] <- NA
levels(nhanes_hyper$VAR_TRAVAIL) <- c("No","Yes")

#Alleger un peu la description
levels(nhanes_hyper$Total_number_of_people_in_the_Household)[7] <- "7 or more"
levels(nhanes_hyper$Total_number_of_people_in_the_Family)[7] <- "7 or more"

#Qlqs nettoyage des variables non utile
nhanes_hyper$fed_infant_either_day <- NULL
nhanes_hyper$Number_of_days_of_intake <- NULL
nhanes_hyper$SEQN <- NULL

for (i in 1:ncol(nhanes_hyper)){
  if (class(nhanes_hyper[,i])=="factor"){
    for (j in 1:length(levels(nhanes_hyper[,i]))){
      if (levels(nhanes_hyper[,i])[j]=="Don't know"){
        levels(nhanes_hyper[,i])[j]=NA
        break
      }
    }
  }
}
for (i in 1:ncol(nhanes_hyper)){
  if (class(nhanes_hyper[,i])=="factor"){
    for (j in 1:length(levels(nhanes_hyper[,i]))){
      if (levels(nhanes_hyper[,i])[j]=="Refused"){
        levels(nhanes_hyper[,i])[j]=NA
        break
      }
    }
  }
}

# Etude du jeu de donnée nhanes_hyper
don <- nhanes_hyper

md.pattern(don)
aggr_plot <- aggr(don, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE,
                  labels=names(data), cex.axis=.7, gap=3,
                  ylab=c("Histogram of missing data","Pattern"))

don$Received_Hepatitis_A_vaccine <- NULL
don$Received_Hepatitis_B_3_dose_series <- NULL

# Remplacement des valeurs NA à l'aide de la méthode Mice
tempdata <- mice(don,m=1,seed=1234)
tempdata$imp
don <- complete(tempdata,1)



write.csv(don,"data/nhanes_hyper_mice.csv")


############################################################################
# Etude sur la prediction hypertension
############################################################################
library(glmnet)#contrainte
library(randomForest)#Foret
library(gbm)#Boosting
library(e1071)#SVM
library(rpart)#Arbre
library(foreach)#parallel

don <- read.csv("data/nhanes_hyper_mice.csv")
don$X <- NULL
levels(don$Y) <- c(0,1)

XX <- as.matrix(model.matrix(~.,don)[,-ncol(model.matrix(~.,don))])
YY <- as.matrix(model.matrix(~.,don)[,ncol(model.matrix(~.,don))])

bloc <- 4
ind= sample(1:nrow(don) %% 4+1)
res <- data.frame(Y=don$Y, log=0, logReduit=0, ridge=0, lasso=0, elastic=0,
                  foret=0, adabo=0, logibo=0, svm=0, pm=0, arbre=0)
set.seed(1234)
deb <-   Sys.time()
# foreach (i = 1:bloc, .packages = c("gbm","e1071","glmnet","randomForest", "rpart")) %dopar% {
for (i in 1:bloc){
  # logisque
  mod <- glm(Y~.,data=don[ind!=i,],family="binomial")
  res[ind==i,"log"] <- predict(mod,don[ind==i,],type="response")
  # logistique 9 variable
  mod <- glm(Y~Age_in_years_at_screening+
  Systolic_Blood_pres_2nd_rdg_mm_Hg+
  high_cholesterol_level+
  Body_Mass_Index_kg_m_2+
  Doctor_ever_said_you_were_overweight+
  Ever_told_doctor_had_trouble_sleeping+
  Phosphorus_mg+
  Diastolic_Blood_pres_1st_rdg_mm_Hg+
  Sodium_mg,data=don[ind!=i,],family="binomial")
  res[ind==i,"logReduit"] <- predict(mod,don[ind==i,],type="response")
  XXA <- XX[ind!=i,]
  YYA <- YY[ind!=i,]
  XXT <- XX[ind==i,]
  # ridge
  tmp <- cv.glmnet(XXA,YYA,alpha=0,family="binomial")
  mod <- glmnet(XXA,YYA,alpha=0,lambda=tmp$lambda.min, family="binomial")
  res[ind==i,"ridge"] <- predict(mod,newx=XXT,type="response")
  # lass0
  tmp <- cv.glmnet(XXA,YYA, alpha=1, family="binomial")
  mod <- glmnet(XXA,YYA,alpha=1, lambda =tmp$lambda.1se,family="binomial" )
  #colnames(don[mod$beta@i])
  res[ind==i,"lasso"] <- predict(mod,newx=XXT, type="response")
  # elasticnet
  tmp <- cv.glmnet(XXA,YYA, alpha=0.5, family="binomial")
  mod <- glmnet(XXA,YYA,alpha = 0.5, lambda = tmp$lambda.min, family="binomial")
  res[ind==i,"elastic"] <- predict(mod,newx=XXT,type="response")
  # foret
  mod <- randomForest(Y~., data = don[ind!=i,], ntree=500)
  res[ind==i, "foret"] <- predict(mod, don[ind==i,], type="prob")[,2]
  # Adaboost
  tmp <- gbm(as.numeric(Y)-1~.,data = don[ind!=i,], distribution = "adaboost", interaction.depth = 2,
             shrinkage = 0.1,n.trees = 500)
  M <- gbm.perf(tmp)[1]
  mod <- gbm(as.numeric(Y)-1~.,data = don[ind!=i,], distribution = "adaboost", interaction.depth = 2,
             shrinkage = 0.1,n.trees = M)
  res[ind==i, "adabo"] <- predict(mod, newdata=don[ind==i,], type = "response", n.trees = M)
  # Logiboost
  tmp <- gbm(as.numeric(Y)-1~.,data=don[ind!=i,], distribution="bernoulli", interaction.depth = 2,
             shrinkage=0.1,n.trees=500)
  M <- gbm.perf(tmp)[1]
  mod <- gbm(as.numeric(Y)-1~.,data=don[ind!=i,], distribution="bernoulli", interaction.depth = 2,
             shrinkage=0.1,n.trees=M)
  res[ind==i, "logibo"] <- predict(mod,newdata=don[ind==i,], type= "response", n.trees = M)
  # SVM
  mod <- svm(Y~.,data=don[ind!=i,], kernel="linear",probability=T)
  # tmp <- tune(svm,Y~.,data=don[ind!=i,], kernel="linear",probability=T,ranges =
  #        list(cost=c(0.1,1,10,100)))
  # mod <- tmp$best.model
  res[ind==i,"svm"] <- attr(predict(mod,newdata = don[ind==i,],probability = T),"probabilities")[,2]
  # arbre
  mod <- rpart(Y~., data = don[ind!=i,], method="class")
  res[ind==i, "arbre"] <- predict(mod, don[ind==i,], type="prob")[,2]
}
fin <- Sys.time()
fin-deb
i=1

newdon <- as.data.frame(cbind(model.matrix(Y~.,don),Y=as.numeric(don$Y)-1))
class(newdon)
newdon$Y <- as.factor(newdon$Y)
library(neuralnet)
# Binary classification
nn <- neuralnet(Y== 1 ~ GenderMale+ Age_in_years_at_screening, newdon,  linear.output = FALSE)
dput(colnames(newdon))
nn <- neuralnet(Y== "1" ~ On_special_diet , don, linear.output = FALSE)
data(iris)
train_idx <- sample(nrow(iris), 2/3 * nrow(iris))
iris_train <- iris[train_idx, ]
iris_test <- iris[-train_idx, ]
# Binary classification
nn <- neuralnet(Species == "setosa" ~ Petal.Length + Petal.Width, iris_train, linear.output = FALSE)
pred <- predict(nn, iris_test)

colnames(newdon)
############################################
# Perceptron multicouche
############################################
library(keras)

for (i in 1:bloc){

  # instanciation du model
  pm.keras <- keras_model_sequential()

  # architecture
  pm.keras %>%
    layer_dense(units = 8, input_shape=ncol(XXA), activation = "sigmoid") %>%
    layer_dense(units = 1, activation = "sigmoid")

  # configuration de l'apprentissage
  pm.keras %>% compile(
    loss="mean_squared_error",
    optimizer="sgd",
    metrics="mae"
  )

  # lancement de l'apprentissage
  pm.keras %>% fit(
    XXA <- XX[ind!=i,],
    YYA <- YY[ind!=i,],
    epochs=15,
    batch_size=8
  )

  # proba prediction
  res[ind==i,"pm"] <- pm.keras %>% predict(XX[ind==i,])
}

write.csv2(res,"res_hyp.csv")
############################################
monerreur <- function(X, Y, seuil=0.5){
  table(cut(X, breaks = c(0,seuil,1)), Y)
}

# matrice de confusion
monerreur(res[,2],res[,1])#log
monerreur(res[,3],res[,1])#Ridge
monerreur(res[,4],res[,1])#Lasso
monerreur(res[,5],res[,1])#Elastic
monerreur(res[,6],res[,1])#Foret
monerreur(res[,7],res[,1])#Adabo
monerreur(res[,8],res[,1])#Logibo
monerreur(res[,9],res[,1])#SVM
monerreur(res[,10],res[,1])#perceptron
monerreur(res[,11],res[,1])#arbre

# erreur
monerreurb <- function(X,Y,seuil=0.5){
  Xc <- cut(X,breaks=c(0,seuil,1),labels=c(0,1))
  sum(as.factor(Y)!=Xc)
}
apply(res[,-1],2,monerreurb,Y=res[,1],seuil=0.5)

library(pROC)
auc(res[,1],res[,2])#log
auc(res[,1],res[,3])#Ridge
auc(res[,1],res[,4])#Lasso
auc(res[,1],res[,5])#Elastic
auc(res[,1],res[,6])#Foret
auc(res[,1],res[,7])#Adabo
auc(res[,1],res[,8])#Logibo
auc(res[,1],res[,9])#SVM
auc(res[,1],res[,10])#perceptron
auc(res[,1],res[,11])#arbre

plot(roc(res[,1],res[,2]), print.thres = "best", print.thres.best.method = "closest.topleft")
lines(roc(res[,1],res[,3]), col="red")#log
points(coords(roc(res[,1],res[,3]), "best", best.method = "closest.topleft")[2],
       coords(roc(res[,1],res[,3]), "best", best.method = "closest.topleft")[3],
       col="red")
lines(roc(res[,1],res[,4]), col="blue")#Ridge
lines(roc(res[,1],res[,5]), col="green")#Lasso
lines(roc(res[,1],res[,6]), col="brown")#Elastic
lines(roc(res[,1],res[,7]), col="yellow")#Adabo
lines(roc(res[,1],res[,8]), col="purple")#Logibo
lines(roc(res[,1],res[,9]), col="grey")#SVM
lines(roc(res[,1],res[,9]), col="black")#perceptron
lines(roc(res[,1],res[,9]), col="orange")#arbre

coords(roc(res[,1],res[,4]), "best", ret=c("threshold", "specificity", "sensitivity", "accuracy"))
coords(roc(res[,1],res[,3]), "best", best.method = "closest.topleft")
plot(roc(res[,1],res[,4]), print.thres = "best", print.thres.best.method = "closest.topleft")
coords(roc(res[,1],res[,4]), x = "local maximas", ret='threshold')
Ensae2018/nhanesv2 documentation built on May 29, 2019, 9:16 a.m.