Preparation

We can install this package by devtools::install_github("hobbitish1028/Logistic").

library(Logistic)
library(Rcpp)
library(glm2)
library(dplyr)
library(ggplot2)

Introduction of Logistic model

The binomial logistic regression assume that data have the following distributions: $$ P(Y_i=1) = \frac{exp(x_i^T\beta)}{1+exp(x_i^T\beta)}$$ $$ P(Y_i=0) = \frac{1}{1+exp(x_i^T\beta)}$$ We want to maxmize the object function$$Loss = \Pi_{i=1}^{n} [P(Y_i=1)]^{y_i}[P(Y_i=0)]^{1-y_i}$$

Generating binomial data sample

The latent model of sample data here is Gaussian mixture model:

There are two groups, for samples in group i (i can be 0 or 1), the distribution is

$$X \sim N(\mu_i,\sigma^2)$$

The following is an toy example with 20000 samples and 100 features (we use it as training model). Among them, 10000 samples are labeled with 0 and the rest are labeled as 1. And the testing model follows the same design as training model.

sigma<-4
set.seed(123)
n<-1e4
p<-1e2

mu1<-rnorm(p)
mu2<-rnorm(p)
X1<-matrix(mu1+rnorm(n*p,0,sigma),n,p,byrow = TRUE)
X2<-matrix(mu2+rnorm(n*p,0,sigma),n,p,byrow = TRUE)
### Train data
X<-rbind(X1,X2)
y<-rep(c(1,0),each=n)
### Test data
test_x<-rbind( matrix(mu1+rnorm(n*p,0,sigma),n,p,byrow = TRUE), 
               matrix(mu2+rnorm(n*p,0,sigma),n,p,byrow = TRUE)  )
test_y<-rep(c(1,0),each=n)

Then we can use the package to fit the above model:

How to use LogReg Function

Under most of the condition, we only need to input the n by p data matrix $X_{n,p}$ and binomial result $Y_n$ whose value is 0 or 1 (This function also works when Y is binary character vector like a & b).

t0<-proc.time()
fit<-Logreg(X,y)
t1<-proc.time()-t0

We can judge the convergence of algorithm by plotting the loss function. My function will judge by itself and stop when it converges. And the default maximal iteration number is 10000. In the application, if the results diverge according to the plot, we can tune the parameter maxit until it converges (increase the maxit value), which is important to achieve a great prediction. (As for the optimization detail, we use adam-like algorithm, which is quite stable and converges quickly. Different from SGD, it is less sensitive to parameter, and the default parameter can already deal with most of the condition.)

plot(21:length(fit$loss),fit$loss[-(1:20)],main = "Convergence",xlab = "iteration",ylab = "-loglikelihood")

How to predict with my package

We need to get the trained model(fit) first by using the above LogReg function. Then we just need two inputs (fit and the test matrix) to make a new prediction. The training accuracy is saved in fit$accuracy and the training result (the estimate of parameter) is saved in fit$x.

train_acc_mine<-fit$accuracy
my_prediction<-My_predict(fit,newx = test_x)
test_acc_mine<-mean(my_prediction == test_y)

glm

glm is well-recognized package for logistic regression without penalty term. Thus we use it as comparison.

t0<-proc.time()
dat<-as.data.frame(cbind(y,X))
fit0<-glm(y~.,data=dat,family=binomial(link=logit))
t2<-proc.time()-t0

train_acc_glm<- (mean((fit0$fitted.values[1:n]>0.5) ) + mean((fit0$fitted.values[n+(1:n)]<0.5) ) )/2

pred_glm <-cbind(rep(1,n),test_x) %*% fit0$coefficients
test_acc_glm<- mean( pred_glm*rep(c(1,-1),each=n)>0  )

Comparison between two methods

class <- rep(c("train","test"),2)
accuracy<-c(train_acc_mine,test_acc_mine,train_acc_glm,test_acc_glm)
method<- rep(c("Mine","glm"),each=2)
dat<-data.frame(class,accuracy,method)

ggplot(
  dat %>%
    filter(
      method %in% c("Mine","glm")
    ) %>% group_by(class,method) %>% summarize(
      acc = accuracy
    )
)+ aes(
  x=class,
  y=acc)+
  labs(
    x="class",
    y="Accuracy"
  )+ geom_col(
    aes(fill=factor(method)),
    position='dodge'
  )+coord_cartesian( ylim = c(0.945, 0.965))+ggtitle("Comparison (n=1e4,p=1e2)") +
  theme(plot.title = element_text(hjust = 0.5))

A<-matrix(c(t1[3],t2[3]),1,2)
colnames(A)<-c("My LogReg","glm")
rownames(A)<-"time (s)"
B<-matrix(accuracy,2,2)
colnames(B)<-c("My LogReg","glm")
rownames(B)<-c("train","test")
A
B

Although logistic regression is a strict convex question with a global maximum, since both model use stochastic optimization, they will converge to different solutions according to their stopping criterions and optimization rules. Thus we don't use benchmark here, and three digits of precision is already enough to make sure their convergence and consistency!

More samples

Sample of Iris (first two flowers)

X<-iris[1:100,1:4]
y<-iris[1:100,5]

n<-50
p<-dim(X)[2]
t0<-proc.time()
fit<-Logreg(X,y)
t1<-proc.time()-t0
train_acc_mine<-fit$accuracy

t0<-proc.time()
dat<-as.data.frame(cbind(y,X))
fit0<-glm(y~.,data=dat,family=binomial(link=logit))
t2<-proc.time()-t0

train_acc_glm<- (mean((fit0$fitted.values[1:n]>0.5) ) + mean((fit0$fitted.values[n+(1:n)]<0.5) ) )/2
train_acc_glm<-max(train_acc_glm,1-train_acc_glm)
accuracy<-c(train_acc_mine,train_acc_glm)
method<- c("Mine","glm")
dat<-data.frame(accuracy,method)

ggplot(
  dat 
)+ aes(
  x=method,
  y=accuracy)+
  labs(
    x="method",
    y="Accuracy"
  )+ geom_col(
    aes(fill=factor(method)),
    position='dodge'
  )+coord_cartesian( ylim = c(0.9, 1.1))+ggtitle("iris: 2 flowers") +
  theme(plot.title = element_text(hjust = 0.5))

A<-matrix(c(t1[3],t2[3]),1,2)
colnames(A)<-c("My LogReg","glm")
rownames(A)<-"time (s)"
B<-matrix(accuracy,1,2)
colnames(B)<-c("My LogReg","glm")
rownames(B)<-c("accuracy")
A
B

Continue : gaussian mixture model with higher dimension

sigma<-8
set.seed(123)
n<-1e4
p<-3e2

mu1<-rnorm(p)
mu2<-rnorm(p)
X1<-matrix(mu1+rnorm(n*p,0,sigma),n,p,byrow = TRUE)
X2<-matrix(mu2+rnorm(n*p,0,sigma),n,p,byrow = TRUE)
### Train data
X<-rbind(X1,X2)
y<-rep(c(1,0),each=n)
### Test data
test_x<-rbind( matrix(mu1+rnorm(n*p,0,sigma),n,p,byrow = TRUE), 
               matrix(mu2+rnorm(n*p,0,sigma),n,p,byrow = TRUE)  )
test_y<-rep(c(1,0),each=n)
t0<-proc.time()
fit<-Logreg(X,y)
t1<-proc.time()-t0
train_acc_mine<-fit$accuracy
my_prediction<-My_predict(fit,newx = test_x)
test_acc_mine<-mean(my_prediction == test_y)
t0<-proc.time()
dat<-as.data.frame(cbind(y,X))
fit0<-glm(y~.,data=dat,family=binomial(link=logit))
t2<-proc.time()-t0

train_acc_glm<- (mean((fit0$fitted.values[1:n]>0.5) ) + mean((fit0$fitted.values[n+(1:n)]<0.5) ) )/2

pred_glm <-cbind(rep(1,n),test_x) %*% fit0$coefficients
test_acc_glm<- mean( pred_glm*rep(c(1,-1),each=n)>0  )

class <- rep(c("train","test"),2)
accuracy<-c(train_acc_mine,test_acc_mine,train_acc_glm,test_acc_glm)
method<- rep(c("Mine","glm"),each=2)
dat<-data.frame(class,accuracy,method)

ggplot(
  dat %>%
    filter(
      method %in% c("Mine","glm")
    ) %>% group_by(class,method) %>% summarize(
      acc = accuracy
    )
)+ aes(
  x=class,
  y=acc)+
  labs(
    x="class",
    y="Accuracy"
  )+ geom_col(
    aes(fill=factor(method)),
    position='dodge'
  )+coord_cartesian( ylim = c(0.925, 0.945))+ggtitle("Comparison (n=1e4,p=3e2)") +
  theme(plot.title = element_text(hjust = 0.5))

A<-matrix(c(t1[3],t2[3]),1,2)
colnames(A)<-c("My LogReg","glm")
rownames(A)<-"time (s)"
B<-matrix(accuracy,2,2)
colnames(B)<-c("My LogReg","glm")
rownames(B)<-c("train","test")
A
B
sigma<-8
set.seed(123)
n<-1e4
p<-5e2

mu1<-rnorm(p)
mu2<-rnorm(p)
X1<-matrix(mu1+rnorm(n*p,0,sigma),n,p,byrow = TRUE)
X2<-matrix(mu2+rnorm(n*p,0,sigma),n,p,byrow = TRUE)
### Train data
X<-rbind(X1,X2)
y<-rep(c(1,0),each=n)
### Test data
test_x<-rbind( matrix(mu1+rnorm(n*p,0,sigma),n,p,byrow = TRUE), 
               matrix(mu2+rnorm(n*p,0,sigma),n,p,byrow = TRUE)  )
test_y<-rep(c(1,0),each=n)
t0<-proc.time()
fit<-Logreg(X,y)
t1<-proc.time()-t0
train_acc_mine<-fit$accuracy
my_prediction<-My_predict(fit,newx = test_x)
test_acc_mine<-mean(my_prediction == test_y)
t0<-proc.time()
dat<-as.data.frame(cbind(y,X))
fit0<-glm(y~.,data=dat,family=binomial(link=logit))
t2<-proc.time()-t0

train_acc_glm<- (mean((fit0$fitted.values[1:n]>0.5) ) + mean((fit0$fitted.values[n+(1:n)]<0.5) ) )/2

pred_glm <-cbind(rep(1,n),test_x) %*% fit0$coefficients
test_acc_glm<- mean( pred_glm*rep(c(1,-1),each=n)>0  )

class <- rep(c("train","test"),2)
accuracy<-c(train_acc_mine,test_acc_mine,train_acc_glm,test_acc_glm)
method<- rep(c("Mine","glm"),each=2)
dat<-data.frame(class,accuracy,method)

ggplot(
  dat %>%
    filter(
      method %in% c("Mine","glm")
    ) %>% group_by(class,method) %>% summarize(
      acc = accuracy
    )
)+ aes(
  x=class,
  y=acc)+
  labs(
    x="class",
    y="Accuracy"
  )+ geom_col(
    aes(fill=factor(method)),
    position='dodge'
  )+coord_cartesian( ylim = c(0.96, 1))+ggtitle("Comparison (n=1e4,p=5e2)") +
  theme(plot.title = element_text(hjust = 0.5))

A<-matrix(c(t1[3],t2[3]),1,2)
colnames(A)<-c("My LogReg","glm")
rownames(A)<-"time (s)"
B<-matrix(accuracy,2,2)
colnames(B)<-c("My LogReg","glm")
rownames(B)<-c("train","test")
A
B

From the graphs, we can see that when variable number p is small, two models's efficiencies are similar, but when p is increasing, my model's speed outperform that of glm, duo to the application of different optimization.



hobbitish1028/Logistic documentation built on Nov. 26, 2019, 8:58 a.m.