library(learnr)
library(gradethis)
s301gradedist <- UBCstat406labs::s301gradedist

tutorial_options(exercise.timelimit = 5, exercise.checker = gradethis::grade_learnr)

knitr::opts_chunk$set(echo = FALSE)
library(knitr)
library(tidyverse)
library(cowplot)

Logistic Regression

Suppose $Y=1$ with probability $p(x)$ and $Y=0$ with probability $1-p(x)$. I want to model $P(Y=1| X=x)$. I'll assume that $p(x)/(1-p(x)) = ax$ for some scalar $a$.

We're going to estimate $a$ given data.

Generate Data

set.seed(20200405)
n = 100
a = 2
x = runif(n)*10-5
logit <- function(x) exp(x)/(1+exp(x))
p = logit(a*x)
df = tibble(x=x, y = rbinom(n, 1, p))

Now that the data is generated, let's go ahead and take a look at it.

set.seed(20200405)
n = 100
a = 2
x = runif(n)*10-5
logit <- function(x) exp(x)/(1+exp(x))
p = logit(a*x)
y = rbinom(n, 1, p)
df = tibble(x=x, y=y)
ggplot(df, aes(x,y)) + geom_point(color="red") +
  stat_function(fun=function(x) logit(a*x)) + theme_cowplot(14)

The likelihood is given by

[ L(y | a, x) = \prod_{i=1}^n p(x_i)^{y_i}(1-p(x_i))^{1-y_i} ]

(Simple) gradient ascent (to maximize $L(a)$) is:

  1. Input $a_0, \gamma>0, j_\max, \epsilon>0, \nabla L(a)$.
  2. For $j=1,2,\ldots,j_\max$, [ a_j = a_{j-1} + \gamma \nabla L(a_{j-1}) ]
  3. Stop if $\epsilon > |a_j - a_{j-1}|$.

Write a function to find $a_{mle}$

Note that on the log scale, $\nabla L(a) = \sum (y_i - p_i) x_i$ where $p_i$ is as above. Write a function that will keep and store the entire chain of $a_{mle}$'s.

set.seed(20200405)
n = 100
a = 2
x = runif(n)*10-5
logit <- function(x) exp(x)/(1+exp(x))
p = logit(a*x)
y = rbinom(n, 1, p)
df = tibble(x=x, y=y)
amle <- function(x, y, a0, gam=0.5, jmax=100, eps=1e-6){
  err = 1e8
  j = 1
  a = double(jmax)
  a[1] = a0
  while(j < jmax && err>eps){
    j    = ____________
    p    = ____________
    ell  = ____________
    a[j] = ____________
    err = abs(a[j] - a[j-1])
  }
  a[1:j]
}

If done correctly, your code should look something like:

amle <- function(x, y, a0, gam=0.5, jmax=100, eps=1e-6){
  err = 1e8
  j = 1
  a = double(jmax)
  a[1] = a0
  while(j < jmax && err>eps){
    j = j+1
    p = logit(a[j-1]*x)
    ell = sum((y-p)*x)
    a[j] = a[j-1] + gam * ell
    err = abs(a[j] - a[j-1])
  }
  a[1:j]
}

Run your function and report the results

set.seed(20200405)
n = 100
a = 2
x = runif(n)*10-5
logit <- function(x) exp(x)/(1+exp(x))
p = logit(a*x)
y = rbinom(n, 1, p)
df = tibble(x=x, y=y)
amle <- function(x, y, a0, gam=0.5, jmax=100, eps=1e-6){
  err = 1e8
  j = 1
  a = double(jmax)
  a[1] = a0
  while(j < jmax && err>eps){
    j = j+1
    p = logit(a[j-1]*x)
    ell = sum((y-p)*x)
    a[j] = a[j-1] + gam * ell
    err = abs(a[j] - a[j-1])
  }
  a[1:j]
}

Let's start by running the function with the default intputs and an initialization of $a_0$=5.

amle(x, y, 5)
set.seed(20200405)
n = 100
a = 2
x = runif(n)*10-5
logit <- function(x) exp(x)/(1+exp(x))
p = logit(a*x)
y = rbinom(n, 1, p)
df = tibble(x=x, y=y)
amle <- function(x, y, a0, gam=0.5, jmax=100, eps=1e-6){
  err = 1e8
  j = 1
  a = double(jmax)
  a[1] = a0
  while(j < jmax && err>eps){
    j = j+1
    p = logit(a[j-1]*x)
    ell = sum((y-p)*x)
    a[j] = a[j-1] + gam * ell
    err = abs(a[j] - a[j-1])
  }
  a[1:j]
}

Next, let's try changing the initialization to $a_0$=.1.

amle(x, y, .1)
set.seed(20200405)
n = 100
a = 2
x = runif(n)*10-5
logit <- function(x) exp(x)/(1+exp(x))
p = logit(a*x)
y = rbinom(n, 1, p)
df = tibble(x=x, y=y)
amle <- function(x, y, a0, gam=0.5, jmax=100, eps=1e-6){
  err = 1e8
  j = 1
  a = double(jmax)
  a[1] = a0
  while(j < jmax && err>eps){
    j = j+1
    p = logit(a[j-1]*x)
    ell = sum((y-p)*x)
    a[j] = a[j-1] + gam * ell
    err = abs(a[j] - a[j-1])
  }
  a[1:j]
}

Third, let's try changing the initialization back to $a_0$=5 but set $gam=.1$.

amle(x, y, 5, .1)
set.seed(20200405)
n = 100
a = 2
x = runif(n)*10-5
logit <- function(x) exp(x)/(1+exp(x))
p = logit(a*x)
y = rbinom(n, 1, p)
df = tibble(x=x, y=y)
amle <- function(x, y, a0, gam=0.5, jmax=100, eps=1e-6){
  err = 1e8
  j = 1
  a = double(jmax)
  a[1] = a0
  while(j < jmax && err>eps){
    j = j+1
    p = logit(a[j-1]*x)
    ell = sum((y-p)*x)
    a[j] = a[j-1] + gam * ell
    err = abs(a[j] - a[j-1])
  }
  a[1:j]
}

Finally, let's leave the initialization at $a_0$=5 but set $gam=1$.

amle(x, y, 5, 1)
set.seed(20200405)
n = 100
a = 2
x = runif(n)*10-5
logit <- function(x) exp(x)/(1+exp(x))
p = logit(a*x)
y = rbinom(n, 1, p)
df = tibble(x=x, y=y)
amle <- function(x, y, a0, gam=0.5, jmax=100, eps=1e-6){
  err = 1e8
  j = 1
  a = double(jmax)
  a[1] = a0
  while(j < jmax && err>eps){
    j = j+1
    p = logit(a[j-1]*x)
    ell = sum((y-p)*x)
    a[j] = a[j-1] + gam * ell
    err = abs(a[j] - a[j-1])
  }
  a[1:j]
}

We can compute the actual value of $a_{mle}$ by using the glm() function. What value does this return?

glm(y~x-1, family=binomial)$coef #just to check

quiz(caption="Quiz",
  question("Based on the output from `glm()`, which of the inputs returned the correct estimate of $a_{mle}$?",
    answer("`amle(x, y, 5)`"),
    answer("`amle(x, y, .1)`"),
    answer("`amle(x, y, 5, .1)`",correct=TRUE),
    answer("`amle(x, y, 5, 1)`"),
    allow_retry = TRUE
  )
)


dajmcdon/ubc-stat406-labs documentation built on Aug. 18, 2020, 1:23 p.m.