[통계계산] 7. Optimization

Computational Statistics

Featured image

[통계계산] 7. Optimization


목차

  1. Computer Arithmetic
  2. LU decomposition
  3. Cholesky Decomposition
  4. QR Decomposition
  5. Interative Methods
  6. Non-linear Equations
  7. Optimization
  8. Numerical Integration
  9. Random Number Generation
  10. Monte Carlo Integration


sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.5 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] fansi_0.5.0       digest_0.6.27     utf8_1.2.1        crayon_1.4.1     
 [5] IRdisplay_1.0     repr_1.1.3        lifecycle_1.0.0   jsonlite_1.7.2   
 [9] evaluate_0.14     pillar_1.6.1      rlang_0.4.11      uuid_0.1-4       
[13] vctrs_0.3.8       ellipsis_0.3.2    IRkernel_1.1      tools_3.6.3      
[17] compiler_3.6.3    base64enc_0.1-3   pbdZMQ_0.3-5      htmltools_0.5.1.1

Optimization

Unconstrained optimization

goldensection <- function(f, a, b, tol=1e-6) {
    alpha <- (3 - sqrt(5)) * 0.5
    f
    iter <- 0
    while ( b - a > tol ) {
        x0 <- a + alpha * (b - a)
        x1 <- b - alpha * (b - a)
        print(c(a, x1, b))
        if ( f(x0) > f(x1) ) {
            a <- x0
        } else if ( f(x0) < f(x1) ) {
            b <- x1
            x1 <- x0
        } else {
            if ( f(b) < f(a) ) {
                a <- x0
            } else {
                b <- x1
                x1 <- x0                
            }
        }
        iter <- iter + 1
    }
    ret <- list(val = (a + b) / 2, iter = iter)
}
# binomial log-likelihood of 7 successes and 3 failures

F <- function(x) { -7 * log(x) - 3 * log(1 - x) }
goldensection(F, 0.01, 0.99, tol=1e-8)
[1] 0.0100000 0.6156733 0.9900000
[1] 0.3843267 0.7586534 0.9900000
[1] 0.6156733 0.8470199 0.9900000
[1] 0.6156733 0.7586534 0.8470199
[1] 0.6156733 0.7040399 0.7586534
[1] 0.6702868 0.7249004 0.7586534
[1] 0.6702868 0.7040399 0.7249004
[1] 0.6911473 0.7120079 0.7249004
[1] 0.6911473 0.7040399 0.7120079
[1] 0.6911473 0.6991154 0.7040399
[1] 0.6960718 0.7009963 0.7040399
[1] 0.6960718 0.6991154 0.7009963
[1] 0.6979528 0.6998338 0.7009963
[1] 0.6991154 0.7002779 0.7009963
[1] 0.6991154 0.6998338 0.7002779
[1] 0.6995594 0.7000034 0.7002779
[1] 0.6998338 0.7001083 0.7002779
[1] 0.6998338 0.7000034 0.7001083
[1] 0.6999387 0.7000435 0.7001083
[1] 0.6999387 0.7000034 0.7000435
[1] 0.6999787 0.7000187 0.7000435
[1] 0.6999787 0.7000034 0.7000187
[1] 0.6999940 0.7000093 0.7000187
[1] 0.6999940 0.7000034 0.7000093
[1] 0.6999940 0.6999998 0.7000034
[1] 0.6999976 0.7000012 0.7000034
[1] 0.6999976 0.6999998 0.7000012
[1] 0.6999990 0.7000004 0.7000012
[1] 0.6999990 0.6999998 0.7000004
[1] 0.6999995 0.7000000 0.7000004
[1] 0.6999998 0.7000002 0.7000004
[1] 0.6999998 0.7000000 0.7000002
[1] 0.7000000 0.7000001 0.7000002
[1] 0.7000000 0.7000000 0.7000001
[1] 0.7 0.7 0.7
[1] 0.7 0.7 0.7
[1] 0.7 0.7 0.7
[1] 0.7 0.7 0.7
[1] 0.7 0.7 0.7

Convergence of golden section search is slow but sure to find the global minimum (if the assumptions are met).

Newton’s Method

We call this naive Newton’s method.

purenewton <- function(f, df, d2f, x0, maxiter=10, tol=1e-6) {
    xold <- x0
    stop <- FALSE
    iter <- 1
    x <- x0
    while ((!stop) && (iter < maxiter)) {
        x <- x - df(x) / d2f(x)
        print(x)
        xdiff <- x - xold
        if (abs(xdiff) < tol) stop <- TRUE 
        xold <- x
        iter <- iter + 1
    }
    return(list(val=x, iter=iter))
}
f <- function(x) sin(x)   # objective function
df <- function(x) cos(x)  # gradient
d2f <- function(x) -sin(x) # hessian
purenewton(f, df, d2f, 2.0)
[1] 1.542342
[1] 1.570804
[1] 1.570796
[1] 1.570796
$val
1.5707963267949
$iter
5

purenewton(f, df, d2f, 2.75)
[1] 0.328211
[1] 3.264834
[1] 11.33789
[1] 10.98155
[1] 10.99558
[1] 10.99557
$val
10.9955742875643
$iter
7

purenewton(f, df, d2f, 4.0)
[1] 4.863691
[1] 4.711224
[1] 4.712389
[1] 4.712389
$val
4.71238898038469
$iter
5

Practical Newton

The lower dashed line shows the linear extrapolation of $f$, and the upper dashed line has a slope a factor of α smaller. The backtracking condition is that $f$ lies below the upper dashed line, i.e., $0 \le t \le t_0$.

Fisher scoring

Example: logistic regression

Example: Poisson regression

# Quarterly count of AIDS deaths in Australia (from Dobson, 1990)
deaths <- c(0, 1, 2, 3, 1, 4, 9, 18, 23, 31, 20, 25, 37, 45)
(quarters <- seq_along(deaths))

<ol class=list-inline><li>1</li><li>2</li><li>3</li><li>4</li><li>5</li><li>6</li><li>7</li><li>8</li><li>9</li><li>10</li><li>11</li><li>12</li><li>13</li><li>14</li></ol>

# Poisson regression using Fisher scoring (IRLS) and step halving
# Model: lambda = exp(beta0 + beta1 * quarter), or deaths ~ quarter
poissonreg <- function(x, y, maxiter=10, tol=1e-6) {
    beta0 <- matrix(0, nrow=2, ncol=1)   # initial point
    betaold <- beta0
    stop <- FALSE
    iter <- 1
    inneriter <- rep(0, maxiter)  # to count no. step halving
    beta <- beta0
    lik <- function(bet) {eta <- bet[1] + bet[2] * x; sum( y * eta - exp(eta) ) } # log-likelihood
    likold <- lik(betaold)
    while ((!stop) && (iter < maxiter)) {
        eta <- beta[1] + x * beta[2]
        w <- exp(eta)  # lambda
        # line search by step halving
        s <- 1.0
        for (i in 1:5) {
            z <- eta + s * (y / w - 1) # working response
            m <- lm(z ~ x, weights=w)  # weighted least squares
            beta <- as.matrix(coef(m))
            curlik <- lik(beta)
            if (curlik > likold) break
            s <- s * 0.5
            inneriter[iter] <- inneriter[iter] + 1
        }
        print(c(as.numeric(beta), inneriter[iter], curlik))
        betadiff <- beta - betaold
        if (norm(betadiff, "F") < tol) stop <- TRUE 
        likold <- curlik
        betaold <- beta
        iter <- iter + 1
    }
    return(list(val=as.numeric(beta), iter=iter, inneriter=inneriter[1:iter]))
}
poissonreg(quarters, deaths)
[1]  -1.3076923   0.4184066   3.0000000 443.4763188
[1]   0.6456032   0.2401380   0.0000000 469.9257845
[1]   0.3743785   0.2541525   0.0000000 472.0483495
[1]   0.3400344   0.2564929   0.0000000 472.0625465
[1]   0.3396340   0.2565236   0.0000000 472.0625479
[1]   0.3396339   0.2565236   0.0000000 472.0625479
$val
<ol class=list-inline>
  • 0.339633920708135
  • 0.256523593717915
  • </ol>
    $iter
    7
    $inneriter
    <ol class=list-inline>
  • 3
  • 0
  • 0
  • 0
  • 0
  • 0
  • 0
  • </ol>
    m <- glm(deaths ~ quarters, family = poisson())
    coef(m)
    

    <dl class=dl-inline><dt>(Intercept)</dt><dd>0.339633920708136</dd><dt>quarters</dt><dd>0.256523593717915</dd></dl>

    Homework: repeat this using the Armijo rule.

    Generalized Linear Models (GLM)

    That IRLS == Fisher scoring == Newton’s method for both logistic and Poisson regression is not a coincidence. Let’s consider a more general class of generalized linear models (GLM).

    Exponential families

    Family Canonical Link Variance Function
    Normal (unit variance) $\eta=\mu$ 1
    Poisson $\eta=\log \mu$ $\mu$
    Binomial $\eta=\log \left(\frac{\mu}{1 - \mu} \right)$ $\mu (1 - \mu)$
    Gamma $\eta = \mu^{-1}$ $\mu^2$
    Inverse Gaussian $\eta = \mu^{-2}$ $\mu^3$

    Generalized linear models

    GLM models the conditional distribution of $Y$ given predictors $\mathbf{x}$ through the conditional mean $\mu = \mathbf{E}(Y|\mathbf{x})$ via a strictly increasing link function \(g(\mu) = \mathbf{x}^T \beta, \quad \mu = g^{-1}(\mathbf{x}^T\beta) = b'(\eta)\)

    From these relations we have (assuming no overdispertion, i.e., $a(\phi)\equiv 1$) \(\mathbf{x}^T\beta = g'(\mu)d\mu, \quad d\mu = b''(\eta)d\eta, \quad b''(\eta) = \mathbf{Var}[Y] = \sigma^2, \quad d\eta = \frac{1}{b''(\eta)}d\mu = \frac{1}{b''(\eta)g'(\mu)}\mathbf{x}^T d\beta.\)

    Then, after some workout with matrix calculus, we have for $n$ samples:

    \begin{eqnarray} \nabla\ell(\beta) &=& \sum_{i=1}^n \frac{(y_i-\mu_i) [1/g’(\mu_i)]}{\sigma^2} \mathbf{x}_i, \quad \mu_i = \mathbf{x}_i^T\beta,
    - \nabla^2 \ell(\beta) &=& \sum_{i=1}^n \frac{[1/g’(\mu_i)]^2}{\sigma^2} \mathbf{x}_i \mathbf{x}_i^T - \sum_{i=1}^n \frac{(y_i - \mu_i)[b’’’(\eta_i)/[b’’(\eta_i)g’(\mu_i)]^2+g’’(\mu_i)/[g’(\mu_i)]^3]}{\sigma^2} \mathbf{x}_i \mathbf{x}_i^T, \quad \eta_i = [b’]^{-1}(\eta),
    \mathbf{I}(\beta) &=& \mathbf{E} [- \nabla^2 \ell(\beta)] = \sum_{i=1}^n \frac{[1/g’(\mu_i)]^2}{\sigma^2} \mathbf{x}_i \mathbf{x}_i^T = \mathbf{X}^T \mathbf{W} \mathbf{X}. \end{eqnarray
    }

    Nonlinear regression - Gauss-Newton method

    Quasi-Newton methods

    Gradient descent method

    Convergence

    Examples

    # Poisson regression using gradient descent
    # Model: lambda = exp(beta0 + beta1 * quarter), or deaths ~ quarter
    poissonreg_grad <- function(x, y, maxiter=10, tol=1e-6) {
        beta0 <- matrix(0, nrow=2, ncol=1)   # initial point
        betaold <- beta0
        iter <- 1
        inneriter <- rep(0, maxiter)  # to count no. step halving
        betamat <- matrix(NA, nrow=maxiter, ncol=length(beta0) + 2)
        colnames(betamat) <- c("beta0", "beta1", "inneriter", "lik")
        beta <- beta0
        lik <- function(bet) {eta <- bet[1] + bet[2] * x; sum( y * eta - exp(eta) ) } # log-likelihood
        likold <- lik(betaold)
        while (iter < maxiter) {
            eta <- beta[1] + x * beta[2]
            lam <- exp(eta)  # lambda
            grad <- as.matrix(c(sum(lam - y), sum((lam - y) * x)))
            # line search by step halving
            #s <- 0.00001
            s <- 0.00005
            for (i in 1:5) {
                beta <- beta - s * grad
                curlik <- lik(beta)
                if (curlik > likold) break
                s <- s * 0.5
                inneriter[iter] <- inneriter[iter] + 1
            }
            #print(c(as.numeric(beta), inneriter[iter], curlik))
            betamat[iter,] <- c(as.numeric(beta), inneriter[iter], curlik)
            betadiff <- beta - betaold
            if (norm(betadiff, "F") < tol) break
            likold <- curlik
            betaold <- beta
            iter <- iter + 1
        }
        #return(list(val=as.numeric(beta), iter=iter, inneriter=inneriter[1:iter]))
        return(betamat[1:iter,])
    }
    
    # Australian AIDS data from the IRLS example above
    pm <- poissonreg_grad(quarters, deaths, tol=1e-8, maxiter=20000)
    head(pm, 10)
    tail(pm, 10)
    
    A matrix: 10 × 4 of type dbl
    beta0beta1inneriterlik
    0.010250000.11495000241.3754
    0.019339540.21786340423.2101
    0.025051100.28260010471.3151
    0.025324010.28309950471.3176
    0.025534040.28284810471.3190
    0.025772060.28293350471.3201
    0.025997250.28286740471.3212
    0.026227980.28286940471.3222
    0.026456000.28284080471.3233
    0.026685010.28282590471.3243
    A matrix: 10 × 4 of type dbl
    beta0beta1inneriterlik
    [12951,]0.33962120.25652470472.0625
    [12952,]0.33962120.25652470472.0625
    [12953,]0.33962120.25652470472.0625
    [12954,]0.33962120.25652470472.0625
    [12955,]0.33962120.25652470472.0625
    [12956,]0.33962120.25652470472.0625
    [12957,]0.33962120.25652470472.0625
    [12958,]0.33962130.25652470472.0625
    [12959,]0.33962130.25652470472.0625
    [12960,]0.33962130.25652470472.0625

    Recall

    m <- glm(deaths ~ quarters, family = poisson())
    coef(m)
    

    <dl class=dl-inline><dt>(Intercept)</dt><dd>0.339633920708136</dd><dt>quarters</dt><dd>0.256523593717915</dd></dl>

    Coordinate descent