STA 243 Computational Statistics

Discussion 7: Python: Gradient Descent Methods for Quadratic Minimization Problem

TA: Tesi Xiao

Let us start with a qudratic minimization problem in $\mathbb{R}^2$.

\[\min_{x\in \mathbb{R}^2} ~ \frac{1}{2} (x-x^\star)^\top A (x-x^\star)\]

Assume $x^\star=0$ for simplicity. The gradient and Hessian of $f(x)$ are given by

\[\nabla f(x) = Ax, \nabla^2 f(x) = A.\]

In addition, assume $A$ is positive definite. Then, one can easily show that $f(x)$ is $\lambda_{\min}(A)$-stongly convex and $\lambda_{\max}(A)$-smooth.

The condition number of $A$: $\kappa(A) = \lambda_{\max}(A)/\lambda_{\min}(A)$

The GD, Heavy Ball, Nestrov’s accelerated GD, SGD methods are implemented below. Note that the update of SGD for solving the problem above uses the noisy gradient information, i.e., $g(x_t) = \nabla f(x_t) + \epsilon, \epsilon\sim N(0, \sigma^2)$.

  • Gradient Descent Method:

    \[x_{t+1} = x_t - \eta \nabla f(x_t)\]

    Convergence: $\eta = \frac{2}{\lambda_{\max}(A) + \lambda_{\min}(A)},\quad \Vert x_{t+1} - x^\star \Vert_2^2 \leq \frac{\kappa-1}{\kappa+1} \Vert x_t - x^\star \Vert_2^2$

  • Heavy Ball Method:

    \[x_{t+1} = x_t - \eta \nabla f(x_t) + \gamma (x_t - x_{t-1})\]

    Convergence: $\eta = \frac{4}{(\sqrt{\lambda_{\max}(A)} + \sqrt{\lambda_{\min}(A)})^2}, \gamma = (\frac{\sqrt{\kappa}-1}{\sqrt{\kappa}+1})^2, \quad \Vert x_{t+1} - x^\star \Vert_2^2 \leq \frac{\sqrt{\kappa}-1}{\sqrt{\kappa}+1} \Vert x_t - x^\star \Vert_2^2$

  • Nestrov’s Accelerated GD Method:

    \[x_{t+1} = y_t - \eta \nabla f(y_t), \quad y_{t+1} = x_{t+1} + \frac{t}{t + 3} (x_{t+1} - x_t)\]

    Convergence: In the convex setting (not strongly convex), $\eta = \frac{1}{L}, \quad f(x_t) - f^\star = c/t^2$.

import numpy as np
import matplotlib.pyplot as plt
class quadratic_function(object):

    def __init__(self, d=2, A=None, xstar=None):
        self.d = d
        if A is None:
            A = np.random.rand(d,d)
            A = A.T @ A
        if xstar is None:
            xstar = np.zeros(d)
        self.A = A
        self.xstar = xstar

    def fun_value(self, x):
        return (x-self.xstar).T @ self.A @ (x-self.xstar)/2

    def grad_value(self, x):
        return self.A @ (x - self.xstar)        
class ALG(object):

    def __init__(self, fun, x0):
        self.fun = fun
        self.x = x0
        self.trajectory = [x0.copy()]
        self.t = 0

    def update(self):
        pass

class GD(ALG):

    def __init__(self, fun, x0):
        ALG.__init__(self, fun, x0)

    def update(self, eta):
        self.x -= eta*self.fun.grad_value(self.x)
        self.trajectory.append(self.x.copy())
        self.t += 1


class HeavyBall(ALG):

    def __init__(self, fun, x0):
        ALG.__init__(self, fun, x0)

    def update(self, eta, gamma):
        if self.t == 0:
            self.x -= eta*self.fun.grad_value(self.x)
        else:
            self.x -= eta*self.fun.grad_value(self.x) - gamma*(self.x - self.trajectory[-2])
        self.trajectory.append(self.x.copy())
        self.t += 1

class NestrovAcc(ALG):

    def __init__(self, fun, x0):
        ALG.__init__(self, fun, x0)
        self.y = x0

    def update(self, eta):
        self.x = self.y - eta*self.fun.grad_value(self.y)
        self.y = self.x + self.t/(self.t + 3) * (self.x - self.trajectory[-1])
        self.trajectory.append(self.x.copy())
        self.t += 1

class SGD(ALG):

    def __init__(self, fun, x0):
        ALG.__init__(self, fun, x0)

    def update(self, eta, sigma=1):
        self.x -= eta* (self.fun.grad_value(self.x) + np.random.normal(scale=sigma, size=self.fun.d))
        self.trajectory.append(self.x.copy())
        self.t += 1

We start with applying GD to the problem where $A=I$. Thus, $f(x_1, x_2) = (x_1^2+x_2^2)/2, \kappa = 1$. Setting $x_0 = [1,1]^\top$ and $\eta = 1$, GD can find $x^\star$ with 1 step.

f = quadratic_function(d=2, A=np.array(([1,0],[0,1])))
alg = GD(f, x0=np.array([1.,1.]))
eta = 1

for t in range(5):
    alg.update(eta)

print(alg.trajectory)

x1 = np.arange(-1, 1, 0.1)
x2 = np.arange(-1, 1, 0.1)
X1, X2 = np.meshgrid(x1, x2)
Y = 0.5*X1**2 + 0.5*X2**2
fig, ax = plt.subplots()
CS = ax.contour(X1, X2, Y, levels=4)
ax.clabel(CS, inline=True, fontsize=10)
ax.set_title('Gradient Descent Method for kappa=1')

for t, x in enumerate(alg.trajectory):
    ax.scatter(x[0], x[1], s=10, c='black', marker='o')
    if t < len(alg.trajectory)-1:
        x_next = alg.trajectory[t+1]
        ax.arrow(x[0], x[1], x_next[0]-x[0], x_next[1]-x[1], shape='right', color='r', width=0.01, length_includes_head=True)

[array([1., 1.]), array([0., 0.]), array([0., 0.]), array([0., 0.]), array([0., 0.]), array([0., 0.])]

png

Consider the case where $A = \text{diag}(9,1)$, i.e., $f(x_1, x_2) = (9x_1^2+x_2^2)/2, \kappa = 9$. Setting $x_0 = [1,1]^\top$ and $\eta = 1$, the trajectory of GD iterates is zigzagging. In addition, we apply the heavy ball method to solving this problem. It shows that the introduced momentum speeds up the convergence.

f = quadratic_function(d=2, A=np.array(([9,0],[0,1])))
alg1 = GD(f, x0=np.array([1.,1.]))
alg2 = HeavyBall(f, x0=np.array([1.,1.]))
eta1 = 1/5
eta2 = 1/4
gamma2 = 1/4

for t in range(30):
    alg1.update(eta1)
    alg2.update(eta2, gamma2)


err_GD = np.linalg.norm(alg1.trajectory, 2, 1)
err_HB = np.linalg.norm(alg2.trajectory, 2, 1)

x1 = np.arange(-1, 1, 0.1)
x2 = np.arange(-1, 1, 0.1)
X1, X2 = np.meshgrid(x1, x2)
Y = 4.5*X1**2 + 0.5*X2**2
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(16, 5))
CS = ax1.contour(X1, X2, Y, levels=4)
ax1.clabel(CS, inline=True, fontsize=10)
ax1.set_title('Gradient Descent Method for kappa=9')

for t, x in enumerate(alg1.trajectory):
    ax1.scatter(x[0], x[1], s=10, c='black', marker='o')
    if t < len(alg1.trajectory)-1:
        x_next = alg1.trajectory[t+1]
        ax1.arrow(x[0], x[1], x_next[0]-x[0], x_next[1]-x[1], shape='right', color='r', width=0.01, length_includes_head=True)

CS = ax2.contour(X1, X2, Y, levels=4)
ax2.clabel(CS, inline=True, fontsize=10)
ax2.set_title('Heavy Ball Method with kappa=4')

for t, x in enumerate(alg2.trajectory):
    ax2.scatter(x[0], x[1], s=10, c='black', marker='o')
    if t < len(alg2.trajectory)-1:
        x_next = alg2.trajectory[t+1]
        ax2.arrow(x[0], x[1], x_next[0]-x[0], x_next[1]-x[1], shape='right', color='r', width=0.01, length_includes_head=True)

t = np.arange(31)
ax3.plot(t, err_GD, label='Gradient Descent')
ax3.plot(t, err_HB, label='Heavy Ball Method')
ax3.legend()
ax3.set_title('Convergence to 0')
ax3.set_yscale('log')

png

f = quadratic_function(d=2, A=np.array(([9,0],[0,1])))
alg = NestrovAcc(f, x0=np.array([1.,1.]))
eta = 1/9

for t in range(30):
    alg.update(eta)

err_NestrovAcc = np.linalg.norm(alg.trajectory, 2, 1)

x1 = np.arange(-1, 1, 0.1)
x2 = np.arange(-1, 1, 0.1)
X1, X2 = np.meshgrid(x1, x2)
Y = 4.5*X1**2 + 0.5*X2**2
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,5))
CS = ax1.contour(X1, X2, Y, levels=4)
ax1.clabel(CS, inline=True, fontsize=10)
ax1.set_title("Nestrov's Accelerated GD with kappa=9")

for t, x in enumerate(alg.trajectory):
    ax1.scatter(x[0], x[1], s=10, c='black', marker='o')
    if t < len(alg.trajectory)-1:
        x_next = alg.trajectory[t+1]
        ax1.arrow(x[0], x[1], x_next[0]-x[0], x_next[1]-x[1], shape='right', color='r', width=0.01, length_includes_head=True)


t = np.arange(31)
ax2.plot(t, err_GD, label='Gradient Descent')
ax2.plot(t, err_NestrovAcc, label="Nestrov's Accelerated GD")
ax2.legend()
ax2.set_title('Convergence to 0')
ax2.set_yscale('log')

png

f = quadratic_function(d=2, A=np.array(([1,0],[0,1])))
alg = SGD(f, x0=np.array([1.,1.]))
eta = 1

for t in range(5):
    alg.update(eta, sigma=0.1)

print(alg.trajectory)

x1 = np.arange(-1, 1, 0.1)
x2 = np.arange(-1, 1, 0.1)
X1, X2 = np.meshgrid(x1, x2)
Y = 0.5*X1**2 + 0.5*X2**2
fig, ax = plt.subplots()
CS = ax.contour(X1, X2, Y, levels=4)
ax.clabel(CS, inline=True, fontsize=10)
ax.set_title('Noisy Gradient Descent Method for kappa=1')
ax.scatter(0,0,s=20, c='red', marker='*')
for t, x in enumerate(alg.trajectory):
    ax.scatter(x[0], x[1], s=10, c='black', marker='o')
    if t < len(alg.trajectory)-1:
        x_next = alg.trajectory[t+1]
        ax.arrow(x[0], x[1], x_next[0]-x[0], x_next[1]-x[1], shape='right', color='r', width=0.01, length_includes_head=True)

[array([1., 1.]), array([0.05638466, 0.05660117]), array([-0.0959119 , -0.07248226]), array([-0.00137746, -0.11649844]), array([0.00086801, 0.21639027]), array([ 0.12597407, -0.02129214])]

png