How well does optimization algorithms fight again local optima?

In [1]:
def show_video(fname, mimetype):
    """Load the video in the file `fname`, with given mimetype, and display as HTML5 video.
    """
    from IPython.display import HTML
    video_encoded = open(fname, "rb").read().encode("base64")
    video_tag = '<video controls alt="test" src="data:video/{0};base64,{1}">'.format(mimetype, video_encoded)
    return HTML(data=video_tag)

Libraries

In [89]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from mpl_toolkits.mplot3d import Axes3D
from pylab import cm
from mpl_toolkits.mplot3d.axes3d import Axes3D

# Theano function
import theano
import theano.tensor as T
import copy
import random
In [15]:
theano.config.floatX = "float32"

Scene functions

In [169]:
alpha = 0.7
phi_ext = 2 * 3.14 * 0.5


_x = T.fmatrix()
_y = T.fmatrix()

# Surface function
def plot_surface(x, y):
    return 2 + alpha - 2 * T.cos(x)*T.cos(y) - 3 + ((T.abs_(x - 5*3.14))*(T.abs_(y - 5*3.14)))*0.1 - alpha * T.cos(phi_ext - 2*x)
surface_func = theano.function([_x, _y], [plot_surface(_x, _y)])

# Surface grid
phi_m = np.linspace(0, 10*3.14, 100)
phi_p = np.linspace(0, 10*3.14, 100)
X,Y = np.meshgrid(phi_p, phi_m)
X, Y = map(lambda d: d.astype("float32"), [X,Y])
Z = surface_func(X, Y)[0]

position = theano.shared(np.array([0.,0.], dtype="float32"))
cost = plot_surface(position[0], position[1])
cost_func = theano.function([], [cost]) 

def arr(v):
    return np.array([[v]], dtype="float32")

def print_scene():
    global ax, plt, fig
    global x, y, X, Y, Z
    fig.clear()
    ax = fig.add_subplot(1,1,1, projection='3d')
    plt.axis('off')
    plt.xlim(2*3.14, 8*3.14)
    plt.ylim(2*3.14, 8*3.14)
    ax.plot_surface(X, Y, -Z, linewidth=0, rstride=3, cstride=3, alpha=0.8, cmap="coolwarm")
    ax.view_init(100, 30)

def init_scene():
    global position, fig
    fig = plt.figure(figsize=(8,6))
    # Start point
    position.set_value(np.array([0.6*3.14, 1.9*3.14], dtype="float32"))

def draw_point():
    global ax, plt, fig
    global x, y, X, Y, Z
    x, y = position.get_value()
    ax.scatter([x], [y], [cost_func()[0] + 10], s=30, color="grey", marker="o", edgecolor="k", animated=True)
    
def show_animation(n=20, fps=2):
    global ax, plt, fig
    ani = animation.FuncAnimation(fig, print_one, init_func=print_scene, frames=n, blit=True)
    writer = animation.FFMpegWriter(fps=fps, extra_args=['-vcodec', 'libx264'])
    ani.save('/tmp/animation.mp4', writer=writer);
    return show_video('/tmp/animation.mp4', 'mp4')

How many steps?

In [180]:
N = 300
FPS = 30

SGD

In [181]:
sgd_update = theano.function([], [], updates={position: position - 0.2*T.grad(cost, position)})

init_scene()
def print_one(seq):
    print ".",
    print_scene()
    # Update and draw point
    if seq > 0:
        sgd_update()
    draw_point()

show_animation(n=N, fps=FPS)
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Out[181]:

Momentum

In [182]:
def momentum_updates(cost, params, learning_rate, momentum):
    updates = []
    for param in params:
        param_update = theano.shared(param.get_value()*0., broadcastable=param.broadcastable)
        updates.append((param, param - learning_rate*param_update))
        updates.append((param_update, momentum*param_update + (1. - momentum)*T.grad(cost, param)))
    return updates

momentum_update = theano.function([], [],
                                  updates=momentum_updates(cost, [position], 0.2, 0.2))

init_scene()
def print_one(seq):
    print ".",
    print_scene()
    # Update and draw point
    if seq > 0:
        momentum_update()
    draw_point()

show_animation(n=N, fps=FPS)
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Out[182]:

Functions for AdaGrad, AdaDelta ....

In [173]:
from collections import OrderedDict
FLOATX = "float32"

def ada_updates(params, gparams, shapes=None, max_norm = 5.0, lr = 0.01, eps= 1e-6, rho=0.95, method="ADADELTA",
                        beta=0.0, gsum_regularization = 0, weight_l2 = 0, clip = True):

    if method == "FINETUNING_ADAGRAD":
        method = "ADAGRAD"
        gsum_regularization = 0

    if not shapes:
        shapes = params
        
    oneMinusBeta = 1 - beta
    
    gsums   = [theano.shared(np.zeros_like(param.get_value(borrow=True), dtype=FLOATX), name="gsum_%s" % param.name) if (method == 'ADADELTA' or method == 'ADAGRAD') else None for param in shapes]
    xsums   = [theano.shared(np.zeros_like(param.get_value(borrow=True), dtype=FLOATX), name="xsum_%s" % param.name) if method == 'ADADELTA' else None for param in shapes]

    # Fix for AdaGrad, init gsum to 1
    if method == 'ADAGRAD':
        for gsum in gsums:
            gsum.set_value(gsum.get_value() ** 0)

    updates = OrderedDict()

    for gparam, param, gsum, xsum in zip(gparams, params, gsums, xsums):
        # clip gradients if they get too big
        if max_norm and clip:
            grad_norm = gparam.norm(L=2)
            gparam = (T.minimum(T.constant(max_norm, dtype=FLOATX), grad_norm)/ grad_norm) * gparam

        if method == 'ADADELTA':
            if weight_l2 > 0:
                gparam += (2 * weight_l2 * param)
            updates[gsum] = rho * gsum + (1. - rho) * (gparam **2)
            dparam = -T.sqrt((xsum + eps) / (updates[gsum] + eps)) * gparam
            updates[xsum] =rho * xsum + (1. - rho) * (dparam **2)
            updates[param] = param * oneMinusBeta + dparam
        elif method == 'ADAGRAD':
            # updates[gsum] =  gsum + (gparam ** 2)
            updates[gsum] = gsum + (gparam **2) - gsum_regularization * gsum
            updates[param] =  param * oneMinusBeta - lr * (gparam / (T.sqrt(updates[gsum] + eps)) + (2 * weight_l2 * param))
        else:
            updates[param] = param * oneMinusBeta - (gparam  + (2 * weight_l2 * param)) * lr

    return updates.items()

AdaGrad with gsum regularization

In [183]:
adagrad_update = theano.function([], [],
    updates=ada_updates([position], [T.cast(T.grad(cost, position), "float32")], method="ADAGRAD", lr=0.5,
                       gsum_regularization=0.1))

init_scene()
def print_one(seq):
    print ".",
    print_scene()
    # Update and draw point
    if seq > 0:
        adagrad_update()
    draw_point()

show_animation(n=N, fps=FPS)
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Out[183]:

Fine-tuning AdaGrad

In [184]:
adagrad_update = theano.function([], [],
    updates=ada_updates([position], [T.cast(T.grad(cost, position), "float32")], method="FINETUNING_ADAGRAD", lr=0.3))

init_scene()
def print_one(seq):
    print ".",
    print_scene()
    # Update and draw point
    if seq > 0:
        adagrad_update()
    draw_point()

show_animation(n=N, fps=FPS)
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Out[184]:

AdaDelta

In [200]:
adadelta_update = theano.function([], [],
    updates=ada_updates([position], [T.cast(T.grad(cost, position), "float32")], method="ADADELTA", rho=0.5))

init_scene()
def print_one(seq):
    print ".",
    print_scene()
    # Update and draw point
    if seq > 0:
        # Don't know why AdaDelta is slow in this setting
        for _ in range(10):
            adadelta_update()
    draw_point()

show_animation(n=N, fps=FPS)
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Out[200]:

RMSProp

In [186]:
def rmsprop_updates(cost, params, momentum, learning_rate):
    for param in params:
        grad = T.grad(cost, param)
        rms_ = theano.shared(
            np.zeros_like(param.get_value()))
        rms = momentum * rms_ + (1 - momentum) * grad * grad
        yield rms_, rms
        yield param, param - learning_rate * grad / T.sqrt(rms + 1e-8)

rmsprop_update = theano.function([], [],
    updates=list(rmsprop_updates(cost, [position], momentum=0.1, learning_rate=0.1)))

init_scene()
def print_one(seq):
    print ".",
    print_scene()
    # Update and draw point
    if seq > 0:
        rmsprop_update()
    draw_point()

show_animation(n=N, fps=FPS)
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Out[186]:

Adam

In [188]:
# Code: https://github.com/benanne/Lasagne/issues/144
def adam(loss, all_params, learning_rate=0.0002, beta1=0.1, beta2=0.001, 
         epsilon=1e-8, gamma=1-1e-8):
    updates = []
    all_grads = theano.grad(loss,all_params)

    i = theano.shared(np.float32(1))  
    i_t = i + 1.
    fix1 = 1. - (1. - beta1)**i_t
    fix2 = 1. - (1. - beta2)**i_t
    beta1_t = 1-(1-beta1)*gamma**(i_t-1)
    learning_rate_t = learning_rate * (T.sqrt(fix2) / fix1)

    for param_i, g in zip(all_params, all_grads):
        m = theano.shared(
            np.zeros(param_i.get_value().shape, dtype=theano.config.floatX))
        v = theano.shared(
            np.zeros(param_i.get_value().shape, dtype=theano.config.floatX))

        m_t = (beta1_t * g) + ((1. - beta1_t) * m) 
        v_t = (beta2 * g**2) + ((1. - beta2) * v)
        g_t = m_t / (T.sqrt(v_t) + epsilon)
        param_i_t = param_i - (learning_rate_t * g_t)

        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((param_i, param_i_t) )
    updates.append((i, i_t))
    return updates

adam_update = theano.function([], [],
    updates=adam(cost, [position], learning_rate=0.2))

init_scene()
def print_one(seq):
    print ".",
    print_scene()
    # Update and draw point
    if seq > 0:
        adam_update()
    draw_point()

show_animation(n=N, fps=FPS)
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Out[188]:

BFGS

In [219]:
import scipy.optimize

p = T.fvector()
scipy_fun = theano.function([p], [plot_surface(p[0], p[1])])
scipy_g = theano.function([p], [T.grad(plot_surface(p[0], p[1]), p)])

def scipy_update_callback(p):
    global position
    position.set_value(p.astype("float32"))

def scipy_update(method):
    global position
    scipy.optimize.minimize(
                fun=lambda p: scipy_fun(p.astype("float32"))[0],
                jac=lambda p: scipy_g(p.astype("float32"))[0],
                x0=[position.get_value()],
                method=method,
                callback=scipy_update_callback,
                options=dict(maxiter=1),
            )

init_scene()
def print_one(seq):
    print ".",
    print_scene()
    # Update and draw point
    if seq > 0:
        scipy_update("bfgs")
    draw_point()

show_animation(n=N, fps=FPS)
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Out[219]:

Newton-CG

CG

In [221]:
init_scene()
def print_one(seq):
    print ".",
    print_scene()
    # Update and draw point
    if seq > 0:
        scipy_update("cg")
    draw_point()

show_animation(n=N, fps=FPS)
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Out[221]: