{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calculus - Week 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div style=\"display:none;\">Dive into the fundamentals of calculus for machine learning and data science. This week you'll learn about partial derivatives, the gradient and multivariate optimization with gradient descent.</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from itertools import product\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import plotly.graph_objects as go\n",
    "import plotly.io as pio\n",
    "import sympy as sp\n",
    "from IPython.core.getipython import get_ipython\n",
    "from IPython.display import display, HTML\n",
    "from matplotlib.animation import FuncAnimation\n",
    "\n",
    "plt.style.use(\"seaborn-v0_8-whitegrid\")\n",
    "pio.renderers.default = \"plotly_mimetype+notebook\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Multivariate optimization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tangent plane"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The equation of the tangent line to $f(x)$ at point $x=a$ is\n",
    "\n",
    "> 📐 $y = \\cfrac{d}{dx}f(a)(x-a) + f(a)$\n",
    "\n",
    "This is derived from the point-slope form of a line\n",
    "\n",
    "$y-y_1 = m(x-x_1)$\n",
    "\n",
    "The equation of the tangent plane to $f(x, y)$ at point $x=a$ and $y=b$ is\n",
    "\n",
    "> 📐 $z = \\cfrac{\\partial}{\\partial x}f(a)(x-a) +  \\cfrac{\\partial}{\\partial y}f(b)(y-b) + f(a, b)$\n",
    "\n",
    "which, similarly, is derived from the point-slope form of a plane\n",
    "\n",
    "$z-z_1 = m_1(x-x_1) + m_2(y-y_1)$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tangent_line(dfa, a, sym_a, a_range, b, sym_b, f):\n",
    "    return (\n",
    "        dfa.evalf(subs={sym_a: a, sym_b: b}) * (a_range - a)\n",
    "        + f.evalf(subs={x: a, y: b})\n",
    "    ).astype(\"float32\")\n",
    "\n",
    "\n",
    "def tangent_plane(dfa, dfb, a, a_range, b, b_range, f):\n",
    "    return (\n",
    "        dfa.evalf(subs={x: a, y: b}) * (a_range - a)\n",
    "        + dfb.evalf(subs={x: a, y: b}) * (b_range - b)\n",
    "        + f.evalf(subs={x: a, y: b})\n",
    "    ).astype(\"float32\")\n",
    "\n",
    "\n",
    "x, y = sp.symbols(\"x, y\")\n",
    "parabloid = x**2 + y**2\n",
    "dfx = sp.diff(parabloid, x)\n",
    "dfy = sp.diff(parabloid, y)\n",
    "x0 = 2\n",
    "y0 = 4\n",
    "full_range = np.linspace(-8, 8, 100)\n",
    "y_cut_xx, y_cut_yy = np.meshgrid(full_range, np.linspace(-8, y0, 100))\n",
    "xcut_xx, xcut_yy = np.meshgrid(np.linspace(-8, x0, 100), full_range)\n",
    "full_xx, full_yy = np.meshgrid(full_range, full_range)\n",
    "tan_x = np.linspace(x0 - 4, x0 + 4, 100)\n",
    "tan_y = np.linspace(y0 - 4, y0 + 4, 100)\n",
    "tan_xx, tan_yy = np.meshgrid(tan_x, tan_y)\n",
    "const_x = np.full(100, x0)\n",
    "const_y = np.full(100, y0)\n",
    "\n",
    "ycut_parabloid_surface = go.Surface(\n",
    "    z=sp.lambdify((x, y), parabloid, \"numpy\")(y_cut_xx, y_cut_yy),\n",
    "    x=y_cut_xx,\n",
    "    y=y_cut_yy,\n",
    "    colorscale=\"Blues\",\n",
    "    contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),\n",
    "    colorbar=dict(orientation=\"h\", y=-0.2, title=dict(text=\"z\", side=\"top\")),\n",
    "    showlegend=True,\n",
    "    legendgrouptitle_text=\"Partial derivative wrt x\",\n",
    "    legendgroup=\"x\",\n",
    "    name=\"y-cut parabloid\",\n",
    ")\n",
    "xcut_parabloid_surface = go.Surface(\n",
    "    z=sp.lambdify((x, y), parabloid, \"numpy\")(xcut_xx, xcut_yy),\n",
    "    x=xcut_xx,\n",
    "    y=xcut_yy,\n",
    "    colorscale=\"Blues\",\n",
    "    contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),\n",
    "    colorbar=dict(orientation=\"h\", y=-0.2, title=dict(text=\"z\", side=\"top\")),\n",
    "    showlegend=True,\n",
    "    legendgrouptitle_text=\"Partial derivative wrt y\",\n",
    "    legendgroup=\"y\",\n",
    "    name=\"x-cut parabloid\",\n",
    ")\n",
    "full_parabloid_surface = go.Surface(\n",
    "    z=sp.lambdify((x, y), parabloid, \"numpy\")(full_xx, full_yy),\n",
    "    x=full_xx,\n",
    "    y=full_yy,\n",
    "    colorscale=\"Blues\",\n",
    "    contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),\n",
    "    colorbar=dict(orientation=\"h\", y=-0.2, title=dict(text=\"z\", side=\"top\")),\n",
    "    showlegend=True,\n",
    "    name=\"full parabloid\",\n",
    ")\n",
    "poi = go.Scatter3d(\n",
    "    x=[x0],\n",
    "    y=[y0],\n",
    "    z=[sp.lambdify((x, y), parabloid, \"numpy\")(x0, y0)],\n",
    "    marker=dict(color=\"#000000\"),\n",
    "    showlegend=True,\n",
    "    name=\"x=2 y=4\",\n",
    ")\n",
    "yparabola = go.Scatter3d(\n",
    "    x=const_x,\n",
    "    y=full_range,\n",
    "    z=sp.lambdify((x, y), parabloid, \"numpy\")(const_x, full_range),\n",
    "    mode=\"lines\",\n",
    "    line=dict(color=\"#000000\", width=5),\n",
    "    showlegend=True,\n",
    "    legendgroup=\"y\",\n",
    "    name=\"y parabola\",\n",
    ")\n",
    "ytangent = go.Scatter3d(\n",
    "    x=const_x,\n",
    "    y=tan_y,\n",
    "    z=tangent_line(dfa=dfy, a=y0, sym_a=y, a_range=tan_y, b=x0, sym_b=x, f=parabloid),\n",
    "    mode=\"lines\",\n",
    "    line=dict(color=\"#000000\"),\n",
    "    showlegend=True,\n",
    "    legendgroup=\"y\",\n",
    "    name=\"y tangent\",\n",
    ")\n",
    "xparabola = go.Scatter3d(\n",
    "    x=full_range,\n",
    "    y=const_y,\n",
    "    z=sp.lambdify((x, y), parabloid, \"numpy\")(full_range, const_y),\n",
    "    mode=\"lines\",\n",
    "    line=dict(color=\"#000000\", width=5),\n",
    "    showlegend=True,\n",
    "    legendgroup=\"x\",\n",
    "    name=\"x parabola\",\n",
    ")\n",
    "xtangent = go.Scatter3d(\n",
    "    x=tan_x,\n",
    "    y=const_y,\n",
    "    z=tangent_line(dfa=dfx, a=x0, sym_a=x, a_range=tan_x, b=y0, sym_b=y, f=parabloid),\n",
    "    mode=\"lines\",\n",
    "    line=dict(color=\"#000000\"),\n",
    "    showlegend=True,\n",
    "    legendgroup=\"x\",\n",
    "    name=\"x tangent\",\n",
    ")\n",
    "tangent_surface = go.Surface(\n",
    "    z=tangent_plane(\n",
    "        dfa=dfx, dfb=dfy, a=x0, a_range=tan_xx, b=y0, b_range=tan_yy, f=parabloid\n",
    "    ),\n",
    "    x=tan_xx,\n",
    "    y=tan_yy,\n",
    "    colorscale=[[0, \"#FF8920\"], [1, \"#FF8920\"]],\n",
    "    showscale=False,\n",
    "    name=\"tangent plane\",\n",
    "    showlegend=True,\n",
    ")\n",
    "fig = go.Figure(\n",
    "    data=[\n",
    "        full_parabloid_surface,\n",
    "        xcut_parabloid_surface,\n",
    "        ycut_parabloid_surface,\n",
    "        poi,\n",
    "        xtangent,\n",
    "        xparabola,\n",
    "        ytangent,\n",
    "        yparabola,\n",
    "        tangent_surface,\n",
    "    ]\n",
    ")\n",
    "fig.update_layout(\n",
    "    title=\"Tangent plane of parabloid at x=2 and y=4\",\n",
    "    autosize=False,\n",
    "    width=600,\n",
    "    height=600,\n",
    "    margin=dict(l=10, r=10, b=10, t=30),\n",
    "    legend=dict(groupclick=\"togglegroup\", itemclick=\"toggleothers\"),\n",
    "    scene_camera=dict(\n",
    "        eye=dict(x=1.5, y=1.5, z=0.5),\n",
    "    ),\n",
    ")\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Partial derivatives"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If we look at the tangent plane on the previous plot from a certain angle we'll see two orthogonal lines, as if they were the axes of the plane.\n",
    "\n",
    "😉 click on `Reset camera to last save` on the plot's navbar to reset the eye to (x=1.5, y=1.5, z=.5)\n",
    "\n",
    "The two lines that form the tangent plane are the tangent lines to the point and their respective slopes are called partial derivatives."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To visualize (or at least get a sense of it) the partial derivative:\n",
    "\n",
    "$\\cfrac{\\partial}{\\partial x}f(x)$ at $x=2$ select the legend group `Partial derivative wrt x`.\n",
    "\n",
    "$\\cfrac{\\partial}{\\partial y}f(y)$ at $y=4$ select the legend group `Partial derivative wrt y`.\n",
    "\n",
    "In either case, we can see that the partial derivative is just the derivative of the imaginary 2D parabola that results from $f(x, y)$ while keeping one of the two variables constant.\n",
    "\n",
    "In fact, calculating partial derivatives is a 2-step process:\n",
    "\n",
    "1. treat all the other variables as constants\n",
    "2. apply the same rules of differentiations\n",
    "\n",
    "For example, let $f(x, y) = x^2+y^2$. To calculate $\\cfrac{\\partial}{\\partial x}x^2+y^2$ we\n",
    "\n",
    "1. treat $y$ as a constant, let's say 1, but we don't usually do this substitution in practice. We'll do it here to drive the point home.\n",
    "\n",
    "$\\cfrac{\\partial f(x,y)}{\\partial x} = x^2+1^2$\n",
    "\n",
    "2. apply the same rules of differentiations (in this case, power, constant and sum rules)\n",
    "\n",
    "$\\cfrac{\\partial f(x,y)}{\\partial x} = 2x + 0$\n",
    "\n",
    "Let's do another example. Let let $f(x, y) = x^2y^2$.\n",
    "\n",
    "We could do the same as before and replace $y$ with 1, but in this case and more complex cases it might create more confusion than be helpful, as we'll have to revert it back to $y$ if it didn't go away like it did in the previous example.\n",
    "\n",
    "Let's leave $y$ as is and just treat as a constant. For the power and multiple constant rules we have\n",
    "\n",
    "$\\cfrac{\\partial f(x,y)}{\\partial x} = 2xy^2$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Equaling partial derivatives to 0 to find the minima and maxima"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's imagine we are in a sauna 5 meters wide and 5 meters long. We want to find the coldest place in the room.\n",
    "\n",
    "Conveniently we know the function of the temperature in terms of the room coordinates."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x, y = sp.symbols(\"x, y\")\n",
    "temp = 50 - sp.Rational(1, 50) * x**2 * (x - 6) * y**2 * (y - 6)\n",
    "temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "room_size = 5\n",
    "xx, yy = np.meshgrid(np.linspace(0, room_size, 100), np.linspace(0, room_size, 100))\n",
    "\n",
    "surface = go.Surface(\n",
    "    z=sp.lambdify((x, y), temp, \"numpy\")(xx, yy),\n",
    "    x=xx,\n",
    "    y=yy,\n",
    "    colorscale=\"RdBu_r\",\n",
    "    contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),\n",
    "    colorbar=dict(title=\"Temperature\"),\n",
    "    name=\"temperature function\",\n",
    ")\n",
    "\n",
    "fig = go.Figure(surface)\n",
    "fig.update_layout(\n",
    "    title=\"Function of the sauna temperature\",\n",
    "    autosize=False,\n",
    "    width=600,\n",
    "    height=600,\n",
    "    scene_aspectmode=\"cube\",\n",
    "    margin=dict(l=10, r=10, b=10, t=30),\n",
    "    scene_camera=dict(\n",
    "        eye=dict(x=2.1, y=0.1, z=0.7),\n",
    "    ),\n",
    ")\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's calculate the partial derivative wrt x."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfx = sp.diff(temp, x).factor()\n",
    "dfx"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's calculate the partial derivative wrt y."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfy = sp.diff(temp, y).factor()\n",
    "dfy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's check where the partial derivatives are 0."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "solutions = {\"x\": set(), \"y\": set()}\n",
    "for s in sp.solve(dfx) + sp.solve(dfy):\n",
    "    for k, v in s.items():\n",
    "        if v <= room_size:\n",
    "            solutions[str(k)].add(float(v))\n",
    "solutions = list(product(solutions[\"x\"], solutions[\"y\"]))\n",
    "\n",
    "zs = []\n",
    "for s in solutions:\n",
    "    z = sp.lambdify((x, y), temp, \"numpy\")(s[0], s[1])\n",
    "    zs.append(z)\n",
    "    fig.add_scatter3d(\n",
    "        x=[s[0]],\n",
    "        y=[s[1]],\n",
    "        z=[z],\n",
    "        marker=dict(color=\"#67001F\" if z > 40 else \"#053061\"),\n",
    "        name=\"maximum\" if z > 40 else \"minimum\",\n",
    "    )\n",
    "fig.update_layout(\n",
    "    title=\"Maxima and minima of the function\",\n",
    "    showlegend=False,\n",
    "    scene_camera=dict(\n",
    "        eye=dict(x=2.0, y=-1.0, z=0.2),\n",
    "    ),\n",
    ")\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's show the tangent plane at the minimum point."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x0, y0 = solutions[np.argmin(zs)]\n",
    "tan_x = np.linspace(x0 - 1, x0 + 1, 100)\n",
    "tan_y = np.linspace(y0 - 1, y0 + 1, 100)\n",
    "tan_xx, tan_yy = np.meshgrid(tan_x, tan_y)\n",
    "const_x = np.full(100, x0)\n",
    "const_y = np.full(100, y0)\n",
    "\n",
    "ytangent = go.Scatter3d(\n",
    "    x=const_x,\n",
    "    y=tan_y,\n",
    "    z=tangent_line(dfa=dfy, a=y0, sym_a=y, a_range=tan_y, b=x0, sym_b=x, f=temp),\n",
    "    mode=\"lines\",\n",
    "    line=dict(color=\"#000000\"),\n",
    "    showlegend=True,\n",
    "    legendgroup=\"y\",\n",
    "    name=\"y tangent\",\n",
    ")\n",
    "\n",
    "xtangent = go.Scatter3d(\n",
    "    x=tan_x,\n",
    "    y=const_y,\n",
    "    z=tangent_line(dfa=dfx, a=x0, sym_a=x, a_range=tan_x, b=y0, sym_b=y, f=temp),\n",
    "    mode=\"lines\",\n",
    "    line=dict(color=\"#000000\"),\n",
    "    showlegend=True,\n",
    "    legendgroup=\"x\",\n",
    "    name=\"x tangent\",\n",
    ")\n",
    "\n",
    "tangent_surface = go.Surface(\n",
    "    z=tangent_plane(\n",
    "        dfa=dfx, dfb=dfy, a=x0, a_range=tan_xx, b=y0, b_range=tan_yy, f=temp\n",
    "    ),\n",
    "    x=tan_xx,\n",
    "    y=tan_yy,\n",
    "    colorscale=[[0, \"#FF8920\"], [1, \"#FF8920\"]],\n",
    "    showscale=False,\n",
    "    name=\"tangent plane\",\n",
    "    showlegend=True,\n",
    ")\n",
    "\n",
    "fig.add_traces([ytangent, xtangent, tangent_surface])\n",
    "fig.update_layout(\n",
    "    title=\"Tangent plane at the minimum\",\n",
    ")\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Gradient"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> 🔑 The gradient of $f$ is a vector, each element of which is a partial derivative of $f$\n",
    "\n",
    "$\\nabla f(x_1, x_2, \\cdots, x_n) = \\langle \\cfrac{\\partial f}{\\partial x_1}, \\cfrac{\\partial f}{\\partial x_2}, \\cdots, \\cfrac{\\partial f}{\\partial x_n} \\rangle$\n",
    "\n",
    "Assume we are at $\\vec{p}$ in the domain space of $f$ within $\\mathbb{R}^3$:\n",
    "\n",
    "$\\vec{p} = \\begin{bmatrix}0.3\\\\0.2\\\\0.8\\\\\\end{bmatrix}$\n",
    "\n",
    "Upon calculating the partial derivatives of $f(\\vec{p})$, suppose we obtain the gradient:\n",
    "\n",
    "$\\nabla f(\\vec{p}) = \\begin{bmatrix}0.05\\\\-0.2\\\\-0.1\\end{bmatrix}$\n",
    "\n",
    "This implies that:\n",
    "\n",
    "- The function $f$ is increasing if we move to the **right** in the first dimension because the slope is positive\n",
    "\n",
    "- The function $f$ is increasing if we move to the **left** in the other two dimensions because the slope is negative\n",
    "\n",
    "Furthermore, the second dimension has the steepest ascent, while the first dimension has the flattest.\n",
    "\n",
    "> 🔑 $\\nabla f$ provides the direction and rate of fastest **increase** of $f$, while $-\\nabla f$ provides the direction and rate of fastest **decrease** of $f$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nabla = sp.lambdify(\n",
    "    (x, y), sp.Matrix([sp.diff(temp, x), sp.diff(temp, y)]), \"numpy\"\n",
    ")\n",
    "p0 = np.array([1.0, 3.0])\n",
    "g0 = nabla(*p0)\n",
    "xx, yy = np.meshgrid(np.linspace(0, 5, 100), np.linspace(0, 5, 100))\n",
    "cs = plt.contourf(\n",
    "    xx, yy, sp.lambdify((x, y), temp, \"numpy\")(xx, yy), levels=20, cmap=\"RdBu_r\"\n",
    ")\n",
    "plt.scatter(p0[0], p0[1], color=\"k\")\n",
    "plt.quiver(\n",
    "    p0[0],\n",
    "    p0[1],\n",
    "    -g0[0],\n",
    "    -g0[1],\n",
    "    angles=\"xy\",\n",
    "    scale_units=\"xy\",\n",
    "    scale=10,\n",
    "    color=\"k\",\n",
    ")\n",
    "plt.xlabel(\"x\")\n",
    "plt.ylabel(\"y\")\n",
    "plt.gca().set_aspect(\"equal\")\n",
    "plt.title(f\"Negative gradient stemming from ${sp.latex(p0)}$\")\n",
    "plt.colorbar(cs, label=\"z\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Continuing the previous example in $\\mathbb{R}^3$, if our goal is to find a minimum of the function $f$, one approach is to update our position by taking a step of size $S$ in the opposite direction of the gradient:\n",
    "\n",
    "$\\vec{p}_{next} = \\langle 0.3, 0.2, 0.8 \\rangle - S \\cdot \\text{sign}(\\langle 0.05, -0.2, -0.1 \\rangle)$\n",
    "\n",
    "This approach doesn't use all the information contained in the gradient, just the direction. It's important to use also the steepness of the slope, so that we can take a larger step size if it's very steep (far from the minimum) and a smaller step size if it's flat (near the minimum.)\n",
    "\n",
    "A better approach involves taking a step proportional to the value of the negative gradient.\n",
    "\n",
    "$\\vec{p}_{next} = \\langle 0.3, 0.2, 0.8 \\rangle - \\alpha \\cdot \\langle0.05, -0.2, -0.1 \\rangle$\n",
    "\n",
    "> 🔑 $\\alpha$ is called learning rate and controls the step size\n",
    "\n",
    "The above is the main idea of the gradient descent algorithm.\n",
    "\n",
    "Below the pseudocode of a typical implementation of the gradient descent algorithm for $T$ steps or until convergence, where convergence is define in terms of the norm of the gradient.\n",
    "\n",
    "$\\begin{array}{l}\n",
    "\\textbf{Algorithm: } \\text{Gradient Descent} \\\\\n",
    "\\textbf{Input: } \\text{initial parameters } p_0, \\text{ learning rate } \\alpha, \\text{ number of iterations } T, \\text{ convergence } \\epsilon \\\\\n",
    "\\textbf{Output: } \\text{final parameters } p\\\\\n",
    "\\phantom{0}1 : p \\gets p_0 \\\\\n",
    "\\phantom{0}2 : \\text{evaluate gradient } \\nabla f(p) \\\\\n",
    "\\phantom{0}3 : \\text{for i = 1 to T do} \\\\\n",
    "\\phantom{0}4 : \\quad p_{new} \\gets p - \\alpha \\nabla f(p) \\\\\n",
    "\\phantom{0}5 : \\quad \\text{evaluate gradient } \\nabla f(p_{new}) \\\\\n",
    "\\phantom{0}6 : \\quad \\text{if } \\|\\nabla f(p_{new})\\| < \\epsilon \\text{ then} \\\\\n",
    "\\phantom{0}7 : \\quad \\quad \\text{return } p_{new} \\\\\n",
    "\\phantom{0}8 : \\quad \\text {end if} \\\\\n",
    "\\phantom{0}9 : \\quad p \\gets p_{new} \\\\\n",
    "10: \\text{end for} \\\\\n",
    "11: \\text{return } p\n",
    "\\end{array}$\n",
    "\n",
    "> 🔑 Gradient descent is an iterative algorithm that uses the information contained in the gradient at a point to find the local minimum of a differentiable function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bivariate_gradient_descent(f, symbols, initial, steps, learning_rate):\n",
    "    x, y = symbols\n",
    "    nabla = sp.lambdify(\n",
    "        (x, y), sp.Matrix([sp.diff(f, x), sp.diff(f, y)]), \"numpy\"\n",
    "    )\n",
    "    p = np.zeros((steps, 2))\n",
    "    g = np.zeros((steps, 2))\n",
    "    step_vector = np.zeros((steps, 2))\n",
    "    p[0] = initial\n",
    "    g[0] = nabla(*p[0]).squeeze()\n",
    "    step_vector[0] = learning_rate * g[0]\n",
    "    for i in range(1, steps):\n",
    "        p[i] = p[i - 1] - step_vector[i - 1]\n",
    "        g[i] = nabla(*p[i]).squeeze()\n",
    "        step_vector[i] = learning_rate * g[i]\n",
    "        if np.linalg.norm(g[i]) < 1e-4:\n",
    "            break\n",
    "    return p[:i], g[:i], step_vector[:i]\n",
    "\n",
    "\n",
    "def fixup_animation_js(html_animation):\n",
    "    html_animation = html_animation.replace(\n",
    "        '<div class=\"anim-controls\">',\n",
    "        '<div class=\"anim-controls\" style=\"display:none\">',\n",
    "    )\n",
    "    animation_id = re.findall(r\"onclick=\\\"(.*)\\.\", html_animation)[0]\n",
    "    img_id = re.findall(r\"<img id=\\\"(.*)\\\"\", html_animation)[0]\n",
    "    html_animation += f\"\"\"\n",
    "<script language=\"javascript\">\n",
    "setupAnimationIntersectionObserver('{animation_id}', '{img_id}');\n",
    "</script>\n",
    "\"\"\"\n",
    "    return html_animation\n",
    "\n",
    "\n",
    "def gradient_descent_animation(f, symbols, initial, steps, learning_rate, lim, cmap):\n",
    "    def _update(frame):\n",
    "        global scat, quiv\n",
    "        scat = ax.scatter(p[:frame, 0], p[:frame, 1], color=\"k\")\n",
    "        quiv = ax.quiver(\n",
    "            p[:frame, 0],\n",
    "            p[:frame, 1],\n",
    "            -g[:frame, 0],\n",
    "            -g[:frame, 1],\n",
    "            angles=\"xy\",\n",
    "            scale_units=\"xy\",\n",
    "            scale=10,\n",
    "            color=\"k\",\n",
    "        )\n",
    "\n",
    "    x, y = symbols\n",
    "    p, g, _ = bivariate_gradient_descent(f, symbols, initial, steps, learning_rate)\n",
    "    fig, ax = plt.subplots()\n",
    "    xx, yy = np.meshgrid(np.linspace(0, lim, 100), np.linspace(0, lim, 100))\n",
    "    cs = ax.contourf(\n",
    "        xx,\n",
    "        yy,\n",
    "        sp.lambdify((x, y), f, \"numpy\")(xx, yy),\n",
    "        levels=20,\n",
    "        cmap=cmap,\n",
    "    )\n",
    "    scat = ax.scatter([], [])\n",
    "    quiv = ax.quiver([], [], [1e-6], [1e-6])\n",
    "    ax.set_xlabel(\"x\")\n",
    "    ax.set_ylabel(\"y\")\n",
    "    ax.set_aspect(\"equal\")\n",
    "    plt.colorbar(cs, label=\"z\")\n",
    "    plt.title(\n",
    "        rf\"Gradient descent from ${sp.latex(p[0])}$ with $\\alpha={learning_rate}$\"\n",
    "    )\n",
    "    ani = FuncAnimation(fig=fig, func=_update, frames=steps)\n",
    "    html_animation = ani.to_jshtml(default_mode=\"loop\")\n",
    "    if \"runtime\" not in get_ipython().config.IPKernelApp.connection_file:\n",
    "        html_animation = fixup_animation_js(html_animation)\n",
    "    display(HTML(html_animation))\n",
    "    plt.close()\n",
    "\n",
    "\n",
    "gradient_descent_animation(\n",
    "    temp,\n",
    "    (x, y),\n",
    "    initial=np.array([1, 3]),\n",
    "    steps=10,\n",
    "    learning_rate=0.1,\n",
    "    lim=5,\n",
    "    cmap=\"RdBu_r\",\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "One of the major drawbacks of gradient descent is its sensitivity to the learning rate."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gradient_descent_animation(\n",
    "    temp,\n",
    "    (x, y),\n",
    "    initial=np.array([1, 3]),\n",
    "    steps=20,\n",
    "    learning_rate=0.3,\n",
    "    lim=5,\n",
    "    cmap=\"RdBu_r\",\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convergence is not necessarily guaranteed and the learning rate must be chosen carefully.\n",
    "\n",
    "Another drawback of gradient descent is that convergence doesn't guarantee we've reached the global minimum.\n",
    "\n",
    "Let's consider a function with multiple minima."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x, y = sp.symbols(\"x, y\")\n",
    "multi = (\n",
    "    -(\n",
    "        10 / (3 + 3 * (x - 0.5) ** 2 + 3 * (y - 0.5) ** 2)\n",
    "        + 2 / (1 + 2 * ((x - 3) ** 2) + 2 * (y - 1.5) ** 2)\n",
    "        + 3 / (1 + 0.5 * ((x - 3.5) ** 2) + 0.5 * (y - 4) ** 2)\n",
    "    )\n",
    "    + 10\n",
    ")\n",
    "\n",
    "xx, yy = np.meshgrid(np.linspace(0, 5, 100), np.linspace(0, 5, 100))\n",
    "\n",
    "surface = go.Surface(\n",
    "    z=sp.lambdify((x, y), multi, \"numpy\")(xx, yy),\n",
    "    x=xx,\n",
    "    y=yy,\n",
    "    colorscale=\"BrBg_r\",\n",
    "    contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),\n",
    "    colorbar=dict(title=\"z\"),\n",
    "    name=\"multi function\",\n",
    ")\n",
    "\n",
    "fig = go.Figure(surface)\n",
    "fig.update_layout(\n",
    "    title=\"Function with multiple minima\",\n",
    "    autosize=False,\n",
    "    width=600,\n",
    "    height=600,\n",
    "    scene_aspectmode=\"cube\",\n",
    "    margin=dict(l=10, r=10, b=10, t=30),\n",
    "    scene_camera=dict(\n",
    "        eye=dict(x=-1.2, y=1.8, z=1.25),\n",
    "    ),\n",
    ")\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this case the choice of the initial values is very important."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gradient_descent_animation(\n",
    "    multi,\n",
    "    (x, y),\n",
    "    initial=np.array([1, 3]),\n",
    "    steps=50,\n",
    "    learning_rate=0.2,\n",
    "    lim=5,\n",
    "    cmap=\"BrBG_r\",\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's start just 0.2 to right along the x-axis and we get to a better minima, and yet not the global minimum."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gradient_descent_animation(\n",
    "    multi,\n",
    "    (x, y),\n",
    "    initial=np.array([1.2, 3]),\n",
    "    steps=50,\n",
    "    learning_rate=0.2,\n",
    "    lim=5,\n",
    "    cmap=\"BrBG_r\",\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If we start a bit more to the left and we get to the global minimum."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gradient_descent_animation(\n",
    "    multi,\n",
    "    (x, y),\n",
    "    initial=np.array([0.6, 3]),\n",
    "    steps=50,\n",
    "    learning_rate=0.2,\n",
    "    lim=5,\n",
    "    cmap=\"BrBG_r\",\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}