{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Calculus - Week 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "from itertools import product\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import plotly.graph_objects as go\n", "import plotly.io as pio\n", "import sympy as sp\n", "from IPython.core.getipython import get_ipython\n", "from IPython.display import display, HTML\n", "from matplotlib.animation import FuncAnimation\n", "\n", "plt.style.use(\"seaborn-v0_8-whitegrid\")\n", "pio.renderers.default = \"plotly_mimetype+notebook\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Multivariate optimization" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Tangent plane" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The equation of the tangent line to $f(x)$ at point $x=a$ is\n", "\n", "> 📐 $y = \\cfrac{d}{dx}f(a)(x-a) + f(a)$\n", "\n", "This is derived from the point-slope form of a line\n", "\n", "$y-y_1 = m(x-x_1)$\n", "\n", "The equation of the tangent plane to $f(x, y)$ at point $x=a$ and $y=b$ is\n", "\n", "> 📐 $z = \\cfrac{\\partial}{\\partial x}f(a)(x-a) + \\cfrac{\\partial}{\\partial y}f(b)(y-b) + f(a, b)$\n", "\n", "which, similarly, is derived from the point-slope form of a plane\n", "\n", "$z-z_1 = m_1(x-x_1) + m_2(y-y_1)$" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tangent_line(dfa, a, sym_a, a_range, b, sym_b, f):\n", " return (\n", " dfa.evalf(subs={sym_a: a, sym_b: b}) * (a_range - a)\n", " + f.evalf(subs={x: a, y: b})\n", " ).astype(\"float32\")\n", "\n", "\n", "def tangent_plane(dfa, dfb, a, a_range, b, b_range, f):\n", " return (\n", " dfa.evalf(subs={x: a, y: b}) * (a_range - a)\n", " + dfb.evalf(subs={x: a, y: b}) * (b_range - b)\n", " + f.evalf(subs={x: a, y: b})\n", " ).astype(\"float32\")\n", "\n", "\n", "x, y = sp.symbols(\"x, y\")\n", "parabloid = x**2 + y**2\n", "dfx = sp.diff(parabloid, x)\n", "dfy = sp.diff(parabloid, y)\n", "x0 = 2\n", "y0 = 4\n", "full_range = np.linspace(-8, 8, 100)\n", "y_cut_xx, y_cut_yy = np.meshgrid(full_range, np.linspace(-8, y0, 100))\n", "xcut_xx, xcut_yy = np.meshgrid(np.linspace(-8, x0, 100), full_range)\n", "full_xx, full_yy = np.meshgrid(full_range, full_range)\n", "tan_x = np.linspace(x0 - 4, x0 + 4, 100)\n", "tan_y = np.linspace(y0 - 4, y0 + 4, 100)\n", "tan_xx, tan_yy = np.meshgrid(tan_x, tan_y)\n", "const_x = np.full(100, x0)\n", "const_y = np.full(100, y0)\n", "\n", "ycut_parabloid_surface = go.Surface(\n", " z=sp.lambdify((x, y), parabloid, \"numpy\")(y_cut_xx, y_cut_yy),\n", " x=y_cut_xx,\n", " y=y_cut_yy,\n", " colorscale=\"Blues\",\n", " contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),\n", " colorbar=dict(orientation=\"h\", y=-0.2, title=dict(text=\"z\", side=\"top\")),\n", " showlegend=True,\n", " legendgrouptitle_text=\"Partial derivative wrt x\",\n", " legendgroup=\"x\",\n", " name=\"y-cut parabloid\",\n", ")\n", "xcut_parabloid_surface = go.Surface(\n", " z=sp.lambdify((x, y), parabloid, \"numpy\")(xcut_xx, xcut_yy),\n", " x=xcut_xx,\n", " y=xcut_yy,\n", " colorscale=\"Blues\",\n", " contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),\n", " colorbar=dict(orientation=\"h\", y=-0.2, title=dict(text=\"z\", side=\"top\")),\n", " showlegend=True,\n", " legendgrouptitle_text=\"Partial derivative wrt y\",\n", " legendgroup=\"y\",\n", " name=\"x-cut parabloid\",\n", ")\n", "full_parabloid_surface = go.Surface(\n", " z=sp.lambdify((x, y), parabloid, \"numpy\")(full_xx, full_yy),\n", " x=full_xx,\n", " y=full_yy,\n", " colorscale=\"Blues\",\n", " contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),\n", " colorbar=dict(orientation=\"h\", y=-0.2, title=dict(text=\"z\", side=\"top\")),\n", " showlegend=True,\n", " name=\"full parabloid\",\n", ")\n", "poi = go.Scatter3d(\n", " x=[x0],\n", " y=[y0],\n", " z=[sp.lambdify((x, y), parabloid, \"numpy\")(x0, y0)],\n", " marker=dict(color=\"#000000\"),\n", " showlegend=True,\n", " name=\"x=2 y=4\",\n", ")\n", "yparabola = go.Scatter3d(\n", " x=const_x,\n", " y=full_range,\n", " z=sp.lambdify((x, y), parabloid, \"numpy\")(const_x, full_range),\n", " mode=\"lines\",\n", " line=dict(color=\"#000000\", width=5),\n", " showlegend=True,\n", " legendgroup=\"y\",\n", " name=\"y parabola\",\n", ")\n", "ytangent = go.Scatter3d(\n", " x=const_x,\n", " y=tan_y,\n", " z=tangent_line(dfa=dfy, a=y0, sym_a=y, a_range=tan_y, b=x0, sym_b=x, f=parabloid),\n", " mode=\"lines\",\n", " line=dict(color=\"#000000\"),\n", " showlegend=True,\n", " legendgroup=\"y\",\n", " name=\"y tangent\",\n", ")\n", "xparabola = go.Scatter3d(\n", " x=full_range,\n", " y=const_y,\n", " z=sp.lambdify((x, y), parabloid, \"numpy\")(full_range, const_y),\n", " mode=\"lines\",\n", " line=dict(color=\"#000000\", width=5),\n", " showlegend=True,\n", " legendgroup=\"x\",\n", " name=\"x parabola\",\n", ")\n", "xtangent = go.Scatter3d(\n", " x=tan_x,\n", " y=const_y,\n", " z=tangent_line(dfa=dfx, a=x0, sym_a=x, a_range=tan_x, b=y0, sym_b=y, f=parabloid),\n", " mode=\"lines\",\n", " line=dict(color=\"#000000\"),\n", " showlegend=True,\n", " legendgroup=\"x\",\n", " name=\"x tangent\",\n", ")\n", "tangent_surface = go.Surface(\n", " z=tangent_plane(\n", " dfa=dfx, dfb=dfy, a=x0, a_range=tan_xx, b=y0, b_range=tan_yy, f=parabloid\n", " ),\n", " x=tan_xx,\n", " y=tan_yy,\n", " colorscale=[[0, \"#FF8920\"], [1, \"#FF8920\"]],\n", " showscale=False,\n", " name=\"tangent plane\",\n", " showlegend=True,\n", ")\n", "fig = go.Figure(\n", " data=[\n", " full_parabloid_surface,\n", " xcut_parabloid_surface,\n", " ycut_parabloid_surface,\n", " poi,\n", " xtangent,\n", " xparabola,\n", " ytangent,\n", " yparabola,\n", " tangent_surface,\n", " ]\n", ")\n", "fig.update_layout(\n", " title=\"Tangent plane of parabloid at x=2 and y=4\",\n", " autosize=False,\n", " width=600,\n", " height=600,\n", " margin=dict(l=10, r=10, b=10, t=30),\n", " legend=dict(groupclick=\"togglegroup\", itemclick=\"toggleothers\"),\n", " scene_camera=dict(\n", " eye=dict(x=1.5, y=1.5, z=0.5),\n", " ),\n", ")\n", "fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Partial derivatives" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If we look at the tangent plane on the previous plot from a certain angle we'll see two orthogonal lines, as if they were the axes of the plane.\n", "\n", "😉 click on `Reset camera to last save` on the plot's navbar to reset the eye to (x=1.5, y=1.5, z=.5)\n", "\n", "The two lines that form the tangent plane are the tangent lines to the point and their respective slopes are called partial derivatives." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To visualize (or at least get a sense of it) the partial derivative:\n", "\n", "$\\cfrac{\\partial}{\\partial x}f(x)$ at $x=2$ select the legend group `Partial derivative wrt x`.\n", "\n", "$\\cfrac{\\partial}{\\partial y}f(y)$ at $y=4$ select the legend group `Partial derivative wrt y`.\n", "\n", "In either case, we can see that the partial derivative is just the derivative of the imaginary 2D parabola that results from $f(x, y)$ while keeping one of the two variables constant.\n", "\n", "In fact, calculating partial derivatives is a 2-step process:\n", "\n", "1. treat all the other variables as constants\n", "2. apply the same rules of differentiations\n", "\n", "For example, let $f(x, y) = x^2+y^2$. To calculate $\\cfrac{\\partial}{\\partial x}x^2+y^2$ we\n", "\n", "1. treat $y$ as a constant, let's say 1, but we don't usually do this substitution in practice. We'll do it here to drive the point home.\n", "\n", "$\\cfrac{\\partial f(x,y)}{\\partial x} = x^2+1^2$\n", "\n", "2. apply the same rules of differentiations (in this case, power, constant and sum rules)\n", "\n", "$\\cfrac{\\partial f(x,y)}{\\partial x} = 2x + 0$\n", "\n", "Let's do another example. Let let $f(x, y) = x^2y^2$.\n", "\n", "We could do the same as before and replace $y$ with 1, but in this case and more complex cases it might create more confusion than be helpful, as we'll have to revert it back to $y$ if it didn't go away like it did in the previous example.\n", "\n", "Let's leave $y$ as is and just treat as a constant. For the power and multiple constant rules we have\n", "\n", "$\\cfrac{\\partial f(x,y)}{\\partial x} = 2xy^2$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Equaling partial derivatives to 0 to find the minima and maxima" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's imagine we are in a sauna 5 meters wide and 5 meters long. We want to find the coldest place in the room.\n", "\n", "Conveniently we know the function of the temperature in terms of the room coordinates." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x, y = sp.symbols(\"x, y\")\n", "temp = 50 - sp.Rational(1, 50) * x**2 * (x - 6) * y**2 * (y - 6)\n", "temp" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "room_size = 5\n", "xx, yy = np.meshgrid(np.linspace(0, room_size, 100), np.linspace(0, room_size, 100))\n", "\n", "surface = go.Surface(\n", " z=sp.lambdify((x, y), temp, \"numpy\")(xx, yy),\n", " x=xx,\n", " y=yy,\n", " colorscale=\"RdBu_r\",\n", " contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),\n", " colorbar=dict(title=\"Temperature\"),\n", " name=\"temperature function\",\n", ")\n", "\n", "fig = go.Figure(surface)\n", "fig.update_layout(\n", " title=\"Function of the sauna temperature\",\n", " autosize=False,\n", " width=600,\n", " height=600,\n", " scene_aspectmode=\"cube\",\n", " margin=dict(l=10, r=10, b=10, t=30),\n", " scene_camera=dict(\n", " eye=dict(x=2.1, y=0.1, z=0.7),\n", " ),\n", ")\n", "fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's calculate the partial derivative wrt x." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dfx = sp.diff(temp, x).factor()\n", "dfx" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's calculate the partial derivative wrt y." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dfy = sp.diff(temp, y).factor()\n", "dfy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's check where the partial derivatives are 0." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "solutions = {\"x\": set(), \"y\": set()}\n", "for s in sp.solve(dfx) + sp.solve(dfy):\n", " for k, v in s.items():\n", " if v <= room_size:\n", " solutions[str(k)].add(float(v))\n", "solutions = list(product(solutions[\"x\"], solutions[\"y\"]))\n", "\n", "zs = []\n", "for s in solutions:\n", " z = sp.lambdify((x, y), temp, \"numpy\")(s[0], s[1])\n", " zs.append(z)\n", " fig.add_scatter3d(\n", " x=[s[0]],\n", " y=[s[1]],\n", " z=[z],\n", " marker=dict(color=\"#67001F\" if z > 40 else \"#053061\"),\n", " name=\"maximum\" if z > 40 else \"minimum\",\n", " )\n", "fig.update_layout(\n", " title=\"Maxima and minima of the function\",\n", " showlegend=False,\n", " scene_camera=dict(\n", " eye=dict(x=2.0, y=-1.0, z=0.2),\n", " ),\n", ")\n", "fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's show the tangent plane at the minimum point." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x0, y0 = solutions[np.argmin(zs)]\n", "tan_x = np.linspace(x0 - 1, x0 + 1, 100)\n", "tan_y = np.linspace(y0 - 1, y0 + 1, 100)\n", "tan_xx, tan_yy = np.meshgrid(tan_x, tan_y)\n", "const_x = np.full(100, x0)\n", "const_y = np.full(100, y0)\n", "\n", "ytangent = go.Scatter3d(\n", " x=const_x,\n", " y=tan_y,\n", " z=tangent_line(dfa=dfy, a=y0, sym_a=y, a_range=tan_y, b=x0, sym_b=x, f=temp),\n", " mode=\"lines\",\n", " line=dict(color=\"#000000\"),\n", " showlegend=True,\n", " legendgroup=\"y\",\n", " name=\"y tangent\",\n", ")\n", "\n", "xtangent = go.Scatter3d(\n", " x=tan_x,\n", " y=const_y,\n", " z=tangent_line(dfa=dfx, a=x0, sym_a=x, a_range=tan_x, b=y0, sym_b=y, f=temp),\n", " mode=\"lines\",\n", " line=dict(color=\"#000000\"),\n", " showlegend=True,\n", " legendgroup=\"x\",\n", " name=\"x tangent\",\n", ")\n", "\n", "tangent_surface = go.Surface(\n", " z=tangent_plane(\n", " dfa=dfx, dfb=dfy, a=x0, a_range=tan_xx, b=y0, b_range=tan_yy, f=temp\n", " ),\n", " x=tan_xx,\n", " y=tan_yy,\n", " colorscale=[[0, \"#FF8920\"], [1, \"#FF8920\"]],\n", " showscale=False,\n", " name=\"tangent plane\",\n", " showlegend=True,\n", ")\n", "\n", "fig.add_traces([ytangent, xtangent, tangent_surface])\n", "fig.update_layout(\n", " title=\"Tangent plane at the minimum\",\n", ")\n", "fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Gradient" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "> 🔑 The gradient of $f$ is a vector, each element of which is a partial derivative of $f$\n", "\n", "$\\nabla f(x_1, x_2, \\cdots, x_n) = \\langle \\cfrac{\\partial f}{\\partial x_1}, \\cfrac{\\partial f}{\\partial x_2}, \\cdots, \\cfrac{\\partial f}{\\partial x_n} \\rangle$\n", "\n", "Assume we are at $\\vec{p}$ in the domain space of $f$ within $\\mathbb{R}^3$:\n", "\n", "$\\vec{p} = \\begin{bmatrix}0.3\\\\0.2\\\\0.8\\\\\\end{bmatrix}$\n", "\n", "Upon calculating the partial derivatives of $f(\\vec{p})$, suppose we obtain the gradient:\n", "\n", "$\\nabla f(\\vec{p}) = \\begin{bmatrix}0.05\\\\-0.2\\\\-0.1\\end{bmatrix}$\n", "\n", "This implies that:\n", "\n", "- The function $f$ is increasing if we move to the **right** in the first dimension because the slope is positive\n", "\n", "- The function $f$ is increasing if we move to the **left** in the other two dimensions because the slope is negative\n", "\n", "Furthermore, the second dimension has the steepest ascent, while the first dimension has the flattest.\n", "\n", "> 🔑 $\\nabla f$ provides the direction and rate of fastest **increase** of $f$, while $-\\nabla f$ provides the direction and rate of fastest **decrease** of $f$" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nabla = sp.lambdify(\n", " (x, y), sp.Matrix([sp.diff(temp, x), sp.diff(temp, y)]), \"numpy\"\n", ")\n", "p0 = np.array([1.0, 3.0])\n", "g0 = nabla(*p0)\n", "xx, yy = np.meshgrid(np.linspace(0, 5, 100), np.linspace(0, 5, 100))\n", "cs = plt.contourf(\n", " xx, yy, sp.lambdify((x, y), temp, \"numpy\")(xx, yy), levels=20, cmap=\"RdBu_r\"\n", ")\n", "plt.scatter(p0[0], p0[1], color=\"k\")\n", "plt.quiver(\n", " p0[0],\n", " p0[1],\n", " -g0[0],\n", " -g0[1],\n", " angles=\"xy\",\n", " scale_units=\"xy\",\n", " scale=10,\n", " color=\"k\",\n", ")\n", "plt.xlabel(\"x\")\n", "plt.ylabel(\"y\")\n", "plt.gca().set_aspect(\"equal\")\n", "plt.title(f\"Negative gradient stemming from ${sp.latex(p0)}$\")\n", "plt.colorbar(cs, label=\"z\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Continuing the previous example in $\\mathbb{R}^3$, if our goal is to find a minimum of the function $f$, one approach is to update our position by taking a step of size $S$ in the opposite direction of the gradient:\n", "\n", "$\\vec{p}_{next} = \\langle 0.3, 0.2, 0.8 \\rangle - S \\cdot \\text{sign}(\\langle 0.05, -0.2, -0.1 \\rangle)$\n", "\n", "This approach doesn't use all the information contained in the gradient, just the direction. It's important to use also the steepness of the slope, so that we can take a larger step size if it's very steep (far from the minimum) and a smaller step size if it's flat (near the minimum.)\n", "\n", "A better approach involves taking a step proportional to the value of the negative gradient.\n", "\n", "$\\vec{p}_{next} = \\langle 0.3, 0.2, 0.8 \\rangle - \\alpha \\cdot \\langle0.05, -0.2, -0.1 \\rangle$\n", "\n", "> 🔑 $\\alpha$ is called learning rate and controls the step size\n", "\n", "The above is the main idea of the gradient descent algorithm.\n", "\n", "Below the pseudocode of a typical implementation of the gradient descent algorithm for $T$ steps or until convergence, where convergence is define in terms of the norm of the gradient.\n", "\n", "$\\begin{array}{l}\n", "\\textbf{Algorithm: } \\text{Gradient Descent} \\\\\n", "\\textbf{Input: } \\text{initial parameters } p_0, \\text{ learning rate } \\alpha, \\text{ number of iterations } T, \\text{ convergence } \\epsilon \\\\\n", "\\textbf{Output: } \\text{final parameters } p\\\\\n", "\\phantom{0}1 : p \\gets p_0 \\\\\n", "\\phantom{0}2 : \\text{evaluate gradient } \\nabla f(p) \\\\\n", "\\phantom{0}3 : \\text{for i = 1 to T do} \\\\\n", "\\phantom{0}4 : \\quad p_{new} \\gets p - \\alpha \\nabla f(p) \\\\\n", "\\phantom{0}5 : \\quad \\text{evaluate gradient } \\nabla f(p_{new}) \\\\\n", "\\phantom{0}6 : \\quad \\text{if } \\|\\nabla f(p_{new})\\| < \\epsilon \\text{ then} \\\\\n", "\\phantom{0}7 : \\quad \\quad \\text{return } p_{new} \\\\\n", "\\phantom{0}8 : \\quad \\text {end if} \\\\\n", "\\phantom{0}9 : \\quad p \\gets p_{new} \\\\\n", "10: \\text{end for} \\\\\n", "11: \\text{return } p\n", "\\end{array}$\n", "\n", "> 🔑 Gradient descent is an iterative algorithm that uses the information contained in the gradient at a point to find the local minimum of a differentiable function" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def bivariate_gradient_descent(f, symbols, initial, steps, learning_rate):\n", " x, y = symbols\n", " nabla = sp.lambdify(\n", " (x, y), sp.Matrix([sp.diff(f, x), sp.diff(f, y)]), \"numpy\"\n", " )\n", " p = np.zeros((steps, 2))\n", " g = np.zeros((steps, 2))\n", " step_vector = np.zeros((steps, 2))\n", " p[0] = initial\n", " g[0] = nabla(*p[0]).squeeze()\n", " step_vector[0] = learning_rate * g[0]\n", " for i in range(1, steps):\n", " p[i] = p[i - 1] - step_vector[i - 1]\n", " g[i] = nabla(*p[i]).squeeze()\n", " step_vector[i] = learning_rate * g[i]\n", " if np.linalg.norm(g[i]) < 1e-4:\n", " break\n", " return p[:i], g[:i], step_vector[:i]\n", "\n", "\n", "def fixup_animation_js(html_animation):\n", " html_animation = html_animation.replace(\n", " '