',\n",
" )\n",
" animation_id = re.findall(r\"onclick=\\\"(.*)\\.\", html_animation)[0]\n",
" img_id = re.findall(r\"
\n",
"setupAnimationIntersectionObserver('{animation_id}', '{img_id}');\n",
"\n",
"\"\"\"\n",
" return html_animation\n",
"\n",
"\n",
"def _update(frame):\n",
" global convex_set, half_space, in_set, out_set\n",
" half_space.remove()\n",
" convex_set.remove()\n",
" in_set.remove()\n",
" out_set.remove()\n",
" ax.plot(x_, Y[:, frame], \"tab:blue\", lw=1)\n",
" line = LineString(list(zip(x_, Y[:, frame])))\n",
" poly = Polygon([line.coords[0], line.coords[-1], vertices[frame]])\n",
" (half_space,) = ax.fill(*poly.exterior.xy, color=\"tab:blue\", alpha=0.1)\n",
" if frame > 0:\n",
" poly_inter = poly.intersection(polygons[-1])\n",
" else:\n",
" poly_inter = poly\n",
" polygons.append(poly_inter)\n",
" (convex_set,) = ax.fill(*poly_inter.exterior.xy, color=\"tab:blue\", alpha=0.5)\n",
" in_set = ax.scatter(0, 1, color=\"tab:blue\", marker=\"o\", label=\"In set\")\n",
" ax.spines[[\"left\", \"bottom\"]].set_position((\"data\", 0))\n",
" out_set = ax.scatter(\n",
" 5 / 2,\n",
" 5,\n",
" color=\"tab:blue\" if frame < 4 else \"tab:orange\",\n",
" marker=\"d\",\n",
" label=\"In set\" if frame < 4 else \"Not in set\",\n",
" zorder=2,\n",
" )\n",
" ax.legend(loc=\"upper right\")\n",
"\n",
"\n",
"fig, ax = plt.subplots()\n",
"x_ = np.linspace(-30, 30, 250)\n",
"Y = np.zeros((250, 5))\n",
"Y[:, 0] = np.full_like(x_, 1)\n",
"m = np.array([1, -1.5, 0.5, -1.2])\n",
"b = np.array([9, 9, 6, 6])\n",
"Y[:, 1:] = m * x_.reshape(-1, 1) + b\n",
"vertices = [(-30, 30), (30, -30), (-30, -30), (30, -30), (-30, -30)]\n",
"polygons = []\n",
"(half_space,) = ax.fill([], [])\n",
"(convex_set,) = ax.fill([], [])\n",
"in_set = ax.scatter([], [])\n",
"out_set = ax.scatter([], [])\n",
"ax.spines[[\"left\", \"bottom\"]].set_position((\"data\", 0))\n",
"ax.text(1.02, 0, \"$x$\", transform=ax.get_yaxis_transform())\n",
"ax.text(0, 1.02, \"$y$\", transform=ax.get_xaxis_transform())\n",
"ax.set_xlim(-10, 10)\n",
"ax.set_ylim(-5, 10)\n",
"plt.title(\n",
" \"Multiple linear constraints define the intersection of convex sets\", pad=20.0\n",
")\n",
"\n",
"ani = FuncAnimation(fig, _update, frames=Y.shape[1], repeat=False, interval=1000)\n",
"html_animation = ani.to_jshtml(default_mode=\"loop\")\n",
"if \"runtime\" not in get_ipython().config.IPKernelApp.connection_file:\n",
" html_animation = fixup_animation_js(html_animation)\n",
"display(HTML(html_animation))\n",
"plt.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's see if $(0, 1)$ belongs to the feasible set\n",
"\n",
"$0 \\ge 0$\n",
"\n",
"$8 \\ge 0$\n",
"\n",
"$8 \\ge 0$\n",
"\n",
"$5 \\ge 0$\n",
"\n",
"$5\\ge 0$\n",
"\n",
"All constraints are satisfied.\n",
"\n",
"What about $(\\cfrac{5}{2}, 5)$?\n",
"\n",
"$4 \\ge 0$\n",
"\n",
"$\\cfrac{13}{2} \\ge 0$\n",
"\n",
"$\\cfrac{1}{4} \\ge 0$\n",
"\n",
"$\\cfrac{9}{4} \\ge 0$\n",
"\n",
"$-2 < 0$\n",
"\n",
"The last constraint is not satisfied."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Inequality constraints and the Karush–Kuhn–Tucker conditions\n",
"\n",
"Let $\\textbf{P}$ a problem with $m$ inequality constraints\n",
"\n",
"$\\begin{array}{l}\n",
"\\textbf{P} \\\\\n",
"\\min \\limits_{x} f(x) \\\\\n",
"\\text{subject to} \\quad g_i(x) \\ge 0 \\quad \\forall i=1,\\dots,m\n",
"\\end{array}$\n",
"\n",
"It turns out inequality constraints can be formulated as equality constraints by introducing a non-negative slack variable $s$.\n",
"\n",
"$\\begin{array}{l}\n",
"\\min \\limits_{x} f(x) \\\\\n",
"\\text{subject to} \\quad g_i(x) - s_i = 0\n",
"\\end{array}$\n",
"\n",
"- If $g_i(x)>0$: The slack variable $s_i$ takes the value $−g_i(x)$, so the sum is zero.\n",
"- If $g_i(x)=0$: The slack variable $s_i$ is zero.\n",
"- If $g_i(x)<0$: No non-negative value of $s_i$ can make the sum zero, which makes the equality unachievable, so it honours the original constraint.\n",
"\n",
"The Lagrangian of this problem is\n",
"\n",
"$\\mathcal{L}(x, \\lambda, s) = f(x) - \\sum \\limits_{i=1}^{m} \\lambda_i(g_i(x) - s_i)$\n",
"\n",
"This problem has an optimal solution only if the **complementary slackness condition** is met.\n",
"\n",
"> 🔑 The complementary slackness condition states that at the optimal solution, either the Lagrange multipliers $\\lambda_i$ are zero or the respective constraints are active (the equality constraint $g_i(x^{\\star}) = 0$ holds). \n",
"\n",
"The complementary slackness condition is expressed as $\\lambda_i g_i(x^{\\star}) = 0$ $\\forall i=1,\\dots,m$.\n",
"\n",
"While it was useful to see the slack variable, in practice we do not add the slack variables but rather we \"limit\" the Lagrange multipliers for inequality constraints as non-negative.\n",
"\n",
"$\\begin{array}{l}\n",
"\\textbf{P} \\\\\n",
"\\min \\limits_{x} f(x) \\\\\n",
"\\text{subject to} \\quad g_i(x) \\ge 0 \\quad \\forall i=1,\\dots,m\n",
"\\end{array}$\n",
"\n",
"The Lagrangian of $\\textbf{P}$ is simply as if $g_i(x)$ were equality constraints.\n",
"\n",
"$\\mathcal{L}(x, \\lambda) = f(x) - \\sum \\limits_{i=1}^{m} \\lambda_i g_i(x)$\n",
"\n",
"Note that we haven't explicitly added any constraints for the Lagrange multipliers to be non-negative.\n",
"\n",
"It's just an additional condition for the problem to have an optimal solution, called **dual feasibility**.\n",
"\n",
"> 🔑 The dual feasibility condition states that at the optimal solution, the Lagrange multipliers $\\lambda_i$ must be non-negative.\n",
"\n",
"The dual feasibility condition is expressed as $\\lambda_i \\ge 0$ $\\forall i=1,\\dots,m$.\n",
"\n",
"The complementary slackness and dual feasibility conditions are two of the four conditions known as Karush–Kuhn–Tucker (KKT) conditions.\n",
"\n",
"The other two, **stationarity** and **primal feasibility** conditions simply state that at the optimal solution the gradient of the Lagrangian must be zero and all the constraints are satisfied.\n",
"\n",
"As an example, let $\\textbf{Q}$ a convex optimization problem with a quadratic objective function and three linear inequality constraint defining a bounded convex set.\n",
"\n",
"$\\begin{array}{l}\n",
"\\textbf{Q} \\\\\n",
"\\min \\limits_{x, y} x^2+y^2 \\\\\n",
"\\text{subject to} \\quad y + 4 \\ge 0 \\\\\n",
"\\quad \\quad \\quad \\quad \\quad x - y -\\cfrac{3}{2} \\ge 0 \\\\\n",
"\\quad \\quad \\quad \\quad \\quad - \\cfrac{3}{2}x - y + 9 \\ge 0\n",
"\\end{array}$\n",
"\n",
"Let's build our intuition of this problem by looking at the contour plot."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"x, y = sp.symbols(\"x, y\")\n",
"f = x**2 + y**2\n",
"\n",
"x_ = np.linspace(-30, 30, 250)\n",
"y_ = np.linspace(-30, 30, 250)\n",
"xx, yy = np.meshgrid(x_, y_)\n",
"\n",
"fig, ax = plt.subplots()\n",
"cs = ax.contour(\n",
" xx,\n",
" yy,\n",
" sp.lambdify((x, y), f, \"numpy\")(xx, yy),\n",
" levels=np.floor(np.linspace(1, 50, 11)),\n",
" cmap=\"RdBu_r\",\n",
")\n",
"polygons = []\n",
"Y = np.zeros((250, 3))\n",
"Y[:, 0] = np.full_like(x_, -4)\n",
"m = np.array([1, -1.5])\n",
"b = np.array([-1.5, 9])\n",
"Y[:, 1:] = m * x_.reshape(-1, 1) + b\n",
"vertices = [(-30, 30), (30, -30), (-30, -30)]\n",
"for i in range(Y.shape[1]):\n",
" line = LineString(list(zip(x_, Y[:, i])))\n",
" poly = Polygon([line.coords[0], line.coords[-1], vertices[i]])\n",
" if i > 0:\n",
" poly_inter = poly.intersection(polygons[-1])\n",
" else:\n",
" poly_inter = poly\n",
" polygons.append(poly_inter)\n",
"ax.fill(\n",
" *poly_inter.exterior.xy,\n",
" color=\"tab:blue\",\n",
" alpha=0.5,\n",
" edgecolor=\"k\",\n",
" linewidth=2,\n",
" zorder=2\n",
")\n",
"\n",
"ax.clabel(cs, inline=True, fontsize=10)\n",
"ax.spines[[\"left\", \"bottom\"]].set_position((\"data\", 0))\n",
"ax.text(1.02, 0, \"$x$\", transform=ax.get_yaxis_transform())\n",
"ax.text(0, 1.02, \"$y$\", transform=ax.get_xaxis_transform())\n",
"ax.set_xlim(-10, 10)\n",
"ax.set_ylim(-10, 10)\n",
"ax.set_aspect(\"equal\")\n",
"plt.title(\"The problem $\\\\text{Q}$\", pad=20.0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can visually see that the minimum should be around $(0.75, -0.75)$ where one of the sides of the triangle is tangent to the $f(x, y) = 1$ contour."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ax.scatter(\n",
" 0.75,\n",
" -0.75,\n",
" marker=\"*\",\n",
" color=\"tab:orange\",\n",
" s=100,\n",
" edgecolors=\"k\",\n",
" linewidth=0.5,\n",
" label=\"optimal solution\",\n",
" zorder=2,\n",
")\n",
"ax.legend()\n",
"fig"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's verify that $(0.75, -0.75)$ is indeed the optimal solution using the KKT conditions.\n",
"\n",
"Before we do that we need to find the optimal values for $\\lambda_1$, $\\lambda_2$ and $\\lambda_3$.\n",
"\n",
"Since $(0.75, -0.75)$ lies on the boundary of the second constraint, the second constraint is active. In fact,\n",
"\n",
"$\\cfrac{3}{4} - (-\\cfrac{3}{4}) -\\cfrac{3}{2} = 0$\n",
"\n",
"As a consequence, the others constraints must be inactive. Thus, as per the complementary slackness condition, their respective Lagrange multiplies must be 0.\n",
"\n",
"So for now we have that $\\lambda_1 = 0$ and $\\lambda_3 = 0$.\n",
"\n",
"Let's find $\\lambda_2$ by setting the partial derivatives of the Lagrangian to 0 and plugging in the optimal solution as well as $\\lambda_1 = 0$ and $\\lambda_3 = 0$.\n",
"\n",
"The Lagrangian of $\\textbf{Q}$ is\n",
"\n",
"$\\mathcal{L}(x, y, \\lambda) = x^2 + y^2 - \\lambda_{1} (y + 4) - \\lambda_{2} (x - y - \\cfrac{3}{2}) - \\lambda_{3} (-\\cfrac{3}{2} x - y + 9)$\n",
"\n",
"The partial derivatives wrt to $x$ and $y$ are\n",
"\n",
"$\\cfrac{\\partial\\mathcal{L}}{\\partial x} = 2x - \\lambda_{2} + \\cfrac{3}{2}\\lambda_{3}$\n",
"\n",
"$\\cfrac{\\partial\\mathcal{L}}{\\partial y} = 2y - \\lambda_{1} + \\lambda_{2} + \\lambda_{3}$\n",
"\n",
"At the optimal solution and for $\\lambda_1 = 0$ and $\\lambda_3 = 0$\n",
"\n",
"$\\cfrac{\\partial\\mathcal{L}}{\\partial x} = 0$ \n",
"\n",
"$2\\cfrac{3}{4} - \\lambda_{2} = 0$\n",
"\n",
"$\\lambda_{2} = \\cfrac{3}{2}$\n",
"\n",
"$\\cfrac{\\partial\\mathcal{L}}{\\partial y} = 0$\n",
"\n",
"$-2\\cfrac{3}{4} + \\lambda_{2} = 0$ \n",
"\n",
"$\\lambda_{2} = \\cfrac{3}{2}$\n",
"\n",
"We can now verify that we've satisfied all the KKT conditions for this problem, which are shown below.\n",
"\n",
"1. Stationarity\n",
"\n",
"$\\nabla \\mathcal{L}(x^{\\star},y^{\\star}) = 0$\n",
"\n",
"2. Primal feasibility\n",
"\n",
"$h_1(x^{\\star},y^{\\star}) \\ge 0$\n",
"\n",
"$h_2(x^{\\star},y^{\\star}) \\ge 0$\n",
"\n",
"$h_3(x^{\\star},y^{\\star}) \\ge 0$\n",
"\n",
"3. Dual feasibility\n",
"\n",
"$\\lambda_1^{\\star} \\ge 0$\n",
"\n",
"$\\lambda_2^{\\star} \\ge 0$\n",
"\n",
"$\\lambda_2^{\\star} \\ge 0$\n",
"\n",
"4. Complimentary slackness\n",
"\n",
"$\\lambda_1^{\\star}(h_1(x^{\\star},y^{\\star})) = 0$\n",
"\n",
"$\\lambda_2^{\\star}(h_2(x^{\\star},y^{\\star})) = 0$\n",
"\n",
"$\\lambda_2^{\\star}(h_3(x^{\\star},y^{\\star})) = 0$\n",
"\n",
"Let's verify them all."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"x, y = sp.symbols(\"x, y\")\n",
"lam1, lam2, lam3 = sp.symbols(\"lambda_1, lambda_2, lambda_3\", nonnegative=True)\n",
"f = x**2 + y**2\n",
"g1 = y + 4\n",
"g2 = x - y - sp.Rational(3, 2)\n",
"g3 = -sp.Rational(3, 2) * x - y + 9\n",
"lagrangian = f - lam1 * (g1) - lam2 * (g2) - lam3 * (g3)\n",
"dLx = sp.diff(lagrangian, x)\n",
"dLy = sp.diff(lagrangian, y)\n",
"\n",
"sol = {x: 0.75, y: -0.75, lam1: 0, lam2: 1.5, lam3: 0}\n",
"# stationarity\n",
"assert dLx.subs(sol) == 0\n",
"assert dLy.subs(sol) == 0\n",
"# primal feasibility\n",
"assert g1.subs(sol) >= 0\n",
"assert g2.subs(sol) >= 0\n",
"assert g3.subs(sol) >= 0\n",
"# dual feasibility\n",
"assert sol[lam1] >= 0\n",
"assert sol[lam2] >= 0\n",
"assert sol[lam3] >= 0\n",
"# complementary slackness\n",
"assert sol[lam1] * g1.subs(sol) == 0\n",
"assert sol[lam2] * g2.subs(sol) == 0\n",
"assert sol[lam3] * g3.subs(sol) == 0"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's solve this problem analytically."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"solutions = sp.solve(\n",
" [dLx, dLy, lam1 * g1, lam2 * g2, lam3 * g3], [x, y, lam1, lam2, lam3], dict=True\n",
")\n",
"valid_solutions = []\n",
"for sol in solutions:\n",
" if g1.subs(sol) >= 0 and g2.subs(sol) >= 0 and g3.subs(sol) >= 0:\n",
" valid_solutions.append(sol)\n",
"\n",
"for sol in valid_solutions:\n",
" for k, v in sol.items():\n",
" display(Math(sp.latex(k) + \"=\" + sp.latex(v)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Wolfe dual problem\n",
"\n",
"> 🔑 The duality principle states that optimization problems may be viewed from either of two perspectives, the primal problem or the dual problem. If the primal is a minimization problem then the dual is a maximization problem (and vice versa).\n",
"\n",
"As before, let $\\textbf{P}$ a **convex** optimization problem with $m$ inequality constraints\n",
"\n",
"$\\begin{array}{l}\n",
"\\textbf{P} \\\\\n",
"\\min \\limits_{x} f(x) \\\\\n",
"\\text{subject to} \\quad g_i(x) \\ge 0 \\quad \\forall i=1,\\dots,m\n",
"\\end{array}$\n",
"\n",
"This is the **primal** problem and the Lagrangian of $\\textbf{P}$ is the usual\n",
"\n",
"$\\mathcal{L}(x, \\lambda) = f(x) - \\sum \\limits_{i=1}^{m} \\lambda_i g_i(x)$\n",
"\n",
"For a **convex** problem $\\textbf{P}$, its corresponding dual problem is termed as the Wolfe dual.\n",
"\n",
"$\\begin{array}{l}\n",
"\\textbf{WD}_P \\\\\n",
"\\max \\limits_{\\lambda} h(\\lambda)\\\\\n",
"\\text{subject to} \\quad \\lambda_i \\ge 0 \\quad \\forall i=1,\\dots,m\n",
"\\end{array}$\n",
"\n",
"where $h(\\lambda) = \\min \\limits_{x} \\mathcal{L}(x, \\lambda)$.\n",
"\n",
"Recall the KKT conditions and note how the dual feasibility condition $\\lambda \\ge 0$ refers to the satisfaction of the constraints of the dual problem, similarly to how the primal feasibility condition $g(x) \\ge 0$ refers to the constraints of the constraints of the primal problem.\n",
"\n",
"$h(\\lambda)$ is the Lagrangian uniquely expressed as a linear function of $\\lambda$ evaluated at the optimal solution $x^{\\star}$.\n",
"\n",
"It's obtained by setting the partial derivative of the Lagrangian wrt x to 0, solving it for $x^{\\star}$ and substituting its equation back into the Lagrangian so $x$ is removed.\n",
"\n",
"This is best explained with an example.\n",
"\n",
"Recall, the problem $\\textbf{Q}$\n",
"\n",
"$\\begin{array}{l}\n",
"\\textbf{Q} \\\\\n",
"\\min \\limits_{x, y} x^2+y^2 \\\\\n",
"\\text{subject to} \\quad y + 4 \\ge 0 \\\\\n",
"\\quad \\quad \\quad \\quad \\quad x - y -\\cfrac{3}{2} \\ge 0 \\\\\n",
"\\quad \\quad \\quad \\quad \\quad - \\cfrac{3}{2}x - y + 9 \\ge 0\n",
"\\end{array}$\n",
"\n",
"whose Lagrangian is\n",
"\n",
"$\\mathcal{L}(x, y, \\lambda) = x^2 + y^2 - \\lambda_{1} (y + 4) - \\lambda_{2} (x - y - \\cfrac{3}{2}) - \\lambda_{3} (-\\cfrac{3}{2} x - y + 9)$\n",
"\n",
"To obtain $h(\\lambda) = \\min \\limits_{x} \\mathcal{L}(x, \\lambda)$ we set the partial derivatives of the Lagrangian to 0, solve for $x^{\\star}$ and $y^{\\star}$ and substitute their respective linear functions of $\\lambda$ back into the Lagrangian.\n",
"\n",
"$\\cfrac{\\partial\\mathcal{L}}{\\partial x} = 0$\n",
"\n",
"$2x^{\\star} - \\lambda_{2} + \\cfrac{3}{2}\\lambda_{3} = 0$\n",
"\n",
"$x^{\\star} = \\cfrac{2\\lambda_{2}-3\\lambda_{3}}{4}$\n",
"\n",
"$\\cfrac{\\partial\\mathcal{L}}{\\partial y} = 0$\n",
"\n",
"$2y - \\lambda_{1} + \\lambda_{2} + \\lambda_{3} = 0$\n",
"\n",
"$y^{\\star} = \\cfrac{\\lambda_{1} - \\lambda_{2} - \\lambda_{3}}{2}$\n",
"\n",
"$h(\\lambda) = \\cfrac{(2 \\lambda_{2} - 3 \\lambda_{3})^{2}}{16} + \\cfrac{(\\lambda_{1} - \\lambda_{2} - \\lambda_{3})^{2}}{4} - \\lambda_{1} \\left(\\cfrac{\\lambda_{1} - \\lambda_{2} - \\lambda_{3}}{2} + 4\\right) - \\lambda_{2} \\left(\\cfrac{2 \\lambda_{2} - 3 \\lambda_{3}}{4} - \\cfrac{\\lambda_{1} - \\lambda_{2} - \\lambda_{3}}{2} - \\cfrac{3}{2}\\right) - \\lambda_{3} \\left(- \\cfrac{3 (2 \\lambda_{2} - 3 \\lambda_{3})}{8} - \\cfrac{\\lambda_{1} - \\lambda_{2} - \\lambda_{3}}{2} + 9\\right)$\n",
"\n",
"The Wolfe dual problem can equivalently be formulated as\n",
"\n",
"$\\begin{array}{l}\n",
"\\textbf{WD}_P \\\\\n",
"\\max \\limits_{x, \\lambda} \\mathcal{L}(x, \\lambda)\\\\\n",
"\\text{subject to} \\quad \\lambda_i \\ge 0 \\quad \\forall i=1,\\dots,m \\\\\n",
"\\quad \\quad \\quad \\quad \\quad \\cfrac{\\partial}{\\partial x}\\mathcal{L}(x, \\lambda) = 0\n",
"\\end{array}$\n",
"\n",
"To analytically solve a minimization problem using the Lagrangian dual we can follow these steps:\n",
"\n",
"1. Solve the Lagrangian dual for all $\\lambda_i$. We might need to solve it for different combinations of active constraints.\n",
"2. Evaluate the equations for $x^{\\star}$ and $y^{\\star}$ used to created $h(\\lambda)$\n",
"3. Check the KKT conditions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"x, y = sp.symbols(\"x, y\")\n",
"lam1, lam2, lam3 = sp.symbols(\"lambda_1, lambda_2, lambda_3\", nonnegative=True)\n",
"f = x**2 + y**2\n",
"g1 = y + 4\n",
"g2 = x - y - sp.Rational(3, 2)\n",
"g3 = -sp.Rational(3, 2) * x - y + 9\n",
"lagrangian = f - lam1 * (g1) - lam2 * (g2) - lam3 * (g3)\n",
"h = {\n",
" x: sp.solve(sp.diff(lagrangian, x), x)[0],\n",
" y: sp.solve(sp.diff(lagrangian, y), y)[0],\n",
"}\n",
"lagrangian_dual = lagrangian.subs(h)\n",
"eval_set = [\n",
" dict(ChainMap(*t))\n",
" for t in [\n",
" c\n",
" for i in range(1, 4)\n",
" for c in combinations([{lam1: 0}, {lam2: 0}, {lam3: 0}], i)\n",
" ]\n",
"]\n",
"valid_solutions = []\n",
"for e in eval_set:\n",
" lambda_sol = sp.solve(\n",
" [\n",
" sp.diff(lagrangian_dual.subs(e), lam1),\n",
" sp.diff(lagrangian_dual.subs(e), lam2),\n",
" sp.diff(lagrangian_dual.subs(e), lam3),\n",
" ],\n",
" [lam1, lam2, lam3],\n",
" dict=True,\n",
" )\n",
" if lambda_sol:\n",
" for s in lambda_sol:\n",
" x_star = h[x].subs(e | s)\n",
" y_star = h[y].subs(e | s)\n",
" sol = dict(ChainMap(e, s, {x: x_star, y: y_star}))\n",
" assert (\n",
" g1.subs(sol) >= 0 and g2.subs(sol) >= 0 and g3.subs(sol) >= 0\n",
" ), \"Primal feasibility not satisfied\"\n",
" assert (\n",
" sol[lam1] * g1.subs(sol) == 0\n",
" and sol[lam2] * g2.subs(sol) == 0\n",
" and sol[lam3] * g3.subs(sol) == 0\n",
" ), \"Complementary slackness not satisfied\"\n",
" valid_solutions.append(sol)\n",
"for sol in valid_solutions:\n",
" for k, v in sol.items():\n",
" display(Math(sp.latex(k) + \"=\" + sp.latex(v)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This might look like an unnecessary convoluted way to get to the same result we got by solving the primal problem directly, but the Lagrangian dual has become a common approach to solving optimization problems. \n",
"\n",
"Many problems can be **efficiently** solved by constructing the Lagrangian function of the problem and solving the dual problem instead of the primal problem."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Optimizing SVM"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Recall in [Linear Algebra](../linear_algebra/la_extra.html#Support-Vector-Machines) we applied concepts of linear algebra to SVM.\n",
"\n",
"We derived the margin $\\gamma(w, b) = \\min\\cfrac{|wx_i+b|}{\\|w\\|}$ from the formula of the distance between a point a line $d = \\cfrac{|ax_0 + by_0 + b|}{\\sqrt{a^2 + b^2}}$.\n",
"\n",
"We then translated the objective in plain english \"find the decision boundary that maximizes the margin between two classes such that the points belonging to each class lie on the correct side of the boundary\" to the following constrained problem\n",
"\n",
"$\\begin{array}{l}\n",
"\\min \\limits_{w, b} \\cfrac{1}{2} w \\cdot w \\\\ \n",
"\\text{subject to } y_i(w \\cdot x_i + b) \\ge 1 \\text{ } \\forall i = 1, \\dots, m\n",
"\\end{array}$\n",
"\n",
"We used `scipy.minimize` to solve it but we now have all the tools (Lagrange multipliers, KKT conditions, Wolfe dual) to solve it ourselves.\n",
"\n",
"The Wolfe dual is\n",
"\n",
"$\\begin{array}{l}\n",
"\\max \\limits_{\\alpha} h(\\alpha)\\\\\n",
"\\text{subject to} \\quad \\alpha_i \\ge 0 \\quad \\forall i=1,\\dots,m\n",
"\\end{array}$\n",
"\n",
"where $h(\\alpha) = \\min \\limits_{w, b} \\mathcal{L}(w,b,\\alpha)$.\n",
"\n",
"Note we use $\\alpha$ instead of $\\lambda$ because it's the notation commonly used in the SVM literature.\n",
"\n",
"Let's create the Lagrangian dual.\n",
"\n",
"**Step 1**: Form the Lagrangian function of the primal\n",
"\n",
"$\\mathcal{L}(w, b, \\alpha) = \\cfrac{1}{2} w \\cdot w - \\sum \\limits_{i=1}^{m} \\alpha_i[y_i(w \\cdot x_i + b) - 1]$\n",
"\n",
"**Step 2a:** Calculate the partial derivative of $\\mathcal{L}(w, b, \\alpha)$ wrt $w$, set it to 0, solve for $w^{\\star}$ and substitute it into $\\mathcal{L}(w, b, \\alpha)$ to obtain $h(b, \\alpha)$\n",
"\n",
"$\\cfrac{\\partial}{\\partial w}\\mathcal{L} = 0$\n",
"\n",
"$w^{\\star} - \\sum \\limits_{i=1}^{m} \\alpha_iy_ix_i = 0$\n",
"\n",
"$w^{\\star} = \\left(\\sum \\limits_{i=1}^{m} \\alpha_iy_ix_i\\right)$\n",
"\n",
"$h(b, \\alpha) = \\cfrac{1}{2} \\left(\\sum \\limits_{i=1}^{m} \\alpha_iy_ix_i\\right) \\cdot \\left(\\sum \\limits_{j=1}^{m} \\alpha_jy_jx_j\\right) - \\sum \\limits_{i=1}^{m} \\alpha_i[y_i(\\left(\\sum \\limits_{j=1}^{m} \\alpha_jy_jx_j\\right) \\cdot x_i + b) - 1]$\n",
"\n",
"$= \\cfrac{1}{2} \\sum \\limits_{i=1}^{m} \\sum \\limits_{j=1}^{m} \\alpha_i\\alpha_jy_iy_jx_i \\cdot x_j - \\sum \\limits_{i=1}^{m} \\sum \\limits_{j=1}^{m} \\alpha_i\\alpha_jy_iy_jx_i \\cdot x_j - b \\sum \\limits_{i=1}^{m} \\alpha_iy_i + \\sum \\limits_{i=1}^{m} \\alpha_i$\n",
"\n",
"$= - \\cfrac{1}{2} \\sum \\limits_{i=1}^{m} \\sum \\limits_{j=1}^{m} \\alpha_i\\alpha_jy_iy_jx_i \\cdot x_j - b \\sum \\limits_{i=1}^{m} \\alpha_iy_i + \\sum \\limits_{i=1}^{m} \\alpha_i$\n",
"\n",
"**Step 2b:** Calculate the partial derivative of $\\mathcal{L}(w, b, \\alpha)$ wrt $b$, set it to 0, solve for $b^{\\star}$ and substitute it into $\\mathcal{L}(w, b, \\alpha)$ to obtain $h(\\alpha)$\n",
"\n",
"$\\cfrac{\\partial}{\\partial b}\\mathcal{L} = 0$\n",
"\n",
"$-\\sum \\limits_{i=1}^{m} \\alpha_iy_i = 0$\n",
"\n",
"To impose the partial derivative wrt to $b$ to be 0 we'll add a constraint to the dual problem.\n",
"\n",
"$\\begin{array}{l}\n",
"\\max \\limits_{\\alpha} h(\\alpha) \\\\\n",
"\\text{subject to} \\quad \\alpha_i \\ge 0 \\quad \\forall i=1,\\dots,m \\\\\n",
"\\quad \\quad \\quad \\quad \\quad \\sum \\limits_{i=1}^{m} \\alpha_iy_i = 0\n",
"\\end{array}$\n",
"\n",
"So we get that $h(\\alpha)$ is\n",
"\n",
"$h(\\alpha) = - \\cfrac{1}{2} \\sum \\limits_{i=1}^{m} \\sum \\limits_{j=1}^{m} \\alpha_i\\alpha_jy_iy_jx_i \\cdot x_j - b\\underbrace{\\sum \\limits_{i=1}^{m} \\alpha_iy_i}_{0} + \\sum \\limits_{i=1}^{m} \\alpha_i$\n",
"\n",
"$= - \\cfrac{1}{2} \\sum \\limits_{i=1}^{m} \\sum \\limits_{j=1}^{m} \\alpha_i\\alpha_jy_iy_jx_i \\cdot x_j + \\sum \\limits_{i=1}^{m} \\alpha_i$\n",
"\n",
"And here's the dual we need to solve.\n",
"\n",
"$\\begin{array}{l}\n",
"\\max \\limits_{\\alpha} \\sum \\limits_{i=1}^{m} \\alpha_i - \\cfrac{1}{2} \\sum \\limits_{i=1}^{m} \\sum \\limits_{j=1}^{m} \\alpha_i\\alpha_jy_iy_jx_i \\cdot x_j \\\\\n",
"\\text{subject to} \\quad \\alpha_i \\ge 0 \\quad \\forall i=1,\\dots,m \\\\\n",
"\\quad \\quad \\quad \\quad \\quad \\sum \\limits_{i=1}^{m} \\alpha_iy_i = 0\n",
"\\end{array}$\n",
"\n",
"Once we've solved the Lagrangian dual for the multipliers $\\alpha$'s, we get $w^{\\star}$ from\n",
"\n",
"$w^{\\star} = \\sum \\limits_{i=1}^{m} \\alpha_iy_ix_i$\n",
"\n",
"To get $b^{\\star}$, though, the process is a little bit more complicated.\n",
"\n",
"We can use the constraint $y_i(w \\cdot x_i + b) - 1 \\ge 0 \\quad \\forall i=1,\\dots,m$.\n",
"\n",
"Recall for the complementary slackness condition the constraint is equal to 0 when is active, so that $\\alpha_i$ may be greater than 0.\n",
"\n",
"$y_i(w \\cdot x_i^{SV} + b) - 1 = 0 \\quad \\forall i=1,\\dots,s$\n",
"\n",
"Note we put a $SV$ superscript on the $x_i$'s for which the constraint is active, and $s$ is the number of active constraints.\n",
"\n",
"$x_i^{SV}$ are the support vectors which the algorithm takes its name from.\n",
"\n",
"We replace $w$ with $w^{\\star}$ and we solve for $b^{\\star}$.\n",
"\n",
"$\\sum \\limits_{i=1}^{s} y_i(w^{\\star} \\cdot x_i^{SV} + b^{\\star}) - 1 = 0$\n",
"\n",
"We multiply both sides by $y_i$.\n",
"\n",
"$y_i \\left[\\sum \\limits_{i=1}^{s} y_i(w^{\\star} \\cdot x_i^{SV} + b^{\\star}) - 1 \\right] = 0y_i$\n",
"\n",
"$\\sum \\limits_{i=1}^{s} \\left[y_i^2(w^{\\star} \\cdot x_i^{SV} + b^{\\star}) - y_i \\right] = 0$\n",
"\n",
"Because $y_i \\in \\{-1, 1\\}$ then $y_i^2 = 1$\n",
"\n",
"$\\sum \\limits_{i=1}^{s} w^{\\star} \\cdot x_i^{SV} + \\sum \\limits_{i=1}^{s} b^{\\star} - \\sum \\limits_{i=1}^{s} y_i = 0$\n",
"\n",
"$\\sum \\limits_{i=1}^{s} b^{\\star} = \\sum \\limits_{i=1}^{s} y_i - \\sum \\limits_{i=1}^{s} w^{\\star} \\cdot x_i^{SV}$\n",
"\n",
"$b^{\\star} = \\cfrac{1}{s} \\sum \\limits_{i=1}^{s} (y_i - w^{\\star} \\cdot x_i^{SV})$\n",
"\n",
"Let's try to solve a small problem analytically."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"m = 6\n",
"k = 2\n",
"neg_centroid = [-1, -1]\n",
"pos_centroid = [1, 1]\n",
"\n",
"rng = np.random.default_rng(1)\n",
"X = np.r_[\n",
" rng.standard_normal((m // 2, k)) + neg_centroid,\n",
" rng.standard_normal((m // 2, k)) + pos_centroid,\n",
"].T\n",
"Y = np.array([[-1.0] * (m // 2) + [1.0] * (m // 2)])\n",
"\n",
"fig, ax = plt.subplots()\n",
"ax.scatter(X[0], X[1], c=np.where(Y.squeeze() == -1, \"tab:orange\", \"tab:blue\"))\n",
"for i in range(m):\n",
" ax.text(*X[:, i] + 0.2, i + 1, color=\"tab:orange\" if Y[0, i] == -1 else \"tab:blue\")\n",
"ax.set_aspect(\"equal\")\n",
"ax.set_xlabel(\"$x_1$\")\n",
"ax.set_ylabel(\"$x_2$\")\n",
"ax.set_xlim(-5, 5)\n",
"ax.set_ylim(-5, 5)\n",
"plt.title(\"Classification data\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It seems that the support vectors are 1 and 4.\n",
"\n",
"This helps us solve the Lagrangian dual analytically because we only need to solve it for $\\alpha_1$ and $\\alpha_2$. We'll set the others to 0."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"i = sp.symbols(\"i\", cls=sp.Idx)\n",
"a = sp.IndexedBase(\"a\", nonnegative=True)\n",
"x1 = sp.IndexedBase(\"x1\")\n",
"x2 = sp.IndexedBase(\"x2\")\n",
"y = sp.IndexedBase(\"y\")\n",
"w1, w2, b = sp.symbols(\"w1, w2, b\")\n",
"w = sp.Matrix([w1, w2])\n",
"x = sp.Matrix([x1[i], x2[i]])\n",
"f = (0.5 * w.T * w)[0]\n",
"g = y[i] * ((w.T * x)[0] + b) - 1\n",
"lagrangian = f - sp.Sum(a[i] * g, (i, 1, m))\n",
"dLw1 = sp.diff(lagrangian, w)[0]\n",
"dLw2 = sp.diff(lagrangian, w)[1]\n",
"dLb = sp.diff(lagrangian, b)\n",
"w1_star = sp.solve(dLw1, w1)[0]\n",
"w2_star = sp.solve(dLw2, w2)[0]\n",
"lagrangian_dual = (\n",
" lagrangian.subs({w1: w1_star, w2: w2_star})\n",
" .expand()\n",
" .subs(sp.Sum(b * a[i] * y[i], (i, 1, m)), 0)\n",
" .simplify()\n",
")\n",
"\n",
"display(Math(\"\\\\mathcal{h}(\\\\alpha) =\" + sp.latex(lagrangian_dual)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It turns out the system of equations resulting from the partial derivatives of $h(\\alpha)$ is quite complicated to be solved symbolically. \n",
"\n",
"`sympy` conveniently features a numerical solver `sympy.nsolve`, although it's not designed for it. \n",
"\n",
"An alternative would be `scipy.optimize.fsolve` which can solve a system of non-linear equations. \n",
"\n",
"We'll stick with `sympy` for convenience, but later we'll take a look at a low-level package specifically designed to solve quadratic programming problems called `cvxopt`.\n",
"\n",
"`sympy.nsolve` requires an initial solution and we'll also add `verify=False` to prevent above-tolerance errors.\n",
"\n",
"We'll use the KTT conditions to verify the solution ourselves."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = dict(\n",
" ChainMap(\n",
" *[{x1[j + 1]: X[0, j], x2[j + 1]: X[1, j], y[j + 1]: Y[0, j]} for j in range(m)]\n",
" )\n",
")\n",
"a_fix = {a[2]: 0.0, a[3]: 0.0, a[5]: 0.0, a[6]: 0.0}\n",
"\n",
"for a_0 in np.linspace(0.1, 1, 50):\n",
" a_sol = sp.nsolve(\n",
" [sp.diff(lagrangian_dual.subs(data | a_fix), a[j + 1]) for j in range(m)]\n",
" + [sp.Sum(a[i] * y[i], (i, 1, m)).doit().subs(data | a_fix)],\n",
" [a[1], a[4]],\n",
" [a_0, a_0],\n",
" dict=True,\n",
" verify=False,\n",
" )\n",
" a_sol = a_fix | a_sol[0]\n",
" if sp.Sum(a[i] * y[i], (i, 1, m)).doit().subs(data | a_sol) == 0:\n",
" break\n",
"\n",
"a_sol = {\n",
" k: v\n",
" for k, v in sorted(zip(a_sol.keys(), a_sol.values()), key=lambda a: a[0].indices)\n",
"}\n",
"for k, v in a_sol.items():\n",
" display(Math(sp.latex(k) + \"=\" + str(v)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's verify the stationary condition for the dual."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for j in range(m):\n",
" assert sp.diff(lagrangian_dual.subs(data | a_sol), a[j + 1]) == 0\n",
"assert sp.Sum(a[i] * y[i], (i, 1, m)).doit().subs(data | a_sol) == 0"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's calculate $w^{\\star}$ and $b^{\\star}$.\n",
"\n",
"Recall\n",
"\n",
"$w^{\\star} = \\sum \\limits_{i=1}^{m} \\alpha_iy_ix_i$\n",
"\n",
"$b^{\\star} = \\cfrac{1}{s} \\sum \\limits_{i=1}^{s} (y_i - w^{\\star} \\cdot x_i^{SV})$"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"alpha = np.array(list(a_sol.values()), dtype=\"float\")\n",
"sv = np.argwhere(alpha > 1e-5).squeeze()\n",
"w_star = (alpha * Y) @ X.T\n",
"b_star = np.mean(Y[:, sv] - w_star @ X[:, sv])\n",
"\n",
"display(Math(\"w^{\\\\star} =\" + sp.latex(sp.Matrix(w_star))))\n",
"display(Math(\"b^{\\\\star} =\" + str(b_star)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It turns out we could have calculated $w^{\\star}$ just using the support vectors too, because the others $\\alpha_i$ are 0's.\n",
"\n",
"Although in practice the $\\alpha_i$ are never exactly 0. But in this case we can verify that it would have produced the exact same result."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.array_equal(w_star, (alpha[sv] * Y[:, sv]) @ X[:, sv].T)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's verify the KKT conditions for the primal."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wb_sol = {w1: w_star[0][0], w2: w_star[0][1], b: b_star}\n",
"sol = data | a_sol | wb_sol\n",
"# stationarity\n",
"assert dLw1.doit().subs(sol) == 0\n",
"assert dLw2.doit().subs(sol) == 0\n",
"assert dLb.doit().subs(sol) == 0\n",
"# primal feasibility\n",
"for j in range(m):\n",
" try:\n",
" g_eval = g.subs({i: j + 1}).subs(sol)\n",
" assert g_eval >= 0, f\"constraint {j+1} not satisfied: {g_eval:.2f}\"\n",
" except AssertionError as e:\n",
" print(e)\n",
"# dual feasibility\n",
"for j in range(m):\n",
" assert a_sol[a[j + 1]] >= 0\n",
"# complementary slackness\n",
"for j in range(m):\n",
" try:\n",
" ag_eval = a_sol[a[j + 1]] * g.subs({i: j + 1}).subs(sol)\n",
" assert (\n",
" ag_eval == 0\n",
" ), f\"complementary slackness {j+1} not satisfied: {ag_eval:.2f}\"\n",
" except AssertionError as e:\n",
" print(e)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We're not at the optimal solution, but we're close.\n",
"\n",
"Let's plot it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.scatter(X[0], X[1], c=np.where(Y.squeeze() == -1, \"tab:orange\", \"tab:blue\"))\n",
"x1 = np.linspace(-4, 4, 100)\n",
"plt.plot(x1, (-w_star[0][0] * x1 - b_star) / w_star[0][1], color=\"k\", alpha=0.5)\n",
"plt.scatter(\n",
" X[0, sv],\n",
" X[1, sv],\n",
" s=80,\n",
" facecolors=\"none\",\n",
" edgecolors=\"y\",\n",
" color=\"y\",\n",
" label=\"support vectors\",\n",
")\n",
"plt.xlabel(\"$x_1$\")\n",
"plt.ylabel(\"$x_2$\")\n",
"plt.title(\"Solution of the dual using sympy.nsolve\")\n",
"plt.gca().set_aspect(\"equal\")\n",
"plt.xlim(-5, 5)\n",
"plt.ylim(-5, 5)\n",
"plt.legend()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As mentioned earlier, `sympy` is not designed to solve this problem although in this case we managed to get an almost optimal solution.\n",
"\n",
"We'll now see how to solve this quadratic programming problem with the `cvxopt` library.\n",
"\n",
"The `cvxopt.solvers.qp` function can solve the following primal quadratic programming problem\n",
"\n",
"$\\begin{array}{l}\n",
"\\min \\limits_{x} \\cfrac{1}{2}x^TPx + q^Tx \\\\\n",
"\\text{subject to} \\quad Gx \\preceq h \\\\\n",
"\\quad \\quad \\quad \\quad \\quad Ax = b\n",
"\\end{array}$\n",
"\n",
"From the docstring of `cvxopt.solvers.qp` we see that it takes these parameters:\n",
"\n",
"- $P$: $(m \\times m)$ `d` (double) `cvxopt.matrix`\n",
"- $q$: $(m \\times 1)$ `d` (double) `cvxopt.matrix`\n",
"- $G$: $(m \\times m)$ `d` (double) `cvxopt.matrix`\n",
"- $h$: $(m \\times 1)$ `d` (double) `cvxopt.matrix`\n",
"- $A$: $(1 \\times m)$ `d` (double) `cvxopt.matrix`\n",
"- $b$: $(1 \\times 1)$ `d` (double) `cvxopt.matrix`\n",
"\n",
"The inequality constraints $Gx \\preceq h$ are\n",
"\n",
"$\\begin{bmatrix}\n",
"G_1&&0&&\\dots&&0 \\\\\n",
"0&&G_2&&\\dots&&0 \\\\\n",
"\\vdots&&\\vdots&&\\ddots&&\\vdots \\\\\n",
"0&&0&&\\dots&&G_m\n",
"\\end{bmatrix}\n",
"\\begin{bmatrix}\n",
"x_1 \\\\\n",
"x_2 \\\\\n",
"\\vdots \\\\\n",
"x_m \\\\\n",
"\\end{bmatrix}\n",
"\\preceq\n",
"\\begin{bmatrix}\n",
"h_1 \\\\\n",
"h_2 \\\\\n",
"\\vdots \\\\\n",
"h_m \\\\\n",
"\\end{bmatrix} \\Longrightarrow \\begin{bmatrix}\n",
"G_1x_1 \\\\\n",
"G_2x_2 \\\\\n",
"\\vdots \\\\\n",
"G_mx_m \\\\\n",
"\\end{bmatrix}\n",
"\\preceq\n",
"\\begin{bmatrix}\n",
"h_1 \\\\\n",
"h_2 \\\\\n",
"\\vdots \\\\\n",
"h_m \\\\\n",
"\\end{bmatrix}$\n",
"\n",
"Note $\\preceq$ represent element-wise inequalities between two column vectors, in this case $Gx$ which is $(m \\times 1)$ and $h$ which is also $(m \\times 1)$.\n",
"\n",
"The equality constraints $Ax = b$ are\n",
"\n",
"$\\begin{bmatrix}\n",
"A_1&&A_2 \\dots A_m\n",
"\\end{bmatrix}\n",
"\\begin{bmatrix}\n",
"x_1 \\\\\n",
"x_2 \\\\\n",
"\\vdots \\\\\n",
"x_m \\\\\n",
"\\end{bmatrix}=\n",
"\\begin{bmatrix}\n",
"b\n",
"\\end{bmatrix} \\Longrightarrow A_1x_1 + A_2x_2 + \\dots + A_mx_m= b$\n",
"\n",
"We need to re-write our problem so it conforms with `cvxopt.solvers.qp` requirements.\n",
"\n",
"Recall the dual is\n",
"\n",
"$\\begin{array}{l}\n",
"\\max \\limits_{\\alpha} \\sum \\limits_{i=1}^{m} \\alpha_i - \\cfrac{1}{2} \\sum \\limits_{i=1}^{m} \\sum \\limits_{j=1}^{m} \\alpha_i\\alpha_jy_iy_jx_i \\cdot x_j \\\\\n",
"\\text{subject to} \\quad \\alpha_i \\ge 0 \\quad \\forall i=1,\\dots,m \\\\\n",
"\\quad \\quad \\quad \\quad \\quad \\sum \\limits_{i=1}^{m} \\alpha_iy_i = 0\n",
"\\end{array}$\n",
"\n",
"Writing as a minimization and with matrix notations becomes\n",
"\n",
"$\\begin{array}{l}\n",
"\\min \\limits_{\\alpha} \\cfrac{1}{2} \\alpha^T (yy^Tx_i \\cdot x_j) \\alpha - \\alpha \\\\\n",
"\\text{subject to} \\quad \\alpha \\succeq 0 \\\\\n",
"\\quad \\quad \\quad \\quad \\quad y \\alpha = 0\n",
"\\end{array}$\n",
"\n",
"Then we note that:\n",
"- $x$ is $\\alpha$ (what `cvxopt.solvers.qp` solve for) and it's a column vector\n",
"- $P$ corresponds to $(yy^Tx_i \\cdot x_j)$\n",
"- $q$ corresponds to a column vector of -1's to make our minus a plus ($q$ gets transposed)\n",
"- $G$ is a diagonal matrix of -1's to make our $\\succeq$ a $\\preceq$\n",
"- $h$ corresponds to a column vector of 0's\n",
"- $A$ corresponds $y$ which is a row vector\n",
"- $b$ corresponds to 0\n",
"\n",
"Let $P = yy^TK$, where $K = x_i \\cdot x_j$.\n",
"\n",
"Let $yy^T$ a square matrix resulting from the outer product of $y$\n",
"\n",
"$yy^T = \\begin{bmatrix}\n",
"y_1y_1&&\\dots&&y_my_1\\\\\n",
"\\vdots&&\\ddots&&\\vdots\\\\\n",
"y_my_1&&\\dots&&y_my_m\n",
"\\end{bmatrix}$\n",
"\n",
"$K = \\begin{bmatrix}\n",
"x_1 \\cdot x_1&&\\dots&&x_m \\cdot x_1\\\\\n",
"\\vdots&&\\ddots&&\\vdots\\\\\n",
"x_m \\cdot x_1&&\\dots&&x_m \\cdot x_m\n",
"\\end{bmatrix}$"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"K = np.array([np.dot(X[:, i], X[:, j]) for j in range(m) for i in range(m)]).reshape(\n",
" (m, m)\n",
")\n",
"P_mat = cvxopt.matrix(np.outer(Y, Y.T) * K)\n",
"assert P_mat.size == (m, m)\n",
"assert P_mat.typecode == \"d\"\n",
"\n",
"Q_mat = cvxopt.matrix(np.full(m, -1.0))\n",
"assert Q_mat.size == (m, 1)\n",
"assert Q_mat.typecode == \"d\"\n",
"\n",
"G_mat = cvxopt.matrix(np.diag(np.full(m, -1.0)))\n",
"assert G_mat.size == (m, m)\n",
"assert G_mat.typecode == \"d\"\n",
"\n",
"h_mat = cvxopt.matrix(np.zeros(m))\n",
"assert h_mat.size == (m, 1)\n",
"assert h_mat.typecode == \"d\"\n",
"\n",
"A_mat = cvxopt.matrix(Y, (1, m))\n",
"assert A_mat.size == (1, m)\n",
"assert A_mat.typecode == \"d\"\n",
"\n",
"b_mat = cvxopt.matrix(0.0)\n",
"assert b_mat.size == (1, 1)\n",
"assert b_mat.typecode == \"d\"\n",
"\n",
"solution = cvxopt.solvers.qp(\n",
" P_mat, Q_mat, G_mat, h_mat, A_mat, b_mat, options=dict(show_progress=False)\n",
")\n",
"alpha = np.ravel(solution[\"x\"])\n",
"sv = np.argwhere(alpha > 1e-5).squeeze()\n",
"print(f\"The support vectors are {sv+1}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It seems our assumption about the support vectors earlier was indeed correct."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"w_star = (alpha * Y) @ X.T\n",
"b_star = np.mean(Y[:, sv] - w_star @ X[:, sv])\n",
"\n",
"display(Math(\"w^{\\\\star} =\" + sp.latex(sp.Matrix(w_star))))\n",
"display(Math(\"b^{\\\\star} =\" + str(b_star)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's verify the KKT conditions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"a_sol = {a[j + 1]: a_val for j, a_val in enumerate(alpha)}\n",
"wb_sol = {w1: w_star[0][0], w2: w_star[0][1], b: b_star}\n",
"sol = data | a_sol | wb_sol\n",
"# stationarity\n",
"assert np.isclose(float(dLw1.doit().subs(sol)), 0, atol=1e-6)\n",
"assert np.isclose(float(dLw2.doit().subs(sol)), 0, atol=1e-6)\n",
"assert np.isclose(float(dLb.doit().subs(sol)), 0, atol=1e-6)\n",
"# primal feasibility\n",
"for j in range(m):\n",
" try:\n",
" g_eval = g.subs({i: j + 1}).subs(sol)\n",
" assert g_eval >= 0, f\"constraint {j} not satisfied: {g_eval:.2f}\"\n",
" except AssertionError as e:\n",
" print(e)\n",
"# dual feasibility\n",
"for j in range(m):\n",
" assert a_sol[a[j + 1]] >= 0\n",
"# complementary slackness\n",
"for j in range(m):\n",
" try:\n",
" ag_eval = float(a_sol[a[j + 1]] * g.subs({i: j + 1}).subs(sol))\n",
" assert np.isclose(\n",
" ag_eval, 0, atol=1e-6\n",
" ), f\"complementary slackness {j} not satisfied: {ag_eval:.2f}\"\n",
" except AssertionError as e:\n",
" print(e)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This time they are all satisfied.\n",
"\n",
"Let's plot it again."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.scatter(X[0], X[1], c=np.where(Y.squeeze() == -1, \"tab:orange\", \"tab:blue\"))\n",
"x1 = np.linspace(-4, 4, 100)\n",
"plt.plot(x1, (-w_star[0][0] * x1 - b_star) / w_star[0][1], color=\"k\", alpha=0.5)\n",
"plt.scatter(\n",
" X[0, sv],\n",
" X[1, sv],\n",
" s=80,\n",
" facecolors=\"none\",\n",
" edgecolors=\"y\",\n",
" color=\"y\",\n",
" label=\"support vectors\",\n",
")\n",
"plt.xlabel(\"$x_1$\")\n",
"plt.ylabel(\"$x_2$\")\n",
"plt.title(\"Solution of the dual using cvxopt.qp\")\n",
"plt.gca().set_aspect(\"equal\")\n",
"plt.xlim(-5, 5)\n",
"plt.ylim(-5, 5)\n",
"plt.legend()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Quasi-newton methods"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"While Newton's method is a second-order optimization algorithm, Quasi-newton methods are a family of first-order optimization algorithms. The BFGS (Broyden–Fletcher–Goldfarb–Shanno) algorithm is one of the most popular.\n",
"\n",
"One of the weaknesses of Newton's method is its cost complexity.\n",
"\n",
"> 🔑 Newton's method has a cost complexity $O(n^3)$; specifically, $O(n^2)$ to calculate the Hessian, $O(n^3)$ to calculate it and invert it.\n",
"\n",
"Quasi-newton methods address exactly this issue.\n",
"\n",
"> 🔑 Quasi-newton methods calculate an approximation of the Hessian at each step.\n",
"\n",
"Quasi-newton optimization methods are a generalization of the secant method to find the root of the first derivative for multidimensional problems.\n",
"\n",
"The secant method looks a lot like Newton's method, but instead of one starting point, we have two starting points and the next point is the root of the secant joining these two points."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def secant(f, x, x0, x1):\n",
" return f(x0) + ((f(x1) - f(x0)) / (x1 - x0)) * (x - x0)\n",
"\n",
"\n",
"def _update(frame):\n",
" global x0_scat, x1_scat, x2_scat, min_scat, secant_line\n",
" x0_scat.remove()\n",
" x1_scat.remove()\n",
" x2_scat.remove()\n",
" secant_line.remove()\n",
" min_scat.remove()\n",
" x0_scat = ax.scatter(xs[frame - 1], df(xs[frame - 1]), color=\"tab:orange\")\n",
" x1_scat = ax.scatter(xs[frame], df(xs[frame]), color=\"tab:orange\")\n",
" (secant_line,) = ax.plot(\n",
" xx,\n",
" secant(df, xx, xs[frame - 1], xs[frame]),\n",
" linestyle=\"--\",\n",
" color=\"tab:orange\",\n",
" label=\"secant\",\n",
" )\n",
" x2_scat = ax.scatter(xs[frame + 1], 0, color=\"k\", zorder=3)\n",
" min_scat = ax.scatter(xs[frame + 1], f(xs[frame + 1]), color=\"tab:blue\")\n",
"\n",
"\n",
"x = sp.symbols(\"x\")\n",
"f = sp.exp(x) - sp.log(x)\n",
"df = sp.lambdify(x, sp.diff(f, x, 1))\n",
"f = sp.lambdify(x, f)\n",
"\n",
"xs = [1.75, 0.25]\n",
"for _ in range(4):\n",
" xs.append(xs[-1] - df(xs[-1]) * (xs[-1] - xs[-2]) / (df(xs[-1]) - df(xs[-2])))\n",
"\n",
"fig, ax = plt.subplots()\n",
"xx = np.linspace(1e-3, 2)\n",
"ax.plot(xx, f(xx), color=\"tab:blue\", label=\"$f(x)$\")\n",
"ax.plot(xx, df(xx), color=\"tab:orange\", label=\"$f'(x)$\")\n",
"x0_scat = ax.scatter([], [])\n",
"x1_scat = ax.scatter([], [])\n",
"(secant_line,) = ax.plot([], [], linestyle=\"--\", color=\"tab:orange\", label=\"secant\")\n",
"x2_scat = ax.scatter([], [])\n",
"min_scat = ax.scatter([], [])\n",
"ax.set_ylim(-5, 11)\n",
"ax.set_xlim(0, 2)\n",
"ax.spines[[\"left\", \"bottom\"]].set_position((\"data\", 0))\n",
"ax.text(1, 0, \"$x$\", transform=ax.get_yaxis_transform())\n",
"ax.text(0, 1.02, \"$f(x)$\", transform=ax.get_xaxis_transform())\n",
"plt.legend(loc=\"upper right\")\n",
"plt.title(\"Minimazation of $f(x)$ using the secant method\")\n",
"\n",
"ani = FuncAnimation(fig=fig, func=_update, frames=range(1, 5), interval=1000)\n",
"html_animation = ani.to_jshtml(default_mode=\"loop\")\n",
"if \"runtime\" not in get_ipython().config.IPKernelApp.connection_file:\n",
" html_animation = fixup_animation_js(html_animation)\n",
"display(HTML(html_animation))\n",
"plt.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Given two points $\\left(x_0, f(x_0)\\right)$ and $\\left(x_1, f(x_1)\\right)$, the slope of the line passing through these two points is the usual rise over run.\n",
"\n",
"$m = \\cfrac{f(x_1) - f(x_0)}{x_1 - x_0}$\n",
"\n",
"Recall the point-slope form of a line $y - y_1 = m(x - x_1)$. \n",
"\n",
"So the line passing through $\\left(x_1, f(x_1)\\right)$ is\n",
"\n",
"$y - f(x_1) = m(x - x_1)$\n",
"\n",
"Note we could have used $\\left(x_0, f(x_0)\\right)$ too, but it would make the parallelism in notation with Newton's method less obvious.\n",
"\n",
"Replace $m$ with the slope of the line passing through $\\left(x_0, f(x_0)\\right)$ and $\\left(x_1, f(x_1)\\right)$ and we get the equation of the secant line.\n",
"\n",
"> 📐 $y = \\cfrac{f(x_1) - f(x_0)}{x_1 - x_0}(x - x_1)+f(x_1)$\n",
"\n",
"The root of this line is at\n",
"\n",
"$\\cfrac{f(x_1) - f(x_0)}{x_1 - x_0}(x - x_1)+f(x_1) = 0$\n",
"\n",
"$x = x_1 - f(x_1)\\cfrac{x_1 - x_0}{f(x_1) - f(x_0)}$\n",
"\n",
"The general update formula of the secant optimization method is\n",
"\n",
"$x_{k+1} = x_{k} - f'(x_k)\\cfrac{x_k - x_{k-1}}{f'(x_k) - f'(x_{k-1})}$\n",
"\n",
"Recall, the update of Newton's method is\n",
"\n",
"$x_{k+1} = x_k - f'(x_k)\\cfrac{1}{f''(x_k)}$\n",
"\n",
"If the two methods are indeed similar, it must follow that\n",
"\n",
"$\\cfrac{x_k - x_{k-1}}{f'(x_k) - f'(x_{k-1})} \\approx \\cfrac{1}{f''(x_k)}$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> 🔑 The secant method is an approximation of Newton's method, for the same reason a secant line is an approximation of a tangent line\n",
"\n",
"Recall in [Week 1](ca_w1.html#Average-rate-of-change-of-a-function) we introduced derivatives by first talking about the average rate of change of function over interval $\\Delta x$ and then showing that as $\\Delta x$ approaches 0 we get to the instantaneous rate of change of the function, that is the derivative.\n",
"\n",
"$f'(x) = \\lim_{{\\Delta x}\\to{0}}\\cfrac{f(x + \\Delta x) - f(x)}{\\Delta x}$\n",
"\n",
"Let $\\Delta x = x_k - x_{k-1}$, and also $x_{k-1} = x_k - \\Delta x$ and we get\n",
"\n",
"$\\lim_{{\\Delta x}\\to{0}} \\cfrac{\\Delta x}{f'(x_k) - f'(x_k - \\Delta x)} = \\cfrac{1}{f''(x_k)}$\n",
"\n",
"$\\lim_{{\\Delta x}\\to{0}} \\cfrac{f'(x_k) - f'(x_k - \\Delta x)}{\\Delta x} = f''(x_k)$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> 🔑 The rate of change of the first derivative over two consecutive steps of the secant method approximates the second derivative of Newton's method\n",
"\n",
"Although this relationship holds in the univariate case, things get more complicated with more variables.\n",
"\n",
"Recall the update of Newton's method in the multivariate case\n",
"\n",
"$p_{k+1} = p_k - H_f^{-1}(p_k) \\cdot \\nabla_f(p_k)$\n",
"\n",
"Let's rewrite $\\lim_{{\\Delta x}\\to{0}} \\cfrac{f'(x_k) - f'(x_k - \\Delta x)}{\\Delta x} = f''(x_k)$ as an approximation and we get\n",
"\n",
"$\\cfrac{f'(x_k) - f'(x_k - \\Delta x)}{\\Delta x} \\approx f''(x_k)$\n",
"\n",
"Let's swap in the matrix notation and we get\n",
"\n",
"$\\cfrac{\\nabla_f(p_k) - \\nabla f(p_k - \\Delta p)}{\\Delta p} \\approx H_f(p_k)$\n",
"\n",
"$\\nabla_f(p_k) - \\nabla f(p_k - \\Delta p) \\approx H_f(p_k) \\Delta p$\n",
"\n",
"Earlier we let $\\Delta x = x_k - x_{k-1}$, so we can expand the notation to\n",
"\n",
"$\\nabla_f(p_k) - \\nabla f(p_k - p_k - p_{k-1}) \\approx H_f(p_k)(p_k - p_{k-1})$\n",
"\n",
"$\\nabla_f(p_k) - \\nabla_f(p_{k-1}) \\approx H_f(p_k)(p_k - p_{k-1})$\n",
"\n",
"For the sake of consistency with the BFGS notation and conventions that we'll introduce later, let's increase the indices by 1 (our focus is determining the approximation of the Hessian for the $k+1$ step), and let \n",
"\n",
"$y_k = \\nabla_f(p_{k+1}) - \\nabla_f(p_k)$\n",
"\n",
"$s_k = p_{k+1} - p_k$\n",
"\n",
"$H_{k+1} = H_f(p_{k+1})$\n",
"\n",
"So we get\n",
"\n",
"$y_k \\approx H_{k+1}s_k$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> 🔑 Quasi-newton methods finds an approximation $B_{k+1}$ of $H_{k+1}$ such that the condition $y_k = B_{k+1}s_k$ (secant equation) is satisfied\n",
"\n",
"It turns out we cannot solve $y_k = B_{k+1}s_k$ for $B_{k+1}$ because the system is underdetermined (more unknowns than equations in the system).\n",
"\n",
"An underdetermined system can be solved by adding more equations, that is constraints.\n",
"\n",
"The various Quasi-newton methods differ in terms of the constraints they specify."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### BFGS\n",
"\n",
"The BFGS algorithm solves the following constrained optimization problem at each update\n",
"\n",
"$\\begin{array}{l}\n",
"\\textbf{P}_{BFGS} \\\\\n",
"\\min \\limits_{B_{k+1}} \\|B_{k+1} - B_{k}\\|_F \\\\\n",
"\\text{subject to} \\quad B_{k+1}^T = B_{k+1} \\\\\n",
"\\quad \\quad \\quad \\quad \\quad y_k = B_{k+1}s_k\n",
"\\end{array}$\n",
"\n",
"In plain english, the BFGS algorithm minimizes the Frobenius norm (the equivalent of the Euclidean vector norm for a matrix) of the Hessian approximation between two consecutive updates such that the matrix is symmetric and the secant equation is satisfied.\n",
"\n",
"Note that $B_{k+1}^T = B_{k+1}$ is the symmetry constraint because a matrix is symmetric if it's equal to its transpose."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sym_mat = np.array([[10, 2, 3], [2, 20, 3], [3, 3, 30]])\n",
"display(\n",
" Math(\n",
" \"A_{sym} :=\"\n",
" + sp.latex(sp.Matrix(sym_mat))\n",
" + \"=\"\n",
" + sp.latex(sp.Matrix(sym_mat).T)\n",
" )\n",
")\n",
"assert np.array_equal(sym_mat, sym_mat.T)\n",
"not_sym_mat = np.array([[10, 3, 2], [2, 20, 3], [3, 3, 30]])\n",
"display(\n",
" Math(\n",
" \"A_{nonsym} :=\"\n",
" + sp.latex(sp.Matrix(not_sym_mat))\n",
" + \"\\\\neq\"\n",
" + sp.latex(sp.Matrix(not_sym_mat).T)\n",
" )\n",
")\n",
"assert not np.array_equal(not_sym_mat, not_sym_mat.T)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It turns out that the solution to $\\textbf{P}_{BFGS}$ ends up being equivalent to\n",
"\n",
"$\\arg \\min \\limits_{B_{k+1}} \\textbf{P}_{BFGS} = B_{k} + U_{k} + V_{k}$\n",
"\n",
"Let $U_{k} = \\alpha uu^T$ and $V_{k} = \\beta vv^T$, where:\n",
"\n",
"- $u$ and $v$ are two linearly independent $(n \\times 1)$ vectors, where $n$ is the number of parameters of $f(x_1, \\dots, x_n)$\n",
"\n",
"- $\\alpha$ and $\\beta$ are two scalar\n",
"\n",
"So we get\n",
"\n",
"$\\arg \\min \\limits_{B_{k+1}} \\textbf{P}_{BFGS} = B_{k} + \\alpha uu^T + \\beta vv^T$\n",
"\n",
"The outer products $uu^T$ and $vv^T$ mean that $U$ and $V$ are rank-one symmetric $(n \\times n)$ matrices and the sum $U_k + V_k$ is rank-two.\n",
"\n",
"From [Linear Algebra - Week 1](../linear_algebra/la_w1.html#Singularity-and-rank) recall, that the rank of a matrix is the number of linearly independent rows. \n",
"\n",
"For example, $uu^T$ yields a matrix where $n-1$ rows are linearly dependent because each row is a linear combination of $u$.\n",
"\n",
"These properties of $U$ and $V$ is what makes them part of the solution to $\\textbf{P}_{BFGS}$.\n",
"\n",
"But perhaps, most importantly, need to find $\\alpha$, $\\beta$, $u$ and $v$ such that the solution satisfies the secant equation condition $y_k = B_{k+1}s_k$.\n",
"\n",
"Let's impose the secant equation condition by multiplying each side by $s_k$\n",
"\n",
"$\\underbrace{B_{k+1}s_k}_{y_k} = B_ks_k + \\alpha uu^Ts_k + \\beta vv^Ts_k$\n",
"\n",
"Let $u = y_k$ and $v = B_ks_k$ and we obtain\n",
"\n",
"$y_k = B_ks_k + \\alpha y_ky_k^Ts_k + \\beta B_ks_ks_k^TB_k^Ts_k$\n",
"\n",
"Note that, in general, the transpose of $XY$ is $Y^TX^T$, so the transpose of $B_ks_k$ is $s_k^TB_k^T$ but $B_k^T=B_k$ by imposition of our problem, so we get\n",
"\n",
"$y_k - \\alpha y_ky_k^Ts_k = B_ks_k + \\beta B_ks_ks_k^TB_ks_k$\n",
"\n",
"$y_k(1 - \\alpha y_k^Ts_k) = B_ks_k(1 + \\beta s_k^TB_ks_k)$\n",
"\n",
"$\\begin{cases}\n",
"y_k(1 - \\alpha y_k^Ts_k) = B_{k}s_k(1 + \\beta s_k^TB_ks_k)\\\\\n",
"1 - \\alpha y_k^Ts_k = 0 \\\\\n",
"1 + \\beta s_k^TB_ks_k = 0\n",
"\\end{cases}$\n",
"\n",
"$\\begin{cases}\n",
"y_k(1 - \\alpha y_k^Ts_k) = B_{k}s_k(1 + \\beta s_k^TB_k^Ts_k)\\\\\n",
"\\alpha = \\cfrac{1}{y_k^Ts_k} \\\\\n",
"\\beta = -\\cfrac{1}{s_k^TB_ks_k}\n",
"\\end{cases}$\n",
"\n",
"Let's replace $\\alpha$ and $\\beta$ back in and we get\n",
"\n",
"$y_k = B_{k}s_k + \\left(\\cfrac{1}{y_k^Ts_k}\\right) y_ky_k^Ts_k + \\left(-\\cfrac{1}{s_k^TB_ks_k}\\right) B_ks_ks_k^TB_ks_k$\n",
"\n",
"$y_k = B_{k} + \\cfrac{y_ky_k^T }{y_k^Ts_k} -\\cfrac{B_ks_ks_k^TB_k}{s_k^TB_ks_k}$\n",
"\n",
"So we get that\n",
"\n",
"$B_{k+1} = B_{k} + \\cfrac{y_ky_k^T}{y_k^Ts_k} -\\cfrac{B_ks_ks_k^TB_k}{s_k^TB_ks_k}$\n",
"\n",
"Now, at this point we could invert $B_{k+1}$ and we would be done but it would be more efficient if we could calculate the inverse directly.\n",
"\n",
"$B_{k+1}$ is a sum of matrices so its inverse is can be seen as a [Woodbury matrix identity](https://en.wikipedia.org/wiki/Woodbury_matrix_identity).\n",
"\n",
"$(B+UV^T)^{-1}=B^{-1}-\\cfrac{B^{-1}UV^TB^{-1}}{I+V^TB^{-1}U}$\n",
"\n",
"or equivalently\n",
"\n",
"$(B+UV^T)^{-1}=B^{-1}-B^{-1}U(I+V^TB^{-1}U)^{-1}V^TB^{-1}$\n",
"\n",
"After some algebra which is shown in [Derivation of the direct inverse of the BFGS update](#derivation-of-the-direct-inverse-of-the-bfgs-update) we get\n",
"\n",
"$B^{-1}_{k+1} = \\left(I - \\cfrac{s_ky_k^T}{y_k^Ts_k} \\right)B^{-1}_k \\left(I - \\cfrac{y_ks_k^T}{y_k^Ts_k} \\right) + \\cfrac{s_ks_k^T}{y_k^Ts_k}$\n",
"\n",
"Let's verify we can calculate $B^{-1}_{k+1}$ directly using some synthetic data for $y_k$, $s_k$ and $B_k$."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y = np.array([1, 2, 3])\n",
"s = np.array([3, 4, 5])\n",
"B = np.array([[1, 2, 3], [2, 2, 3], [3, 3, 3]])\n",
"\n",
"B1 = (\n",
" B\n",
" + np.outer(y, y.T) / np.dot(y.T, s)\n",
" - np.outer(B @ s, s.T @ B) / np.dot(np.dot(s.T, B), s)\n",
")\n",
"expected = np.linalg.inv(B1)\n",
"direct_inverse = (np.identity(3) - np.outer(s, y) / np.dot(y, s)) @ np.linalg.inv(B) @ (\n",
" np.identity(3) - np.outer(y, s) / np.dot(y, s)\n",
") + np.outer(s, s) / np.dot(y, s)\n",
"assert np.allclose(direct_inverse, expected)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's use the BFGS algorithm on a small problem we've seen before."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def bivariate_bfgs_algo(f, symbols, initial, steps):\n",
" x, y = symbols\n",
" nabla = sp.Matrix([sp.diff(f, x), sp.diff(f, y)])\n",
" nabla_eval = sp.lambdify((x, y), nabla, \"numpy\")\n",
" p = np.zeros((steps, 2))\n",
" g = np.zeros((steps, 2))\n",
" b_inv = np.zeros((steps, 2, 2))\n",
" step_vector = np.zeros((steps, 2))\n",
" p[0] = initial\n",
" g[0] = nabla_eval(*p[0]).squeeze()\n",
" b_inv[0] = np.identity(2)\n",
" step_vector[0] = b_inv[0] @ g[0]\n",
" for i in range(1, steps):\n",
" p[i] = p[i - 1] - step_vector[i - 1]\n",
" g[i] = nabla_eval(*p[i]).squeeze()\n",
" # calculate approximate inverse hessian\n",
" s_k = p[i] - p[i - 1]\n",
" y_k = g[i] - g[i - 1]\n",
" i_mat = np.identity(2)\n",
" out1 = np.outer(s_k, y_k)\n",
" out2 = np.outer(y_k, s_k)\n",
" out3 = np.outer(s_k, s_k)\n",
" dot1 = np.dot(y_k, s_k)\n",
" b_inv[i] = (i_mat - out1 / dot1) @ b_inv[i - 1] @ (\n",
" i_mat - out2 / dot1\n",
" ) + out3 / dot1\n",
" step_vector[i] = b_inv[i] @ g[i]\n",
" if np.linalg.norm(step_vector[i]) < 1e-4:\n",
" break\n",
" return p[: i + 1], g[: i + 1], b_inv[: i + 1], step_vector[: i + 1]\n",
"\n",
"\n",
"x, y = sp.symbols(\"x, y\")\n",
"g = x**4 + 0.8 * y**4 + 4 * x**2 + 2 * y**2 - x * y - 0.2 * x**2 * y\n",
"p, _, _, _ = bivariate_bfgs_algo(g, symbols=(x, y), initial=np.array([4, 4]), steps=100)\n",
"\n",
"display(\n",
" Math(\n",
" \"\\\\arg \\\\min \\\\limits_{x,y}\"\n",
" + sp.latex(g)\n",
" + \"=\"\n",
" + sp.latex(np.array2string(p[-1], precision=2, suppress_small=True))\n",
" )\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's re-train a logistic regression model (single-neuron with sigmoid activation) on this classification data that we used in [Week 3](ca_w3.html#Single-neuron-network-with-sigmoid-activation-and-Log-loss-function)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"m = 40\n",
"k = 2\n",
"neg_centroid = [-1, -1]\n",
"pos_centroid = [1, 1]\n",
"\n",
"rng = np.random.default_rng(1)\n",
"X = np.r_[\n",
" rng.standard_normal((m // 2, k)) + neg_centroid,\n",
" rng.standard_normal((m // 2, k)) + pos_centroid,\n",
"].T\n",
"Y = np.array([[0] * (m // 2) + [1] * (m // 2)])\n",
"\n",
"plt.scatter(X[0], X[1], c=np.where(Y.squeeze() == 0, \"tab:orange\", \"tab:blue\"))\n",
"plt.gca().set_aspect(\"equal\")\n",
"plt.xlabel(\"$x_1$\")\n",
"plt.ylabel(\"$x_2$\")\n",
"plt.xlim(-5, 5)\n",
"plt.ylim(-5, 5)\n",
"plt.title(\"Classification data\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"But this time we'll use the BFGS algorithm instead of gradient descent."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def sigmoid(x):\n",
" return 1 / (1 + np.exp(-x))\n",
"\n",
"\n",
"def init_neuron_params(k):\n",
" w = rng.uniform(size=(1, k)) * 0.5\n",
" b = np.zeros((1, 1))\n",
" return {\"w\": w, \"b\": b}\n",
"\n",
"\n",
"def neuron_output(X, params, activation, *args):\n",
" w = params[\"w\"]\n",
" b = params[\"b\"]\n",
" Z = w @ X + b\n",
" Y_hat = activation(Z, *args)\n",
" return Y_hat\n",
"\n",
"\n",
"def compute_log_loss(Y, Y_hat):\n",
" m = Y_hat.shape[1]\n",
" loss = (-1 / m) * (\n",
" np.dot(Y, np.log(Y_hat).T) + np.dot((1 - Y), np.log(1 - Y_hat).T)\n",
" )\n",
" return loss.squeeze()\n",
"\n",
"\n",
"def compute_grads(X, Y, Y_hat):\n",
" m = Y_hat.shape[1]\n",
" dw = -1 / m * np.dot(Y - Y_hat, X.T)\n",
" db = -1 / m * np.sum(Y - Y_hat, axis=1, keepdims=True)\n",
" return np.hstack([dw.squeeze(), db.squeeze()])\n",
"\n",
"\n",
"def compute_approx_inverse_hessian(p_new, p_old, g_new, g_old, b_inv_old):\n",
" s_k = p_new - p_old\n",
" y_k = g_new - g_old\n",
" i_mat = np.identity(b_inv_old.shape[0])\n",
" out1 = np.outer(s_k, y_k)\n",
" out2 = np.outer(y_k, s_k)\n",
" out3 = np.outer(s_k, s_k)\n",
" dot1 = np.dot(y_k, s_k)\n",
" return (i_mat - out1 / dot1) @ b_inv_old @ (i_mat - out2 / dot1) + out3 / dot1\n",
"\n",
"\n",
"steps = 50\n",
"\n",
"params = init_neuron_params(k)\n",
"Y_hat = neuron_output(X, params, sigmoid)\n",
"loss = compute_log_loss(Y, Y_hat)\n",
"print(f\"Iter 0 - Log loss={loss:.6f}\")\n",
"\n",
"p = np.zeros((steps + 1, k + 1))\n",
"g = np.zeros((steps + 1, k + 1))\n",
"b_inv = np.zeros((steps + 1, k + 1, k + 1))\n",
"step_vector = np.zeros((steps + 1, k + 1))\n",
"\n",
"p[0] = np.hstack([v.squeeze() for v in params.values()])\n",
"g[0] = compute_grads(X, Y, Y_hat)\n",
"b_inv[0] = np.identity(k + 1)\n",
"step_vector[0] = b_inv[0] @ g[0]\n",
"\n",
"for i in range(1, steps + 1):\n",
" p[i] = p[i - 1] - step_vector[i - 1]\n",
" params = {\"w\": p[i][None, :k], \"b\": p[i][None, -1:]}\n",
" Y_hat = neuron_output(X, params, sigmoid)\n",
" loss_new = compute_log_loss(Y, Y_hat)\n",
" if loss - loss_new <= 1e-4:\n",
" print(f\"Iter {i} - Log loss={loss:.6f}\")\n",
" print(\"The algorithm has converged\")\n",
" break\n",
" loss = loss_new\n",
" if i % 2 == 0:\n",
" print(f\"Iter {i} - Log loss={loss:.6f}\")\n",
" g[i] = compute_grads(X, Y, Y_hat)\n",
" b_inv[i] = compute_approx_inverse_hessian(\n",
" p_new=p[i], p_old=p[i - 1], g_new=g[i], g_old=g[i - 1], b_inv_old=b_inv[i - 1]\n",
" )\n",
" step_vector[i] = b_inv[i] @ g[i]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's compare it with the results obtained in [Week 3](ca_w3.html#Single-neuron-network-with-sigmoid-activation-and-Log-loss-function) using gradient descent on the same data.\n",
"\n",
"```\n",
"Iter 0 - Log loss=0.582076\n",
"Iter 100 - Log loss=0.182397\n",
"Iter 200 - Log loss=0.148754\n",
"Iter 300 - Log loss=0.134595\n",
"Iter 306 - Log loss=0.134088\n",
"The algorithm has converged\n",
"```\n",
"\n",
"The results might be slightly different due to the random initialization.\n",
"\n",
"Finally, let's plot it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"w, b = params.values()\n",
"xx1, xx2 = np.meshgrid(np.linspace(-5, 5, 100), np.linspace(-5, 5, 100))\n",
"\n",
"neg_scatter = go.Scatter3d(\n",
" z=np.full(int(m / 2), 0),\n",
" x=X[0, : int(m / 2)],\n",
" y=X[1, : int(m / 2)],\n",
" mode=\"markers\",\n",
" marker=dict(color=\"#ff7f0e\", size=5),\n",
" name=\"negative class\",\n",
")\n",
"pos_scatter = go.Scatter3d(\n",
" z=np.full(int(m / 2), 1),\n",
" x=X[0, int(m / 2) :],\n",
" y=X[1, int(m / 2) :],\n",
" mode=\"markers\",\n",
" marker=dict(color=\"#1f77b4\", size=5),\n",
" name=\"positive class\",\n",
")\n",
"final_model_plane = go.Surface(\n",
" z=sigmoid(w[0, 0] * xx1 + w[0, 1] * xx2 + b),\n",
" x=xx1,\n",
" y=xx2,\n",
" colorscale=[[0, \"#ff7f0e\"], [1, \"#1f77b4\"]],\n",
" showscale=False,\n",
" opacity=0.5,\n",
" name=\"final params\",\n",
")\n",
"\n",
"fig = go.Figure([pos_scatter, neg_scatter, final_model_plane])\n",
"fig.update_layout(\n",
" title=\"Final model\",\n",
" autosize=False,\n",
" width=600,\n",
" height=600,\n",
" margin=dict(l=10, r=10, b=10, t=30),\n",
" scene=dict(\n",
" xaxis=dict(title=\"x1\", range=[-5, 5]),\n",
" yaxis=dict(title=\"x2\", range=[-5, 5]),\n",
" zaxis_title=\"y\",\n",
" camera_eye=dict(x=0, y=0.3, z=2.5),\n",
" camera_up=dict(x=0, y=np.sin(np.pi), z=0),\n",
" ),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Derivation of the direct inverse of the BFGS update"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The objective is to derive\n",
"\n",
"$B^{-1}_{k+1} = \\left(I - \\cfrac{s_ky_k^T}{y_k^Ts_k} \\right)B^{-1}_k \\left(I - \\cfrac{y_ks_k^T}{y_k^Ts_k} \\right) + \\cfrac{s_ks_k^T}{y_k^Ts_k}$\n",
"\n",
"from\n",
"\n",
"$B^{-1}_{k+1} = \\left(B_{k} + \\cfrac{y_ky_k^T}{y_k^Ts_k} -\\cfrac{B_ks_ks_k^TB_k}{s_k^TB_ks_k}\\right)^{-1}$\n",
"\n",
"Using Woodbury matrix identity we get that the inverse of $B_{k+1}$ is\n",
"\n",
"$B_{k+1}^{-1} = \\left(\\underbrace{B_{k} + \\cfrac{y_ky_k^T}{y_k^Ts_k} -\\cfrac{B_ks_ks_k^TB_k}{s_k^TB_ks_k}}_{(1)}\\right)^{-1} = (B_k+UV^T)^{-1} = \\underbrace{B_k^{-1}-B_k^{-1}U(I+V^TB_k^{-1}U)^{-1}V^TB_k^{-1}}_{(2)}$\n",
"\n",
"For the sake of a cleaner notation, let's remove the $k$ indexing from $(1)$.\n",
"\n",
"$B + \\cfrac{yy^T}{y^Ts} -\\cfrac{Bss^TB}{s^TBs}$\n",
"\n",
"It turns out the above expression can be rewritten as\n",
"\n",
"$\\underbrace{B}_{n \\times n}+\n",
"\\underbrace{\\begin{bmatrix}\\overbrace{Bs}^{n \\times 1}&&\\overbrace{y}^{n \\times 1}\\end{bmatrix}}_{U \\text{ } (n \\times 2)}\n",
"\\underbrace{\\overbrace{\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}}^{V_1 \\text{ } (2 \\times 2)}\n",
"\\overbrace{\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}}^{V_2 \\text{ } (2 \\times n)}}_{V^T \\text{ } (2 \\times n)}$\n",
"\n",
"To verify this and all the next operations we'll be making, let's create a simple example with $n=3$."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y = np.array([1, 2, 3])\n",
"s = np.array([3, 4, 5])\n",
"B = np.array([[1, 2, 3], [2, 2, 3], [3, 3, 3]])\n",
"B1 = (\n",
" B\n",
" + np.outer(y, y.T) / np.dot(y.T, s)\n",
" - np.outer(B @ s, s.T @ B) / np.dot(np.dot(s.T, B), s)\n",
")\n",
"\n",
"U = np.hstack([(B @ s)[:, None], y[:, None]])\n",
"V1 = np.array([[-1 / (s.T @ B @ s), 0], [0, 1 / (y.T @ s)]])\n",
"V2 = np.vstack([s.T @ B, y.T])\n",
"VT = V1 @ V2\n",
"\n",
"print(\"Shapes:\\n-------\")\n",
"print(\"y: \", y.shape)\n",
"print(\"s: \", s.shape)\n",
"print(\"B: \", B.shape)\n",
"print(\"U: \", U.shape)\n",
"print(\"V1:\", V1.shape)\n",
"print(\"V2:\", V2.shape)\n",
"print(\"VT:\", VT.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, let's verify that\n",
"\n",
"$B + \\cfrac{yy^T}{y^Ts} -\\cfrac{Bss^TB}{s^TBs} = B + UV^T$"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.allclose(B1, B + U @ VT)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's plug the substitutions for $U$ and $V^T$ into $(2)$ and we obtain\n",
"\n",
"$B^{-1}-B^{-1}\\begin{bmatrix}Bs&&y\\end{bmatrix}\n",
"\\left(I+\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}\n",
"\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}B^{-1}\\begin{bmatrix}Bs&&y\\end{bmatrix}\\right)^{-1}\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}B^{-1}$\n",
"\n",
"Let's verify that we can simplify $B^{-1}U := B^{-1}\\begin{bmatrix}Bs&&y\\end{bmatrix}$ to $\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}$ and let's define it as $D$."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"D = np.hstack([s[:, None], (np.linalg.inv(B) @ y)[:, None]])\n",
"assert np.allclose(np.linalg.inv(B) @ U, D)\n",
"print(\"Shapes:\\n-------\")\n",
"print(\"D:\", D.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's verify we can simplify $V_2B^{-1} := \\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}B^{-1}$ to $\\begin{bmatrix}s^T\\\\y^TB^{-1}\\end{bmatrix}$ and that is equivalent to $D^T$, and so it follows that $B^{-1}V_2^T = D$.\n",
"\n",
"Note that $(B^{-1})^T$ is the same as $B^{-1}$ because $B$ is symmetric."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.allclose(V2 @ np.linalg.inv(B), np.vstack([s.T, y.T @ np.linalg.inv(B)]))\n",
"assert np.allclose(D.T, np.vstack([s.T, y.T @ np.linalg.inv(B)]))\n",
"assert np.allclose(V2 @ np.linalg.inv(B), D.T)\n",
"assert np.allclose(np.linalg.inv(B) @ V2.T, D)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let $M$ the big bracketed expression in the middle.\n",
"\n",
"$B^{-1}-\\underbrace{\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}}_{D \\text{ } (n \\times 2)}\n",
"\\underbrace{\\left(\\overbrace{I}^{(2 \\times 2)}+\n",
"\\overbrace{\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}}^{2 \\times n}\n",
"\\overbrace{B^{-1}}^{n \\times n}\n",
"\\overbrace{\\begin{bmatrix}Bs&&y\\end{bmatrix}}^{(n \\times 2)}\\right)^{-1}\n",
"\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}}_{M^{-1} \\text{ } (2 \\times 2)}\n",
"\\underbrace{\\begin{bmatrix}s^T\\\\y^TB^{-1}\\end{bmatrix}}_{D^T \\text{ } (2 \\times n)}$"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"M_inv = np.linalg.inv(np.identity(2) + VT @ np.linalg.inv(B) @ U) @ V1\n",
"print(\"Shapes:\\n-------\")\n",
"print(\"M_inv:\", M_inv.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's verify $B^{-1}-DM^{-1}D^T$ is equivalent to $\\left(B_{k} + \\cfrac{y_ky_k^T}{y_k^Ts_k} -\\cfrac{B_ks_ks_k^TB_k}{s_k^TB_ks_k}\\right)^{-1}$."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.allclose(np.linalg.inv(B) - D @ M_inv @ D.T, np.linalg.inv(B1))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's simplify $M^{-1}$.\n",
"\n",
"$M^{-1} = (I + V^T B^{-1} U)^{-1} V_1$\n",
"\n",
"$M^{-1} = [V_{1}^{-1} (I + V_1 V_2 B^{-1} U)]^{-1}$\n",
"\n",
"Earlier we simplified $B^{-1} U$ to $D$, so we get\n",
"\n",
"$M^{-1} = [V_{1}^{-1} (I + V_{1} V_{2} D)]^{-1}$\n",
"\n",
"$M^{-1} = (V_{1}^{-1} + V_2 D)^{-1}$\n",
"\n",
"Let's verify it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.allclose(M_inv, np.linalg.inv(np.linalg.inv(V1) + V2 @ np.linalg.inv(B) @ U))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So we have that $M^{-1}$ is\n",
"\n",
"$M^{-1} = \\left(\\begin{bmatrix}-s^TBs&&0\\\\0&&y^Ts\\end{bmatrix}+\n",
"\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}B^{-1}\\begin{bmatrix}Bs&&y\\end{bmatrix}\\right)^{-1}$\n",
"\n",
"It might not be obvious that the inverse of $V1$ is $\\begin{bmatrix}-s^TBs&&0\\\\0&&y^Ts\\end{bmatrix}$, so let's verify it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.allclose(np.linalg.inv(V1), np.array([[-s.T @ B @ s, 0], [0, y.T @ s]]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's verify $\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}$ becomes $\\begin{bmatrix}s^TBs&&s^Ty\\\\y^Ts&&y^TB^{-1}y\\end{bmatrix}$."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.allclose(\n",
" V2 @ D, np.array([[s.T @ B @ s, s.T @ y], [y.T @ s, y.T @ np.linalg.inv(B) @ y]])\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So $M^{-1}$ now is\n",
"\n",
"$M^{-1} = \\left(\\begin{bmatrix}-s^TBs&&0\\\\0&&y^Ts\\end{bmatrix}+\n",
"\\begin{bmatrix}s^TBs&&s^Ty\\\\y^Ts&&y^TB^{-1}y\\end{bmatrix}\\right)^{-1}$\n",
"\n",
"And we can simplify it further to\n",
"\n",
"$M^{-1} = \\begin{bmatrix}0&&s^Ty\\\\y^Ts&&y^Ts+y^TB^{-1}y\\end{bmatrix}^{-1}$\n",
"\n",
"Let's verify it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.allclose(\n",
" M_inv,\n",
" np.linalg.inv(\n",
" np.array([[0, y.T @ s], [s.T @ y, y.T @ s + y.T @ np.linalg.inv(B) @ y]])\n",
" ),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To calculate the inverse of $M^{-1}$, recall\n",
"\n",
"$A^{-1} = \\begin{bmatrix}a&&b\\\\c&&d\\end{bmatrix}^{-1}$\n",
"\n",
"$A^{-1} = \\cfrac{1}{ab - cd}\\begin{bmatrix}d&&-b\\\\-c&&a\\end{bmatrix}$\n",
"\n",
"The determinant of $M$ is\n",
"\n",
"$\\text{det} M = -s^Tyy^Ts$\n",
"\n",
"So $M^{-1}$ becomes\n",
"\n",
"$M^{-1} = -\\cfrac{1}{s^Tyy^Ts} \\begin{bmatrix}y^Ts+y^TB^{-1}y&&-s^Ty\\\\-y^Ts&&0\\end{bmatrix}$\n",
"\n",
"$M^{-1} = \\begin{bmatrix}-\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}&&\\cfrac{s^Ty}{s^Tyy^Ts}\\\\\\cfrac{y^Ts}{s^Tyy^Ts}&&0\\end{bmatrix}$"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"det = s.T @ y * y @ s.T\n",
"assert np.allclose(\n",
" M_inv,\n",
" np.array(\n",
" [\n",
" [-(y.T @ s + y.T @ np.linalg.inv(B) @ y) / det, s.T @ y / det],\n",
" [y.T @ s / det, 0],\n",
" ]\n",
" ),\n",
")\n",
"# replace M_inv with new equivalent\n",
"M_inv = np.array(\n",
" [\n",
" [-(y.T @ s + y.T @ np.linalg.inv(B) @ y) / det, s.T @ y / det],\n",
" [y.T @ s / det, 0],\n",
" ]\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's plug it back in into $B^{-1}-DM^{-1}D^T$ and we get\n",
"\n",
"$B^{-1}-\n",
"\\underbrace{\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}}_{D \\text{ } (n \\times 2)}\n",
"\\underbrace{\\begin{bmatrix}-\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}&&\\cfrac{s^Ty}{s^Tyy^Ts}\\\\\\cfrac{y^Ts}{s^Tyy^Ts}&&0\\end{bmatrix}}_{M^{-1} \\text{ } (2 \\times 2)}\n",
"\\underbrace{\\begin{bmatrix}s^T\\\\y^TB^{-1}\\end{bmatrix}}_{D^T \\text{ } (2 \\times n)}$\n",
"\n",
"Let's verify it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.allclose(np.linalg.inv(B) - D @ M_inv @ D.T, np.linalg.inv(B1))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$B^{-1}-\n",
"\\underbrace{\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}}_{D \\text{ } (n \\times 2)}\n",
"\\underbrace{\\begin{bmatrix}-s^T\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}+y^TB^{-1}\\cfrac{s^Ty}{s^Tyy^Ts}\\\\s^T\\cfrac{y^Ts}{s^Tyy^Ts}\\end{bmatrix}}_{M^{-1}D^T \\text{ } (2 \\times n)}$\n",
"\n",
"Let's verify it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.allclose(\n",
" M_inv @ D.T,\n",
" np.vstack(\n",
" [\n",
" -s.T * (y.T @ s + y.T @ np.linalg.inv(B) @ y) / det\n",
" + (y.T @ np.linalg.inv(B)) * (s.T @ y / det),\n",
" s.T * (y.T @ s / det),\n",
" ]\n",
" ),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$B^{-1}-\\underbrace{\\left[s\\left(-s^T\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}+y^TB^{-1}\\cfrac{s^Ty}{s^Tyy^Ts}\\right)+B^{-1}y\\left(s^T\\cfrac{y^Ts}{s^Tyy^Ts}\\right)\\right]}_{DM^{-1}D^T \\text{} (n \\times n)}$\n",
"\n",
"The elements of $D$ are column vectors.\n",
"\n",
"Recall, column vector by row vector is an outer product, whereas row vector by column vector is a dot product.\n",
"\n",
"So $s\\left(-s^T\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}+y^TB^{-1}\\cfrac{s^Ty}{s^Tyy^Ts}\\right)$ is an outer product.\n",
"\n",
"And $B^{-1}y\\left(s^T\\cfrac{y^Ts}{s^Tyy^Ts}\\right)$ is another outer product.\n",
"\n",
"Let's verify it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.allclose(\n",
" D @ M_inv @ D.T,\n",
" np.outer(\n",
" s,\n",
" (\n",
" -s.T * (y.T @ s + y.T @ np.linalg.inv(B) @ y) / det\n",
" + (y.T @ np.linalg.inv(B)) * (s.T @ y / det)\n",
" ),\n",
" )\n",
" + np.outer(np.linalg.inv(B) @ y, s.T * (y.T @ s / det)),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's bring $s$ inside the bracket.\n",
"\n",
"$B^{-1}-\\left[-ss^T\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}+sy^TB^{-1}\\cfrac{s^Ty}{s^Tyy^Ts}+B^{-1}ys^T\\cfrac{y^Ts}{s^Tyy^Ts}\\right]$\n",
"\n",
"Let's verify."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"out1 = np.outer(-s, s.T * (y.T @ s + y.T @ np.linalg.inv(B) @ y) / det)\n",
"out2 = np.outer(s, (y.T @ np.linalg.inv(B)) * (s.T @ y / det))\n",
"out3 = np.outer(np.linalg.inv(B) @ y, s.T * (y.T @ s / det))\n",
"assert np.allclose(D @ M_inv @ D.T, out1 + out2 + out3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's bring $\\cfrac{1}{s^Tyy^Ts}$ outside.\n",
"\n",
"$B^{-1}-\\cfrac{1}{s^Tyy^Ts}\\left[-ss^Ty^Ts -ss^Ty^TB^{-1}y + sy^TB^{-1}s^Ty + B^{-1}ys^Ty^Ts\\right]$\n",
"\n",
"Let's verify."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"out1 = np.outer(-s, s.T * (y.T @ s))\n",
"out2 = np.outer(-s, s.T * (y.T @ np.linalg.inv(B) @ y))\n",
"out3 = np.outer(s, (y.T @ np.linalg.inv(B)) * (s.T @ y))\n",
"out4 = np.outer(np.linalg.inv(B) @ y, s.T * (y.T @ s))\n",
"assert np.allclose(D @ M_inv @ D.T, 1 / det * (out1 + out2 + out3 + out4))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's swap the signs.\n",
"\n",
"$B^{-1} + \\cfrac{1}{s^Tyy^Ts}\\left[ss^Ty^Ts + ss^Ty^TB^{-1}y - sy^TB^{-1}s^Ty - B^{-1}ys^Ty^Ts\\right]$\n",
"\n",
"Let's verify."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"out1 = np.outer(s, s.T * (y.T @ s))\n",
"out2 = np.outer(s, s.T * (y.T @ np.linalg.inv(B) @ y))\n",
"out3 = np.outer(s, (y.T @ np.linalg.inv(B)) * (s.T @ y))\n",
"out4 = np.outer(np.linalg.inv(B) @ y, s.T * (y.T @ s))\n",
"assert np.allclose(\n",
" np.linalg.inv(B) - D @ M_inv @ D.T,\n",
" np.linalg.inv(B) + 1 / det * (out1 + out2 - out3 - out4),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$B^{-1} + \\cfrac{ss^Ty^Ts}{s^Tyy^Ts} + \\cfrac{ss^Ty^TB^{-1}y}{s^Tyy^Ts} - \\cfrac{sy^TB^{-1}s^Ty}{s^Tyy^Ts} - \\cfrac{B^{-1}ys^Ty^Ts}{s^Tyy^Ts}$\n",
"\n",
"Now, it can be really confusing to understand what's a dot product, an outer product or a simple multiplication.\n",
"\n",
"So let's take a look at the code.\n",
"\n",
"```\n",
"out1 = np.outer(s, s.T * (y.T @ s)) / (s.T @ y * y @ s.T)\n",
"out2 = np.outer(s, s.T * (y.T @ np.linalg.inv(B) @ y)) / (s.T @ y * y @ s.T)\n",
"out3 = np.outer(s, (y.T @ np.linalg.inv(B)) * (s.T @ y)) / (s.T @ y * y @ s.T)\n",
"out4 = np.outer(np.linalg.inv(B) @ y, s.T * (y.T @ s)) / (s.T @ y * y @ s.T)\n",
"```\n",
"\n",
"We can see that `(y.T @ s)`, `(s.T @ y)` and `(y.T @ s)` in `out1`, `out3` and `out4`, respectively can go away.\n",
"\n",
"$B^{-1} + \\cfrac{ss^T}{s^Ty} + \\cfrac{ss^Ty^TB^{-1}y}{s^Tyy^Ts} - \\cfrac{sy^TB^{-1}}{y^Ts} - \\cfrac{B^{-1}ys^T}{s^Ty}$\n",
"\n",
"Also note that $s^Ty = y^Ts$ and $s^Tyy^Ts = (y^Ts)^2$, but let's verify it.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert s.T @ y == y.T @ s\n",
"assert s.T @ y * y @ s.T == (y.T @ s)**2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Recall, our objective is to get to this form\n",
"\n",
"$\\left(I - \\cfrac{sy^T}{y^Ts} \\right)B^{-1} \\left(I - \\cfrac{ys^T}{y^Ts} \\right) + \\cfrac{ss^T}{y^Ts}$\n",
"\n",
"We're almost there.\n",
"\n",
"$B^{-1} + \\cfrac{ss^T}{y^Ts} + \\cfrac{ss^Ty^TB^{-1}y}{(y^Ts)^2} - \\cfrac{sy^TB^{-1}}{y^Ts} - \\cfrac{B^{-1}ys^T}{y^Ts}$\n",
"\n",
"Let's verify it, though."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"out1 = np.outer(s, s.T) / (y.T @ s)\n",
"out2 = np.outer(s, s.T * (y.T @ np.linalg.inv(B) @ y)) / (y.T @ s)**2\n",
"out3 = np.outer(s, (y.T @ np.linalg.inv(B))) / (y.T @ s)\n",
"out4 = np.outer(np.linalg.inv(B) @ y, s.T) / (y.T @ s)\n",
"assert np.allclose(\n",
" np.linalg.inv(B) - D @ M_inv @ D.T,\n",
" np.linalg.inv(B) + out1 + out2 - out3 - out4,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To make the next simplifications as clear as possible, let's just change the order of the terms and put the ones that can be factored together.\n",
"\n",
"$\\left(B^{-1} - \\cfrac{sy^TB^{-1}}{y^Ts} - \\cfrac{B^{-1}ys^T}{y^Ts} + \\cfrac{ss^Ty^TB^{-1}y}{(y^Ts)^2}\\right) + \\cfrac{ss^T}{y^Ts}$\n",
"\n",
"Although it was a trivial operation, let's verify it again."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"out1 = np.outer(s, (y.T @ np.linalg.inv(B))) / (y.T @ s)\n",
"out2 = np.outer(np.linalg.inv(B) @ y, s.T) / (y.T @ s)\n",
"out3 = np.outer(s, s.T * (y.T @ np.linalg.inv(B) @ y)) / (y.T @ s) ** 2\n",
"out4 = np.outer(s, s.T) / (y.T @ s)\n",
"assert np.allclose(\n",
" np.linalg.inv(B) - D @ M_inv @ D.T,\n",
" np.linalg.inv(B) - out1 - out2 + out3 + out4,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It turns out that\n",
"\n",
"$\\cfrac{ss^Ty^TB^{-1}y}{(y^Ts)^2} = \\cfrac{sy^T}{y^Ts}B^{-1}\\cfrac{ys^T}{y^Ts}$\n",
"\n",
"And also\n",
"\n",
"$\\cfrac{sy^TB^{-1}}{y^Ts} = \\cfrac{sy^T}{y^Ts}B^{-1}$\n",
"\n",
"$\\cfrac{B^{-1}ys^T}{y^Ts} = B^{-1}\\cfrac{ys^T}{y^Ts}$\n",
"\n",
"Let's verify it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.allclose(out1, (np.outer(s, y.T) / (y.T @ s)) @ np.linalg.inv(B))\n",
"assert np.allclose(out2, np.linalg.inv(B) @ (np.outer(y, s.T) / (y.T @ s)))\n",
"assert np.allclose(\n",
" out3,\n",
" (np.outer(s, y.T) / (y.T @ s)) @ np.linalg.inv(B) @ np.outer(y, s.T) / (y.T @ s),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$\\left(B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1} - B^{-1}\\cfrac{ys^T}{y^Ts} + \\cfrac{sy^T}{y^Ts}B^{-1}\\cfrac{ys^T}{y^Ts}\\right) + \\cfrac{ss^T}{y^Ts}$\n",
"\n",
"Let's verify that \n",
"\n",
"$B^{-1}\\cfrac{ys^T}{y^Ts} + \\cfrac{sy^T}{y^Ts}B^{-1}\\cfrac{ys^T}{y^Ts}$ \n",
"\n",
"can be rewritten as\n",
"\n",
"$\\left(B^{-1} + \\cfrac{sy^T}{y^Ts}B^{-1}\\right)\\cfrac{ys^T}{y^Ts}$"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.allclose(\n",
" out2 + out3,\n",
" (np.linalg.inv(B) + (np.outer(s, y.T) / (y.T @ s)) @ np.linalg.inv(B))\n",
" @ np.outer(y, s.T)\n",
" / (y.T @ s),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1} - \\left(B^{-1} + \\cfrac{sy^T}{y^Ts}B^{-1}\\right)\\cfrac{ys^T}{y^Ts} + \\cfrac{ss^T}{y^Ts}$\n",
"\n",
"Let's verify that \n",
"\n",
"$\\left(B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1}\\right) - \\left(B^{-1} + \\cfrac{sy^T}{y^Ts}B^{-1}\\right)\\cfrac{ys^T}{y^Ts}$\n",
"\n",
"can be rewritten as\n",
"\n",
"$\\left(B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1}\\right)\\left(I - \\cfrac{ys^T}{y^Ts}\\right)$\n",
"\n",
"Note the signs are indeed correct.\n",
"\n",
"$B^{-1}$ with $- \\cfrac{ys^T}{y^Ts}$ becomes $-B^{-1}\\cfrac{ys^T}{y^Ts}$\n",
"\n",
"$- \\cfrac{sy^T}{y^Ts}B^{-1}$ with $- \\cfrac{ys^T}{y^Ts}$ becomes $\\cfrac{sy^T}{y^Ts}B^{-1}\\cfrac{ys^T}{y^Ts}$."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.allclose(\n",
" np.linalg.inv(B) - out1 - out2 + out3,\n",
" (np.linalg.inv(B) - (np.outer(s, y.T) / (y.T @ s)) @ np.linalg.inv(B))\n",
" @ (np.identity(3) - np.outer(y, s.T) / (y.T @ s)),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$\\left(B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1}\\right)\\left(I - \\cfrac{ys^T}{y^Ts}\\right) + \\cfrac{ss^T}{y^Ts}$\n",
"\n",
"Now, let's verify that \n",
"\n",
"$\\left(B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1}\\right)$\n",
"\n",
"can be rewritten as\n",
"\n",
"$\\left(I - \\cfrac{sy^T}{y^Ts}\\right)B^{-1}$"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert np.allclose(\n",
" np.linalg.inv(B) - out1,\n",
" (np.identity(3) - np.outer(s, y.T) / (y.T @ s)) @ np.linalg.inv(B),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So we finally have\n",
"\n",
"$\\left(I - \\cfrac{sy^T}{y^Ts}\\right)B^{-1}\\left(I - \\cfrac{ys^T}{y^Ts}\\right) + \\cfrac{ss^T}{y^Ts}$\n",
"\n",
"which is indeed what we want\n",
"\n",
"$B^{-1}_{k+1} = \\left(I - \\cfrac{s_ky_k^T}{y_k^Ts_k} \\right)B^{-1}_k \\left(I - \\cfrac{y_ks_k^T}{y_k^Ts_k} \\right) + \\cfrac{s_ks_k^T}{y_k^Ts_k}$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To appreciate how far we've come, here's the full derivation."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$B^{-1}-B^{-1}U(I+V^TB^{-1}U)^{-1}V^TB^{-1}$\n",
"\n",
"$B^{-1}-B^{-1}\\begin{bmatrix}Bs&&y\\end{bmatrix}\n",
"\\left(I+\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}\n",
"\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}B^{-1}\\begin{bmatrix}Bs&&y\\end{bmatrix}\\right)^{-1}\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}B^{-1}$\n",
"\n",
"$B^{-1}-\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}\n",
"\\left(I+\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}B^{-1}\n",
"\\begin{bmatrix}Bs&&y\\end{bmatrix}\\right)^{-1}\n",
"\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}\n",
"\\begin{bmatrix}s^T\\\\y^TB^{-1}\\end{bmatrix}$\n",
"\n",
"$B^{-1}-\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}\n",
"\\begin{bmatrix}-\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}&&\\cfrac{s^Ty}{s^Tyy^Ts}\\\\\\cfrac{y^Ts}{s^Tyy^Ts}&&0\\end{bmatrix}\n",
"\\begin{bmatrix}s^T\\\\y^TB^{-1}\\end{bmatrix}$\n",
"\n",
"$B^{-1}-\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}\n",
"\\begin{bmatrix}-s^T\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}+y^TB^{-1}\\cfrac{s^Ty}{s^Tyy^Ts}\\\\s^T\\cfrac{y^Ts}{s^Tyy^Ts}\\end{bmatrix}$\n",
"\n",
"$B^{-1}-\\left[s\\left(-s^T\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}+y^TB^{-1}\\cfrac{s^Ty}{s^Tyy^Ts}\\right)+B^{-1}y\\left(s^T\\cfrac{y^Ts}{s^Tyy^Ts}\\right)\\right]$\n",
"\n",
"$B^{-1}-\\left[-ss^T\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}+sy^TB^{-1}\\cfrac{s^Ty}{s^Tyy^Ts}+B^{-1}ys^T\\cfrac{y^Ts}{s^Tyy^Ts}\\right]$\n",
"\n",
"$B^{-1}-\\cfrac{1}{s^Tyy^Ts}\\left[-ss^Ty^Ts -ss^Ty^TB^{-1}y + sy^TB^{-1}s^Ty + B^{-1}ys^Ty^Ts\\right]$\n",
"\n",
"$B^{-1} + \\cfrac{1}{s^Tyy^Ts}\\left[ss^Ty^Ts + ss^Ty^TB^{-1}y - sy^TB^{-1}s^Ty - B^{-1}ys^Ty^Ts\\right]$\n",
"\n",
"$B^{-1} + \\cfrac{ss^Ty^Ts}{s^Tyy^Ts} + \\cfrac{ss^Ty^TB^{-1}y}{s^Tyy^Ts} - \\cfrac{sy^TB^{-1}s^Ty}{s^Tyy^Ts} - \\cfrac{B^{-1}ys^Ty^Ts}{s^Tyy^Ts}$\n",
"\n",
"$B^{-1} + \\cfrac{ss^T}{s^Ty} + \\cfrac{ss^Ty^TB^{-1}y}{s^Tyy^Ts} - \\cfrac{sy^TB^{-1}}{y^Ts} - \\cfrac{B^{-1}ys^T}{s^Ty}$\n",
"\n",
"$B^{-1} + \\cfrac{ss^T}{y^Ts} + \\cfrac{ss^Ty^TB^{-1}y}{(y^Ts)^2} - \\cfrac{sy^TB^{-1}}{y^Ts} - \\cfrac{B^{-1}ys^T}{y^Ts}$\n",
"\n",
"$\\left(B^{-1} - \\cfrac{sy^TB^{-1}}{y^Ts} - \\cfrac{B^{-1}ys^T}{y^Ts} + \\cfrac{ss^Ty^TB^{-1}y}{(y^Ts)^2}\\right) + \\cfrac{ss^T}{y^Ts}$\n",
"\n",
"$\\left(B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1} - B^{-1}\\cfrac{ys^T}{y^Ts} + \\cfrac{sy^T}{y^Ts}B^{-1}\\cfrac{ys^T}{y^Ts}\\right) + \\cfrac{ss^T}{y^Ts}$\n",
"\n",
"$B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1} - \\left(B^{-1} + \\cfrac{sy^T}{y^Ts}B^{-1}\\right)\\cfrac{ys^T}{y^Ts} + \\cfrac{ss^T}{y^Ts}$\n",
"\n",
"$\\left(B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1}\\right)\\left(I - \\cfrac{ys^T}{y^Ts}\\right) + \\cfrac{ss^T}{y^Ts}$\n",
"\n",
"$\\left(I - \\cfrac{sy^T}{y^Ts}\\right)B^{-1}\\left(I - \\cfrac{ys^T}{y^Ts}\\right) + \\cfrac{ss^T}{y^Ts}$\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}