diff --git a/book_equations.ipynb b/book_equations.ipynb index 285288c..f2ffaee 100644 --- a/book_equations.ipynb +++ b/book_equations.ipynb @@ -306,8 +306,7 @@ "**Equation 4-22: Cross entropy cost function**\n", "\n", "$\n", - "J(\\mathbf{\\Theta}) =\n", - "- \\dfrac{1}{m}\\sum\\limits_{i=1}^{m}\\sum\\limits_{k=1}^{K}{y_k^{(i)}\\log\\left(\\hat{p}_k^{(i)}\\right)}\n", + "J(\\mathbf{\\Theta}) = - \\dfrac{1}{m}\\sum\\limits_{i=1}^{m}\\sum\\limits_{k=1}^{K}{y_k^{(i)}\\log\\left(\\hat{p}_k^{(i)}\\right)}\n", "$\n", "\n", "**Cross entropy between two discrete probability distributions $p$ and $q$ (page 141):**\n", @@ -761,12 +760,8 @@ "\n", "**Equation 11-4: Momentum algorithm**\n", "\n", - "$\n", - "\\begin{split}\n", - "1. \\quad & \\mathbf{m} \\gets \\beta \\mathbf{m} - \\eta \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\\\\n", - "2. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\mathbf{m}\n", - "\\end{split}\n", - "$\n", + "1. $\\mathbf{m} \\gets \\beta \\mathbf{m} - \\eta \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})$\n", + "2. $\\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\mathbf{m}$\n", "\n", "**In the text page 296:**\n", "\n", @@ -775,22 +770,13 @@ "\n", "**Equation 11-5: Nesterov Accelerated Gradient algorithm**\n", "\n", - "$\n", - "\\begin{split}\n", - "1. \\quad & \\mathbf{m} \\gets \\beta \\mathbf{m} - \\eta \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta} + \\beta \\mathbf{m}) \\\\\n", - "2. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\mathbf{m}\n", - "\\end{split}\n", - "$\n", - "\n", + "1. $\\mathbf{m} \\gets \\beta \\mathbf{m} - \\eta \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta} + \\beta \\mathbf{m})$\n", + "2. $\\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\mathbf{m}$\n", "\n", "**Equation 11-6: AdaGrad algorithm**\n", "\n", - "$\n", - "\\begin{split}\n", - "1. \\quad & \\mathbf{s} \\gets \\mathbf{s} + \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})\\\\\n", - "2. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} - \\eta \\, \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}\n", - "\\end{split}\n", - "$\n", + "1. $\\mathbf{s} \\gets \\mathbf{s} + \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})$\n", + "2. $\\mathbf{\\theta} \\gets \\mathbf{\\theta} - \\eta \\, \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}$\n", "\n", "**In the text page 298-299:**\n", "\n", @@ -803,30 +789,22 @@ "\n", "**Equation 11-7: RMSProp algorithm**\n", "\n", - "$\n", - "\\begin{split}\n", - "1. \\quad & \\mathbf{s} \\gets \\beta \\mathbf{s} + (1 - \\beta ) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})\\\\\n", - "2. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} - \\eta \\, \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}\n", - "\\end{split}\n", - "$\n", + "1. $\\mathbf{s} \\gets \\beta \\mathbf{s} + (1 - \\beta ) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})$\n", + "2. $\\mathbf{\\theta} \\gets \\mathbf{\\theta} - \\eta \\, \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}$\n", "\n", "\n", "**Equation 11-8: Adam algorithm**\n", "\n", - "$\n", - "\\begin{split}\n", - "1. \\quad & \\mathbf{m} \\gets \\beta_1 \\mathbf{m} - (1 - \\beta_1) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})\\\\\n", - "2. \\quad & \\mathbf{s} \\gets \\beta_2 \\mathbf{s} + (1 - \\beta_2) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})\\\\\n", - "3. \\quad & \\mathbf{m} \\gets \\dfrac{\\mathbf{m}}{1 - {\\beta_1}^T}\\\\\n", - "4. \\quad & \\mathbf{s} \\gets \\dfrac{\\mathbf{s}}{1 - {\\beta_2}^T}\\\\\n", - "5. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\eta \\, \\mathbf{m} \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}\n", - "\\end{split}\n", - "$\n", + "1. $\\mathbf{m} \\gets \\beta_1 \\mathbf{m} - (1 - \\beta_1) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})$\n", + "2. $\\mathbf{s} \\gets \\beta_2 \\mathbf{s} + (1 - \\beta_2) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})$\n", + "3. $\\mathbf{m} \\gets \\left(\\dfrac{\\mathbf{m}}{1 - {\\beta_1}^T}\\right)$\n", + "4. $\\mathbf{s} \\gets \\left(\\dfrac{\\mathbf{s}}{1 - {\\beta_2}^T}\\right)$\n", + "5. $\\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\eta \\, \\mathbf{m} \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}$\n", "\n", "**In the text page 309:**\n", "\n", "We typically implement this constraint by computing $\\left\\| \\mathbf{w} \\right\\|_2$ after each training step\n", - "and clipping $\\mathbf{w}$ if needed ($ \\mathbf{w} \\gets \\mathbf{w} \\dfrac{r}{\\left\\| \\mathbf{w} \\right\\|_2} $).\n", + "and clipping $\\mathbf{w}$ if needed $ \\left( \\mathbf{w} \\gets \\mathbf{w} \\dfrac{r}{\\left\\| \\mathbf{w} \\right\\|_2} \\right) $.\n", "\n", "\n" ]