MuLabPKU · PhotonYan · Nov 29, 2025 · Nov 29, 2025 · Dec 3, 2025 · Dec 18, 2025
diff --git a/.gitignore b/.gitignore
@@ -14,4 +14,7 @@ notes/2025/tikz/4
 notes/2025/mvp/chapters/4-nn.tex
 notes/2025/mvp/chapters/4-nn.pdf
 
-0-ac*
+0-ac*
+
+logs/
+movs/
diff --git a/notes/2025/mvp/chapters/2-lr.tex b/notes/2025/mvp/chapters/2-lr.tex
@@ -8,9 +8,7 @@ \chapter{Logistic Regression}
 \item Sigmoid Regression
 \item Maximum a posteriori
 \end{introduction}
-\section{Classification}
-
-  \subsection{Binary Classification Problem}
+\section{Binary Classification}
 
 Settings.
 \begin{itemize}
@@ -58,7 +56,7 @@ \section{Classification}
   \centering
   \includegraphics{../../tikz/2/2.pdf}
   \caption{Classification by heperplane.}
-  \label{2-lr}
+  \label{fig:2-hyperplane}
 \end{figure}
 
 
@@ -182,7 +180,7 @@ \section{Classification}
   If all points can be separated by a linear model without error, we say the dataset is {linearly separable}.
 \end{definition}
 
-Example \ref{2-lr} is linearly separable, and the final state leads to $\|W\| \to \infty, \quad \|b\| \to \infty$. However, this situation is not desirable in practice, since it implies poor robustness. Hence a natural question arises: under the condition of linear separability, how can we find a well-chosen separating hyperplane that maximizes robustness? The answer will be presented in the next chapter, where we introduce the Support Vector Machine (SVM). The SVM optimizes $\hat w, \hat b$ by maximizing the margin (the distance between data points and the separating hyperplane), instead of simply minimizing the cross-entropy loss.
+Figure~\ref{fig:2-hyperplane} illustrates a linearly separable setting, and the final state leads to $\|W\| \to \infty, \quad \|b\| \to \infty$. However, this situation is not desirable in practice, since it implies poor robustness. Hence a natural question arises: under the condition of linear separability, how can we find a well-chosen separating hyperplane that maximizes robustness? The answer will be presented in the next chapter, where we introduce the Support Vector Machine (SVM). The SVM optimizes $\hat w, \hat b$ by maximizing the margin (the distance between data points and the separating hyperplane), instead of simply minimizing the cross-entropy loss.
 
 Although logistic regression may suffer from divergence of parameters under separable data, it often achieves better performance than SVM in practice, due to the following reasons:
 \begin{enumerate}
@@ -261,7 +259,7 @@ \section{Rethink of Linear Regression}
   \centering
   \includegraphics{../../tikz/2/3.pdf}
   \caption{Normal distribution (95\%).}
-  \label{2-lr}
+  \label{fig:2-normal}
 \end{figure}
 While the Central Limit Theorem (CLT) does not imply that most datasets are normally distributed, 
 it motivates modeling additive noise as Gaussian. We assume:

diff --git a/notes/2025/mvp/chapters/3-svm.tex b/notes/2025/mvp/chapters/3-svm.tex
@@ -237,7 +237,8 @@ \subsection{Hard Margin}
 \begin{figure}[H]
     \centering
     \includegraphics{../../tikz/3/1.pdf}
-    \label{2-lr}
+    \caption{Distance from a point to a hyperplane.}
+    \label{fig:3-hyperplane-distance}
 \end{figure}
 \begin{proof}
     \begin{enumerate}
@@ -962,7 +963,7 @@ \section{Kernel}
             \begin{equation}
                 K(x, z) = x^\top z;
             \end{equation}
-            \item \textbf{Polynomial kernel}: maps $\mathbb{R}^n \to \mathbb{R}^{\scriptsize\begin{pmatrix}n+p\\p\end{pmatrix}}$,
+            \item \textbf{Polynomial kernel}: can be understood as mapping $\mathbb{R}^n$ into a feature space of dimension $\binom{n+p}{p}$,
             \begin{equation}
                 K(x, z) = (x^\top z + 1)^p;
             \end{equation}

diff --git a/notes/2025/mvp/chapters/7-gp.pdf b/notes/2025/mvp/chapters/7-gp.pdf
diff --git a/notes/2025/mvp/chapters/7-gp.tex b/notes/2025/mvp/chapters/7-gp.tex
@@ -527,7 +527,7 @@ \section{Gaussian Process Regression (GPR)}
         \Sigma_\star
     \bigr),
 \end{equation}
-with mean
+with posterior mean
 \begin{equation}
     \mu_\star
     = k(x_\star,X)^\top (K + \sigma^2 I)^{-1} y,
@@ -564,7 +564,7 @@ \section{Gaussian Process Regression (GPR)}
     Gram matrix $K_\ell$ converges to a diagonal matrix:
     \begin{equation}
         \lim_{\ell \to 0} K_\ell
-        = \sigma_f^2 I_n.
+        = \sigma_f^2 I.
     \end{equation}
 
     In the noise-free case (i.e.\ $\sigma^2 = 0$ in the observation
@@ -581,7 +581,7 @@ \section{Gaussian Process Regression (GPR)}
         \qquad
         K_\ell^{-1}
         \;\longrightarrow\;
-        \frac{1}{\sigma_f^2} I_n,
+        \frac{1}{\sigma_f^2} I,
     \end{equation}
     where $e_j$ is the $j$-th standard basis vector in $\mathbb R^n$.
     Hence
@@ -595,5 +595,82 @@ \section{Gaussian Process Regression (GPR)}
     exactly at every training input: each training point is matched
     perfectly by the predictive mean.
 \end{remark}
+%===================================
+From the discussion above, we know that the \emph{posterior mean} of a Gaussian
+Process provides the predictive value for a test input, while the associated
+\emph{uncertainty} can be quantified through the \emph{predictive variance}.
+The relative standard deviation is simply the square root of this predictive
+variance.
+
+Gaussian Process Regression (GPR) serves as the mathematical foundation of
+\textbf{Bayesian Optimization (BO)}, a framework for performing optimization
+when the objective function is expensive, noisy, or lacks analytic structure.
+
+\begin{definition}[Black-box Optimization]
+An optimization problem is called \emph{black-box optimization} if and only if
+the analytical form of the objective function is unknown and no gradient
+information is available.
+\end{definition}
+Since the gradient of the objective function is unavailable, gradient-based
+methods such as gradient descent cannot be applied in black-box settings.
+The only feasible operation is \emph{point-wise evaluation}: we may query the
+black box at a finite number of input locations and observe the corresponding
+outputs.
+
+Bayesian Optimization (BO) aims to construct a probabilistic surrogate model
+of the black-box function and use it to locate the maximizer of $y$ with as
+few evaluations as possible. This is particularly important when each
+evaluation is expensive, for instance in hyperparameter tuning.
+
+The typical BO procedure proceeds as follows:
+\begin{enumerate}
+    \item Randomly or uniformly select a small set of initial points
+    $x_1,\ldots,x_n$ and obtain their evaluations $y_1,\ldots,y_n$.
+    \item Fit a Gaussian Process Regression (GPR) model using the collected
+    data.
+    \item Use an acquisition function $a(x)$ to select one or a batch of new
+    query points; evaluate them, augment the dataset, and refit the GPR model.
+    \item Repeat the process until the evaluation budget is exhausted, and
+    return the point achieving the maximum observed value of $y$.
+\end{enumerate}
+Here are two commonly used acquisition functions in Bayesian Optimization:
+
+\begin{enumerate}
+    \item \textbf{Expected Improvement (EI)}:
+    \begin{equation}
+        a(x) = \mathbb{E}\big[(y(x) - y_{\max})^{+}\big],
+    \end{equation}
+    where
+    \begin{equation}
+        [z]^{+} := \max(0, z).
+    \end{equation}
+    Under the GPR model, the predictive distribution is
+    \begin{equation}
+        y(x) \sim \mathcal{N}\big(\mu(x),\, \Sigma(x)\big),
+    \end{equation}
+    so EI admits a closed-form expression obtained by integrating over the
+    tail above $y_{\max}$:
+    \begin{equation}
+        a(x)=\int_{y_{\max}}^{+\infty} \big(y(x)-y_{\max}\big)\,
+        \mathcal{N}\!\left(y(x)\,\middle|\,\mu(x),\,\Sigma(x)\right)\,
+        \mathrm{d}y(x).
+    \end{equation}
+
+    \item \textbf{Upper Confidence Bound (UCB)}:
+    \begin{equation}
+        a(x)=\mu(x) + \kappa\,\sigma(x),
+    \end{equation}
+    where $\sigma(x)$ denotes the predictive standard deviation and
+    $\kappa>0$ controls the exploration–exploitation balance.
+    In essence, UCB prefers points where the model is either promising
+    (large $\mu$) or highly uncertain (large $\sigma$).
+\end{enumerate}
+
+\begin{remark}
+    \textbf{Optuna} is a widely used automatic hyperparameter tuning library,
+    recommended in class for practical Bayesian Optimization.
+\end{remark}
+
+
 
 \end{document}
diff --git a/notes/2025/mvp/chapters/8-tel.pdf b/notes/2025/mvp/chapters/8-tel.pdf
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,4 +14,7 @@ notes/2025/tikz/4 @@
     notes/2025/mvp/chapters/4-nn.tex
     notes/2025/mvp/chapters/4-nn.pdf
--ac*
+-ac*
+    logs/
+    movs/