diff --git a/lecture_note.tex b/lecture_note.tex index d260d79..96dbfcc 100644 --- a/lecture_note.tex +++ b/lecture_note.tex @@ -3555,7 +3555,7 @@ \section{$n$-Gram Language Model} limit the maximum length of phrases/sentences we estimate a probability on}. This idea is a foundation on which a so-called $n$-gram language model is based. -In the $n$-gram language model, we first rewrite the probability of a given +In the $n$-gram language model, using the chain rule we firstly rewrite the probability of a given sentence $S$ from Eq.~\eqref{eq:sentence_prob} into \begin{align} \label{eq:unidir_sentence} @@ -3568,11 +3568,11 @@ \section{$n$-Gram Language Model} conditional probability (Eq.~\eqref{eq:unidir_sentence}~(a)) is only conditioned on the $n-1$ preceding symbols only, meaning \begin{align*} - p(w_k | w_{ 0 \\ - \gamma(w_{k-n+1}, \ldots, w_{k}) p^S(w_{k}|w_{k-n+1}, \ldots, + \gamma(w_{k-n+2}, \ldots, w_{k}) p^S(w_{k}|w_{k-n+2}, \ldots, w_{k-1}), \text{ otherwise} \end{array} \right. @@ -3784,17 +3809,17 @@ \subsection{Smoothing and Back-Off} Also, let us define the following quantities describing the number of all possible words following a given $n$-gram with a specified frequency $l$: \begin{align*} - N_l(w_{k-n}, \ldots, w_{k-1}) = |\{ c(w_{k-n}, \ldots, w_{k-1}, w_k) = l \}| + N_l(w_{k-n+1}, \ldots, w_{k-1}) = |\{ c(w_{k-n+1}, \ldots, w_{k-1}, w_k) = l \}| \end{align*} The modified KN smoothing then defines $\alpha$ in Eq.~\eqref{eq:n_gram_smoothing_general} to be \begin{align*} - \alpha(w_k | w_{k-n}, \ldots, w_{k-1}) = + \alpha(w_k | w_{k-n+1}, \ldots, w_{k-1}) = \frac{ - c(w_{k-n}, \ldots, w_{k-1}, w_k) - D(c(w_{k-n}, \ldots, w_{k-1}, w_k)) + c(w_{k-n+1}, \ldots, w_{k-1}, w_k) - D(c(w_{k-n+1}, \ldots, w_{k-1}, w_k)) }{ - \sum_{w' \in V} c(w_{k-n}, \ldots, w_{k-1}, w') + \sum_{w' \in V} c(w_{k-n+1}, \ldots, w_{k-1}, w') }, \end{align*} where $D$ is @@ -3810,13 +3835,13 @@ \subsection{Smoothing and Back-Off} \end{align*} And, $\gamma$ is defined as \begin{align*} - \gamma(w_{k-n}, \ldots, w_{k-1}) = + \gamma(w_{k-n+1}, \ldots, w_{k-1}) = \frac{ - D_1 N_1(w_{k-n}, \ldots, w_{k-1}) - + D_2 N_2(w_{k-n}, \ldots, w_{k-1}) - + D_{3+} N_{3+}(w_{k-n}, \ldots, w_{k-1}) + D_1 N_1(w_{k-n+1}, \ldots, w_{k-1}) + + D_2 N_2(w_{k-n+1}, \ldots, w_{k-1}) + + D_{3+} N_{3+}(w_{k-n+1}, \ldots, w_{k-1}) }{ - \sum_{w' \in V} c(w_{k-n}, \ldots, w_{k-1}, w') + \sum_{w' \in V} c(w_{k-n+1}, \ldots, w_{k-1}, w') }. \end{align*} @@ -3904,11 +3929,11 @@ \section{Neural Language Model} One thing we notice from $n$-gram language modelling is that this boils down to computing the conditional distribution of a next word $w_k$ given $n-1$ -preceding words $w_{k-n}, \ldots, w_{k-1}$. In other words, the goal of $n$-gram +preceding words $w_{k-n+1}, \ldots, w_{k-1}$. In other words, the goal of $n$-gram language modeling is to find a function that takes as input $n-1$ words and returns a conditional probability of a next word: \begin{align*} - p(w_k | w_{k-n}, \ldots, w_{k-1}) = f_{\TT}^{w_k} (w_{k-n}, \ldots, + p(w_k | w_{k-n+1}, \ldots, w_{k-1}) = f_{\TT}^{w_k} (w_{k-n+1}, \ldots, w_{k-1}). \end{align*} This is almost exactly what we have learned in