upload nlp

L1aoXingyu · L1aoXingyu · commit 14e0d1053519 · 2017-08-05T13:55:25.000+08:00
diff --git a/chapter5_RNN/NLP/N-Gram.py b/chapter5_RNN/NLP/N-Gram.py
@@ -0,0 +1,77 @@
+import torch
+import torch.nn.functional as F
+from torch import nn, optim
+from torch.autograd import Variable
+
+CONTEXT_SIZE = 2
+EMBEDDING_DIM = 10
+# We will use Shakespeare Sonnet 2
+test_sentence = """When forty winters shall besiege thy brow,
+And dig deep trenches in thy beauty's field,
+Thy youth's proud livery so gazed on now,
+Will be a totter'd weed of small worth held:
+Then being asked, where all thy beauty lies,
+Where all the treasure of thy lusty days;
+To say, within thine own deep sunken eyes,
+Were an all-eating shame, and thriftless praise.
+How much more praise deserv'd thy beauty's use,
+If thou couldst answer 'This fair child of mine
+Shall sum my count, and make my old excuse,'
+Proving his beauty by succession thine!
+This were to be new made when thou art old,
+And see thy blood warm when thou feel'st it cold.""".split()
+
+trigram = [((test_sentence[i], test_sentence[i + 1]), test_sentence[i + 2])
+           for i in range(len(test_sentence) - 2)]
+
+vocb = set(test_sentence)
+word_to_idx = {word: i for i, word in enumerate(vocb)}
+idx_to_word = {word_to_idx[word]: word for word in word_to_idx}
+
+
+class NgramModel(nn.Module):
+    def __init__(self, vocb_size, context_size, n_dim):
+        super(NgramModel, self).__init__()
+        self.n_word = vocb_size
+        self.embedding = nn.Embedding(self.n_word, n_dim)
+        self.linear1 = nn.Linear(context_size * n_dim, 128)
+        self.linear2 = nn.Linear(128, self.n_word)
+
+    def forward(self, x):
+        emb = self.embedding(x)
+        emb = emb.view(1, -1)
+        out = self.linear1(emb)
+        out = F.relu(out)
+        out = self.linear2(out)
+        log_prob = F.log_softmax(out)
+        return log_prob
+
+
+ngrammodel = NgramModel(len(word_to_idx), CONTEXT_SIZE, 100)
+criterion = nn.NLLLoss()
+optimizer = optim.SGD(ngrammodel.parameters(), lr=1e-3)
+
+for epoch in range(100):
+    print('epoch: {}'.format(epoch + 1))
+    print('*' * 10)
+    running_loss = 0
+    for data in trigram:
+        word, label = data
+        word = Variable(torch.LongTensor([word_to_idx[i] for i in word]))
+        label = Variable(torch.LongTensor([word_to_idx[label]]))
+        # forward
+        out = ngrammodel(word)
+        loss = criterion(out, label)
+        running_loss += loss.data[0]
+        # backward
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+    print('Loss: {:.6f}'.format(running_loss / len(word_to_idx)))
+
+word, label = trigram[3]
+word = Variable(torch.LongTensor([word_to_idx[i] for i in word]))
+out = ngrammodel(word)
+_, predict_label = torch.max(out, 1)
+predict_word = idx_to_word[predict_label.data[0][0]]
+print('real word is {}, predict word is {}'.format(label, predict_word))
diff --git a/chapter5_RNN/NLP/seq-lstm.py b/chapter5_RNN/NLP/seq-lstm.py
@@ -0,0 +1,117 @@
+__author__ = 'SherlockLiao'
+
+import torch
+import torch.nn.functional as F
+from torch import nn, optim
+from torch.autograd import Variable
+
+training_data = [("The dog ate the apple".split(),
+                  ["DET", "NN", "V", "DET", "NN"]),
+                 ("Everybody read that book".split(), ["NN", "V", "DET",
+                                                       "NN"])]
+
+word_to_idx = {}
+tag_to_idx = {}
+for context, tag in training_data:
+    for word in context:
+        if word not in word_to_idx:
+            word_to_idx[word] = len(word_to_idx)
+    for label in tag:
+        if label not in tag_to_idx:
+            tag_to_idx[label] = len(tag_to_idx)
+alphabet = 'abcdefghijklmnopqrstuvwxyz'
+character_to_idx = {}
+for i in range(len(alphabet)):
+    character_to_idx[alphabet[i]] = i
+
+
+class CharLSTM(nn.Module):
+    def __init__(self, n_char, char_dim, char_hidden):
+        super(CharLSTM, self).__init__()
+        self.char_embedding = nn.Embedding(n_char, char_dim)
+        self.char_lstm = nn.LSTM(char_dim, char_hidden, batch_first=True)
+
+    def forward(self, x):
+        x = self.char_embedding(x)
+        _, h = self.char_lstm(x)
+        return h[1]
+
+
+class LSTMTagger(nn.Module):
+    def __init__(self, n_word, n_char, char_dim, n_dim, char_hidden, n_hidden,
+                 n_tag):
+        super(LSTMTagger, self).__init__()
+        self.word_embedding = nn.Embedding(n_word, n_dim)
+        self.char_lstm = CharLSTM(n_char, char_dim, char_hidden)
+        self.lstm = nn.LSTM(n_dim + char_hidden, n_hidden, batch_first=True)
+        self.linear1 = nn.Linear(n_hidden, n_tag)
+
+    def forward(self, x, word):
+        char = torch.FloatTensor()
+        for each in word:
+            char_list = []
+            for letter in each:
+                char_list.append(character_to_idx[letter.lower()])
+            char_list = torch.LongTensor(char_list)
+            char_list = char_list.unsqueeze(0)
+            if torch.cuda.is_available():
+                tempchar = self.char_lstm(Variable(char_list).cuda())
+            else:
+                tempchar = self.char_lstm(Variable(char_list))
+            tempchar = tempchar.squeeze(0)
+            char = torch.cat((char, tempchar.cpu().data), 0)
+        char = char.squeeze(1)
+        if torch.cuda.is_available():
+            char = char.cuda()
+        char = Variable(char)
+        x = self.word_embedding(x)
+        x = torch.cat((x, char), 1)
+        x = x.unsqueeze(0)
+        x, _ = self.lstm(x)
+        x = x.squeeze(0)
+        x = self.linear1(x)
+        y = F.log_softmax(x)
+        return y
+
+
+model = LSTMTagger(
+    len(word_to_idx), len(character_to_idx), 10, 100, 50, 128, len(tag_to_idx))
+if torch.cuda.is_available():
+    model = model.cuda()
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.SGD(model.parameters(), lr=1e-2)
+
+
+def make_sequence(x, dic):
+    idx = [dic[i] for i in x]
+    idx = Variable(torch.LongTensor(idx))
+    return idx
+
+
+for epoch in range(300):
+    print('*' * 10)
+    print('epoch {}'.format(epoch + 1))
+    running_loss = 0
+    for data in training_data:
+        word, tag = data
+        word_list = make_sequence(word, word_to_idx)
+        tag = make_sequence(tag, tag_to_idx)
+        if torch.cuda.is_available():
+            word_list = word_list.cuda()
+            tag = tag.cuda()
+        # forward
+        out = model(word_list, word)
+        loss = criterion(out, tag)
+        running_loss += loss.data[0]
+        # backward
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+    print('Loss: {}'.format(running_loss / len(data)))
+print()
+input = make_sequence("Everybody ate the apple".split(), word_to_idx)
+if torch.cuda.is_available():
+    input = input.cuda()
+
+out = model(input, "Everybody ate the apple".split())
+print(out)