|  | 
|  | 1 | +{ | 
|  | 2 | + "cells": [ | 
|  | 3 | +  { | 
|  | 4 | +   "cell_type": "code", | 
|  | 5 | +   "execution_count": 40, | 
|  | 6 | +   "metadata": { | 
|  | 7 | +    "collapsed": false | 
|  | 8 | +   }, | 
|  | 9 | +   "outputs": [], | 
|  | 10 | +   "source": [ | 
|  | 11 | +    "# things we need for NLP\n", | 
|  | 12 | +    "import nltk\n", | 
|  | 13 | +    "from nltk.stem.lancaster import LancasterStemmer\n", | 
|  | 14 | +    "stemmer = LancasterStemmer()\n", | 
|  | 15 | +    "\n", | 
|  | 16 | +    "# things we need for Tensorflow\n", | 
|  | 17 | +    "import numpy as np\n", | 
|  | 18 | +    "import tflearn\n", | 
|  | 19 | +    "import tensorflow as tf\n", | 
|  | 20 | +    "import random" | 
|  | 21 | +   ] | 
|  | 22 | +  }, | 
|  | 23 | +  { | 
|  | 24 | +   "cell_type": "code", | 
|  | 25 | +   "execution_count": 41, | 
|  | 26 | +   "metadata": { | 
|  | 27 | +    "collapsed": false | 
|  | 28 | +   }, | 
|  | 29 | +   "outputs": [], | 
|  | 30 | +   "source": [ | 
|  | 31 | +    "# import our chat-bot intents file\n", | 
|  | 32 | +    "import json\n", | 
|  | 33 | +    "with open('intents_Rx.json') as json_data:\n", | 
|  | 34 | +    "    intents = json.load(json_data)" | 
|  | 35 | +   ] | 
|  | 36 | +  }, | 
|  | 37 | +  { | 
|  | 38 | +   "cell_type": "code", | 
|  | 39 | +   "execution_count": 42, | 
|  | 40 | +   "metadata": { | 
|  | 41 | +    "collapsed": false | 
|  | 42 | +   }, | 
|  | 43 | +   "outputs": [ | 
|  | 44 | +    { | 
|  | 45 | +     "name": "stdout", | 
|  | 46 | +     "output_type": "stream", | 
|  | 47 | +     "text": [ | 
|  | 48 | +      "17 documents\n", | 
|  | 49 | +      "5 classes ['coupon', 'goodbye', 'greeting', 'med', 'thanks']\n", | 
|  | 50 | +      "40 unique stemmed words [\"'m\", \"'s\", 'a', 'anyon', 'ar', 'buy', 'bye', 'can', 'cheap', 'cheapest', 'coupon', 'day', 'deal', 'find', 'for', 'good', 'goodby', 'hello', 'help', 'hi', 'how', 'i', 'is', 'lat', 'less', 'look', 'me', 'med', 'money', 'see', 'send', 'thank', 'that', 'the', 'ther', 'to', 'want', 'what', 'wher', 'you']\n" | 
|  | 51 | +     ] | 
|  | 52 | +    } | 
|  | 53 | +   ], | 
|  | 54 | +   "source": [ | 
|  | 55 | +    "words = []\n", | 
|  | 56 | +    "classes = []\n", | 
|  | 57 | +    "documents = []\n", | 
|  | 58 | +    "ignore_words = ['?']\n", | 
|  | 59 | +    "# loop through each sentence in our intents patterns\n", | 
|  | 60 | +    "for intent in intents['intents']:\n", | 
|  | 61 | +    "    for pattern in intent['patterns']:\n", | 
|  | 62 | +    "        # tokenize each word in the sentence\n", | 
|  | 63 | +    "        w = nltk.word_tokenize(pattern)\n", | 
|  | 64 | +    "        # add to our words list\n", | 
|  | 65 | +    "        words.extend(w)\n", | 
|  | 66 | +    "        # add to documents in our corpus\n", | 
|  | 67 | +    "        documents.append((w, intent['tag']))\n", | 
|  | 68 | +    "        # add to our classes list\n", | 
|  | 69 | +    "        if intent['tag'] not in classes:\n", | 
|  | 70 | +    "            classes.append(intent['tag'])\n", | 
|  | 71 | +    "\n", | 
|  | 72 | +    "# stem and lower each word and remove duplicates\n", | 
|  | 73 | +    "words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]\n", | 
|  | 74 | +    "words = sorted(list(set(words)))\n", | 
|  | 75 | +    "\n", | 
|  | 76 | +    "# remove duplicates\n", | 
|  | 77 | +    "classes = sorted(list(set(classes)))\n", | 
|  | 78 | +    "\n", | 
|  | 79 | +    "print (len(documents), \"documents\")\n", | 
|  | 80 | +    "print (len(classes), \"classes\", classes)\n", | 
|  | 81 | +    "print (len(words), \"unique stemmed words\", words)" | 
|  | 82 | +   ] | 
|  | 83 | +  }, | 
|  | 84 | +  { | 
|  | 85 | +   "cell_type": "code", | 
|  | 86 | +   "execution_count": 43, | 
|  | 87 | +   "metadata": { | 
|  | 88 | +    "collapsed": false | 
|  | 89 | +   }, | 
|  | 90 | +   "outputs": [], | 
|  | 91 | +   "source": [ | 
|  | 92 | +    "# create our training data\n", | 
|  | 93 | +    "training = []\n", | 
|  | 94 | +    "output = []\n", | 
|  | 95 | +    "# create an empty array for our output\n", | 
|  | 96 | +    "output_empty = [0] * len(classes)\n", | 
|  | 97 | +    "\n", | 
|  | 98 | +    "# training set, bag of words for each sentence\n", | 
|  | 99 | +    "for doc in documents:\n", | 
|  | 100 | +    "    # initialize our bag of words\n", | 
|  | 101 | +    "    bag = []\n", | 
|  | 102 | +    "    # list of tokenized words for the pattern\n", | 
|  | 103 | +    "    pattern_words = doc[0]\n", | 
|  | 104 | +    "    # stem each word\n", | 
|  | 105 | +    "    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]\n", | 
|  | 106 | +    "    # create our bag of words array\n", | 
|  | 107 | +    "    for w in words:\n", | 
|  | 108 | +    "        bag.append(1) if w in pattern_words else bag.append(0)\n", | 
|  | 109 | +    "\n", | 
|  | 110 | +    "    # output is a '0' for each tag and '1' for current tag\n", | 
|  | 111 | +    "    output_row = list(output_empty)\n", | 
|  | 112 | +    "    output_row[classes.index(doc[1])] = 1\n", | 
|  | 113 | +    "\n", | 
|  | 114 | +    "    training.append([bag, output_row])\n", | 
|  | 115 | +    "\n", | 
|  | 116 | +    "# shuffle our features and turn into np.array\n", | 
|  | 117 | +    "random.shuffle(training)\n", | 
|  | 118 | +    "training = np.array(training)\n", | 
|  | 119 | +    "\n", | 
|  | 120 | +    "# create train and test lists\n", | 
|  | 121 | +    "train_x = list(training[:,0])\n", | 
|  | 122 | +    "train_y = list(training[:,1])" | 
|  | 123 | +   ] | 
|  | 124 | +  }, | 
|  | 125 | +  { | 
|  | 126 | +   "cell_type": "code", | 
|  | 127 | +   "execution_count": 44, | 
|  | 128 | +   "metadata": { | 
|  | 129 | +    "collapsed": false, | 
|  | 130 | +    "scrolled": true | 
|  | 131 | +   }, | 
|  | 132 | +   "outputs": [ | 
|  | 133 | +    { | 
|  | 134 | +     "name": "stdout", | 
|  | 135 | +     "output_type": "stream", | 
|  | 136 | +     "text": [ | 
|  | 137 | +      "Training Step: 2999  | total loss: \u001b[1m\u001b[32m0.15651\u001b[0m\u001b[0m | time: 0.006s\n", | 
|  | 138 | +      "| Adam | epoch: 1000 | loss: 0.15651 - acc: 0.9794 -- iter: 16/17\n", | 
|  | 139 | +      "Training Step: 3000  | total loss: \u001b[1m\u001b[32m0.14101\u001b[0m\u001b[0m | time: 0.008s\n", | 
|  | 140 | +      "| Adam | epoch: 1000 | loss: 0.14101 - acc: 0.9815 -- iter: 17/17\n", | 
|  | 141 | +      "--\n", | 
|  | 142 | +      "INFO:tensorflow:/home/gk/gensim/notebooks/Rxmodel.tflearn is not in all_model_checkpoint_paths. Manually adding it.\n" | 
|  | 143 | +     ] | 
|  | 144 | +    } | 
|  | 145 | +   ], | 
|  | 146 | +   "source": [ | 
|  | 147 | +    "# reset underlying graph data\n", | 
|  | 148 | +    "tf.reset_default_graph()\n", | 
|  | 149 | +    "# Build neural network\n", | 
|  | 150 | +    "net = tflearn.input_data(shape=[None, len(train_x[0])])\n", | 
|  | 151 | +    "net = tflearn.fully_connected(net, 8)\n", | 
|  | 152 | +    "net = tflearn.fully_connected(net, 8)\n", | 
|  | 153 | +    "net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')\n", | 
|  | 154 | +    "net = tflearn.regression(net)\n", | 
|  | 155 | +    "\n", | 
|  | 156 | +    "# Define model and setup tensorboard\n", | 
|  | 157 | +    "model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')\n", | 
|  | 158 | +    "# Start training (apply gradient descent algorithm)\n", | 
|  | 159 | +    "model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True)\n", | 
|  | 160 | +    "model.save('Rxmodel.tflearn')" | 
|  | 161 | +   ] | 
|  | 162 | +  }, | 
|  | 163 | +  { | 
|  | 164 | +   "cell_type": "code", | 
|  | 165 | +   "execution_count": 45, | 
|  | 166 | +   "metadata": { | 
|  | 167 | +    "collapsed": true | 
|  | 168 | +   }, | 
|  | 169 | +   "outputs": [], | 
|  | 170 | +   "source": [ | 
|  | 171 | +    "def clean_up_sentence(sentence):\n", | 
|  | 172 | +    "    # tokenize the pattern\n", | 
|  | 173 | +    "    sentence_words = nltk.word_tokenize(sentence)\n", | 
|  | 174 | +    "    # stem each word\n", | 
|  | 175 | +    "    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]\n", | 
|  | 176 | +    "    return sentence_words\n", | 
|  | 177 | +    "\n", | 
|  | 178 | +    "# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence\n", | 
|  | 179 | +    "def bow(sentence, words, show_details=False):\n", | 
|  | 180 | +    "    # tokenize the pattern\n", | 
|  | 181 | +    "    sentence_words = clean_up_sentence(sentence)\n", | 
|  | 182 | +    "    # bag of words\n", | 
|  | 183 | +    "    bag = [0]*len(words)  \n", | 
|  | 184 | +    "    for s in sentence_words:\n", | 
|  | 185 | +    "        for i,w in enumerate(words):\n", | 
|  | 186 | +    "            if w == s: \n", | 
|  | 187 | +    "                bag[i] = 1\n", | 
|  | 188 | +    "                if show_details:\n", | 
|  | 189 | +    "                    print (\"found in bag: %s\" % w)\n", | 
|  | 190 | +    "\n", | 
|  | 191 | +    "    return(np.array(bag))" | 
|  | 192 | +   ] | 
|  | 193 | +  }, | 
|  | 194 | +  { | 
|  | 195 | +   "cell_type": "code", | 
|  | 196 | +   "execution_count": 46, | 
|  | 197 | +   "metadata": { | 
|  | 198 | +    "collapsed": false | 
|  | 199 | +   }, | 
|  | 200 | +   "outputs": [ | 
|  | 201 | +    { | 
|  | 202 | +     "name": "stdout", | 
|  | 203 | +     "output_type": "stream", | 
|  | 204 | +     "text": [ | 
|  | 205 | +      "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", | 
|  | 206 | +      " 0 0 0]\n", | 
|  | 207 | +      "['coupon', 'goodbye', 'greeting', 'med', 'thanks']\n" | 
|  | 208 | +     ] | 
|  | 209 | +    } | 
|  | 210 | +   ], | 
|  | 211 | +   "source": [ | 
|  | 212 | +    "p = bow(\"hello\", words)\n", | 
|  | 213 | +    "print (p)\n", | 
|  | 214 | +    "print (classes)" | 
|  | 215 | +   ] | 
|  | 216 | +  }, | 
|  | 217 | +  { | 
|  | 218 | +   "cell_type": "code", | 
|  | 219 | +   "execution_count": 47, | 
|  | 220 | +   "metadata": { | 
|  | 221 | +    "collapsed": false | 
|  | 222 | +   }, | 
|  | 223 | +   "outputs": [ | 
|  | 224 | +    { | 
|  | 225 | +     "name": "stdout", | 
|  | 226 | +     "output_type": "stream", | 
|  | 227 | +     "text": [ | 
|  | 228 | +      "[[4.3407872851730644e-09, 0.009914605878293514, 0.9880092740058899, 0.0020757599268108606, 3.3042027780538774e-07]]\n" | 
|  | 229 | +     ] | 
|  | 230 | +    } | 
|  | 231 | +   ], | 
|  | 232 | +   "source": [ | 
|  | 233 | +    "print(model.predict([p]))" | 
|  | 234 | +   ] | 
|  | 235 | +  }, | 
|  | 236 | +  { | 
|  | 237 | +   "cell_type": "code", | 
|  | 238 | +   "execution_count": 48, | 
|  | 239 | +   "metadata": { | 
|  | 240 | +    "collapsed": false | 
|  | 241 | +   }, | 
|  | 242 | +   "outputs": [], | 
|  | 243 | +   "source": [ | 
|  | 244 | +    "# save all of our data structures\n", | 
|  | 245 | +    "import pickle\n", | 
|  | 246 | +    "pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( \"Rx_training_data\", \"wb\" ) )" | 
|  | 247 | +   ] | 
|  | 248 | +  } | 
|  | 249 | + ], | 
|  | 250 | + "metadata": { | 
|  | 251 | +  "kernelspec": { | 
|  | 252 | +   "display_name": "Python 3", | 
|  | 253 | +   "language": "python", | 
|  | 254 | +   "name": "python3" | 
|  | 255 | +  }, | 
|  | 256 | +  "language_info": { | 
|  | 257 | +   "codemirror_mode": { | 
|  | 258 | +    "name": "ipython", | 
|  | 259 | +    "version": 3 | 
|  | 260 | +   }, | 
|  | 261 | +   "file_extension": ".py", | 
|  | 262 | +   "mimetype": "text/x-python", | 
|  | 263 | +   "name": "python", | 
|  | 264 | +   "nbconvert_exporter": "python", | 
|  | 265 | +   "pygments_lexer": "ipython3", | 
|  | 266 | +   "version": "3.5.2" | 
|  | 267 | +  } | 
|  | 268 | + }, | 
|  | 269 | + "nbformat": 4, | 
|  | 270 | + "nbformat_minor": 1 | 
|  | 271 | +} | 
0 commit comments