RL Step by step Q-Learning sample

FlipWebApps · FlipWebApps · commit 9b4cf972282c · 2019-01-04T21:01:26.000+01:00
diff --git a/ReinforcementLearning/Q-Learning-StepByStep.ipynb b/ReinforcementLearning/Q-Learning-StepByStep.ipynb
@@ -0,0 +1,281 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Q-Learning Step By Step Example\n",
+    "\n",
+    "A simple example of Q learning in a step by step fashion using a simple 2x2 gridworld type problem\n",
+    "\n",
+    "State 0 | State 1\n",
+    "--------|--------\n",
+    "State 2 | State 3\n",
+    "\n",
+    "State 0 = Start<br />\n",
+    "State 1 = Safe<br />\n",
+    "State 2 = Hole<br />\n",
+    "State 3 = Goal<br />\n",
+    "\n",
+    "For each state we can move up, down, left, right or stay put - not excluding invalid moves at edges.\n",
+    "\n",
+    "Each hole gives a reward of -10, reaching the goal gives +10, all other states give a reward of -1.\n",
+    "\n",
+    "So the optimal path is 0-1-3."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[ 0.  -9.2  0.   0.   0. ]\n",
+      " [ 0.   0.   0.   0.   0. ]\n",
+      " [ 0.   0.   0.   0.   0. ]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.  -9.2  0.   0.   0. ]\n",
+      " [ 0.   0.   0.   0.   0. ]\n",
+      " [ 0.   0.   0.   0.  -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "goal state reached\n",
+      "[[ 0.  -9.2  0.  -0.2  0. ]\n",
+      " [ 0.   0.   0.   0.   0. ]\n",
+      " [ 0.   0.   0.   0.  -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.  -9.2  0.  -0.2  0. ]\n",
+      " [ 0.   0.   0.   0.  -0.2]\n",
+      " [ 0.   0.   0.   0.  -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.  -9.2  0.  -0.2  0. ]\n",
+      " [ 0.   0.   0.   0.  -0.2]\n",
+      " [ 0.   0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "goal state reached\n",
+      "[[ 0.  -9.2  0.  -0.2  0. ]\n",
+      " [ 0.   0.   0.   0.  -0.2]\n",
+      " [ 0.   0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.  -9.2  0.  -0.2  0. ]\n",
+      " [ 0.  10.8  0.   0.  -0.2]\n",
+      " [ 0.   0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "goal state reached\n",
+      "[[ 0.  -9.2  0.  10.6  0. ]\n",
+      " [ 0.  10.8  0.   0.  -0.2]\n",
+      " [ 0.   0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.  -9.2  0.  10.6  0. ]\n",
+      " [ 0.  10.8  0.   0.  -0.2]\n",
+      " [ 0.   0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "goal state reached\n",
+      "[[ 0.  -9.2  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  -0.2]\n",
+      " [ 0.   0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.  -9.2  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  -0.2]\n",
+      " [ 0.   0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "goal state reached\n",
+      "[[ 0.  -9.2  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  -0.2]\n",
+      " [ 0.   0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.  -9.2  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  10.6]\n",
+      " [ 0.   0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.  -9.2  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  10.6]\n",
+      " [ 0.   0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "goal state reached\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  10.6]\n",
+      " [ 0.   0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  10.6]\n",
+      " [ 0.   0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "goal state reached\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  10.6]\n",
+      " [ 0.   0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  10.6]\n",
+      " [10.4  0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  10.6]\n",
+      " [10.4  0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  10.6]\n",
+      " [10.4  0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  10.6]\n",
+      " [10.4  0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  10.6]\n",
+      " [10.4  0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  10.6]\n",
+      " [10.4  0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "goal state reached\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8  0.   0.  10.6]\n",
+      " [10.4  0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8 10.4  0.  10.6]\n",
+      " [10.4  0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8 10.4  0.  10.6]\n",
+      " [10.4  0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8 10.4  0.  10.6]\n",
+      " [10.4  0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "goal state reached\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8 10.4  0.  10.6]\n",
+      " [10.4  0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8 10.4  0.  10.6]\n",
+      " [10.4  0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n",
+      "goal state reached\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import random\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "gamma = 0.8\n",
+    "\n",
+    "# each matrix below has states as rows, columns in order (U, D, L, R, N) unless otherwise stated\n",
+    "\n",
+    "# rewards for each state / action. 0 represents no such transition possible\n",
+    "rewards = np.array([[0, -10, 0, -1, -1],\n",
+    "                    [0, 10, -1, 0, -1],\n",
+    "                    [-1, 0, 0, 10, -1],\n",
+    "                    [-1, 0, -10, 0, 0]])\n",
+    "\n",
+    "q_matrix = np.zeros((4,5))\n",
+    "\n",
+    "# valid actions for each state encoded as 0=up,1=down, 2=left, 3?right, 4=no action\n",
+    "valid_actions = np.array([[1, 3, 4],\n",
+    "                          [1, 2, 4],\n",
+    "                          [0, 3, 4],\n",
+    "                          [0, 2, 4]])\n",
+    "\n",
+    "# what states we move to for each state / action. -1 represents invalid transaction\n",
+    "transition_matrix = np.array([[-1, 2, -1, 1, 1 ],\n",
+    "                              [-1, 3, 0, -1, 2 ],\n",
+    "                              [0, -1, -1, 3, 3 ],\n",
+    "                              [1, -1, 2, -1, -1]])\n",
+    "\n",
+    "\n",
+    "for i in range(100): # 10 episodes\n",
+    "    current_state = 0\n",
+    "    while current_state != 3:\n",
+    "        # chose a random action - could use epsilon-greedy here\n",
+    "        action = random.choice(valid_actions[current_state])\n",
+    "\n",
+    "        # record next state and reward (r, s')\n",
+    "        next_state = transition_matrix[current_state][action]\n",
+    "        reward = rewards[current_state][action]\n",
+    "\n",
+    "        # get possible rewards for all valid actions\n",
+    "        future_rewards = []\n",
+    "        for action_next in valid_actions[next_state]:\n",
+    "            future_rewards.append(q_matrix[next_state][action_next])\n",
+    "\n",
+    "        # q update\n",
+    "        q_state = reward + gamma + max(future_rewards)\n",
+    "        q_matrix[current_state][action] = q_state\n",
+    "        print(q_matrix)\n",
+    "\n",
+    "        current_state = next_state\n",
+    "        if current_state == 3:\n",
+    "            print('goal state reached')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If this works then we would expect to:\n",
+    "\n",
+    "1. go right (q value for row 1, column 4 to be highest)\n",
+    "2. go down (q value for row 2, column 2 to be highest) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final q-matrix\n",
+      "[[ 0.   1.6  0.  10.6 10.6]\n",
+      " [ 0.  10.8 10.4  0.  10.6]\n",
+      " [10.4  0.   0.  10.8 -0.2]\n",
+      " [ 0.   0.   0.   0.   0. ]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Final q-matrix\")\n",
+    "print(q_matrix)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}