diff --git a/Add Audio.ipynb b/Add Audio.ipynb new file mode 100644 index 000000000..f61681028 --- /dev/null +++ b/Add Audio.ipynb @@ -0,0 +1,68 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Generate a sound\n", + "import numpy as np\n", + "import IPython.display as ipd\n", + "from IPython.display import Audio\n", + "framerate = 44100\n", + "t = np.linspace(0,5,framerate*5)\n", + "data = np.sin(2*np.pi*220*t) + np.sin(2*np.pi*224*t)\n", + "Audio(data,rate=framerate)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "celltoolbar": "Raw Cell Format", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/01.03-Magic-Commands.ipynb b/notebooks/01.03-Magic-Commands.ipynb index 4a47b373a..b2dd79cd4 100644 --- a/notebooks/01.03-Magic-Commands.ipynb +++ b/notebooks/01.03-Magic-Commands.ipynb @@ -30,7 +30,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The previous two sections showed how IPython lets you use and explore Python efficiently and interactively.\n", + "This is a test for the special character ñ in markdown cell. The previous two sections showed how IPython lets you use and explore Python efficiently and interactively.\n", "Here we'll begin discussing some of the enhancements that IPython adds on top of the normal Python syntax.\n", "These are known in IPython as *magic commands*, and are prefixed by the ``%`` character.\n", "These magic commands are designed to succinctly solve various common problems in standard data analysis.\n", diff --git a/notebooks/02.00-Introduction-to-NumPy.ipynb b/notebooks/02.00-Introduction-to-NumPy.ipynb index 813b85ab4..d2f4e92c9 100644 --- a/notebooks/02.00-Introduction-to-NumPy.ipynb +++ b/notebooks/02.00-Introduction-to-NumPy.ipynb @@ -157,7 +157,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/notebooks/04.14-Visualization-With-Seaborn.ipynb b/notebooks/04.14-Visualization-With-Seaborn.ipynb index 652627b2a..d10e5a99d 100644 --- a/notebooks/04.14-Visualization-With-Seaborn.ipynb +++ b/notebooks/04.14-Visualization-With-Seaborn.ipynb @@ -6,7 +6,18 @@ "source": [ "\n", "\n", - "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", + "*This notebook contains an excerpt from the [Python Data Science Handbook] (random addition) (http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", + "\n", + "**Changing Header and first row completely**\n", + "\n", + "| Change | header and | 1st | row | completely | for testing |\n", + "|----------|-------------|------|----------|-------------|------|\n", + "| the | colons | after | the header | row are | removed |\n", + "| col 2 is | centered | 12 | to fix | all the changes | in the table properly |\n", + "| col 3 is | right aligned | 1 | hence, we are testing | the table changes | here |\n", + "| col 1 is | left aligned | 1600 | It's weird | to write random | text\n", + "| col 2 is | centered | 12 | but it's 2020 | and I got | to do this | \n", + "| col 3 is | right aligned | 1 | Let's make | world a better place | peace\n", "\n", "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*" ] @@ -16,7 +27,18 @@ "metadata": {}, "source": [ "\n", - "< [Geographic Data with Basemap](04.13-Geographic-Data-With-Basemap.ipynb) | [Contents](Index.ipynb) | [Further Resources](04.15-Further-Resources.ipynb) >" + "< [Geographic Data with Basemap](04.13-Geographic-Data-With-Basemap.ipynb) | [Contents](Index.ipynb) | [Further Resources](04.15-Further-Resources.ipynb) >\n", + "\n", + "**Additional Column in the middle**\n", + "\n", + "| Tables | Are | Cool | NEW COLUMN | Let us | test them | properly, shall we? |\n", + "|:----------:|:-------------:|:-------------:|:------:|:----------:|:-------------:|:------:|\n", + "| col 1 is | left aligned | 1600 | ADDING A | Still some | work to | be done |\n", + "| col 2 is | centered | 12 | COLUMN | to fix | all the changes | in the table properly |\n", + "| col 3 is | right aligned | 1 | FOR THE | hence, we are testing | the table changes | here |\n", + "| col 1 is | left aligned | 1600 | TESTING | It's weird | to write random | text\n", + "| col 2 is | centered | 12 | PURPOSE | but it's 2020 | and I got | to do this | \n", + "| col 3 is | right aligned | 1 | HERE ON OUT | Let's make | world a better place | peace" ] }, { @@ -33,6 +55,17 @@ "Matplotlib has proven to be an incredibly useful and popular visualization tool, but even avid users will admit it often leaves much to be desired.\n", "There are several valid complaints about Matplotlib that often come up:\n", "\n", + "**Small changes at multiple places**\n", + "\n", + "| Tables | Are | Cool | Let us | test them | properly, shall we? |\n", + "|:----------:|:-------------:|:------:|:----------:|:-------------:|:------:|\n", + "| col 1 is | right aligned | 1800 | Still some | work got to | be done |\n", + "| col 2 is | centered | 12 | to fix | all the changes | in the table properly |\n", + "| col 3 is | right aligned | 1 | hence, we are testing | the table changes | here |\n", + "| col 1 is | left aligned | 1600 | It's weird | to write random | text\n", + "| row 2 is | centered | 12 | but it is 2220 | and I got | to do this | \n", + "| col 3 is | right aligned | 1 | Let's make | world A better place | apple pie\n", + "\n", "- Prior to version 2.0, Matplotlib's defaults are not exactly the best choices. It was based off of MATLAB circa 1999, and this often shows.\n", "- Matplotlib's API is relatively low level. Doing sophisticated statistical visualization is possible, but often requires a *lot* of boilerplate code.\n", "- Matplotlib predated Pandas by more than a decade, and thus is not designed for use with Pandas ``DataFrame``s. In order to visualize data from a Pandas ``DataFrame``, you must extract each ``Series`` and often concatenate them together into the right format. It would be nicer to have a plotting library that can intelligently use the ``DataFrame`` labels in a plot.\n", @@ -51,7 +84,18 @@ "## Seaborn Versus Matplotlib\n", "\n", "Here is an example of a simple random-walk plot in Matplotlib, using its classic plot formatting and colors.\n", - "We start with the typical imports:" + "We start with the typical imports:\n", + "\n", + "**Remove 1st Column**\n", + "\n", + "Are | Cool | Let us | test them | properly, shall we? |\n", + "-------------:|:------:|:----------:|:-------------:|:------:|\n", + " left aligned | 1600 | Still some | work to | be done |\n", + " centered | 12 | to fix | all the changes | in the table properly |\n", + " right aligned | 1 | hence, we are testing | the table changes | here |\n", + " left aligned | 1600 | It's weird | to write random | text\n", + " centered | 12 | but it's 2020 | and I got | to do this | \n", + " right aligned | 1 | Let's make | world a better place | peace" ] }, { @@ -71,7 +115,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we create some random walk data:" + "**First and last row removed**\n", + "\n", + "| Tables | Are | Cool | Let us | test them | properly, shall we? |\n", + "|:----------:|:-------------:|:------:|:----------:|:-------------:|:------:|\n", + "| col 2 is | centered | 12 | to fix | all the changes | in the table properly |\n", + "| col 3 is | right aligned | 1 | hence, we are testing | the table changes | here |\n", + "| col 1 is | left aligned | 1600 | It's weird | to write random | text\n", + "| col 2 is | centered | 12 | but it's 2020 | and I got | to do this | " ] }, { @@ -121,6 +172,18 @@ "source": [ "Although the result contains all the information we'd like it to convey, it does so in a way that is not all that aesthetically pleasing, and even looks a bit old-fashioned in the context of 21st-century data visualization.\n", "\n", + "**ADD ROW IN MIDDLE**\n", + "\n", + "| Tables | Are | Cool | Let us | test them | properly, shall we? |\n", + "|:----------:|:-------------:|:------:|:----------:|:-------------:|:------:|\n", + "| col 1 is | left aligned | 1600 | Still some | work to | be done |\n", + "| col 2 is | centered | 12 | to fix | all the changes | in the table properly |\n", + "| col 3 is | right aligned | 1 | hence, we are testing | the table changes | here |\n", + "| COL NEW | NEWLY ADDED | COLUMN | 2200 | TO THIS TABLE | IN THE MIDDLE |\n", + "| col 1 is | left aligned | 1600 | It's weird | to write random | text\n", + "| col 2 is | centered | 12 | but it's 2020 | and I got | to do this | \n", + "| col 3 is | right aligned | 1 | Let's make | world a better place | peace\n", + "\n", "Now let's take a look at how it works with Seaborn.\n", "As we will see, Seaborn has many of its own high-level plotting routines, but it can also overwrite Matplotlib's default parameters and in turn get even simple Matplotlib scripts to produce vastly superior output.\n", "We can set the style by calling Seaborn's ``set()`` method.\n", @@ -173,6 +236,18 @@ "Ah, much better!" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**DELETE TOP 2 AND BOTTOM 2 ROWS**\n", + "\n", + "| Tables | Are | Cool | Let us | test them | properly, shall we? |\n", + "|:----------:|:-------------:|:------:|:----------:|:-------------:|:------:|\n", + "| col 3 is | right aligned | 1 | hence, we are testing | the table changes | here |\n", + "| col 1 is | left aligned | 1600 | It's weird | to write random | text" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -181,6 +256,17 @@ "\n", "The main idea of Seaborn is that it provides high-level commands to create a variety of plot types useful for statistical data exploration, and even some statistical model fitting.\n", "\n", + "**DELETE FIRST 2 AND LAST 2 COLUMNS**\n", + "\n", + "| Cool | Let us |\n", + "|:------:|:----------:|\n", + "| 1600 | Still some |\n", + "| 12 | to fix |\n", + "| 1 | hence, we are testing |\n", + "| 1600 | It's weird |\n", + "| 12 | but it's 2020 |\n", + "| 1 | Let's make |\n", + "\n", "Let's take a look at a few of the datasets and plot types available in Seaborn. Note that all of the following *could* be done using raw Matplotlib commands (this is, in fact, what Seaborn does under the hood) but the Seaborn API is much more convenient." ] }, @@ -191,7 +277,14 @@ "### Histograms, KDE, and densities\n", "\n", "Often in statistical data visualization, all you want is to plot histograms and joint distributions of variables.\n", - "We have seen that this is relatively straightforward in Matplotlib:" + "We have seen that this is relatively straightforward in Matplotlib:\n", + "\n", + "**REPLACE ENTIRE TABLE**\n", + "\n", + "Markdown | Less | Pretty\n", + "--- | --- | ---\n", + "*Still* | `renders` | **nicely**\n", + "1 | 2 | 3" ] }, { @@ -251,7 +344,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Histograms and KDE can be combined using ``distplot``:" + "Histograms and KDE can be combined using ``distplot``:\n", + "\n", + "**NEW ROW AT END**\n", + "\n", + "| Tables | Are | Cool | Let us | test them | properly, shall we? |\n", + "|:----------:|:-------------:|:------:|:----------:|:-------------:|:------:|\n", + "| col 1 is | left aligned | 1600 | Still some | work to | be done |\n", + "| col 2 is | centered | 12 | to fix | all the changes | in the table properly |\n", + "| col 3 is | right aligned | 1 | hence, we are testing | the table changes | here |\n", + "| col 1 is | left aligned | 1600 | It's weird | to write random | text\n", + "| col 2 is | centered | 12 | but it's 2020 | and I got | to do this | \n", + "| col 3 is | right aligned | 1 | Let's make | world a better place | peace\n", + "| NEWLY | ADDED | ROW | AT THE | END OF THE | TABLE" ] }, { @@ -279,6 +384,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "**REMOVE LAST COLUMN**\n", + "\n", + "| Tables | Are | Cool | Let us | test them |\n", + "|:----------:|:-------------:|:------:|:----------:|:-------------:|\n", + "| col 1 is | left aligned | 1600 | Still some | work to |\n", + "| col 2 is | centered | 12 | to fix | all the changes |\n", + "| col 3 is | right aligned | 1 | hence, we are testing | the table changes |\n", + "| col 1 is | left aligned | 1600 | It's weird | to write random |\n", + "| col 2 is | centered | 12 | but it's 2020 | and I got |\n", + "| col 3 is | right aligned | 1 | Let's make | world a better place |\n", + "\n", + "\n", "If we pass the full two-dimensional dataset to ``kdeplot``, we will get a two-dimensional visualization of the data:" ] }, @@ -307,7 +424,16 @@ "metadata": {}, "source": [ "We can see the joint distribution and the marginal distributions together using ``sns.jointplot``.\n", - "For this plot, we'll set the style to a white background:" + "For this plot, we'll set the style to a white background:\n", + "\n", + "| Tables | Are | Cool | Let us | test them | properly, shall we? |\n", + "|:----------:|:-------------:|:------:|:----------:|:-------------:|:------:|\n", + "| col 1 is | left aligned | 1600 | Still some | work to | be done |\n", + "| col 2 is | centered | 12 | to fix | all the changes | in the table properly |\n", + "| col 3 is | right aligned | 1 | hence, we are testing | the table changes | here |\n", + "| col 1 is | left aligned | 1600 | It's weird | to write random | text\n", + "| col 2 is | centered | 12 | but it's 2020 | and I got | to do this | \n", + "| col 3 is | right aligned | 1 | Let's make | world a better place | peace" ] }, { @@ -335,7 +461,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There are other parameters that can be passed to ``jointplot``—for example, we can use a hexagonally based histogram instead:" + "There are other parameters that can be passed to ``jointplot``—for example, we can use a hexagonally based histogram instead:\n", + "\n", + "| Tables | Are | Cool | Let us | test them | properly, shall we? |\n", + "|:----------:|:-------------:|:------:|:----------:|:-------------:|:------:|\n", + "| col 1 is | left aligned | 1600 | Still some | work to | be done |\n", + "| col 2 is | centered | 12 | to fix | all the changes | in the table properly |\n", + "| col 3 is | right aligned | 1 | hence, we are testing | the table changes | here |\n", + "| col 1 is | left aligned | 1600 | It's weird | to write random | text\n", + "| col 2 is | centered | 12 | but it's 2020 | and I got | to do this | \n", + "| col 3 is | right aligned | 1 | Let's make | world a better place | peace" ] }, { @@ -374,7 +509,16 @@ { "cell_type": "markdown", "metadata": {}, - "source": [] + "source": [ + "| Tables | Are | Cool | Let us | test them | properly, shall we? |\n", + "|:----------:|:-------------:|:------:|:----------:|:-------------:|:------:|\n", + "| col 1 is | left aligned | 1600 | Still some | work to | be done |\n", + "| col 2 is | centered | 12 | to fix | all the changes | in the table properly |\n", + "| col 3 is | right aligned | 1 | hence, we are testing | the table changes | here |\n", + "| col 1 is | left aligned | 1600 | It's weird | to write random | text\n", + "| col 2 is | centered | 12 | but it's 2020 | and I got | to do this | \n", + "| col 3 is | right aligned | 1 | Let's make | world a better place | peace" + ] }, { "cell_type": "code", @@ -389,6 +533,15 @@ "source": [ "### Faceted histograms\n", "\n", + "| Tables | Are | Cool | Let us | test them | properly, shall we? |\n", + "|:----------:|:-------------:|:------:|:----------:|:-------------:|:------:|\n", + "| col 1 is | left aligned | 1600 | Still some | work to | be done |\n", + "| col 2 is | centered | 12 | to fix | all the changes | in the table properly |\n", + "| col 3 is | right aligned | 1 | hence, we are testing | the table changes | here |\n", + "| col 1 is | left aligned | 1600 | It's weird | to write random | text\n", + "| col 2 is | centered | 12 | but it's 2020 | and I got | to do this | \n", + "| col 3 is | right aligned | 1 | Let's make | world a better place | peace\n", + "\n", "Emptied above cells Sometimes the best way to view data is via histograms of subsets. Seaborn's ``FacetGrid`` makes this extremely simple.\n", "We'll take a look at some data that shows the amount that restaurant staff receive in tips based on various indicator data:" ] @@ -518,7 +671,16 @@ "source": [ "### Factor plots\n", "\n", - "Factor plots can be useful for this kind of visualization as well. This allows you to view the distribution of a parameter within bins defined by any other parameter:" + "Factor plots can be useful for this kind of visualization as well. This allows you to view the distribution of a parameter within bins defined by any other parameter:\n", + "\n", + "| Tables | Are | Cool | Let us | test them | properly, shall we? |\n", + "|:----------:|:-------------:|:------:|:----------:|:-------------:|:------:|\n", + "| col 1 is | left aligned | 1600 | Still some | work to | be done |\n", + "| col 2 is | centered | 12 | to fix | all the changes | in the table properly |\n", + "| col 3 is | right aligned | 1 | hence, we are testing | the table changes | here |\n", + "| col 1 is | left aligned | 1600 | It's weird | to write random | text\n", + "| col 2 is | centered | 12 | but it's 2020 | and I got | to do this | \n", + "| col 3 is | right aligned | 1 | Let's make | world a better place | peace" ] }, { @@ -549,6 +711,15 @@ "source": [ "### Joint distributions\n", "\n", + "| Tables | Are | Cool | Let us | test them | properly, shall we? |\n", + "|:----------:|:-------------:|:------:|:----------:|:-------------:|:------:|\n", + "| col 1 is | left aligned | 1600 | Still some | work to | be done |\n", + "| col 2 is | centered | 12 | to fix | all the changes | in the table properly |\n", + "| col 3 is | right aligned | 1 | hence, we are testing | the table changes | here |\n", + "| col 1 is | left aligned | 1600 | It's weird | to write random | text\n", + "| col 2 is | centered | 12 | but it's 2020 | and I got | to do this | \n", + "| col 3 is | right aligned | 1 | Let's make | world a better place | peace\n", + "\n", "Similar to the pairplot we saw earlier, we can use ``sns.jointplot`` to show the joint distribution between different datasets, along with the associated marginal distributions:" ] }, @@ -606,7 +777,16 @@ "source": [ "### Bar plots\n", "\n", - "Time series can be plotted using ``sns.factorplot``. In the following example, we'll use the Planets data that we first saw in [Aggregation and Grouping](03.08-Aggregation-and-Grouping.ipynb):" + "Time series can be plotted using ``sns.factorplot``. In the following example, we'll use the Planets data that we first saw in [Aggregation and Grouping](03.08-Aggregation-and-Grouping.ipynb):\n", + "\n", + "| Tables | Are | Cool | Let us | test them | properly, shall we? |\n", + "|:----------:|:-------------:|:------:|:----------:|:-------------:|:------:|\n", + "| col 1 is | left aligned | 1600 | Still some | work to | be done |\n", + "| col 2 is | centered | 12 | to fix | all the changes | in the table properly |\n", + "| col 3 is | right aligned | 1 | hence, we are testing | the table changes | here |\n", + "| col 1 is | left aligned | 1600 | It's weird | to write random | text\n", + "| col 2 is | centered | 12 | but it's 2020 | and I got | to do this | \n", + "| col 3 is | right aligned | 1 | Let's make | world a better place | peace" ] }, { @@ -1611,7 +1791,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/notebooks/Lime - basic usage, two class case.ipynb b/notebooks/Lime - basic usage, two class case.ipynb new file mode 100644 index 000000000..04e2e635e --- /dev/null +++ b/notebooks/Lime - basic usage, two class case.ipynb @@ -0,0 +1,74665 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import print_function\n", + "import lime\n", + "import sklearn\n", + "import numpy as np\n", + "import sklearn\n", + "import sklearn.ensemble\n", + "import sklearn.metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetching data, training a classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For this tutorial, we'll be using the [20 newsgroups dataset](http://scikit-learn.org/stable/datasets/#the-20-newsgroups-text-dataset). In particular, for simplicity, we'll use a 2-class subset: atheism and christianity." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading 20news dataset. This may take a few minutes.\n", + "Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)\n" + ] + } + ], + "source": [ + "from sklearn.datasets import fetch_20newsgroups\n", + "categories = ['alt.atheism', 'soc.religion.christian']\n", + "newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)\n", + "newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)\n", + "class_names = ['atheism', 'christian']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use the tfidf vectorizer, commonly used for text." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)\n", + "train_vectors = vectorizer.fit_transform(newsgroups_train.data)\n", + "test_vectors = vectorizer.transform(newsgroups_test.data)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's say we want to use random forests for classification. It's usually hard to understand what random forests are doing, especially with many trees." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", + " criterion='gini', max_depth=None, max_features='auto',\n", + " max_leaf_nodes=None, max_samples=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=500,\n", + " n_jobs=None, oob_score=False, random_state=None,\n", + " verbose=0, warm_start=False)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)\n", + "rf.fit(train_vectors, newsgroups_train.target)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9241540256709451" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred = rf.predict(test_vectors)\n", + "sklearn.metrics.f1_score(newsgroups_test.target, pred, average='binary')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that this classifier achieves a very high F score. [The sklearn guide to 20 newsgroups](http://scikit-learn.org/stable/datasets/#filtering-text-for-more-realistic-training) indicates that Multinomial Naive Bayes overfits this dataset by learning irrelevant stuff, such as headers. Let's see if random forests do the same." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explaining predictions using lime" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lime explainers assume that classifiers act on raw text, but sklearn classifiers act on vectorized representation of texts. For this purpose, we use sklearn's pipeline, and implements predict_proba on raw_text lists." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from lime import lime_text\n", + "from sklearn.pipeline import make_pipeline\n", + "c = make_pipeline(vectorizer, rf)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.252 0.748]]\n" + ] + } + ], + "source": [ + "print(c.predict_proba([newsgroups_test.data[0]]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we create an explainer object. We pass the class_names as an argument for prettier display." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from lime.lime_text import LimeTextExplainer\n", + "explainer = LimeTextExplainer(class_names=class_names)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then generate an explanation with at most 6 features for an arbitrary document in the test set." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/amitrathi/.virtualenvs/loaded2/lib/python3.6/site-packages/lime/lime_text.py:116: FutureWarning: split() requires a non-empty pattern match.\n", + " self.as_list = [s for s in splitter.split(self.raw) if s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document id: 83\n", + "Probability(christian) = 0.492\n", + "True class: atheism\n" + ] + } + ], + "source": [ + "idx = 83\n", + "exp = explainer.explain_instance(newsgroups_test.data[idx], c.predict_proba, num_features=6)\n", + "print('Document id: %d' % idx)\n", + "print('Probability(christian) =', c.predict_proba([newsgroups_test.data[idx]])[0,1])\n", + "print('True class: %s' % class_names[newsgroups_test.target[idx]])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The classifier got this example right (it predicted atheism). \n", + "The explanation is presented below as a list of weighted features. " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Host', -0.13158254377478837),\n", + " ('Posting', -0.12755833682065656),\n", + " ('NNTP', -0.1091239484180258),\n", + " ('edu', -0.02540690605359643),\n", + " ('post', -0.0130522416154676),\n", + " ('Thanks', 0.009196448838747762)]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exp.as_list()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These weighted features are a linear model, which approximates the behaviour of the random forest classifier in the vicinity of the test example. Roughly, if we remove 'Posting' and 'Host' from the document , the prediction should move towards the opposite class (Christianity) by about 0.27 (the sum of the weights for both features). Let's see if this is the case." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original prediction: 0.492\n", + "Prediction removing some features: 0.734\n", + "Difference: 0.242\n" + ] + } + ], + "source": [ + "print('Original prediction:', rf.predict_proba(test_vectors[idx])[0,1])\n", + "tmp = test_vectors[idx].copy()\n", + "tmp[0,vectorizer.vocabulary_['Posting']] = 0\n", + "tmp[0,vectorizer.vocabulary_['Host']] = 0\n", + "print('Prediction removing some features:', rf.predict_proba(tmp)[0,1])\n", + "print('Difference:', rf.predict_proba(tmp)[0,1] - rf.predict_proba(test_vectors[idx])[0,1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pretty close! \n", + "The words that explain the model around this document seem very arbitrary - not much to do with either Christianity or Atheism. \n", + "In fact, these are words that appear in the email headers (you will see this clearly soon), which make distinguishing between the classes much easier." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualizing explanations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The explanations can be returned as a matplotlib barplot:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "%matplotlib inline\n", + "fig = exp.as_pyplot_figure()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The explanations can also be exported as an html page (which we can render here in this notebook), using D3.js to render graphs. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "exp.show_in_notebook(text=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, we can save the fully contained html page to a file:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "exp.save_to_file('/tmp/oi.html')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we can also include a visualization of the original document, with the words in the explanations highlighted. Notice how the words that affect the classifier the most are all in the email header." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "exp.show_in_notebook(text=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it for this tutorial. Random forests were just an example, this explainer works for any classifier you may want to use, as long as it implements predict_proba." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/notebooks/Shap Trial.ipynb b/notebooks/Shap Trial.ipynb new file mode 100644 index 000000000..2d681f0ca --- /dev/null +++ b/notebooks/Shap Trial.ipynb @@ -0,0 +1,274 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + " Visualization omitted, Javascript library not loaded!
\n", + " Have you run `initjs()` in this notebook? If this notebook was from another\n", + " user you must also trust this notebook (File -> Trust notebook). If you are viewing\n", + " this notebook on github the Javascript has been stripped for security. If you are using\n", + " JupyterLab this error is because a JupyterLab extension has not yet been written.\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import xgboost\n", + "import shap\n", + "\n", + "# load JS visualization code to notebook\n", + "shap.initjs()\n", + "\n", + "# train XGBoost model\n", + "X,y = shap.datasets.boston()\n", + "model = xgboost.train({\"learning_rate\": 0.02}, xgboost.DMatrix(X, label=y), 90)\n", + "\n", + "# explain the model's predictions using SHAP\n", + "# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)\n", + "explainer = shap.TreeExplainer(model)\n", + "shap_values = explainer.shap_values(X)\n", + "\n", + "# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)\n", + "shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a\n" + ] + } + ], + "source": [ + "print('a')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + " Visualization omitted, Javascript library not loaded!
\n", + " Have you run `initjs()` in this notebook? If this notebook was from another\n", + " user you must also trust this notebook (File -> Trust notebook). If you are viewing\n", + " this notebook on github the Javascript has been stripped for security. If you are using\n", + " JupyterLab this error is because a JupyterLab extension has not yet been written.\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# visualize the training set predictions\n", + "shap.force_plot(explainer.expected_value, shap_values, X)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# create a dependence plot to show the effect of a single feature across the whole dataset\n", + "shap.dependence_plot(\"RM\", shap_values, X)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import shap\n", + "shap.initjs()\n", + "# summarize the effects of all the features\n", + "shap.summary_plot(shap_values, X)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# summarize the effects of all the features\n", + "shap.summary_plot(shap_values, X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "celltoolbar": "Edit Metadata", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}