{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Lesson15-Python For Data Science-CaseStudies.ipynb", "version": "0.3.2", "provenance": [], "collapsed_sections": [ "NvoiEwiAWrWy", "wR_L2OPkuqH4", "JJNSA3n0u3Zf", "qTL1K5SSXD1U", "SEIH6ESVXKOb", "ANeUczzxXOf2", "bRKFiJVvPyZi", "3b1-VTl8jubf", "wQ8dueD5jubm", "nxPSuxe3jubp", "wv5b_Nhljubx" ], "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "metadata": { "id": "spdivf2TMnGC", "colab_type": "text" }, "cell_type": "markdown", "source": [ "# Lesson 16: Case Studies" ] }, { "metadata": { "id": "c_Id55m6Jsbu", "colab_type": "text" }, "cell_type": "markdown", "source": [ "## Pragmatic AI Labs\n", "\n" ] }, { "metadata": { "id": "e5p96AqpSDZa", "colab_type": "text" }, "cell_type": "markdown", "source": [ "![alt text](https://paiml.com/images/logo_with_slogan_white_background.png)\n", "\n", "This notebook was produced by [Pragmatic AI Labs](https://paiml.com/). You can continue learning about these topics by:\n", "\n", "* Buying a copy of [Pragmatic AI: An Introduction to Cloud-Based Machine Learning](http://www.informit.com/store/pragmatic-ai-an-introduction-to-cloud-based-machine-9780134863917)\n", "* Reading an online copy of [Pragmatic AI:Pragmatic AI: An Introduction to Cloud-Based Machine Learning](https://www.safaribooksonline.com/library/view/pragmatic-ai-an/9780134863924/)\n", "* Watching video [Essential Machine Learning and AI with Python and Jupyter Notebook-Video-SafariOnline](https://www.safaribooksonline.com/videos/essential-machine-learning/9780135261118) on Safari Books Online.\n", "* Watching video [AWS Certified Machine Learning-Speciality](https://learning.oreilly.com/videos/aws-certified-machine/9780135556597)\n", "* Purchasing video [Essential Machine Learning and AI with Python and Jupyter Notebook- Purchase Video](http://www.informit.com/store/essential-machine-learning-and-ai-with-python-and-jupyter-9780135261095)\n", "* Viewing more content at [noahgift.com](https://noahgift.com/)\n" ] }, { "metadata": { "id": "pBTeTbnRKG_k", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "NvoiEwiAWrWy", "colab_type": "text" }, "cell_type": "markdown", "source": [ "## 16.4 Ludwig (Open Source AutoML)" ] }, { "metadata": { "id": "jbnbFKcTXNOn", "colab_type": "text" }, "cell_type": "markdown", "source": [ "**Github Project URL**: https://uber.github.io/ludwig/\n", "\n", "![alt text](https://user-images.githubusercontent.com/58792/52920000-f78d8c00-32bc-11e9-8e5e-adf53f1b8a37.png)" ] }, { "metadata": { "id": "Aa-KnZKcfvkV", "colab_type": "text" }, "cell_type": "markdown", "source": [ "### Install Ludwig" ] }, { "metadata": { "id": "Q3FrtesdfyV9", "colab_type": "code", "outputId": "1db2fd0d-8904-489e-ac1f-70bc70c9704a", "colab": { "base_uri": "https://localhost:8080/", "height": 925 } }, "cell_type": "code", "source": [ "!pip install --upgrade numpy #must restart colab runtime\n", "!pip install --upgrade scikit-image\n", "!pip install -q ludwig\n", "!python -m spacy download en " ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Requirement already up-to-date: numpy in /usr/local/lib/python3.6/dist-packages (1.16.1)\n", "Collecting scikit-image\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/24/06/d560630eb9e36d90d69fe57d9ff762d8f501664ce478b8a0ae132b3c3008/scikit_image-0.14.2-cp36-cp36m-manylinux1_x86_64.whl (25.3MB)\n", "\u001b[K 100% |████████████████████████████████| 25.3MB 1.9MB/s \n", "\u001b[?25hCollecting pillow>=4.3.0 (from scikit-image)\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/85/5e/e91792f198bbc5a0d7d3055ad552bc4062942d27eaf75c3e2783cf64eae5/Pillow-5.4.1-cp36-cp36m-manylinux1_x86_64.whl (2.0MB)\n", "\u001b[K 100% |████████████████████████████████| 2.0MB 18.3MB/s \n", "\u001b[?25hRequirement already satisfied, skipping upgrade: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image) (1.1.0)\n", "Requirement already satisfied, skipping upgrade: matplotlib>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image) (3.0.2)\n", "Requirement already satisfied, skipping upgrade: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: cloudpickle>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from scikit-image) (0.6.1)\n", "Requirement already satisfied, skipping upgrade: PyWavelets>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image) (1.0.1)\n", "Requirement already satisfied, skipping upgrade: networkx>=1.8 in /usr/local/lib/python3.6/dist-packages (from scikit-image) (2.2)\n", "Collecting dask[array]>=1.0.0 (from scikit-image)\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7c/2b/cf9e5477bec3bd3b4687719876ea38e9d8c9dc9d3526365c74e836e6a650/dask-1.1.1-py2.py3-none-any.whl (701kB)\n", "\u001b[K 100% |████████████████████████████████| 706kB 25.2MB/s \n", "\u001b[?25hRequirement already satisfied, skipping upgrade: numpy>=1.8.2 in /usr/local/lib/python3.6/dist-packages (from scipy>=0.17.0->scikit-image) (1.16.1)\n", "Requirement already satisfied, skipping upgrade: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=2.0.0->scikit-image) (0.10.0)\n", "Requirement already satisfied, skipping upgrade: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=2.0.0->scikit-image) (2.3.1)\n", "Requirement already satisfied, skipping upgrade: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=2.0.0->scikit-image) (1.0.1)\n", "Requirement already satisfied, skipping upgrade: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=2.0.0->scikit-image) (2.5.3)\n", "Requirement already satisfied, skipping upgrade: decorator>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from networkx>=1.8->scikit-image) (4.3.2)\n", "Requirement already satisfied, skipping upgrade: toolz>=0.7.3; extra == \"array\" in /usr/local/lib/python3.6/dist-packages (from dask[array]>=1.0.0->scikit-image) (0.9.0)\n", "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from kiwisolver>=1.0.1->matplotlib>=2.0.0->scikit-image) (40.8.0)\n", "\u001b[31mfeaturetools 0.4.1 has requirement pandas>=0.23.0, but you'll have pandas 0.22.0 which is incompatible.\u001b[0m\n", "\u001b[31malbumentations 0.1.12 has requirement imgaug<0.2.7,>=0.2.5, but you'll have imgaug 0.2.8 which is incompatible.\u001b[0m\n", "Installing collected packages: pillow, dask, scikit-image\n", " Found existing installation: Pillow 4.0.0\n", " Uninstalling Pillow-4.0.0:\n", " Successfully uninstalled Pillow-4.0.0\n", " Found existing installation: dask 0.20.2\n", " Uninstalling dask-0.20.2:\n", " Successfully uninstalled dask-0.20.2\n", " Found existing installation: scikit-image 0.13.1\n", " Uninstalling scikit-image-0.13.1:\n", " Successfully uninstalled scikit-image-0.13.1\n", "Successfully installed dask-1.1.1 pillow-5.4.1 scikit-image-0.14.2\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "application/vnd.colab-display-data+json": { "pip_warning": { "packages": [ "PIL" ] } } }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "Requirement already satisfied: en_core_web_sm==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm==2.0.0 in /usr/local/lib/python3.6/dist-packages (2.0.0)\n", "\n", "\u001b[93m Linking successful\u001b[0m\n", " /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->\n", " /usr/local/lib/python3.6/dist-packages/spacy/data/en\n", "\n", " You can now load the model via spacy.load('en')\n", "\n" ], "name": "stdout" } ] }, { "metadata": { "id": "yVLSSAUViyiX", "colab_type": "text" }, "cell_type": "markdown", "source": [ "### Basic Ideas" ] }, { "metadata": { "id": "qHaRqAN5i1iV", "colab_type": "text" }, "cell_type": "markdown", "source": [ "* **Training Models**\n", "* **Prediction (Inference)**\n", "* **Datatypes**\n", " - binary\n", " - numerical\n", " - category\n", " - set\n", " - bag\n", " - sequence\n", " - text\n", " - timeseries\n", " - image\n", "\n" ] }, { "metadata": { "id": "W3G5sZ3yo-yK", "colab_type": "text" }, "cell_type": "markdown", "source": [ "### Topic Modeling Example" ] }, { "metadata": { "id": "aIbXYrxU8ySd", "colab_type": "code", "outputId": "241c61f9-ad81-4c4d-82dd-42bef0502fdf", "colab": { "base_uri": "https://localhost:8080/", "height": 407 } }, "cell_type": "code", "source": [ "!wget https://raw.githubusercontent.com/uchidalab/book-dataset/master/Task1/book30-listing-train.csv\n", "!wget https://raw.githubusercontent.com/noahgift/recommendations/master/model_definition.yaml" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "--2019-02-18 02:44:21-- https://raw.githubusercontent.com/uchidalab/book-dataset/master/Task1/book30-listing-train.csv\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 9728786 (9.3M) [text/plain]\n", "Saving to: ‘book30-listing-train.csv.3’\n", "\n", "book30-listing-trai 100%[===================>] 9.28M --.-KB/s in 0.1s \n", "\n", "2019-02-18 02:44:23 (64.4 MB/s) - ‘book30-listing-train.csv.3’ saved [9728786/9728786]\n", "\n", "--2019-02-18 02:44:24-- https://raw.githubusercontent.com/noahgift/recommendations/master/model_definition.yaml\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 180 [text/plain]\n", "Saving to: ‘model_definition.yaml.2’\n", "\n", "model_definition.ya 100%[===================>] 180 --.-KB/s in 0s \n", "\n", "2019-02-18 02:44:25 (34.7 MB/s) - ‘model_definition.yaml.2’ saved [180/180]\n", "\n" ], "name": "stdout" } ] }, { "metadata": { "id": "v-w5Zkzcumoi", "colab_type": "text" }, "cell_type": "markdown", "source": [ "#### Ingest" ] }, { "metadata": { "id": "Ef8dbaV4tHrz", "colab_type": "code", "outputId": "e7bbaff9-edcf-43df-f142-f8e5e916338f", "colab": { "base_uri": "https://localhost:8080/", "height": 197 } }, "cell_type": "code", "source": [ "import pandas as pd\n", "df = pd.read_csv(\"https://media.githubusercontent.com/media/noahgift/recommendations/master/data/book30-listing-train-with-headers.csv\")\n", "df = df.drop(\"Unnamed: 0\", axis=1)\n", "df.head()" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ASINFILENAMEIMAGE URLTITLEAUTHORCATEGORYIDCATEGORY
014048033351404803335.jpghttp://ecx.images-amazon.com/images/I/51UJnL3T...Magnets: Pulling Together, Pushing Apart (Amaz...Natalie M. Rosinsky4Children's Books
114462760821446276082.jpghttp://ecx.images-amazon.com/images/I/51MGUKhk...Energy Security (SAGE Library of International...NaN10Engineering & Transportation
214915226661491522666.jpghttp://ecx.images-amazon.com/images/I/51qKvjsi...An Amish Gathering: Life in Lancaster CountyBeth Wiseman9Christian Books & Bibles
39700964100970096410.jpghttp://ecx.images-amazon.com/images/I/51qoUENb...City of Rocks Idaho: A Climber's Guide (Region...Dave Bingham26Sports & Outdoors
484368080538436808053.jpghttp://ecx.images-amazon.com/images/I/41aDW5pz...Como vencer el insomnio. Tecnicas, reglas y co...Choliz Montanes11Health, Fitness & Dieting
\n", "
" ], "text/plain": [ " ASIN FILENAME \\\n", "0 1404803335 1404803335.jpg \n", "1 1446276082 1446276082.jpg \n", "2 1491522666 1491522666.jpg \n", "3 970096410 0970096410.jpg \n", "4 8436808053 8436808053.jpg \n", "\n", " IMAGE URL \\\n", "0 http://ecx.images-amazon.com/images/I/51UJnL3T... \n", "1 http://ecx.images-amazon.com/images/I/51MGUKhk... \n", "2 http://ecx.images-amazon.com/images/I/51qKvjsi... \n", "3 http://ecx.images-amazon.com/images/I/51qoUENb... \n", "4 http://ecx.images-amazon.com/images/I/41aDW5pz... \n", "\n", " TITLE AUTHOR \\\n", "0 Magnets: Pulling Together, Pushing Apart (Amaz... Natalie M. Rosinsky \n", "1 Energy Security (SAGE Library of International... NaN \n", "2 An Amish Gathering: Life in Lancaster County Beth Wiseman \n", "3 City of Rocks Idaho: A Climber's Guide (Region... Dave Bingham \n", "4 Como vencer el insomnio. Tecnicas, reglas y co... Choliz Montanes \n", "\n", " CATEGORYID CATEGORY \n", "0 4 Children's Books \n", "1 10 Engineering & Transportation \n", "2 9 Christian Books & Bibles \n", "3 26 Sports & Outdoors \n", "4 11 Health, Fitness & Dieting " ] }, "metadata": { "tags": [] }, "execution_count": 39 } ] }, { "metadata": { "id": "efw0icqJ_0Bq", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "df.to_csv(\"book30-listing-train-with-headers.csv\")" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "wR_L2OPkuqH4", "colab_type": "text" }, "cell_type": "markdown", "source": [ "#### EDA" ] }, { "metadata": { "id": "IUGz5U5duj-D", "colab_type": "text" }, "cell_type": "markdown", "source": [ "**Columns**" ] }, { "metadata": { "id": "KVYJIiwHuhiT", "colab_type": "code", "outputId": "c5a39f2f-e99f-4514-8bb8-ebe7c4eee1b1", "colab": { "base_uri": "https://localhost:8080/", "height": 70 } }, "cell_type": "code", "source": [ "df.columns" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['ASIN', 'FILENAME', 'IMAGE URL', 'TITLE', 'AUTHOR', 'CATEGORYID',\n", " 'CATEGORY'],\n", " dtype='object')" ] }, "metadata": { "tags": [] }, "execution_count": 40 } ] }, { "metadata": { "id": "ir0Viy2zuyr1", "colab_type": "text" }, "cell_type": "markdown", "source": [ "**Shape**" ] }, { "metadata": { "id": "-kaJsKyruyAl", "colab_type": "code", "outputId": "f0e37663-1297-49c0-e000-ec0af6665d43", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "cell_type": "code", "source": [ "df.shape" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(51299, 7)" ] }, "metadata": { "tags": [] }, "execution_count": 41 } ] }, { "metadata": { "id": "JJNSA3n0u3Zf", "colab_type": "text" }, "cell_type": "markdown", "source": [ "#### Training w/Ludwig" ] }, { "metadata": { "id": "bldBWuL2Nwmh", "colab_type": "code", "outputId": "1c6b54c6-35ec-4fa8-b1a5-dd9bcd42b013", "colab": { "base_uri": "https://localhost:8080/", "height": 214 } }, "cell_type": "code", "source": [ "!head book30-listing-train-with-headers.csv" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ ",ASIN,FILENAME,IMAGE URL,TITLE,AUTHOR,CATEGORYID,CATEGORY\n", "0,1404803335,1404803335.jpg,http://ecx.images-amazon.com/images/I/51UJnL3Tx6L.jpg,\"Magnets: Pulling Together, Pushing Apart (Amazing Science)\",Natalie M. Rosinsky,4,Children's Books\n", "1,1446276082,1446276082.jpg,http://ecx.images-amazon.com/images/I/51MGUKhkyhL.jpg,Energy Security (SAGE Library of International Security),,10,Engineering & Transportation\n", "2,1491522666,1491522666.jpg,http://ecx.images-amazon.com/images/I/51qKvjsi3ML.jpg,An Amish Gathering: Life in Lancaster County,Beth Wiseman,9,Christian Books & Bibles\n", "3,970096410,0970096410.jpg,http://ecx.images-amazon.com/images/I/51qoUENb1CL.jpg,City of Rocks Idaho: A Climber's Guide (Regional Rock Climbing Series),Dave Bingham,26,Sports & Outdoors\n", "4,8436808053,8436808053.jpg,http://ecx.images-amazon.com/images/I/41aDW5pzZBL.jpg,\"Como vencer el insomnio. Tecnicas, reglas y consejos practicos para dormir mejor (BIBLIOTECA PRACTICA) (Spanish Edition)\",Choliz Montanes,11,\"Health, Fitness & Dieting\"\n", "5,1848291388,1848291388.jpg,http://ecx.images-amazon.com/images/I/51Lpg7xmrBL.jpg,John Martin Littlejohn: An Enigma of Osteopathy,John O'Brien,16,Medical Books\n", "6,73402656,0073402656.jpg,http://ecx.images-amazon.com/images/I/51WccSzFUrL.jpg,Chemistry: The Molecular Nature of Matter and Change,Martin Silberberg,23,Science & Math\n", "7,323045979,0323045979.jpg,http://ecx.images-amazon.com/images/I/51rJir5EpnL.jpg,\"Mosby's Oncology Nursing Advisor: A Comprehensive Guide to Clinical Practice, 1e\",Susan Newton MS RN AOCN AOCNS,16,Medical Books\n", "8,1847176968,1847176968.jpg,http://ecx.images-amazon.com/images/I/61KoC743OzL.jpg,Ireland's Wild Atlantic Way,Carsten Krieger,29,Travel\n" ], "name": "stdout" } ] }, { "metadata": { "id": "ous6EqC8Nocg", "colab_type": "code", "outputId": "398add9c-ef76-47e2-9fbb-219c8ff1af53", "colab": { "base_uri": "https://localhost:8080/", "height": 212 } }, "cell_type": "code", "source": [ "!cat model_definition.yaml" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "input_features:\n", " -\n", " name: TITLE\n", " type: text\n", " encoder: parallel_cnn\n", " level: word\n", "\n", "output_features:\n", " -\n", " name: CATEGORY\n", " type: category" ], "name": "stdout" } ] }, { "metadata": { "id": "WpVA2fyXLRoK", "colab_type": "code", "outputId": "abfc3b5b-6f59-469f-cb7b-aff5e28cb8ca", "colab": { "base_uri": "https://localhost:8080/", "height": 20338 } }, "cell_type": "code", "source": [ "!ludwig experiment --data_csv book30-listing-train-with-headers.csv --model_definition_file model_definition.yaml\n" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ " _ _ _ \n", "| |_ _ __| |_ __ _(_)__ _ \n", "| | || / _` \\ V V / / _` |\n", "|_|\\_,_\\__,_|\\_/\\_/|_\\__, |\n", " |___/ \n", "ludwig v0.1.0 - Experiment\n", "\n", "Experiment name: experiment\n", "Model name: run\n", "Output path: results/experiment_run_0\n", "\n", "ludwig_version: '0.1.0'\n", "command: ('/usr/local/bin/ludwig experiment --data_csv '\n", " 'book30-listing-train-with-headers.csv --model_definition_file '\n", " 'model_definition.yaml')\n", "dataset_type: 'book30-listing-train-with-headers.csv'\n", "model_definition: { 'combiner': {'type': 'concat'},\n", " 'input_features': [ { 'encoder': 'parallel_cnn',\n", " 'level': 'word',\n", " 'name': 'TITLE',\n", " 'tied_weights': None,\n", " 'type': 'text'}],\n", " 'output_features': [ { 'dependencies': [],\n", " 'loss': { 'class_distance_temperature': 0,\n", " 'class_weights': 1,\n", " 'confidence_penalty': 0,\n", " 'distortion': 1,\n", " 'labels_smoothing': 0,\n", " 'negative_samples': 0,\n", " 'robust_lambda': 0,\n", " 'sampler': None,\n", " 'type': 'softmax_cross_entropy',\n", " 'unique': False,\n", " 'weight': 1},\n", " 'name': 'CATEGORY',\n", " 'reduce_dependencies': 'sum',\n", " 'reduce_input': 'sum',\n", " 'top_k': 3,\n", " 'type': 'category'}],\n", " 'preprocessing': { 'bag': { 'fill_value': '',\n", " 'format': 'space',\n", " 'lowercase': 10000,\n", " 'missing_value_strategy': 'fill_with_const',\n", " 'most_common': False},\n", " 'binary': { 'fill_value': 0,\n", " 'missing_value_strategy': 'fill_with_const'},\n", " 'category': { 'fill_value': '',\n", " 'lowercase': False,\n", " 'missing_value_strategy': 'fill_with_const',\n", " 'most_common': 10000},\n", " 'force_split': False,\n", " 'image': {'missing_value_strategy': 'backfill'},\n", " 'numerical': { 'fill_value': 0,\n", " 'missing_value_strategy': 'fill_with_const'},\n", " 'sequence': { 'fill_value': '',\n", " 'format': 'space',\n", " 'lowercase': False,\n", " 'missing_value_strategy': 'fill_with_const',\n", " 'most_common': 20000,\n", " 'padding': 'right',\n", " 'padding_symbol': '',\n", " 'sequence_length_limit': 256,\n", " 'unknown_symbol': ''},\n", " 'set': { 'fill_value': '',\n", " 'format': 'space',\n", " 'lowercase': False,\n", " 'missing_value_strategy': 'fill_with_const',\n", " 'most_common': 10000},\n", " 'split_probabilities': (0.7, 0.1, 0.2),\n", " 'stratify': None,\n", " 'text': { 'char_format': 'characters',\n", " 'char_most_common': 70,\n", " 'char_sequence_length_limit': 1024,\n", " 'fill_value': '',\n", " 'lowercase': True,\n", " 'missing_value_strategy': 'fill_with_const',\n", " 'padding': 'right',\n", " 'padding_symbol': '',\n", " 'unknown_symbol': '',\n", " 'word_format': 'space_punct',\n", " 'word_most_common': 20000,\n", " 'word_sequence_length_limit': 256},\n", " 'timeseries': { 'fill_value': '',\n", " 'format': 'space',\n", " 'missing_value_strategy': 'fill_with_const',\n", " 'padding': 'right',\n", " 'padding_value': 0,\n", " 'timeseries_length_limit': 256}},\n", " 'training': { 'batch_size': 128,\n", " 'bucketing_field': None,\n", " 'decay': False,\n", " 'decay_rate': 0.96,\n", " 'decay_steps': 10000,\n", " 'dropout_rate': 0.0,\n", " 'early_stop': 3,\n", " 'epochs': 200,\n", " 'gradient_clipping': None,\n", " 'increase_batch_size_on_plateau': 0,\n", " 'increase_batch_size_on_plateau_max': 512,\n", " 'increase_batch_size_on_plateau_patience': 5,\n", " 'increase_batch_size_on_plateau_rate': 2,\n", " 'learning_rate': 0.001,\n", " 'learning_rate_warmup_epochs': 5,\n", " 'optimizer': { 'beta1': 0.9,\n", " 'beta2': 0.999,\n", " 'epsilon': 1e-08,\n", " 'type': 'adam'},\n", " 'reduce_learning_rate_on_plateau': 0,\n", " 'reduce_learning_rate_on_plateau_patience': 5,\n", " 'reduce_learning_rate_on_plateau_rate': 0.5,\n", " 'regularization_lambda': 0,\n", " 'regularizer': 'l2',\n", " 'staircase': False,\n", " 'validation_field': 'combined',\n", " 'validation_measure': 'loss'}}\n", "\n", "Using full raw csv, no hdf5 and json file with the same name have been found\n", "Building dataset (it may take a while)\n", "Loading NLP pipeline\n", "Writing dataset\n", "Writing train set metadata with vocabulary\n", "Training set: 36059\n", "Validation set: 5042\n", "Test set: 10198\n", "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Colocations handled automatically by placer.\n", "From /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Colocations handled automatically by placer.\n", "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/losses/losses_impl.py:209: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.cast instead.\n", "From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/losses/losses_impl.py:209: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.cast instead.\n", "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.cast instead.\n", "From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.cast instead.\n", "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:102: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Deprecated in favor of operator or tf.math.divide.\n", "From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:102: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Deprecated in favor of operator or tf.math.divide.\n", "\n", "╒══════════╕\n", "│ TRAINING │\n", "╘══════════╛\n", "\n", "2019-02-18 01:21:33.899464: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2200000000 Hz\n", "2019-02-18 01:21:33.899801: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x318ac00 executing computations on platform Host. Devices:\n", "2019-02-18 01:21:33.899835: I tensorflow/compiler/xla/service/service.cc:158] StreamExecutor device (0): , \n", "2019-02-18 01:21:34.055715: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", "2019-02-18 01:21:34.056285: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x318a100 executing computations on platform CUDA. Devices:\n", "2019-02-18 01:21:34.056320: I tensorflow/compiler/xla/service/service.cc:158] StreamExecutor device (0): Tesla K80, Compute Capability 3.7\n", "2019-02-18 01:21:34.056733: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 0 with properties: \n", "name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235\n", "pciBusID: 0000:00:04.0\n", "totalMemory: 11.17GiB freeMemory: 11.10GiB\n", "2019-02-18 01:21:34.056767: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1512] Adding visible gpu devices: 0\n", "2019-02-18 01:21:43.842054: I tensorflow/core/common_runtime/gpu/gpu_device.cc:984] Device interconnect StreamExecutor with strength 1 edge matrix:\n", "2019-02-18 01:21:43.842116: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 0 \n", "2019-02-18 01:21:43.842133: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 0: N \n", "2019-02-18 01:21:43.842364: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n", "2019-02-18 01:21:43.842446: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10752 MB memory) -> physical GPU (device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7)\n", "\n", "Epoch 1\n", "Training: 0% 0/282 [00:00: { 'accuracy': 1.0,\n", " 'f1_score': 0,\n", " 'fall_out': 0.0,\n", " 'false_discovery_rate': 1.0,\n", " 'false_negative_rate': 1.0,\n", " 'false_negatives': 0,\n", " 'false_omission_rate': 0.0,\n", " 'false_positive_rate': 0.0,\n", " 'false_positives': 0,\n", " 'hit_rate': 0,\n", " 'informedness': 0.0,\n", " 'markedness': 0.0,\n", " 'matthews_correlation_coefficient': 0,\n", " 'miss_rate': 1.0,\n", " 'negative_predictive_value': 1.0,\n", " 'positive_predictive_value': 0,\n", " 'precision': 0,\n", " 'recall': 0,\n", " 'sensitivity': 0,\n", " 'specificity': 1.0,\n", " 'true_negative_rate': 1.0,\n", " 'true_negatives': 10198,\n", " 'true_positive_rate': 0,\n", " 'true_positives': 0},\n", " Children's Books: { 'accuracy': 0.9269464600902138,\n", " 'f1_score': 0.10991636798088411,\n", " 'fall_out': 0.02880446004542636,\n", " 'false_discovery_rate': 0.8584615384615385,\n", " 'false_negative_rate': 0.91015625,\n", " 'false_negatives': 466,\n", " 'false_omission_rate': 0.04719943279651573,\n", " 'false_positive_rate': 0.02880446004542636,\n", " 'false_positives': 279,\n", " 'hit_rate': 0.08984375,\n", " 'informedness': 0.06103928995457375,\n", " 'markedness': 0.09433902874194589,\n", " 'matthews_correlation_coefficient': 0.07588403869993006,\n", " 'miss_rate': 0.91015625,\n", " 'negative_predictive_value': 0.9528005672034843,\n", " 'positive_predictive_value': 0.14153846153846153,\n", " 'precision': 0.14153846153846153,\n", " 'recall': 0.08984375,\n", " 'sensitivity': 0.08984375,\n", " 'specificity': 0.9711955399545736,\n", " 'true_negative_rate': 0.9711955399545736,\n", " 'true_negatives': 9407,\n", " 'true_positive_rate': 0.08984375,\n", " 'true_positives': 46},\n", " Engineering & Transportation: { 'accuracy': 0.963326142380859,\n", " 'f1_score': 0.04591836734693878,\n", " 'fall_out': 0.029946629768728972,\n", " 'false_discovery_rate': 0.9711538461538461,\n", " 'false_negative_rate': 0.8875,\n", " 'false_negatives': 71,\n", " 'false_omission_rate': 0.0071818733562614145,\n", " 'false_positive_rate': 0.029946629768728972,\n", " 'false_positives': 303,\n", " 'hit_rate': 0.1125,\n", " 'informedness': 0.08255337023127107,\n", " 'markedness': 0.02166428048989233,\n", " 'matthews_correlation_coefficient': 0.042290180516003875,\n", " 'miss_rate': 0.8875,\n", " 'negative_predictive_value': 0.9928181266437386,\n", " 'positive_predictive_value': 0.028846153846153848,\n", " 'precision': 0.028846153846153848,\n", " 'recall': 0.1125,\n", " 'sensitivity': 0.1125,\n", " 'specificity': 0.970053370231271,\n", " 'true_negative_rate': 0.970053370231271,\n", " 'true_negatives': 9815,\n", " 'true_positive_rate': 0.1125,\n", " 'true_positives': 9},\n", " Christian Books & Bibles: { 'accuracy': 0.9656795450088252,\n", " 'f1_score': 0.005681818181818181,\n", " 'fall_out': 0.034229109454688156,\n", " 'false_discovery_rate': 0.9971428571428571,\n", " 'false_negative_rate': 0.5,\n", " 'false_negatives': 1,\n", " 'false_omission_rate': 0.00010154346060109454,\n", " 'false_positive_rate': 0.034229109454688156,\n", " 'false_positives': 349,\n", " 'hit_rate': 0.5,\n", " 'informedness': 0.46577089054531173,\n", " 'markedness': 0.002755599396541797,\n", " 'matthews_correlation_coefficient': 0.035825660983621235,\n", " 'miss_rate': 0.5,\n", " 'negative_predictive_value': 0.9998984565393989,\n", " 'positive_predictive_value': 0.002857142857142857,\n", " 'precision': 0.002857142857142857,\n", " 'recall': 0.5,\n", " 'sensitivity': 0.5,\n", " 'specificity': 0.9657708905453118,\n", " 'true_negative_rate': 0.9657708905453118,\n", " 'true_negatives': 9847,\n", " 'true_positive_rate': 0.5,\n", " 'true_positives': 1},\n", " Sports & Outdoors: { 'accuracy': 0.963424200823691,\n", " 'f1_score': 0,\n", " 'fall_out': 0.03297244094488194,\n", " 'false_discovery_rate': 1.0,\n", " 'false_negative_rate': 1.0,\n", " 'false_negatives': 38,\n", " 'false_omission_rate': 0.0038527831288655,\n", " 'false_positive_rate': 0.03297244094488194,\n", " 'false_positives': 335,\n", " 'hit_rate': 0.0,\n", " 'informedness': -0.03297244094488194,\n", " 'markedness': -0.0038527831288655,\n", " 'matthews_correlation_coefficient': -0.011271009901067143,\n", " 'miss_rate': 1.0,\n", " 'negative_predictive_value': 0.9961472168711345,\n", " 'positive_predictive_value': 0.0,\n", " 'precision': 0.0,\n", " 'recall': 0.0,\n", " 'sensitivity': 0.0,\n", " 'specificity': 0.9670275590551181,\n", " 'true_negative_rate': 0.9670275590551181,\n", " 'true_negatives': 9825,\n", " 'true_positive_rate': 0.0,\n", " 'true_positives': 0},\n", " Health, Fitness & Dieting: { 'accuracy': 0.9297901549323396,\n", " 'f1_score': 0.0427807486631016,\n", " 'fall_out': 0.03329248366013071,\n", " 'false_discovery_rate': 0.9532163742690059,\n", " 'false_negative_rate': 0.9605911330049262,\n", " 'false_negatives': 390,\n", " 'false_omission_rate': 0.03956980519480524,\n", " 'false_positive_rate': 0.03329248366013071,\n", " 'false_positives': 326,\n", " 'hit_rate': 0.03940886699507389,\n", " 'informedness': 0.006116383334943132,\n", " 'markedness': 0.0072138205361889085,\n", " 'matthews_correlation_coefficient': 0.0066424763235420695,\n", " 'miss_rate': 0.9605911330049262,\n", " 'negative_predictive_value': 0.9604301948051948,\n", " 'positive_predictive_value': 0.04678362573099415,\n", " 'precision': 0.04678362573099415,\n", " 'recall': 0.03940886699507389,\n", " 'sensitivity': 0.03940886699507389,\n", " 'specificity': 0.9667075163398693,\n", " 'true_negative_rate': 0.9667075163398693,\n", " 'true_negatives': 9466,\n", " 'true_positive_rate': 0.03940886699507389,\n", " 'true_positives': 16},\n", " Medical Books: { 'accuracy': 0.9540105903118259,\n", " 'f1_score': 0.07495069033530573,\n", " 'fall_out': 0.0315180530620387,\n", " 'false_discovery_rate': 0.9432835820895522,\n", " 'false_negative_rate': 0.8895348837209303,\n", " 'false_negatives': 153,\n", " 'false_omission_rate': 0.0155125215451688,\n", " 'false_positive_rate': 0.0315180530620387,\n", " 'false_positives': 316,\n", " 'hit_rate': 0.11046511627906977,\n", " 'informedness': 0.07894706321703104,\n", " 'markedness': 0.04120389636527899,\n", " 'matthews_correlation_coefficient': 0.05703443355673547,\n", " 'miss_rate': 0.8895348837209303,\n", " 'negative_predictive_value': 0.9844874784548312,\n", " 'positive_predictive_value': 0.056716417910447764,\n", " 'precision': 0.056716417910447764,\n", " 'recall': 0.11046511627906977,\n", " 'sensitivity': 0.11046511627906977,\n", " 'specificity': 0.9684819469379613,\n", " 'true_negative_rate': 0.9684819469379613,\n", " 'true_negatives': 9710,\n", " 'true_positive_rate': 0.11046511627906977,\n", " 'true_positives': 19},\n", " Science & Math: { 'accuracy': 0.9558737007256325,\n", " 'f1_score': 0.030172413793103446,\n", " 'fall_out': 0.036212525972098564,\n", " 'false_discovery_rate': 0.9812332439678284,\n", " 'false_negative_rate': 0.9230769230769231,\n", " 'false_negatives': 84,\n", " 'false_omission_rate': 0.008549618320610741,\n", " 'false_positive_rate': 0.036212525972098564,\n", " 'false_positives': 366,\n", " 'hit_rate': 0.07692307692307693,\n", " 'informedness': 0.04071055095097842,\n", " 'markedness': 0.010217137711560742,\n", " 'matthews_correlation_coefficient': 0.020394737198102416,\n", " 'miss_rate': 0.9230769230769231,\n", " 'negative_predictive_value': 0.9914503816793893,\n", " 'positive_predictive_value': 0.01876675603217158,\n", " 'precision': 0.01876675603217158,\n", " 'recall': 0.07692307692307693,\n", " 'sensitivity': 0.07692307692307693,\n", " 'specificity': 0.9637874740279014,\n", " 'true_negative_rate': 0.9637874740279014,\n", " 'true_negatives': 9741,\n", " 'true_positive_rate': 0.07692307692307693,\n", " 'true_positives': 7},\n", " Travel: { 'accuracy': 0.9540105903118259,\n", " 'f1_score': 0.016771488469601678,\n", " 'fall_out': 0.030505433157212658,\n", " 'false_discovery_rate': 0.9870967741935484,\n", " 'false_negative_rate': 0.9760479041916168,\n", " 'false_negatives': 163,\n", " 'false_omission_rate': 0.016484627831715226,\n", " 'false_positive_rate': 0.030505433157212658,\n", " 'false_positives': 306,\n", " 'hit_rate': 0.023952095808383235,\n", " 'informedness': -0.006553337348829458,\n", " 'markedness': -0.0035814020252635803,\n", " 'matthews_correlation_coefficient': -0.004844598606007852,\n", " 'miss_rate': 0.9760479041916168,\n", " 'negative_predictive_value': 0.9835153721682848,\n", " 'positive_predictive_value': 0.012903225806451613,\n", " 'precision': 0.012903225806451613,\n", " 'recall': 0.023952095808383235,\n", " 'sensitivity': 0.023952095808383235,\n", " 'specificity': 0.9694945668427873,\n", " 'true_negative_rate': 0.9694945668427873,\n", " 'true_negatives': 9725,\n", " 'true_positive_rate': 0.023952095808383235,\n", " 'true_positives': 4},\n", " Business & Money: { 'accuracy': 0.9681310060796234,\n", " 'f1_score': 0,\n", " 'fall_out': 0.030823598704230903,\n", " 'false_discovery_rate': 1.0,\n", " 'false_negative_rate': 1.0,\n", " 'false_negatives': 11,\n", " 'false_omission_rate': 0.0011129097531363819,\n", " 'false_positive_rate': 0.030823598704230903,\n", " 'false_positives': 314,\n", " 'hit_rate': 0.0,\n", " 'informedness': -0.030823598704230903,\n", " 'markedness': -0.0011129097531363819,\n", " 'matthews_correlation_coefficient': -0.00585695173487886,\n", " 'miss_rate': 1.0,\n", " 'negative_predictive_value': 0.9988870902468636,\n", " 'positive_predictive_value': 0.0,\n", " 'precision': 0.0,\n", " 'recall': 0.0,\n", " 'sensitivity': 0.0,\n", " 'specificity': 0.9691764012957691,\n", " 'true_negative_rate': 0.9691764012957691,\n", " 'true_negatives': 9873,\n", " 'true_positive_rate': 0.0,\n", " 'true_positives': 0},\n", " Cookbooks, Food & Wine: { 'accuracy': 0.9595018631104139,\n", " 'f1_score': 0.019002375296912115,\n", " 'fall_out': 0.03492846571287622,\n", " 'false_discovery_rate': 0.9888268156424581,\n", " 'false_negative_rate': 0.9365079365079365,\n", " 'false_negatives': 59,\n", " 'false_omission_rate': 0.00599593495934958,\n", " 'false_positive_rate': 0.03492846571287622,\n", " 'false_positives': 354,\n", " 'hit_rate': 0.06349206349206349,\n", " 'informedness': 0.028563597779187155,\n", " 'markedness': 0.005177249398192307,\n", " 'matthews_correlation_coefficient': 0.012160627837924515,\n", " 'miss_rate': 0.9365079365079365,\n", " 'negative_predictive_value': 0.9940040650406504,\n", " 'positive_predictive_value': 0.0111731843575419,\n", " 'precision': 0.0111731843575419,\n", " 'recall': 0.06349206349206349,\n", " 'sensitivity': 0.06349206349206349,\n", " 'specificity': 0.9650715342871238,\n", " 'true_negative_rate': 0.9650715342871238,\n", " 'true_negatives': 9781,\n", " 'true_positive_rate': 0.06349206349206349,\n", " 'true_positives': 4},\n", " Politics & Social Sciences: { 'accuracy': 0.928025102961365,\n", " 'f1_score': 0.0516795865633075,\n", " 'fall_out': 0.035834609494640124,\n", " 'false_discovery_rate': 0.9460916442048517,\n", " 'false_negative_rate': 0.9503722084367245,\n", " 'false_negatives': 383,\n", " 'false_omission_rate': 0.03897425460466064,\n", " 'false_positive_rate': 0.035834609494640124,\n", " 'false_positives': 351,\n", " 'hit_rate': 0.04962779156327544,\n", " 'informedness': 0.013793182068635224,\n", " 'markedness': 0.014934101190487548,\n", " 'matthews_correlation_coefficient': 0.01435230910870509,\n", " 'miss_rate': 0.9503722084367245,\n", " 'negative_predictive_value': 0.9610257453953394,\n", " 'positive_predictive_value': 0.05390835579514825,\n", " 'precision': 0.05390835579514825,\n", " 'recall': 0.04962779156327544,\n", " 'sensitivity': 0.04962779156327544,\n", " 'specificity': 0.9641653905053599,\n", " 'true_negative_rate': 0.9641653905053599,\n", " 'true_negatives': 9444,\n", " 'true_positive_rate': 0.04962779156327544,\n", " 'true_positives': 20},\n", " Crafts, Hobbies & Home: { 'accuracy': 0.9681310060796234,\n", " 'f1_score': 0,\n", " 'fall_out': 0.0312990580847724,\n", " 'false_discovery_rate': 1.0,\n", " 'false_negative_rate': 1.0,\n", " 'false_negatives': 6,\n", " 'false_omission_rate': 0.000607348921955686,\n", " 'false_positive_rate': 0.0312990580847724,\n", " 'false_positives': 319,\n", " 'hit_rate': 0.0,\n", " 'informedness': -0.0312990580847724,\n", " 'markedness': -0.000607348921955686,\n", " 'matthews_correlation_coefficient': -0.004359982704783838,\n", " 'miss_rate': 1.0,\n", " 'negative_predictive_value': 0.9993926510780443,\n", " 'positive_predictive_value': 0.0,\n", " 'precision': 0.0,\n", " 'recall': 0.0,\n", " 'sensitivity': 0.0,\n", " 'specificity': 0.9687009419152276,\n", " 'true_negative_rate': 0.9687009419152276,\n", " 'true_negatives': 9873,\n", " 'true_positive_rate': 0.0,\n", " 'true_positives': 0},\n", " Religion & Spirituality: { 'accuracy': 0.957834869582271,\n", " 'f1_score': 0.009216589861751152,\n", " 'fall_out': 0.03517091483896462,\n", " 'false_discovery_rate': 0.994413407821229,\n", " 'false_negative_rate': 0.9736842105263158,\n", " 'false_negatives': 74,\n", " 'false_omission_rate': 0.007520325203252076,\n", " 'false_positive_rate': 0.03517091483896462,\n", " 'false_positives': 356,\n", " 'hit_rate': 0.02631578947368421,\n", " 'informedness': -0.008855125365280436,\n", " 'markedness': -0.0019337330244810769,\n", " 'matthews_correlation_coefficient': -0.004138048858431092,\n", " 'miss_rate': 0.9736842105263158,\n", " 'negative_predictive_value': 0.9924796747967479,\n", " 'positive_predictive_value': 0.00558659217877095,\n", " 'precision': 0.00558659217877095,\n", " 'recall': 0.02631578947368421,\n", " 'sensitivity': 0.02631578947368421,\n", " 'specificity': 0.9648290851610354,\n", " 'true_negative_rate': 0.9648290851610354,\n", " 'true_negatives': 9766,\n", " 'true_positive_rate': 0.02631578947368421,\n", " 'true_positives': 2},\n", " Literature & Fiction: { 'accuracy': 0.9111590507942734,\n", " 'f1_score': 0.12884615384615386,\n", " 'fall_out': 0.03088559722659945,\n", " 'false_discovery_rate': 0.814404432132964,\n", " 'false_negative_rate': 0.9013254786450663,\n", " 'false_negatives': 612,\n", " 'false_omission_rate': 0.06221408966148212,\n", " 'false_positive_rate': 0.03088559722659945,\n", " 'false_positives': 294,\n", " 'hit_rate': 0.09867452135493372,\n", " 'informedness': 0.06778892412833426,\n", " 'markedness': 0.12338147820555401,\n", " 'matthews_correlation_coefficient': 0.09145434743585469,\n", " 'miss_rate': 0.9013254786450663,\n", " 'negative_predictive_value': 0.9377859103385179,\n", " 'positive_predictive_value': 0.18559556786703602,\n", " 'precision': 0.18559556786703602,\n", " 'recall': 0.09867452135493372,\n", " 'sensitivity': 0.09867452135493372,\n", " 'specificity': 0.9691144027734006,\n", " 'true_negative_rate': 0.9691144027734006,\n", " 'true_negatives': 9225,\n", " 'true_positive_rate': 0.09867452135493372,\n", " 'true_positives': 67},\n", " Humor & Entertainment: { 'accuracy': 0.9680329476367915,\n", " 'f1_score': 0,\n", " 'fall_out': 0.031492200529775305,\n", " 'false_discovery_rate': 1.0,\n", " 'false_negative_rate': 1.0,\n", " 'false_negatives': 5,\n", " 'false_omission_rate': 0.0005062265870203753,\n", " 'false_positive_rate': 0.031492200529775305,\n", " 'false_positives': 321,\n", " 'hit_rate': 0.0,\n", " 'informedness': -0.031492200529775305,\n", " 'markedness': -0.0005062265870203753,\n", " 'matthews_correlation_coefficient': -0.003992767109655738,\n", " 'miss_rate': 1.0,\n", " 'negative_predictive_value': 0.9994937734129796,\n", " 'positive_predictive_value': 0.0,\n", " 'precision': 0.0,\n", " 'recall': 0.0,\n", " 'sensitivity': 0.0,\n", " 'specificity': 0.9685077994702247,\n", " 'true_negative_rate': 0.9685077994702247,\n", " 'true_negatives': 9872,\n", " 'true_positive_rate': 0.0,\n", " 'true_positives': 0},\n", " Law: { 'accuracy': 0.9264561678760541,\n", " 'f1_score': 0.05778894472361809,\n", " 'fall_out': 0.03343246846477288,\n", " 'false_discovery_rate': 0.9340974212034384,\n", " 'false_negative_rate': 0.9485458612975392,\n", " 'false_negatives': 424,\n", " 'false_omission_rate': 0.04305005584323285,\n", " 'false_positive_rate': 0.03343246846477288,\n", " 'false_positives': 326,\n", " 'hit_rate': 0.05145413870246085,\n", " 'informedness': 0.01802167023768808,\n", " 'markedness': 0.022852522953328736,\n", " 'matthews_correlation_coefficient': 0.020293857020391354,\n", " 'miss_rate': 0.9485458612975392,\n", " 'negative_predictive_value': 0.9569499441567672,\n", " 'positive_predictive_value': 0.0659025787965616,\n", " 'precision': 0.0659025787965616,\n", " 'recall': 0.05145413870246085,\n", " 'sensitivity': 0.05145413870246085,\n", " 'specificity': 0.9665675315352271,\n", " 'true_negative_rate': 0.9665675315352271,\n", " 'true_negatives': 9425,\n", " 'true_positive_rate': 0.05145413870246085,\n", " 'true_positives': 23},\n", " Computers & Technology: { 'accuracy': 0.9531280643263385,\n", " 'f1_score': 0.047808764940239036,\n", " 'fall_out': 0.03508597554915016,\n", " 'false_discovery_rate': 0.9671232876712329,\n", " 'false_negative_rate': 0.9124087591240876,\n", " 'false_negatives': 125,\n", " 'false_omission_rate': 0.01271229533204521,\n", " 'false_positive_rate': 0.03508597554915016,\n", " 'false_positives': 353,\n", " 'hit_rate': 0.08759124087591241,\n", " 'informedness': 0.05250526532676236,\n", " 'markedness': 0.02016441699672189,\n", " 'matthews_correlation_coefficient': 0.03253825540148643,\n", " 'miss_rate': 0.9124087591240876,\n", " 'negative_predictive_value': 0.9872877046679548,\n", " 'positive_predictive_value': 0.03287671232876712,\n", " 'precision': 0.03287671232876712,\n", " 'recall': 0.08759124087591241,\n", " 'sensitivity': 0.08759124087591241,\n", " 'specificity': 0.9649140244508498,\n", " 'true_negative_rate': 0.9649140244508498,\n", " 'true_negatives': 9708,\n", " 'true_positive_rate': 0.08759124087591241,\n", " 'true_positives': 12},\n", " Test Preparation: { 'accuracy': 0.9327319082172975,\n", " 'f1_score': 0.15099009900990099,\n", " 'fall_out': 0.027274598600247058,\n", " 'false_discovery_rate': 0.8128834355828221,\n", " 'false_negative_rate': 0.8734439834024896,\n", " 'false_negatives': 421,\n", " 'false_omission_rate': 0.04264586709886553,\n", " 'false_positive_rate': 0.027274598600247058,\n", " 'false_positives': 265,\n", " 'hit_rate': 0.12655601659751037,\n", " 'informedness': 0.09928141799726342,\n", " 'markedness': 0.1444706973183123,\n", " 'matthews_correlation_coefficient': 0.11976333198778118,\n", " 'miss_rate': 0.8734439834024896,\n", " 'negative_predictive_value': 0.9573541329011345,\n", " 'positive_predictive_value': 0.18711656441717792,\n", " 'precision': 0.18711656441717792,\n", " 'recall': 0.12655601659751037,\n", " 'sensitivity': 0.12655601659751037,\n", " 'specificity': 0.9727254013997529,\n", " 'true_negative_rate': 0.9727254013997529,\n", " 'true_negatives': 9451,\n", " 'true_positive_rate': 0.12655601659751037,\n", " 'true_positives': 61},\n", " Arts & Photography: { 'accuracy': 0.941557168072171,\n", " 'f1_score': 0.04487179487179488,\n", " 'fall_out': 0.03171076550191876,\n", " 'false_discovery_rate': 0.9573170731707317,\n", " 'false_negative_rate': 0.9527027027027027,\n", " 'false_negatives': 282,\n", " 'false_omission_rate': 0.02857142857142858,\n", " 'false_positive_rate': 0.03171076550191876,\n", " 'false_positives': 314,\n", " 'hit_rate': 0.0472972972972973,\n", " 'informedness': 0.0155865317953785,\n", " 'markedness': 0.014111498257839639,\n", " 'matthews_correlation_coefficient': 0.01483068832779676,\n", " 'miss_rate': 0.9527027027027027,\n", " 'negative_predictive_value': 0.9714285714285714,\n", " 'positive_predictive_value': 0.042682926829268296,\n", " 'precision': 0.042682926829268296,\n", " 'recall': 0.0472972972972973,\n", " 'sensitivity': 0.0472972972972973,\n", " 'specificity': 0.9682892344980812,\n", " 'true_negative_rate': 0.9682892344980812,\n", " 'true_negatives': 9588,\n", " 'true_positive_rate': 0.0472972972972973,\n", " 'true_positives': 14},\n", " Parenting & Relationships: { 'accuracy': 0.9494018434987253,\n", " 'f1_score': 0.0851063829787234,\n", " 'fall_out': 0.035068438405435054,\n", " 'false_discovery_rate': 0.9359999999999999,\n", " 'false_negative_rate': 0.873015873015873,\n", " 'false_negatives': 165,\n", " 'false_omission_rate': 0.016797312430011146,\n", " 'false_positive_rate': 0.035068438405435054,\n", " 'false_positives': 351,\n", " 'hit_rate': 0.12698412698412698,\n", " 'informedness': 0.09191568857869203,\n", " 'markedness': 0.04720268756998891,\n", " 'matthews_correlation_coefficient': 0.0658685625375291,\n", " 'miss_rate': 0.873015873015873,\n", " 'negative_predictive_value': 0.9832026875699889,\n", " 'positive_predictive_value': 0.064,\n", " 'precision': 0.064,\n", " 'recall': 0.12698412698412698,\n", " 'sensitivity': 0.12698412698412698,\n", " 'specificity': 0.964931561594565,\n", " 'true_negative_rate': 0.964931561594565,\n", " 'true_negatives': 9658,\n", " 'true_positive_rate': 0.12698412698412698,\n", " 'true_positives': 24},\n", " Romance: { 'accuracy': 0.9173367326926848,\n", " 'f1_score': 0.11542497376705142,\n", " 'fall_out': 0.030138700594431134,\n", " 'false_discovery_rate': 0.8401162790697674,\n", " 'false_negative_rate': 0.909688013136289,\n", " 'false_negatives': 554,\n", " 'false_omission_rate': 0.05622082403085038,\n", " 'false_positive_rate': 0.030138700594431134,\n", " 'false_positives': 289,\n", " 'hit_rate': 0.090311986863711,\n", " 'informedness': 0.06017328626927987,\n", " 'markedness': 0.10366289689938224,\n", " 'matthews_correlation_coefficient': 0.07897934648140213,\n", " 'miss_rate': 0.909688013136289,\n", " 'negative_predictive_value': 0.9437791759691496,\n", " 'positive_predictive_value': 0.15988372093023256,\n", " 'precision': 0.15988372093023256,\n", " 'recall': 0.090311986863711,\n", " 'sensitivity': 0.090311986863711,\n", " 'specificity': 0.9698612994055689,\n", " 'true_negative_rate': 0.9698612994055689,\n", " 'true_negatives': 9300,\n", " 'true_positive_rate': 0.090311986863711,\n", " 'true_positives': 55},\n", " History: { 'accuracy': 0.9323396744459698,\n", " 'f1_score': 0.07999999999999999,\n", " 'fall_out': 0.030978427563643773,\n", " 'false_discovery_rate': 0.9099099099099099,\n", " 'false_negative_rate': 0.9280575539568345,\n", " 'false_negatives': 387,\n", " 'false_omission_rate': 0.0392295995945261,\n", " 'false_positive_rate': 0.030978427563643773,\n", " 'false_positives': 303,\n", " 'hit_rate': 0.07194244604316546,\n", " 'informedness': 0.04096401847952169,\n", " 'markedness': 0.05086049049556407,\n", " 'matthews_correlation_coefficient': 0.04564482525476266,\n", " 'miss_rate': 0.9280575539568345,\n", " 'negative_predictive_value': 0.9607704004054739,\n", " 'positive_predictive_value': 0.09009009009009009,\n", " 'precision': 0.09009009009009009,\n", " 'recall': 0.07194244604316546,\n", " 'sensitivity': 0.07194244604316546,\n", " 'specificity': 0.9690215724363562,\n", " 'true_negative_rate': 0.9690215724363562,\n", " 'true_negatives': 9478,\n", " 'true_positive_rate': 0.07194244604316546,\n", " 'true_positives': 30},\n", " Comics & Graphic Novels: { 'accuracy': 0.9580309864679349,\n", " 'f1_score': 0.218978102189781,\n", " 'fall_out': 0.028708612583775106,\n", " 'false_discovery_rate': 0.8270893371757925,\n", " 'false_negative_rate': 0.7014925373134329,\n", " 'false_negatives': 141,\n", " 'false_omission_rate': 0.014313267688559561,\n", " 'false_positive_rate': 0.028708612583775106,\n", " 'false_positives': 287,\n", " 'hit_rate': 0.29850746268656714,\n", " 'informedness': 0.2697988501027919,\n", " 'markedness': 0.15859739513564786,\n", " 'matthews_correlation_coefficient': 0.20685597607247405,\n", " 'miss_rate': 0.7014925373134329,\n", " 'negative_predictive_value': 0.9856867323114404,\n", " 'positive_predictive_value': 0.1729106628242075,\n", " 'precision': 0.1729106628242075,\n", " 'recall': 0.29850746268656714,\n", " 'sensitivity': 0.29850746268656714,\n", " 'specificity': 0.9712913874162249,\n", " 'true_negative_rate': 0.9712913874162249,\n", " 'true_negatives': 9710,\n", " 'true_positive_rate': 0.29850746268656714,\n", " 'true_positives': 60},\n", " Reference: { 'accuracy': 0.9581290449107668,\n", " 'f1_score': 0.027334851936218676,\n", " 'fall_out': 0.034220156265453494,\n", " 'false_discovery_rate': 0.9829545454545454,\n", " 'false_negative_rate': 0.9310344827586207,\n", " 'false_negatives': 81,\n", " 'false_omission_rate': 0.00822669104204754,\n", " 'false_positive_rate': 0.034220156265453494,\n", " 'false_positives': 346,\n", " 'hit_rate': 0.06896551724137931,\n", " 'informedness': 0.03474536097592584,\n", " 'markedness': 0.008818763503406934,\n", " 'matthews_correlation_coefficient': 0.01750460286002505,\n", " 'miss_rate': 0.9310344827586207,\n", " 'negative_predictive_value': 0.9917733089579525,\n", " 'positive_predictive_value': 0.017045454545454544,\n", " 'precision': 0.017045454545454544,\n", " 'recall': 0.06896551724137931,\n", " 'sensitivity': 0.06896551724137931,\n", " 'specificity': 0.9657798437345465,\n", " 'true_negative_rate': 0.9657798437345465,\n", " 'true_negatives': 9765,\n", " 'true_positive_rate': 0.06896551724137931,\n", " 'true_positives': 6},\n", " Teen & Young Adult: { 'accuracy': 0.9515591292410277,\n", " 'f1_score': 0.04263565891472868,\n", " 'fall_out': 0.034176962933439636,\n", " 'false_discovery_rate': 0.9689265536723164,\n", " 'false_negative_rate': 0.9320987654320988,\n", " 'false_negatives': 151,\n", " 'false_omission_rate': 0.015339292970337315,\n", " 'false_positive_rate': 0.034176962933439636,\n", " 'false_positives': 343,\n", " 'hit_rate': 0.06790123456790123,\n", " 'informedness': 0.03372427163446168,\n", " 'markedness': 0.015734153357346292,\n", " 'matthews_correlation_coefficient': 0.023035252587315484,\n", " 'miss_rate': 0.9320987654320988,\n", " 'negative_predictive_value': 0.9846607070296627,\n", " 'positive_predictive_value': 0.031073446327683617,\n", " 'precision': 0.031073446327683617,\n", " 'recall': 0.06790123456790123,\n", " 'sensitivity': 0.06790123456790123,\n", " 'specificity': 0.9658230370665604,\n", " 'true_negative_rate': 0.9658230370665604,\n", " 'true_negatives': 9693,\n", " 'true_positive_rate': 0.06790123456790123,\n", " 'true_positives': 11},\n", " Self-Help: { 'accuracy': 0.8456560109825456,\n", " 'f1_score': 0.11173814898419863,\n", " 'fall_out': 0.025380130330398987,\n", " 'false_discovery_rate': 0.691588785046729,\n", " 'false_negative_rate': 0.9317711922811854,\n", " 'false_negatives': 1352,\n", " 'false_omission_rate': 0.1368836691303027,\n", " 'false_positive_rate': 0.025380130330398987,\n", " 'false_positives': 222,\n", " 'hit_rate': 0.06822880771881461,\n", " 'informedness': 0.04284867738841558,\n", " 'markedness': 0.17152754582296836,\n", " 'matthews_correlation_coefficient': 0.08573055741213308,\n", " 'miss_rate': 0.9317711922811854,\n", " 'negative_predictive_value': 0.8631163308696973,\n", " 'positive_predictive_value': 0.308411214953271,\n", " 'precision': 0.308411214953271,\n", " 'recall': 0.06822880771881461,\n", " 'sensitivity': 0.06822880771881461,\n", " 'specificity': 0.974619869669601,\n", " 'true_negative_rate': 0.974619869669601,\n", " 'true_negatives': 8525,\n", " 'true_positive_rate': 0.06822880771881461,\n", " 'true_positives': 99},\n", " Calendars: { 'accuracy': 0.8434006667974112,\n", " 'f1_score': 0.19627579265223954,\n", " 'fall_out': 0.014421385860007074,\n", " 'false_discovery_rate': 0.3867924528301887,\n", " 'false_negative_rate': 0.8831635710005992,\n", " 'false_negatives': 1474,\n", " 'false_omission_rate': 0.14919028340080975,\n", " 'false_positive_rate': 0.014421385860007074,\n", " 'false_positives': 123,\n", " 'hit_rate': 0.11683642899940083,\n", " 'informedness': 0.10241504313939376,\n", " 'markedness': 0.46401726376900143,\n", " 'matthews_correlation_coefficient': 0.21799621117424445,\n", " 'miss_rate': 0.8831635710005992,\n", " 'negative_predictive_value': 0.8508097165991902,\n", " 'positive_predictive_value': 0.6132075471698113,\n", " 'precision': 0.6132075471698113,\n", " 'recall': 0.11683642899940083,\n", " 'sensitivity': 0.11683642899940083,\n", " 'specificity': 0.9855786141399929,\n", " 'true_negative_rate': 0.9855786141399929,\n", " 'true_negatives': 8406,\n", " 'true_positive_rate': 0.11683642899940083,\n", " 'true_positives': 195},\n", " Science Fiction & Fantasy: { 'accuracy': 0.9561678760541282,\n", " 'f1_score': 0.11485148514851486,\n", " 'fall_out': 0.027994401119776025,\n", " 'false_discovery_rate': 0.9061488673139159,\n", " 'false_negative_rate': 0.8520408163265306,\n", " 'false_negatives': 167,\n", " 'false_omission_rate': 0.016887450702801066,\n", " 'false_positive_rate': 0.027994401119776025,\n", " 'false_positives': 280,\n", " 'hit_rate': 0.14795918367346939,\n", " 'informedness': 0.11996478255369336,\n", " 'markedness': 0.07696368198328307,\n", " 'matthews_correlation_coefficient': 0.09608814377255998,\n", " 'miss_rate': 0.8520408163265306,\n", " 'negative_predictive_value': 0.9831125492971989,\n", " 'positive_predictive_value': 0.09385113268608414,\n", " 'precision': 0.09385113268608414,\n", " 'recall': 0.14795918367346939,\n", " 'sensitivity': 0.14795918367346939,\n", " 'specificity': 0.972005598880224,\n", " 'true_negative_rate': 0.972005598880224,\n", " 'true_negatives': 9722,\n", " 'true_positive_rate': 0.14795918367346939,\n", " 'true_positives': 29},\n", " Mystery, Thriller & Suspense: { 'accuracy': 0.9433222200431457,\n", " 'f1_score': 0.12158054711246201,\n", " 'fall_out': 0.030069859269008847,\n", " 'false_discovery_rate': 0.8813056379821959,\n", " 'false_negative_rate': 0.8753894080996885,\n", " 'false_negatives': 281,\n", " 'false_omission_rate': 0.028496095730656146,\n", " 'false_positive_rate': 0.030069859269008847,\n", " 'false_positives': 297,\n", " 'hit_rate': 0.12461059190031153,\n", " 'informedness': 0.09454073263130258,\n", " 'markedness': 0.09019826628714811,\n", " 'matthews_correlation_coefficient': 0.09234397748018172,\n", " 'miss_rate': 0.8753894080996885,\n", " 'negative_predictive_value': 0.9715039042693439,\n", " 'positive_predictive_value': 0.11869436201780416,\n", " 'precision': 0.11869436201780416,\n", " 'recall': 0.12461059190031153,\n", " 'sensitivity': 0.12461059190031153,\n", " 'specificity': 0.9699301407309912,\n", " 'true_negative_rate': 0.9699301407309912,\n", " 'true_negatives': 9580,\n", " 'true_positive_rate': 0.12461059190031153,\n", " 'true_positives': 40},\n", " Biographies & Memoirs: { 'accuracy': 0.8951755246126691,\n", " 'f1_score': 0.09329940627650551,\n", " 'fall_out': 0.03210666666666662,\n", " 'false_discovery_rate': 0.8455056179775281,\n", " 'false_negative_rate': 0.9331713244228432,\n", " 'false_negatives': 768,\n", " 'false_omission_rate': 0.0780329201381833,\n", " 'false_positive_rate': 0.03210666666666662,\n", " 'false_positives': 301,\n", " 'hit_rate': 0.06682867557715674,\n", " 'informedness': 0.03472200891049004,\n", " 'markedness': 0.0764614618842887,\n", " 'matthews_correlation_coefficient': 0.05152567865497131,\n", " 'miss_rate': 0.9331713244228432,\n", " 'negative_predictive_value': 0.9219670798618167,\n", " 'positive_predictive_value': 0.1544943820224719,\n", " 'precision': 0.1544943820224719,\n", " 'recall': 0.06682867557715674,\n", " 'sensitivity': 0.06682867557715674,\n", " 'specificity': 0.9678933333333334,\n", " 'true_negative_rate': 0.9678933333333334,\n", " 'true_negatives': 9074,\n", " 'true_positive_rate': 0.06682867557715674,\n", " 'true_positives': 55}}\n", "\n", "Finished: experiment_run\n", "Saved to: results/experiment_run_0\n" ], "name": "stdout" } ] }, { "metadata": { "id": "qTL1K5SSXD1U", "colab_type": "text" }, "cell_type": "markdown", "source": [ "## 16.5 Sklearn Algorithm Cheatsheet" ] }, { "metadata": { "id": "T3d3iRE6XN1M", "colab_type": "text" }, "cell_type": "markdown", "source": [ "### Sklearn model selection" ] }, { "metadata": { "id": "A8oBgYpGTR4S", "colab_type": "text" }, "cell_type": "markdown", "source": [ "url: https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html\n", "\n", "![cheat-sheet](https://scikit-learn.org/stable/_static/ml_map.png)" ] }, { "metadata": { "id": "-EDmZcGjCBP1", "colab_type": "text" }, "cell_type": "markdown", "source": [ "### Model Explainability with SHAP\n" ] }, { "metadata": { "id": "tc7GQk3NCF-b", "colab_type": "text" }, "cell_type": "markdown", "source": [ "[https://github.com/slundberg/shap](https://github.com/slundberg/shap): *A unified approach to explain the output of any machine learning model.*" ] }, { "metadata": { "id": "WqFiPokws8nN", "colab_type": "text" }, "cell_type": "markdown", "source": [ "#### Install SHAP" ] }, { "metadata": { "id": "PiPf95catAHk", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "!pip install -q shap" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "-Sokh2GCsra1", "colab_type": "code", "outputId": "7a8354ee-752b-4dad-eb5a-bdb3daa0e768", "colab": { "base_uri": "https://localhost:8080/", "height": 42 } }, "cell_type": "code", "source": [ "import sklearn\n", "import shap\n", "\n", "shap.initjs()" ], "execution_count": 0, "outputs": [ { "output_type": "display_data", "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] } } ] }, { "metadata": { "id": "TYUXsHf65HUU", "colab_type": "text" }, "cell_type": "markdown", "source": [ "#### Load Census Data" ] }, { "metadata": { "id": "NO1iYErvBluS", "colab_type": "text" }, "cell_type": "markdown", "source": [ "[Adult datasets](https://archive.ics.uci.edu/ml/datasets/Adult)\n", "\n", "* *Predict whether income exceeds $50K/yr based on census data. Also known as \"Census Income\" dataset.*" ] }, { "metadata": { "id": "hD_xS-tV5Gp_", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "X,y = shap.datasets.adult()\n", "X_display,y_display = shap.datasets.adult(display=True)\n", "X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=7)" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "rKBTIdWgS7Vx", "colab_type": "code", "outputId": "73a92dc0-1557-47fd-c546-6f7ff38854af", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "cell_type": "code", "source": [ "X_train.shape, y_train.shape" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "((26048, 12), (26048,))" ] }, "metadata": { "tags": [] }, "execution_count": 47 } ] }, { "metadata": { "id": "tfuP7W1N9t1D", "colab_type": "text" }, "cell_type": "markdown", "source": [ "#### Train a k-nearest neighbor Classifier" ] }, { "metadata": { "id": "slogcMRd9yX2", "colab_type": "code", "outputId": "fa3fd6f8-cf88-452b-c488-5df0ef827b52", "colab": { "base_uri": "https://localhost:8080/", "height": 70 } }, "cell_type": "code", "source": [ "knn = sklearn.neighbors.KNeighborsClassifier()\n", "knn.fit(X_train, y_train)" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", " metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n", " weights='uniform')" ] }, "metadata": { "tags": [] }, "execution_count": 48 } ] }, { "metadata": { "id": "wtq8kfB192KS", "colab_type": "text" }, "cell_type": "markdown", "source": [ "#### Explain predictions" ] }, { "metadata": { "id": "kLuiobE595g3", "colab_type": "code", "outputId": "809d39ec-3bdd-4976-c558-2709aff46cf5", "colab": { "base_uri": "https://localhost:8080/", "height": 247 } }, "cell_type": "code", "source": [ "f = lambda x: knn.predict_proba(x)[:,1]\n", "med = X_train.median().values.reshape((1,X_train.shape[1]))\n", "explainer = shap.KernelExplainer(f, med)\n", "shap_values_single = explainer.shap_values(X.iloc[0,:], nsamples=1000)\n", "\n", "#Plot\n", "shap.initjs()\n", "shap.force_plot(explainer.expected_value, shap_values_single, X_display.iloc[0,:])" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "/usr/local/lib/python3.6/dist-packages/shap/explainers/kernel.py:535: UserWarning: l1_reg=\"auto\" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply \"num_features(10)\"!\n", " \"l1_reg=\\\"auto\\\" is deprecated and in the next version (v0.29) the behavior will change from a \" \\\n" ], "name": "stderr" }, { "output_type": "display_data", "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] } }, { "output_type": "execute_result", "data": { "text/html": [ "\n", "
\n", "
\n", " Visualization omitted, Javascript library not loaded!
\n", " Have you run `initjs()` in this notebook? If this notebook was from another\n", " user you must also trust this notebook (File -> Trust notebook). If you are viewing\n", " this notebook on github the Javascript has been stripped for security. If you are using\n", " JupyterLab this error is because a JupyterLab extension has not yet been written.\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 49 } ] }, { "metadata": { "id": "SEIH6ESVXKOb", "colab_type": "text" }, "cell_type": "markdown", "source": [ "## 16.6 Recommendations" ] }, { "metadata": { "id": "ANeUczzxXOf2", "colab_type": "text" }, "cell_type": "markdown", "source": [ "### Install Surprise" ] }, { "metadata": { "id": "0R_rvBACYbp9", "colab_type": "code", "outputId": "6ff83a68-19f5-4bcc-89cf-ad0fde5f058e", "colab": { "base_uri": "https://localhost:8080/", "height": 52 } }, "cell_type": "code", "source": [ " !pip install -q scikit-surprise" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "\u001b[K 100% |████████████████████████████████| 3.3MB 10.9MB/s \n", "\u001b[?25h Building wheel for scikit-surprise (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h" ], "name": "stdout" } ] }, { "metadata": { "id": "48NhoRpWYmc_", "colab_type": "code", "outputId": "1c1c121f-12af-4612-caae-5550bab8e28a", "colab": { "base_uri": "https://localhost:8080/", "height": 354 } }, "cell_type": "code", "source": [ "from surprise import SVD\n", "from surprise import Dataset\n", "from surprise.model_selection import cross_validate\n", "\n", "# Load the movielens-100k dataset (download it if needed).\n", "data = Dataset.load_builtin('ml-100k')\n", "\n", "# Use the famous SVD algorithm.\n", "algo = SVD()\n", "\n", "# Run 5-fold cross-validation and print results.\n", "cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Evaluating RMSE, MAE of algorithm SVD on 5 split(s).\n", "\n", " Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std \n", "RMSE (testset) 0.9460 0.9371 0.9344 0.9300 0.9354 0.9366 0.0053 \n", "MAE (testset) 0.7441 0.7390 0.7338 0.7312 0.7397 0.7376 0.0045 \n", "Fit time 5.30 5.22 5.25 5.23 5.23 5.24 0.03 \n", "Test time 0.16 0.26 0.16 0.15 0.16 0.18 0.04 \n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "{'fit_time': (5.302802085876465,\n", " 5.2162816524505615,\n", " 5.2515764236450195,\n", " 5.2256152629852295,\n", " 5.226689577102661),\n", " 'test_mae': array([0.74405382, 0.73902602, 0.73379062, 0.73123877, 0.73968219]),\n", " 'test_rmse': array([0.94601002, 0.93705768, 0.93435584, 0.93001856, 0.93540059]),\n", " 'test_time': (0.16068744659423828,\n", " 0.26168084144592285,\n", " 0.1584162712097168,\n", " 0.1538381576538086,\n", " 0.16183090209960938)}" ] }, "metadata": { "tags": [] }, "execution_count": 33 } ] }, { "metadata": { "id": "bRKFiJVvPyZi", "colab_type": "text" }, "cell_type": "markdown", "source": [ "### Handcoded Similarity Engine" ] }, { "metadata": { "id": "aUwh2jQMP2UD", "colab_type": "text" }, "cell_type": "markdown", "source": [ "\n", "\n", "```python\n", "\"\"\"Data Science Algorithms\"\"\"\n", "\n", "\n", "def tanimoto(list1, list2):\n", " \"\"\"tanimoto coefficient\n", " In [2]: list2=['39229', '31995', '32015']\n", " In [3]: list1=['31936', '35989', '27489', '39229', '15468', '31993', '26478']\n", " In [4]: tanimoto(list1,list2)\n", " Out[4]: 0.1111111111111111\n", " Uses intersection of two sets to determine numerical score\n", " \"\"\"\n", "\n", " intersection = set(list1).intersection(set(list2))\n", " return float(len(intersection))/(len(list1) + len(list2) - len(intersection))\n", "```\n", "\n" ] }, { "metadata": { "id": "3b1-VTl8jubf", "colab_type": "text" }, "cell_type": "markdown", "source": [ "### Collaborative Filtering Recommendation Exploration\n" ] }, { "metadata": { "id": "xrWJwi5pQLPa", "colab_type": "text" }, "cell_type": "markdown", "source": [ "#### Knn Exploration of MovieLens with Surprise" ] }, { "metadata": { "id": "-GaS8y05jubj", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "import io # needed because of weird encoding of u.item file\n", "from surprise import KNNBaseline\n", "from surprise import Dataset\n", "from surprise import get_dataset_dir" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "wQ8dueD5jubm", "colab_type": "text" }, "cell_type": "markdown", "source": [ "#### Helper Function to Convert IDS to Names" ] }, { "metadata": { "id": "ReVmJK2Tjubn", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "def read_item_names():\n", " \"\"\"Read the u.item file from MovieLens 100-k dataset and return two\n", " mappings to convert raw ids into movie names and movie names into raw ids.\n", " \"\"\"\n", "\n", " file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'\n", " rid_to_name = {}\n", " name_to_rid = {}\n", " with io.open(file_name, 'r', encoding='ISO-8859-1') as f:\n", " for line in f:\n", " line = line.split('|')\n", " rid_to_name[line[0]] = line[1]\n", " name_to_rid[line[1]] = line[0]\n", "\n", " return rid_to_name, name_to_rid" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "nxPSuxe3jubp", "colab_type": "text" }, "cell_type": "markdown", "source": [ "#### Train KNN based model" ] }, { "metadata": { "id": "tAyBaE_Cjubq", "colab_type": "code", "outputId": "1e219365-5b00-42d1-8561-a9d8ac0a4f1e", "colab": { "base_uri": "https://localhost:8080/", "height": 87 } }, "cell_type": "code", "source": [ "# First, train the algorithm to compute the similarities between items\n", "data = Dataset.load_builtin('ml-100k')\n", "trainset = data.build_full_trainset()\n", "sim_options = {'name': 'pearson_baseline', 'user_based': False}\n", "algo = KNNBaseline(sim_options=sim_options)\n", "algo.fit(trainset)\n", "\n" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Estimating biases using als...\n", "Computing the pearson_baseline similarity matrix...\n", "Done computing similarity matrix.\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 51 } ] }, { "metadata": { "id": "wv5b_Nhljubx", "colab_type": "text" }, "cell_type": "markdown", "source": [ "#### Recommendations" ] }, { "metadata": { "id": "1QXutzAkjubx", "colab_type": "code", "outputId": "57f0cbe3-2e1e-4760-f7ed-3ff1471b9401", "colab": { "base_uri": "https://localhost:8080/", "height": 194 } }, "cell_type": "code", "source": [ "# Read the mappings raw id <-> movie name\n", "rid_to_name, name_to_rid = read_item_names()\n", "\n", "# Retrieve inner id of the movie Toy Story\n", "toy_story_raw_id = name_to_rid['Toy Story (1995)']\n", "toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)\n", "\n", "# Retrieve inner ids of the nearest neighbors of Toy Story.\n", "toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)\n", "\n", "# Convert inner ids of the neighbors into names.\n", "toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)\n", " for inner_id in toy_story_neighbors)\n", "toy_story_neighbors = (rid_to_name[rid]\n", " for rid in toy_story_neighbors)\n", "\n", "for movie in toy_story_neighbors:\n", " print(movie)\n" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Beauty and the Beast (1991)\n", "Raiders of the Lost Ark (1981)\n", "That Thing You Do! (1996)\n", "Lion King, The (1994)\n", "Craft, The (1996)\n", "Liar Liar (1997)\n", "Aladdin (1992)\n", "Cool Hand Luke (1967)\n", "Winnie the Pooh and the Blustery Day (1968)\n", "Indiana Jones and the Last Crusade (1989)\n" ], "name": "stdout" } ] }, { "metadata": { "id": "Igt__Lo_ZF4k", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "" ], "execution_count": 0, "outputs": [] } ] }