Open In Colab

Lesson 9 Data Science Libraries

Pragmatic AI Labs

alt text

This notebook was produced by Pragmatic AI Labs. You can continue learning about these topics by:

Setup Tasks

Install Latest Plotly

import plotly
plotly.__version__
'3.6.1'
!pip uninstall -q -y plotly
!pip install plotly==3.6.0
!pip install -q --upgrade cufflinks
Collecting plotly==3.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/4d/59/63a5a05532a67b1c49283e8b7885bbe55454a1eef8443e97a7479bb9964b/plotly-3.6.0.tar.gz (31.1MB)
    100% |████████████████████████████████| 31.1MB 999kB/s 
[?25hRequirement already satisfied: decorator>=4.0.6 in /usr/local/lib/python3.6/dist-packages (from plotly==3.6.0) (4.3.2)
Requirement already satisfied: nbformat>=4.2 in /usr/local/lib/python3.6/dist-packages (from plotly==3.6.0) (4.4.0)
Requirement already satisfied: pytz in /usr/local/lib/python3.6/dist-packages (from plotly==3.6.0) (2018.9)
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from plotly==3.6.0) (2.18.4)
Collecting retrying>=1.3.3 (from plotly==3.6.0)
  Downloading https://files.pythonhosted.org/packages/44/ef/beae4b4ef80902f22e3af073397f079c96969c69b2c7d52a57ea9ae61c9d/retrying-1.3.3.tar.gz
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from plotly==3.6.0) (1.11.0)
Requirement already satisfied: traitlets>=4.1 in /usr/local/lib/python3.6/dist-packages (from nbformat>=4.2->plotly==3.6.0) (4.3.2)
Requirement already satisfied: jupyter-core in /usr/local/lib/python3.6/dist-packages (from nbformat>=4.2->plotly==3.6.0) (4.4.0)
Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.6/dist-packages (from nbformat>=4.2->plotly==3.6.0) (0.2.0)
Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /usr/local/lib/python3.6/dist-packages (from nbformat>=4.2->plotly==3.6.0) (2.6.0)
Requirement already satisfied: idna<2.7,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->plotly==3.6.0) (2.6)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->plotly==3.6.0) (2018.11.29)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->plotly==3.6.0) (3.0.4)
Requirement already satisfied: urllib3<1.23,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->plotly==3.6.0) (1.22)
Building wheels for collected packages: plotly, retrying
  Building wheel for plotly (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/67/0b/29/08c7f5caed2d1ac446db982ff607b326d49bfa0bd3a67ef8c7
  Building wheel for retrying (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/d7/a9/33/acc7b709e2a35caa7d4cae442f6fe6fbf2c43f80823d46460c
Successfully built plotly retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-3.6.0 retrying-1.3.3

import plotly
plotly.__version__
'3.6.0'
def enable_plotly_in_cell():
  import IPython
  from plotly.offline import init_notebook_mode
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
  '''))
  init_notebook_mode(connected=False)

9.1 Learn numpy

References

What is numpy?

  • Low level multi-dimensional array library
  • A programmers Excel
  • The building blocks for many key Python libraries:
    • Pandas
    • Sklearn
    • Tensorflow

Hello World Numpy Workflow

import numpy as np

Make an array

a = np.arange(6).reshape(2, 3)
a.shape
(2, 3)
a.size
6
a.dtype.name
'int64'
a
array([[0, 1, 2],
       [3, 4, 5]])

Create an Array

One Dimensional Array

a = np.array([2,4,6,8])
print(f"Shape {a.shape}")
print(f"Content: {a}")
Shape (4,)
Content: [2 4 6 8]

Two Dimensional Array

a = np.array([(2,4,6,8),(20,40,60,80)]) 
print(f"Shape: {a.shape}")
print(f"Content: {a}")
Shape: (2, 4)
Content: [[ 2  4  6  8]
 [20 40 60 80]]

Create Sequence of Numbers

a = np.arange(1,20)
print(f"Shape: {a.shape}")
print(f"Content: {a}")

Shape: (19,)
Content: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]

Create empty multi-dimensional array

a = np.zeros( (2,3) )
print(f"Shape: {a.shape}")
print(f"Content: {a}")
Shape: (2, 3)
Content: [[0. 0. 0.]
 [0. 0. 0.]]

Applied Numpy

Numpy Non-GPU Mandelbrot

import numpy as np
from pylab import imshow, show
from timeit import default_timer as timer
def mandel(x, y, max_iters):
  """
    Given the real and imaginary parts of a complex number,
    determine if it is a candidate for membership in the Mandelbrot
    set given a fixed number of iterations.
  """
  c = complex(x, y)
  z = 0.0j
  for i in range(max_iters):
    z = z*z + c
    if (z.real*z.real + z.imag*z.imag) >= 4:
      return i

  return max_iters
def create_fractal(min_x, max_x, min_y, max_y, image, iters):
  height = image.shape[0]
  width = image.shape[1]

  pixel_size_x = (max_x - min_x) / width
  pixel_size_y = (max_y - min_y) / height
    
  for x in range(width):
    real = min_x + x * pixel_size_x
    for y in range(height):
      imag = min_y + y * pixel_size_y
      color = mandel(real, imag, iters)
      image[y, x] = color
image = np.zeros((1024, 1536), dtype = np.uint8)
start = timer()
create_fractal(-2.0, 1.0, -1.0, 1.0, image, 20) 
dt = timer() - start

print("Mandelbrot created in %f s" % dt)
imshow(image)
show()
Mandelbrot created in 5.650793 s

png

Numba Mandelbrot

Numba is LLVM JIT support for Python

from numba import autojit
import numpy as np
from timeit import default_timer as timer

@autojit
def mandel(x, y, max_iters):
  """
    Given the real and imaginary parts of a complex number,
    determine if it is a candidate for membership in the Mandelbrot
    set given a fixed number of iterations.
  """
  c = complex(x, y)
  z = 0.0j
  for i in range(max_iters):
    z = z*z + c
    if (z.real*z.real + z.imag*z.imag) >= 4:
      return i

  return max_iters

@autojit
def create_fractal(min_x, max_x, min_y, max_y, image, iters):
  height = image.shape[0]
  width = image.shape[1]

  pixel_size_x = (max_x - min_x) / width
  pixel_size_y = (max_y - min_y) / height
    
  for x in range(width):
    real = min_x + x * pixel_size_x
    for y in range(height):
      imag = min_y + y * pixel_size_y
      color = mandel(real, imag, iters)
      image[y, x] = color
image = np.zeros((1024, 1536), dtype = np.uint8)
start = timer()
create_fractal(-2.0, 1.0, -1.0, 1.0, image, 20) 
dt = timer() - start

print("Mandelbrot created in %f s" % dt)
imshow(image)
show()
Mandelbrot created in 0.361222 s

png

Cuda-Numpy

CUDA Install
!/usr/local/cuda/bin/nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130

CUDA GPU Mandelbrot
from numba import cuda
from numba import *

mandel_gpu = cuda.jit(restype=uint32, argtypes=[f8, f8, uint32], device=True)(mandel)
@cuda.jit(argtypes=[f8, f8, f8, f8, uint8[:,:], uint32])
def mandel_kernel(min_x, max_x, min_y, max_y, image, iters):
  height = image.shape[0]
  width = image.shape[1]

  pixel_size_x = (max_x - min_x) / width
  pixel_size_y = (max_y - min_y) / height

  startX, startY = cuda.grid(2)
  gridX = cuda.gridDim.x * cuda.blockDim.x;
  gridY = cuda.gridDim.y * cuda.blockDim.y;

  for x in range(startX, width, gridX):
    real = min_x + x * pixel_size_x
    for y in range(startY, height, gridY):
      imag = min_y + y * pixel_size_y 
      image[y, x] = mandel_gpu(real, imag, iters)

    ---------------------------------------------------------------------------

    OSError                                   Traceback (most recent call last)

    /usr/local/lib/python3.6/dist-packages/numba/cuda/cudadrv/nvvm.py in __new__(cls)
        110                 try:
    --> 111                     inst.driver = open_cudalib('nvvm', ccc=True)
        112                 except OSError as e:


    /usr/local/lib/python3.6/dist-packages/numba/cuda/cudadrv/libs.py in open_cudalib(lib, ccc)
         47     if path is None:
    ---> 48         raise OSError('library %s not found' % lib)
         49     if ccc:


    OSError: library nvvm not found

    
    During handling of the above exception, another exception occurred:


    NvvmSupportError                          Traceback (most recent call last)

    <ipython-input-34-dff031d8ec9d> in <module>()
          2 sys.path.append('/usr/local/lib/python3.6/site-packages/')
          3 
    ----> 4 @cuda.jit(argtypes=[f8, f8, f8, f8, uint8[:,:], uint32])
          5 def mandel_kernel(min_x, max_x, min_y, max_y, image, iters):
          6   height = image.shape[0]


    /usr/local/lib/python3.6/dist-packages/numba/cuda/decorators.py in kernel_jit(func)
         94             # Force compilation for the current context
         95             if bind:
    ---> 96                 kernel.bind()
         97 
         98             return kernel


    /usr/local/lib/python3.6/dist-packages/numba/cuda/compiler.py in bind(self)
        498         Force binding to current CUDA context
        499         """
    --> 500         self._func.get()
        501 
        502     @property


    /usr/local/lib/python3.6/dist-packages/numba/cuda/compiler.py in get(self)
        376         cufunc = self.cache.get(device.id)
        377         if cufunc is None:
    --> 378             ptx = self.ptx.get()
        379 
        380             # Link


    /usr/local/lib/python3.6/dist-packages/numba/cuda/compiler.py in get(self)
        348             arch = nvvm.get_arch_option(*cc)
        349             ptx = nvvm.llvm_to_ptx(self.llvmir, opt=3, arch=arch,
    --> 350                                    **self._extra_options)
        351             self.cache[cc] = ptx
        352             if config.DUMP_ASSEMBLY:


    /usr/local/lib/python3.6/dist-packages/numba/cuda/cudadrv/nvvm.py in llvm_to_ptx(llvmir, **opts)
        472 
        473 def llvm_to_ptx(llvmir, **opts):
    --> 474     cu = CompilationUnit()
        475     libdevice = LibDevice(arch=opts.get('arch', 'compute_20'))
        476     # New LLVM generate a shorthand for datalayout that NVVM does not know


    /usr/local/lib/python3.6/dist-packages/numba/cuda/cudadrv/nvvm.py in __init__(self)
        144 class CompilationUnit(object):
        145     def __init__(self):
    --> 146         self.driver = NVVM()
        147         self._handle = nvvm_program()
        148         err = self.driver.nvvmCreateProgram(byref(self._handle))


    /usr/local/lib/python3.6/dist-packages/numba/cuda/cudadrv/nvvm.py in __new__(cls)
        114                     errmsg = ("libNVVM cannot be found. Do `conda install "
        115                               "cudatoolkit`:\n%s")
    --> 116                     raise NvvmSupportError(errmsg % e)
        117 
        118                 # Find & populate functions


    NvvmSupportError: libNVVM cannot be found. Do `conda install cudatoolkit`:
    library nvvm not found

    

    ---------------------------------------------------------------------------
    NOTE: If your import is failing due to a missing package, you can
    manually install dependencies using either !pip or !apt.
    
    To view examples of installing some common dependencies, click the
    "Open Examples" button below.
    ---------------------------------------------------------------------------



image = np.zeros((1024, 1536), dtype = np.uint8)
start = timer()
create_fractal(-2.0, 1.0, -1.0, 1.0, image, 20) 
dt = timer() - start

print("Mandelbrot created in %f s" % dt)
imshow(image)
show()
Mandelbrot created in 0.067448 s

png

Cuda Vectorize with Numpy
from numba import (cuda, vectorize)
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

from functools import wraps
from time import time

def real_estate_df():
    """30 Years of Housing Prices"""

    df = pd.read_csv("https://raw.githubusercontent.com/noahgift/real_estate_ml/master/data/Zip_Zhvi_SingleFamilyResidence.csv")
    df.rename(columns={"RegionName":"ZipCode"}, inplace=True)
    df["ZipCode"]=df["ZipCode"].map(lambda x: "{:.0f}".format(x))
    df["RegionID"]=df["RegionID"].map(lambda x: "{:.0f}".format(x))
    return df

def numerical_real_estate_array(df):
    """Converts df to numpy numerical array"""

    columns_to_drop = ['RegionID', 'ZipCode', 'City', 'State', 'Metro', 'CountyName']
    df_numerical = df.dropna()
    df_numerical = df_numerical.drop(columns_to_drop, axis=1)
    return df_numerical.values

def real_estate_array():
    """Returns Real Estate Array"""

    df = real_estate_df()
    rea = numerical_real_estate_array(df)
    return np.float32(rea)


@vectorize(['float32(float32, float32)'], target='cuda')
def add_ufunc(x, y):
    return x + y

def cuda_operation():
    """Performs Vectorized Operations on GPU"""

    x = real_estate_array()
    y = real_estate_array()

    print("Moving calculations to GPU memory")
    x_device = cuda.to_device(x)
    y_device = cuda.to_device(y)
    out_device = cuda.device_array(
        shape=(x_device.shape[0],x_device.shape[1]), dtype=np.float32)
    print(x_device)
    print(x_device.shape)
    print(x_device.dtype)

    print("Calculating on GPU")
    add_ufunc(x_device,y_device, out=out_device)

    out_host = out_device.copy_to_host()
    print(f"Calculations from GPU {out_host}")

cuda_operation()

9.2 Learn sklearn

Supervized Machine Learning: Classification Modeling Workflow

Key Evaluation Terms

  • Amazon ML Key Classification Metrics
  • sklearn classification metrics

  • Precision: Measures the fraction of actual positives among those examples that are predicted as positive. The range is 0 to 1. A larger value indicates better predictive accuracy

  • ** Recall**: Measures the fraction of actual positives that are predicted as positive. The range is 0 to 1. A larger value indicates better predictive accuracy

  • F1-score: Weighted average of recall and precision

  • AUC: AUC measures the ability of the model to predict a higher score for positive examples as compared to negative examples

  • False Positive Rate: The false positive rate (FPR) measures the false alarm rate or the fraction of actual negatives that are predicted as positive. The range is 0 to 1. A smaller value indicates better predictive accuracy

Digits Dataset

sklearn modeling

https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html

# Standard scientific Python imports
import matplotlib.pyplot as plt

# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm, metrics

# The digits dataset
digits = datasets.load_digits()

# The data that we are interested in is made of 8x8 images of digits, let's
# have a look at the first 4 images, stored in the `images` attribute of the
# dataset.  If we were working from image files, we could load them using
# matplotlib.pyplot.imread.  Note that each image must have the same size. For these
# images, we know which digit they represent: it is given in the 'target' of
# the dataset.
images_and_labels = list(zip(digits.images, digits.target))
for index, (image, label) in enumerate(images_and_labels[:4]):
    plt.subplot(2, 4, index + 1)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Training: %i' % label)

# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)

# We learn the digits on the first half of the digits
classifier.fit(data[:n_samples // 2], digits.target[:n_samples // 2])

# Now predict the value of the digit on the second half:
expected = digits.target[n_samples // 2:]
predicted = classifier.predict(data[n_samples // 2:])

print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

images_and_predictions = list(zip(digits.images[n_samples // 2:], predicted))
for index, (image, prediction) in enumerate(images_and_predictions[:4]):
    plt.subplot(2, 4, index + 5)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Prediction: %i' % prediction)

plt.show()
Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
              precision    recall  f1-score   support

           0       1.00      0.99      0.99        88
           1       0.99      0.97      0.98        91
           2       0.99      0.99      0.99        86
           3       0.98      0.87      0.92        91
           4       0.99      0.96      0.97        92
           5       0.95      0.97      0.96        91
           6       0.99      0.99      0.99        91
           7       0.96      0.99      0.97        89
           8       0.94      1.00      0.97        88
           9       0.93      0.98      0.95        92

   micro avg       0.97      0.97      0.97       899
   macro avg       0.97      0.97      0.97       899
weighted avg       0.97      0.97      0.97       899


Confusion matrix:
[[87  0  0  0  1  0  0  0  0  0]
 [ 0 88  1  0  0  0  0  0  1  1]
 [ 0  0 85  1  0  0  0  0  0  0]
 [ 0  0  0 79  0  3  0  4  5  0]
 [ 0  0  0  0 88  0  0  0  0  4]
 [ 0  0  0  0  0 88  1  0  0  2]
 [ 0  1  0  0  0  0 90  0  0  0]
 [ 0  0  0  0  0  1  0 88  0  0]
 [ 0  0  0  0  0  0  0  0 88  0]
 [ 0  0  0  1  0  1  0  0  0 90]]

png

Yellowbrick Confusion Matrix

http://www.scikit-yb.org/en/latest/api/classifier/confusion_matrix.html

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from yellowbrick.classifier import ConfusionMatrix

# We'll use the handwritten digits data set from scikit-learn.
# Each feature of this dataset is an 8x8 pixel image of a handwritten number.
# Digits.data converts these 64 pixels into a single array of features
digits = load_digits()
X = digits.data
y = digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state=1)

model = LogisticRegression()

# The ConfusionMatrix visualizer taxes a model
cm = ConfusionMatrix(model, classes=[0,1,2,3,4,5,6,7,8,9])

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm.score(X_test, y_test)

# How did we do?
cm.poof()

png

ROCAUC

http://www.scikit-yb.org/en/latest/api/classifier/rocauc.html

from yellowbrick.classifier import ROCAUC
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
classes=[0,1,2,3,4,5,6,7,8,9]

# Instantiate the visualizer with the classification model
visualizer = ROCAUC(model, classes=classes)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
g = visualizer.poof()             # Draw/show/poof the data

png

Supervized Machine Learning: Regression Modeling Workflow

Ingest

Source: http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_MLB_HeightsWeights

import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/noahgift/functional_intro_to_python/master/data/mlb_weight_ht.csv")
df.head()

Name Team Position Height(inches) Weight(pounds) Age
0 Adam_Donachie BAL Catcher 74 180.0 22.99
1 Paul_Bako BAL Catcher 74 215.0 34.69
2 Ramon_Hernandez BAL Catcher 72 210.0 30.78
3 Kevin_Millar BAL First_Baseman 72 210.0 35.43
4 Chris_Gomez BAL First_Baseman 73 188.0 35.71

Find N/A

df.shape
(1034, 6)
df.isnull().values.any()
True
df = df.dropna()
df.isnull().values.any()
False
df.shape
(1033, 6)

Clean

df.rename(index=str, 
             columns={"Height(inches)": "Height", "Weight(pounds)": "Weight"},
             inplace=True)
df.head()

Name Team Position Height Weight Age
0 Adam_Donachie BAL Catcher 74 180.0 22.99
1 Paul_Bako BAL Catcher 74 215.0 34.69
2 Ramon_Hernandez BAL Catcher 72 210.0 30.78
3 Kevin_Millar BAL First_Baseman 72 210.0 35.43
4 Chris_Gomez BAL First_Baseman 73 188.0 35.71

EDA

df.describe()
Height Weight Age
count 1033.000000 1033.000000 1033.000000
mean 73.698935 201.689255 28.737648
std 2.306330 20.991491 4.322298
min 67.000000 150.000000 20.900000
25% 72.000000 187.000000 25.440000
50% 74.000000 200.000000 27.930000
75% 75.000000 215.000000 31.240000
max 83.000000 290.000000 48.520000

Model

from sklearn import linear_model
from sklearn.model_selection import train_test_split

Create Features

var = df['Weight'].values
var.shape
(1033,)
y = df['Weight'].values #Target
y = y.reshape(-1, 1)
X = df['Height'].values #Feature(s)
X = X.reshape(-1,1)
#X = df[['Height', 'Age']].values
y.shape
(1033, 1)

Split data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(826, 1) (826, 1)
(207, 1) (207, 1)

Fit the model

lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)
lm.predict?

Returns Numpy Array

type(predictions)
numpy.ndarray

Plot Predictions

from matplotlib import pyplot as plt
plt.scatter(y_test, predictions)
plt.xlabel("Actual Weight")
plt.ylabel("Predicted Weight")
Text(0, 0.5, 'Predicted Weight')

png

model.score(X_test, y_test)
0.19307840892345418

Use Cross-Validation

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
scores = cross_val_score(model, X, y, cv=6)
scores
array([0.29670427, 0.22459508, 0.29543549, 0.30012566, 0.19191046,
       0.34579806])

Plot Cross-validation Predictions

predictions = cross_val_predict(model, X, y, cv=6)
plt.scatter(y, predictions)
<matplotlib.collections.PathCollection at 0x7f6e32e91518>

png

accuracy = metrics.r2_score(y, predictions)
accuracy
0.280770222008195

Conclusion

  • Cross-Validation improved Accuracy
  • Adding more data or more features could improve the model
  • Major League Baseball may be a strange set to predict Weight
  • Bigger Data Set here: http://socr.ucla.edu/docs/resources/SOCR_Data/SOCR_Data_Dinov_020108_HeightsWeights.html

Unsupervized Machine Learning: Clustering

Ingest

import pandas as pd

df = pd.read_csv(
    "https://raw.githubusercontent.com/noahgift/food/master/data/features.en.openfoodfacts.org.products.csv")
df.drop(["Unnamed: 0", "exceeded", "g_sum", "energy_100g"], axis=1, inplace=True) #drop two rows we don't need
df = df.drop(df.index[[1,11877]]) #drop outlier
df.rename(index=str, columns={"reconstructed_energy": "energy_100g"}, inplace=True)
df.head()
fat_100g carbohydrates_100g sugars_100g proteins_100g salt_100g energy_100g product
0 28.57 64.29 14.29 3.57 0.00000 2267.85 Banana Chips Sweetened (Whole)
2 57.14 17.86 3.57 17.86 1.22428 2835.70 Organic Salted Nut Mix
3 18.75 57.81 15.62 14.06 0.13970 1953.04 Organic Muesli
4 36.67 36.67 3.33 16.67 1.60782 2336.91 Zen Party Mix
5 18.18 60.00 21.82 14.55 0.02286 1976.37 Cinnamon Nut Granola
df.columns
Index(['fat_100g', 'carbohydrates_100g', 'sugars_100g', 'proteins_100g',
       'salt_100g', 'energy_100g', 'product'],
      dtype='object')

Create Features to Cluster

df_cluster_features = df.drop("product", axis=1)

Scale the data

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(scaler.fit(df_cluster_features))
print(scaler.transform(df_cluster_features))
MinMaxScaler(copy=True, feature_range=(0, 1))
[[2.85700000e-01 6.42900000e-01 1.53063241e-01 6.89388819e-02
  0.00000000e+00 5.06782123e-01]
 [5.71400000e-01 1.78600000e-01 4.71343874e-02 2.06913199e-01
  6.02500000e-04 6.33675978e-01]
 [1.87500000e-01 5.78100000e-01 1.66205534e-01 1.70223038e-01
  6.87500000e-05 4.36433520e-01]
 ...
 [0.00000000e+00 1.33300000e-01 1.43577075e-01 3.44694410e-02
  1.87500000e-05 5.06391061e-02]
 [0.00000000e+00 1.62500000e-01 1.72430830e-01 3.44694410e-02
  1.87500000e-05 6.17318436e-02]
 [0.00000000e+00 0.00000000e+00 1.18577075e-02 3.44694410e-02
  0.00000000e+00 0.00000000e+00]]

Add Cluster Labels

from sklearn.cluster import KMeans

k_means = KMeans(n_clusters=3)
kmeans = k_means.fit(scaler.transform(df_cluster_features))
df['cluster'] = kmeans.labels_
df.head()
fat_100g carbohydrates_100g sugars_100g proteins_100g salt_100g energy_100g product cluster
0 28.57 64.29 14.29 3.57 0.00000 2267.85 Banana Chips Sweetened (Whole) 2
2 57.14 17.86 3.57 17.86 1.22428 2835.70 Organic Salted Nut Mix 2
3 18.75 57.81 15.62 14.06 0.13970 1953.04 Organic Muesli 2
4 36.67 36.67 3.33 16.67 1.60782 2336.91 Zen Party Mix 2
5 18.18 60.00 21.82 14.55 0.02286 1976.37 Cinnamon Nut Granola 2

9.3 Learn pandas

Time Series Workflow

Ingest Zillow

import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns; sns.set(color_codes=True)
from sklearn.cluster import KMeans
color = sns.color_palette()
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline
df = pd.read_csv("https://raw.githubusercontent.com/noahgift/real_estate_ml/master/data/Zip_Zhvi_SingleFamilyResidence_2018.csv")
df.head()

RegionID RegionName City State Metro CountyName SizeRank 1996-04 1996-05 1996-06 ... 2018-03 2018-04 2018-05 2018-06 2018-07 2018-08 2018-09 2018-10 2018-11 2018-12
0 84654 60657 Chicago IL Chicago-Naperville-Elgin Cook County 1 334200.000 335400.000 336500.000 ... 1037400 1038700 1041500 1042800 1042900 1044400 1047800 1049700 1048300 1047900
1 91982 77494 Katy TX Houston-The Woodlands-Sugar Land Harris County 2 210400.000 212200.000 212200.000 ... 330400 332700 334500 335900 337000 338300 338400 336900 336000 336500
2 84616 60614 Chicago IL Chicago-Naperville-Elgin Cook County 3 498100.000 500900.000 503100.000 ... 1317900 1321100 1325300 1323800 1321200 1320700 1319500 1318800 1319700 1323300
3 93144 79936 El Paso TX El Paso El Paso County 4 77300.000 77300.000 77300.000 ... 120800 121300 122200 123000 123600 124500 125600 126300 126800 127400
4 91940 77449 Katy TX Houston-The Woodlands-Sugar Land Harris County 5 95400.000 95600.000 95800.000 ... 175500 176400 176900 176900 177300 178000 178500 179300 180200 180700

5 rows × 280 columns

EDA

df.describe()
RegionID RegionName SizeRank 1996-04 1996-05 1996-06 1996-07 1996-08 1996-09 1996-10 ... 2018-03 2018-04 2018-05 2018-06 2018-07 2018-08 2018-09 2018-10 2018-11 2018-12
count 15508.000 15508.000 15508.000 14338.000 14338.000 14338.000 14338.000 14338.000 14338.000 14338.000 ... 15508.000 15508.000 15508.000 15508.000 15508.000 15508.000 15508.000 15508.000 15508.000 15508.000
mean 80789.618 47683.566 7754.500 115889.866 116007.379 116123.051 116235.493 116358.920 116501.681 116689.315 ... 279359.582 280672.685 282148.749 283446.447 284466.282 285500.200 286717.307 288029.320 289187.510 290106.635
std 31521.485 29008.034 4476.918 85115.825 85264.209 85413.118 85566.676 85744.243 85958.867 86230.630 ... 361868.364 361360.576 363102.089 365301.815 366277.876 367095.613 366772.521 364624.171 361143.146 359132.687
min 58196.000 1001.000 1.000 11300.000 11500.000 11600.000 11800.000 11800.000 12000.000 12100.000 ... 21700.000 21700.000 22100.000 22200.000 22000.000 21800.000 21700.000 21500.000 21600.000 21900.000
25% 67215.000 22199.000 3877.750 66700.000 66800.000 66925.000 67100.000 67200.000 67300.000 67500.000 ... 128300.000 128800.000 129675.000 130300.000 131100.000 131900.000 132900.000 134000.000 135100.000 135600.000
50% 77886.500 45792.500 7754.500 96500.000 96700.000 96750.000 96900.000 96900.000 97000.000 97150.000 ... 191100.000 192150.000 193400.000 194600.000 195700.000 196900.000 198100.000 199600.000 201100.000 202150.000
75% 90314.250 74010.250 11631.250 140500.000 140600.000 140600.000 140800.000 141000.000 141100.000 141300.000 ... 310750.000 312300.000 314325.000 316100.000 317425.000 318325.000 319800.000 321200.000 322425.000 323900.000
max 753844.000 99901.000 15508.000 3676700.000 3704200.000 3729600.000 3754600.000 3781800.000 3813500.000 3849600.000 ... 17724700.000 17408900.000 17450500.000 17722800.000 18006700.000 18273800.000 18331900.000 18131900.000 17594900.000 17119600.000

8 rows × 276 columns

Clean Up DataFrame

Rename RegionName to ZipCode and Change Zip Code to String

df.rename(columns={"RegionName":"ZipCode"}, inplace=True)
df["ZipCode"]=df["ZipCode"].map(lambda x: "{:.0f}".format(x))
df["RegionID"]=df["RegionID"].map(lambda x: "{:.0f}".format(x))
df.head()
RegionID ZipCode City State Metro CountyName SizeRank 1996-04 1996-05 1996-06 ... 2018-03 2018-04 2018-05 2018-06 2018-07 2018-08 2018-09 2018-10 2018-11 2018-12
0 84654 60657 Chicago IL Chicago-Naperville-Elgin Cook County 1 334200.000 335400.000 336500.000 ... 1037400 1038700 1041500 1042800 1042900 1044400 1047800 1049700 1048300 1047900
1 91982 77494 Katy TX Houston-The Woodlands-Sugar Land Harris County 2 210400.000 212200.000 212200.000 ... 330400 332700 334500 335900 337000 338300 338400 336900 336000 336500
2 84616 60614 Chicago IL Chicago-Naperville-Elgin Cook County 3 498100.000 500900.000 503100.000 ... 1317900 1321100 1325300 1323800 1321200 1320700 1319500 1318800 1319700 1323300
3 93144 79936 El Paso TX El Paso El Paso County 4 77300.000 77300.000 77300.000 ... 120800 121300 122200 123000 123600 124500 125600 126300 126800 127400
4 91940 77449 Katy TX Houston-The Woodlands-Sugar Land Harris County 5 95400.000 95600.000 95800.000 ... 175500 176400 176900 176900 177300 178000 178500 179300 180200 180700

5 rows × 280 columns

median_prices = df.median()
#sf_prices = df["City"] == "San Francisco".median()

Median USA Prices December, 2018

median_prices.tail()
2018-08   196900.000
2018-09   198100.000
2018-10   199600.000
2018-11   201100.000
2018-12   202150.000
dtype: float64
sf_df = df[df["City"] == "San Francisco"].median()
df_comparison = pd.concat([sf_df,median_prices], axis=1)
df_comparison.columns = ["San Francisco","Median USA"]
df_comparison.tail()

San Francisco Median USA
2018-08 1828600.000 196900.000
2018-09 1823200.000 198100.000
2018-10 1823700.000 199600.000
2018-11 1813400.000 201100.000
2018-12 1806000.000 202150.000

Transpose

df_transposed = df.transpose()
df_transposed.head(15)

0 1 2 3 4 5 6 7 8 9 ... 15498 15499 15500 15501 15502 15503 15504 15505 15506 15507
RegionID 84654 91982 84616 93144 91940 91733 61807 84640 62037 97564 ... 94711 62556 99032 62697 99074 58333 59107 75672 93733 95851
ZipCode 60657 77494 60614 79936 77449 77084 10467 60640 11226 94109 ... 84781 12429 97028 12720 97102 1338 3293 40404 81225 89155
City Chicago Katy Chicago El Paso Katy Houston New York Chicago New York San Francisco ... Pine Valley Esopus Rhododendron Bethel Arch Cape Buckland Woodstock Berea Mount Crested Butte Mesquite
State IL TX IL TX TX TX NY IL NY CA ... UT NY OR NY OR MA NH KY CO NV
Metro Chicago-Naperville-Elgin Houston-The Woodlands-Sugar Land Chicago-Naperville-Elgin El Paso Houston-The Woodlands-Sugar Land Houston-The Woodlands-Sugar Land New York-Newark-Jersey City Chicago-Naperville-Elgin New York-Newark-Jersey City San Francisco-Oakland-Hayward ... St. George Kingston Portland-Vancouver-Hillsboro NaN Astoria Greenfield Town Claremont-Lebanon Richmond-Berea NaN Las Vegas-Henderson-Paradise
CountyName Cook County Harris County Cook County El Paso County Harris County Harris County Bronx County Cook County Kings County San Francisco County ... Washington County Ulster County Clackamas County Sullivan County Clatsop County Franklin County Grafton County Madison County Gunnison County Clark County
SizeRank 1 2 3 4 5 6 7 8 9 10 ... 15499 15500 15501 15502 15503 15504 15505 15506 15507 15508
1996-04 334200.000 210400.000 498100.000 77300.000 95400.000 95000.000 152900.000 216500.000 162000.000 766000.000 ... 135900.000 78300.000 136200.000 62500.000 182600.000 94600.000 92700.000 57100.000 191100.000 176400.000
1996-05 335400.000 212200.000 500900.000 77300.000 95600.000 95200.000 152700.000 216700.000 162300.000 771100.000 ... 136300.000 78300.000 136600.000 62600.000 183700.000 94300.000 92500.000 57300.000 192400.000 176300.000
1996-06 336500.000 212200.000 503100.000 77300.000 95800.000 95400.000 152600.000 216900.000 162600.000 776500.000 ... 136600.000 78200.000 136800.000 62700.000 184800.000 94000.000 92400.000 57500.000 193700.000 176100.000
1996-07 337600.000 210700.000 504600.000 77300.000 96100.000 95700.000 152400.000 217000.000 163000.000 781900.000 ... 136900.000 78200.000 136800.000 62700.000 185800.000 93700.000 92200.000 57700.000 195000.000 176000.000
1996-08 338500.000 208300.000 505500.000 77400.000 96400.000 95900.000 152300.000 217100.000 163400.000 787300.000 ... 137100.000 78100.000 136700.000 62700.000 186700.000 93400.000 92100.000 58000.000 196300.000 175900.000
1996-09 339500.000 205500.000 505700.000 77500.000 96700.000 96100.000 152000.000 217200.000 164000.000 793000.000 ... 137400.000 78000.000 136600.000 62600.000 187700.000 93200.000 91900.000 58200.000 197700.000 175800.000
1996-10 340400.000 202500.000 505300.000 77600.000 96800.000 96200.000 151800.000 217500.000 164700.000 799100.000 ... 137700.000 78000.000 136400.000 62500.000 188700.000 93000.000 91700.000 58400.000 199100.000 175800.000
1996-11 341300.000 199800.000 504200.000 77700.000 96800.000 96100.000 151600.000 217900.000 165700.000 805800.000 ... 137900.000 78000.000 136000.000 62400.000 189800.000 92900.000 91300.000 58700.000 200700.000 176000.000

15 rows × 15508 columns

Create Cities DataFrame

cities = df_transposed.iloc[2].values
cities_df = df_transposed.drop(df_transposed.index[:7])
cities_df.columns = cities
cities_df.head()
Chicago Katy Chicago El Paso Katy Houston New York Chicago New York San Francisco ... Pine Valley Esopus Rhododendron Bethel Arch Cape Buckland Woodstock Berea Mount Crested Butte Mesquite
1996-04 334200.000 210400.000 498100.000 77300.000 95400.000 95000.000 152900.000 216500.000 162000.000 766000.000 ... 135900.000 78300.000 136200.000 62500.000 182600.000 94600.000 92700.000 57100.000 191100.000 176400.000
1996-05 335400.000 212200.000 500900.000 77300.000 95600.000 95200.000 152700.000 216700.000 162300.000 771100.000 ... 136300.000 78300.000 136600.000 62600.000 183700.000 94300.000 92500.000 57300.000 192400.000 176300.000
1996-06 336500.000 212200.000 503100.000 77300.000 95800.000 95400.000 152600.000 216900.000 162600.000 776500.000 ... 136600.000 78200.000 136800.000 62700.000 184800.000 94000.000 92400.000 57500.000 193700.000 176100.000
1996-07 337600.000 210700.000 504600.000 77300.000 96100.000 95700.000 152400.000 217000.000 163000.000 781900.000 ... 136900.000 78200.000 136800.000 62700.000 185800.000 93700.000 92200.000 57700.000 195000.000 176000.000
1996-08 338500.000 208300.000 505500.000 77400.000 96400.000 95900.000 152300.000 217100.000 163400.000 787300.000 ... 137100.000 78100.000 136700.000 62700.000 186700.000 93400.000 92100.000 58000.000 196300.000 175900.000

5 rows × 15508 columns

Create time series

from pandas.plotting import autocorrelation_plot

sf_values = cities_df.iloc[:, 9].values
index = pd.DatetimeIndex(cities_df.index.values)
sf_data = pd.Series(sf_values, index=index)

autocorrelation plot

Reference: https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#visualization-autocorrelation

autocorrelation_plot(sf_data)
<matplotlib.axes._subplots.AxesSubplot at 0x7f6e2fc637b8>

png

sf_data.tail()
2018-08-01    3993000
2018-09-01    3999000
2018-10-01    4014600
2018-11-01    4009500
2018-12-01    4016600
dtype: object

Simple Plot

sf_data.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f6e32f078d0>

png

DataFrame Workflow

Ingest

import pandas as pd
df = pd.read_csv(
    "https://raw.githubusercontent.com/noahgift/food/master/data/features.en.openfoodfacts.org.products.csv")
df.drop(["Unnamed: 0", "exceeded", "g_sum", "energy_100g"], axis=1, inplace=True) #drop two rows we don't need
df = df.drop(df.index[[1,11877]]) #drop outlier
df.rename(index=str, columns={"reconstructed_energy": "energy_100g"}, inplace=True)
df.head()
fat_100g carbohydrates_100g sugars_100g proteins_100g salt_100g energy_100g product
0 28.570 64.290 14.290 3.570 0.000 2267.850 Banana Chips Sweetened (Whole)
2 57.140 17.860 3.570 17.860 1.224 2835.700 Organic Salted Nut Mix
3 18.750 57.810 15.620 14.060 0.140 1953.040 Organic Muesli
4 36.670 36.670 3.330 16.670 1.608 2336.910 Zen Party Mix
5 18.180 60.000 21.820 14.550 0.023 1976.370 Cinnamon Nut Granola

EDA

df.columns
Index(['fat_100g', 'carbohydrates_100g', 'sugars_100g', 'proteins_100g',
       'salt_100g', 'energy_100g', 'product'],
      dtype='object')

Rows and Attributes

df.shape
(45026, 7)

First Five Columns

df.head()
fat_100g carbohydrates_100g sugars_100g proteins_100g salt_100g energy_100g product
0 28.570 64.290 14.290 3.570 0.000 2267.850 Banana Chips Sweetened (Whole)
2 57.140 17.860 3.570 17.860 1.224 2835.700 Organic Salted Nut Mix
3 18.750 57.810 15.620 14.060 0.140 1953.040 Organic Muesli
4 36.670 36.670 3.330 16.670 1.608 2336.910 Zen Party Mix
5 18.180 60.000 21.820 14.550 0.023 1976.370 Cinnamon Nut Granola

Descriptive Statistics

df.describe()
fat_100g carbohydrates_100g sugars_100g proteins_100g salt_100g energy_100g
count 45026.000 45026.000 45026.000 45026.000 45026.000 45026.000
mean 10.766 34.054 16.005 6.617 1.470 1111.286
std 14.930 29.557 21.496 7.926 12.795 791.609
min 0.000 0.000 -1.200 -3.570 0.000 0.000
25% 0.000 7.440 1.570 0.000 0.064 334.520
50% 3.170 22.390 5.880 4.000 0.635 1121.540
75% 17.860 61.540 23.080 9.520 1.440 1678.460
max 100.000 100.000 100.000 100.000 2032.000 4475.000

Correlations

df.corr()
fat_100g carbohydrates_100g sugars_100g proteins_100g salt_100g energy_100g
fat_100g 1.000 -0.051 -0.068 0.383 -0.003 0.768
carbohydrates_100g -0.051 1.000 0.669 -0.097 -0.011 0.580
sugars_100g -0.068 0.669 1.000 -0.277 -0.030 0.327
proteins_100g 0.383 -0.097 -0.277 1.000 0.016 0.391
salt_100g -0.003 -0.011 -0.030 0.016 1.000 -0.007
energy_100g 0.768 0.580 0.327 0.391 -0.007 1.000

Filtering by Quantiles

Find fatty foods in the 98th percentile
high_fat_df = df[df.fat_100g > df.fat_100g.quantile(.98)]
high_fat_text = high_fat_df['product'].values
len(high_fat_text)
878
high_fat_text[0]
'Organic Salted Nut Mix'
Find protein foods in the 98th percentile
high_protein_df = df[df.proteins_100g > df.proteins_100g.quantile(.98)]
high_protein_text = high_protein_df['product'].values
len(high_protein_text)
896
high_protein_text[0]
'Organic Yellow Split Peas'

9.4 Learn tensorflow

TensorFlow Hello World

References

import tensorflow as tf
input1 = tf.ones((2, 3))
input2 = tf.reshape(tf.range(1, 7, dtype=tf.float32), (2, 3))

print("Two Tensor Flow Matrices with shape:")
print(input1.shape)
print(input2.shape)
Two Tensor Flow Matrices with shape:
(2, 3)
(2, 3)

output = input1 + input2
with tf.Session():
  result = output.eval()

print("Result of addition of two Matrics")
result
Result of addition of two Matrics

array([[2., 3., 4.],
       [5., 6., 7.]], dtype=float32)

9.5 Use seaborn for 2D plots

Faceted Distribution Plots

Generate distributions based on energy type

import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import matplotlib.cbook
warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)
sns.set(style="white", palette="muted", color_codes=True)


# Set up the matplotlib figure
f, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True)
sns.despine(left=True)

# Plot each distribution on the 4 points
sns.distplot(df.proteins_100g, color="b", ax=axes[0, 0])
sns.distplot(df.sugars_100g, color="g", ax=axes[0, 1])
sns.distplot(df.fat_100g, color="r", ax=axes[1, 1])
sns.distplot(df.carbohydrates_100g, color="m", ax=axes[1, 0])
<matplotlib.axes._subplots.AxesSubplot at 0x7f59f80c4550>

png

Pairplot

import seaborn as sns
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x7fd6031d4630>

png

lmplot

import seaborn as sns
sns.lmplot(x="fat_100g", y="proteins_100g",  data=df.sample(100))
<seaborn.axisgrid.FacetGrid at 0x7f6e135ab048>

png

heatmap

sns.heatmap(df.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x7f6e13550588>

png

9.6 Use Plotly for interactive plots

2D Plots

def enable_plotly_in_cell():
  import IPython
  from plotly.offline import init_notebook_mode
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
  '''))
  init_notebook_mode(connected=False)
from plotly.offline import init_notebook_mode
enable_plotly_in_cell()
init_notebook_mode(connected=False)

import cufflinks as cf
cf.go_offline()
df.sample(1000).iplot(kind='bubble', 
                      size='energy_100g', 
                      mode='markers', 
                      x='fat_100g', 
                      y='proteins_100g',
                      xTitle='Fat', 
                      yTitle='Protein',
                      text="product")

3D ClusterPlot

Protein-Fat-Carb 3D Plot

import plotly.offline as py
import plotly.graph_objs as go

from plotly.offline import init_notebook_mode
enable_plotly_in_cell()


trace1 = go.Scatter3d(
    x=df["fat_100g"],
    y=df["carbohydrates_100g"],
    z=df["proteins_100g"],
    mode='markers',
    text=df["product"],
    marker=dict(
        size=12,
        color=df["cluster"],                # set color to an array/list of desired values
        colorscale='Viridis',   # choose a colorscale
        opacity=0.8
    )
)
#print(trace1)
data = [trace1]
layout = go.Layout(
    showlegend=False,
    title="Protein-Fat-Carb:  Food Energy Types",
    scene = dict(
        xaxis = dict(title='X: Fat Content-100g'),
        yaxis = dict(title="Y:  Carbohydrate Content-100g"),
        zaxis = dict(title="Z:  Protein Content-100g"),
    ),
    width=1000,
    height=900,
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='3d-scatter-colorscale')

Sugar-Salt-Carb-3D Plot

import plotly.offline as py
import plotly.graph_objs as go

from plotly.offline import init_notebook_mode
enable_plotly_in_cell()


trace1 = go.Scatter3d(
    x=df["sugars_100g"],
    y=df["carbohydrates_100g"],
    z=df["salt_100g"],
    mode='markers',
    text=df["product"],
    marker=dict(
        size=12,
        color=df["cluster"],                # set color to an array/list of desired values
        colorscale='Viridis',   # choose a colorscale
        opacity=0.8
    )
)
#print(trace1)
data = [trace1]
layout = go.Layout(
    showlegend=False,
    title="Sugar, Carb, Salt:  Food Energy Types",
    scene = dict(
        xaxis = dict(title='X: Sugar Content-100g'),
        yaxis = dict(title="Y: Carbohydrate Content-100g"),
        zaxis = dict(title="Z: Salt Content-100g"),
    ),
    width=1000,
    height=900,
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='3d-scatter-colorscale')

9.7 Specialized Visualization Libraries

Yellowbrick

Visualize Regression Lasso (Regression) Model Accuracy with Yellowbrick

Note, uses Lasso Regression

from yellowbrick.regressor import PredictionError
from sklearn.linear_model import Lasso

lasso = Lasso()
visualizer = PredictionError(lasso)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
g = visualizer.poof()            # Draw/show/poof the data

png

Visualize cross-validated scores for Linear regression model

See this: http://www.scikit-yb.org/en/latest/api/model_selection/cross_validation.html

from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from yellowbrick.model_selection import CVScores

# Create a new figure and axes
_, ax = plt.subplots()
cv = KFold(12)

oz = CVScores(
    linear_model.LinearRegression(), ax=ax, cv=cv, scoring='r2'
)

oz.fit(X, y)
oz.poof()

png

Word Cloud

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

High protein foods

Find protein foods in the 98th percentile

high_protein_df = df[df.proteins_100g > df.proteins_100g.quantile(.98)]
high_protein_text = high_protein_df['product'].values
len(high_protein_text)
896

Word Cloud High Protein

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(high_protein_text))

fig = plt.figure(
    figsize = (10, 7),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

png

High fat foods

Find fatty foods in the 98th percentile

high_fat_df = df[df.fat_100g > df.fat_100g.quantile(.98)]
high_fat_text = high_fat_df['product'].values
len(high_fat_text)
878

Word Cloud High Fat

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(high_fat_text))

fig = plt.figure(
    figsize = (10, 7),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

png

High sugar foods

Find sugary foods in the 98th percentile

high_sugar_df = df[df.sugars_100g > df.sugars_100g.quantile(.98)]
high_sugar_text = high_sugar_df['product'].values
len(high_sugar_text)
893

Word Cloud High Sugar

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(high_sugar_text))

fig = plt.figure(
    figsize = (10, 7),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

png

9.8 Learn Natural Language Processing Libraries

NLTK Stopword Processing

Setup Stop Words

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

Preprocess Text

dataset = df['product'].fillna("").values
raw_text_data = [d.split() for d in dataset]

Remove stop words

text_data = [item for item in raw_text_data if item not in stop]

Gensim Topic Modeling

from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(
    corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)
(0, '0.039*"Chocolate" + 0.033*"Cream" + 0.030*"Milk" + 0.027*"Juice"')
(1, '0.030*"Fruit" + 0.023*"&" + 0.022*"Light" + 0.021*"Premium"')
(2, '0.087*"Cheese" + 0.028*"&" + 0.026*"Candy" + 0.023*"Cheddar"')
(3, '0.034*"In" + 0.032*"Sauce" + 0.029*"Sweet" + 0.025*"&"')
(4, '0.045*"Beans" + 0.037*"Mix" + 0.022*"Green" + 0.022*"&"')