# Support vector machines

In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing, svm, model_selection as ms

from sklearn.pipeline import Pipeline

In [2]:
WHITES_URL = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'

Read in the Wine Quality dataset.

In [3]:
whites = pd.read_csv(WHITES_URL, sep=';')

Define a new variable 'good_quality' for whites with quality >= 7.

In [4]:
whites['good_quality'] = whites.quality >= 7

Prepare the data.

In [5]:
X = whites.drop(['quality', 'good_quality'], axis=1).get_values()
y = whites.good_quality.astype('int').get_values()

Create a pipeline that scales the data and trains a support vector classifier.

In [6]:
ssvc = Pipeline([
    ('scale', preprocessing.StandardScaler()),
    ('svc', svm.SVC())
])

Train a support vector classifier with linear (= no) kernel.

In [7]:
ssvc.set_params(
    svc__kernel='linear'
)
ssvc.fit(X, y)

Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

Coefficients defining the separating hyperplane (**not** regression coefficients).

In [8]:
ssvc.named_steps['svc'].coef_

array([[  1.11539825e-04,  -3.77434399e-05,  -1.19437625e-05,
          3.98253379e-04,  -3.96662015e-05,   3.45835505e-05,
         -3.62934401e-07,  -5.51454369e-04,   1.28281300e-04,
          9.24959290e-05,  -9.06366853e-06]])

Support vectors:

In [9]:
ssvc.named_steps['svc'].n_support_

array([1191, 1060], dtype=int32)

In [10]:
ssvc.named_steps['svc'].support_

array([  10,   12,   14, ..., 4886, 4887, 4896], dtype=int32)

In [11]:
ssvc.named_steps['svc'].support_vectors_

array([[ 1.4757511 , -0.0817699 ,  0.6264779 , ..., -1.31315295,
         0.61476253,  1.20742712],
       [ 1.23872307, -0.9747665 ,  0.29591974, ..., -0.05475133,
         1.22818321,  0.23220977],
       [ 1.71277913,  1.40655776,  2.36190825, ..., -1.37938461,
         1.57870931, -0.66173945],
       ..., 
       [-0.77601514, -0.67710097, -0.44783612, ...,  0.14394366,
         5.17160188,  1.32932928],
       [-0.77601514,  1.30733592, -0.94367336, ..., -0.98199462,
         2.63028763,  2.02010823],
       [-1.60561323,  0.11667379, -0.28255704, ...,  1.0049553 ,
        -0.96260494,  1.85757201]])

Define stratified folds for cross-validation.

In [12]:
ten_fold_cv = ms.StratifiedKFold(n_splits=10, shuffle=True)

Compute average AUC across folds.

In [13]:
aucs = ms.cross_val_score(ssvc, X, y, scoring='roc_auc', cv=ten_fold_cv)
np.mean(aucs)

0.76356761981296295

Train using the Radial Basis Function (RBF) kernel.

In [14]:
ssvc.set_params(
    svc__kernel='rbf'
)
ssvc.fit(X, y)

Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

Compute average AUC across folds.

In [15]:
aucs = ms.cross_val_score(ssvc, X, y, scoring='roc_auc', cv=ten_fold_cv)
np.mean(aucs)

0.84152212829654993

Determine 'optimal' kernel and value of `C` by cross-validation using AUC scoring.

In [16]:
gs = ms.GridSearchCV(
    estimator=ssvc,
    param_grid={
        'svc__C': [1e-15, 0.0001, 0.001, 0.01, 0.1, 1, 10],
        'svc__kernel': ['linear', 'rbf']
    },
    scoring='roc_auc',
    cv=ten_fold_cv
)
gs.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=True),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'svc__C': [1e-15, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'svc__kernel': ['linear', 'rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [17]:
gs.best_score_

0.85757257756367744

In [18]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])