# Linear regression using `scikit-learn`

In [1]:
import numpy as np
import pandas as pd

from sklearn import linear_model as lm, model_selection as ms

In [2]:
WHITES_URL = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'

Read in the Wine Quality dataset.

In [3]:
whites = pd.read_csv(WHITES_URL, sep=';')

Fit a linear regression model for 'quality' using 'density' and 'sulphates' as predictors.

In [4]:
X = whites[['density', 'sulphates']]
y = whites['quality']
model1 = lm.LinearRegression().fit(X, y)

Print variable names and regression coefficients.

In [5]:
list(zip(X.columns, model1.coef_))

[('density', -92.640520464253058), ('sulphates', 0.59740810883095863)]

Define 5 folds for cross-validation.

In [6]:
five_fold_cv = ms.KFold(n_splits=5, shuffle=True)

Compute average MSE across folds.

In [7]:
mses = ms.cross_val_score(lm.LinearRegression(), X, y, scoring='neg_mean_squared_error', cv=five_fold_cv)
np.mean(-mses)

0.70687671369786353

Add 'alcohol' as predictor and compute the average MSE across folds.

In [8]:
X = whites[['density', 'sulphates', 'alcohol']]

In [9]:
mses = ms.cross_val_score(lm.LinearRegression(), X, y, scoring='neg_mean_squared_error', cv=five_fold_cv)
np.mean(-mses)

0.63292327434294804

Add 'free sulfur dioxide' as predictor and compute the average MSE across folds.

In [10]:
X = whites[['density', 'sulphates', 'alcohol', 'free sulfur dioxide']]

In [11]:
mses = ms.cross_val_score(lm.LinearRegression(), X, y, scoring='neg_mean_squared_error', cv=five_fold_cv)
np.mean(-mses)

0.62366264514099579

Use all predictors and compute the average MSE across folds.

In [12]:
X = whites.drop('quality', axis=1)

In [13]:
mses = ms.cross_val_score(lm.LinearRegression(), X, y, scoring='neg_mean_squared_error', cv=five_fold_cv)
np.mean(-mses)

0.5696689499697728

Using `GridSearchCV`, tune a linear regression model with LASSO regularisation (include all predictors).

In [14]:
gs = ms.GridSearchCV(estimator=lm.Lasso(),
                     param_grid={'alpha': np.logspace(-10, 10, 21)},
                     scoring='neg_mean_squared_error',
                     cv=five_fold_cv)
gs.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e-10,   1.00000e-09,   1.00000e-08,   1.00000e-07,
         1.00000e-06,   1.00000e-05,   1.00000e-04,   1.00000e-03,
         1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02,   1.00000e+03,   1.00000e+04,   1.00000e+05,
         1.00000e+06,   1.00000e+07,   1.00000e+08,   1.00000e+09,
         1.00000e+10])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

Print the 'best' MSE.

In [15]:
-gs.best_score_

0.56748029089705854

Print the 'best' value of $\alpha$; what can you conclude?

In [16]:
gs.best_params_['alpha']  # Small: regularisation has little effect

1.0000000000000001e-05

Print variable names and regression coefficients for the 'best' model.

In [17]:
list(zip(X.columns, gs.best_estimator_.coef_))

[('fixed acidity', 0.041217421557521658),
 ('volatile acidity', -1.8829036599756834),
 ('citric acid', 0.010251673228966944),
 ('residual sugar', 0.069763341208336285),
 ('chlorides', -0.37079513343515752),
 ('free sulfur dioxide', 0.0039510396449355068),
 ('total sulfur dioxide', -0.00040902513609630332),
 ('density', -118.72007809579399),
 ('pH', 0.57666769975317833),
 ('sulphates', 0.58626699442362074),
 ('alcohol', 0.22917112757094324)]