# Decision trees and random forests

In [1]:
import numpy as np
import pandas as pd

from sklearn import model_selection as ms, tree, ensemble

%matplotlib inline

In [2]:
WHITES_URL = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'

Read in the Wine Quality dataset.

In [3]:
whites = pd.read_csv(WHITES_URL, sep=';')

Train a decision tree for 'quality' limiting the depth to 3, and the minimum number of samples per leaf to 50.

In [4]:
X = whites.drop('quality', axis=1)
y = whites.quality
tree1 = tree.DecisionTreeRegressor(max_depth=2, min_samples_leaf=50)
tree1.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=50,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

Export the tree for plotting.

In [5]:
tree.export_graphviz(tree1, 'tree1.dot', feature_names=X.columns)

Define folds for cross-validation.

In [6]:
ten_fold_cv = ms.KFold(n_splits=10, shuffle=True)

Compute average MSE across folds.

In [7]:
mses = ms.cross_val_score(tree.DecisionTreeRegressor(max_depth=2, min_samples_leaf=50),
                          X, y, scoring='neg_mean_squared_error', cv=ten_fold_cv)
np.mean(-mses)

0.60703913964421019

Train a random forest with 20 decision trees.

In [8]:
rf1 = ensemble.RandomForestRegressor(n_estimators=20)
rf1.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

Investigate importances of predictors.

In [9]:
rf1.feature_importances_

array([ 0.0579232 ,  0.12103474,  0.05663566,  0.07376525,  0.06398742,
        0.11916611,  0.06976268,  0.06224071,  0.06852126,  0.06194336,
        0.24501961])

Evaluate performance through cross-validation.

In [10]:
mses = ms.cross_val_score(ensemble.RandomForestRegressor(n_estimators=20),
                          X, y, scoring='neg_mean_squared_error', cv=ten_fold_cv)
np.mean(-mses)

0.37689210696548558

What happens when you increase the number of trees to 50?

In [11]:
mses = ms.cross_val_score(ensemble.RandomForestRegressor(n_estimators=50),
                          X, y, scoring='neg_mean_squared_error', cv=ten_fold_cv)
np.mean(-mses)

0.3697803963106715