PSL: Regression Tree¶

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import plot_tree, DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, KFold

url = "https://liangfgithub.github.io/Data/HousingData.csv"
Housing = pd.read_csv(url)
Housing.head()

Fit a Regression Tree¶

X = Housing.drop("Y", axis=1)
y = Housing["Y"]
tr1 = DecisionTreeRegressor(max_leaf_nodes=10)
tr1.fit(X, y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=10,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

fig, ax = plt.subplots(figsize=(12, 6))
plot_tree(tr1, feature_names=X.columns, filled=True,
          impurity=False, ax=ax, fontsize=10)
fig.tight_layout()

Understanding the Fitted Tree:

Starting at the root node, the average value for all observations is 3.035, representing 506 data points. The first split occurs based on the variable lsat. The right node contains 178 samples with an average of 2.657, while the left node contains the remaining 328 samples with an average of 3.239. Leaf nodes appear at the bottom, and their color shading indicates the magnitude of their predictions.

Complexity Prunning¶

pruning decision trees with cost complexity pruning

$$ \frac{1}{n}RSS(T) + \alpha |T| $$

ref: https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html

path = tr1.cost_complexity_pruning_path(X, y)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
print(np.column_stack((ccp_alphas, impurities)))

[[0.         0.0290964 ]
 [0.00198492 0.03108132]
 [0.00236513 0.03344645]
 [0.00343786 0.03688431]
 [0.00354684 0.04043114]
 [0.00372911 0.04416025]
 [0.0057464  0.04990666]
 [0.01291553 0.06282219]
 [0.02622148 0.08904367]
 [0.07731515 0.16635882]]

Understand the relationship between ccp_alphas and impurities, where impurities is the averaged RSS.

The difference between adjacent impurities is equivalent to the corresponding alpha value. This difference signifies the improvement gained in RSS by adding an additional split, effectively setting the “cost” of that split.

np.diff(impurities)

array([0.00198492, 0.00236513, 0.00343786, 0.00354684, 0.00372911,
       0.0057464 , 0.01291553, 0.02622148, 0.07731515])

fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

Text(0.5, 1.0, 'Total Impurity vs effective alpha for training set')

Cross-validation¶

# find the beta sequence (the geometric mean of adjacent alphas)
ccp_betas = np.sqrt(ccp_alphas[1:] * ccp_alphas[:-1])
ccp_betas = np.append(ccp_betas, 100)
n_beta = len(ccp_betas)
ccp_betas

array([0.00000000e+00, 2.16669876e-03, 2.85148685e-03, 3.49192050e-03,
       3.63683151e-03, 4.62914467e-03, 8.61497909e-03, 1.84028372e-02,
       4.50257479e-02, 1.00000000e+02])

# use cross-validation to find the best alpha
folds = 10
cv = KFold(n_splits=folds, shuffle=True)
scores_mean = []
scores_se = []
for beta in ccp_betas:
    clf = DecisionTreeRegressor(ccp_alpha=beta)
    score = cross_val_score(clf, X, y, cv=cv)
    scores_mean.append(score.mean())
    scores_se.append(score.std())

scores = 1.0 - np.flip(np.array(scores_mean))
err = np.flip(scores_se) / np.sqrt(folds)
score_1se = err[scores.argmin()] + scores.min()

# plot the CV relative error 
fig, ax = plt.subplots()
ax.errorbar(range(n_beta), y=scores, yerr=err, marker='o', mfc='none')
ax.axhline(score_1se, linestyle='--')
ax.set_xlabel(r"$\alpha$")
ax.set_ylabel("Cross-validation Relative Error")

# manually set xticks
xticks = ax.get_xticks()
ticks = np.flip(ccp_betas)[xticks[1:-1].astype(int)]
ticks = np.round(ticks, 4)
ticks[0] = float('inf')
ax.set_xticks(xticks[1:-1])
ax.set_xticklabels(ticks)
fig.set_figwidth(10)
fig.set_figheight(5)

If you’re uncertain whether the cross-validation error has reached its minimum, consider starting with a larger tree.

tr2 = DecisionTreeRegressor(max_leaf_nodes=40)
tr2.fit(X, y)

path = tr2.cost_complexity_pruning_path(X, y)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

ccp_betas = np.sqrt(ccp_alphas[1:] * ccp_alphas[:-1])
ccp_betas = np.append(ccp_betas, 100)
n_beta = len(ccp_betas)

folds = 10
cv = KFold(folds, shuffle=True)
scores_mean = []
scores_se = []
for beta in ccp_betas:
    clf = DecisionTreeRegressor(ccp_alpha=beta)
    score = cross_val_score(clf, X, y, cv=cv)
    scores_mean.append(score.mean())
    scores_se.append(score.std())

scores = 1.0 - np.flip(np.array(scores_mean))
err = np.flip(scores_se) / np.sqrt(folds)
score_1se = err[scores.argmin()] + scores.min()

ccp_betas

array([0.00000000e+00, 2.52207514e-04, 2.67714077e-04, 2.79825464e-04,
       2.87259370e-04, 3.33334164e-04, 3.91901413e-04, 4.11664644e-04,
       4.18021594e-04, 4.52829824e-04, 4.96394908e-04, 5.15692921e-04,
       5.38361948e-04, 6.14878246e-04, 6.93167099e-04, 6.99253037e-04,
       7.49597381e-04, 8.14772738e-04, 8.39888905e-04, 9.30109923e-04,
       1.02322225e-03, 1.05097426e-03, 1.11418226e-03, 1.24401104e-03,
       1.38822957e-03, 1.47051577e-03, 1.72335068e-03, 2.16669876e-03,
       2.85148685e-03, 3.49192050e-03, 3.63683151e-03, 4.62914467e-03,
       8.61497909e-03, 1.84028372e-02, 4.50257479e-02, 1.00000000e+02])

fig, ax = plt.subplots()
ax.errorbar(range(len(ccp_betas)), y=scores, yerr=err, marker='o', mfc='none')
ax.axhline(score_1se, linestyle='--')
ax.set_xlabel(r"$\alpha$")
ax.set_ylabel("X-val Relative Error")

# manually set xticks
xticks = ax.get_xticks()
ticks = np.flip(ccp_betas)[xticks[1:-1].astype(int)]
ticks = np.round(ticks, 4)
ticks[0] = float('inf')
ax.set_xticks(xticks[1:-1])
ax.set_xticklabels(ticks)
fig.set_figwidth(10)
fig.set_figheight(5)

Optimal CP Value¶

We can use either the minimum cross-validation error or the One Standard Error (1-SE) rule to choose the optimal CP value for pruning.

# Get alpha.min
new_betas = np.flip(ccp_betas)
print(scores.argmin())
new_betas[scores.argmin()]

28

0.00041166464374243035

# get alpha.1se
np.max(new_betas[scores < score_1se])

0.0013882295686940722

	Y	lon	lat	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	lstat
0	3.178054	-70.955	42.2550	-5.064036	1.8	0.837248	-0.619897	1.883275	3.432567	1.408545	0.000000	5.690359	0.454865	2.231591
1	3.072693	-70.950	42.2875	-3.600502	0.0	1.955860	-0.757153	1.859574	5.529585	1.602836	0.693147	5.488938	1.236450	3.023243
2	3.546740	-70.936	42.2830	-3.601235	0.0	1.955860	-0.757153	1.971996	2.918119	1.602836	0.693147	5.488938	1.236450	2.007486
3	3.508556	-70.928	42.2930	-3.430523	0.0	0.779325	-0.780886	1.945624	1.419592	1.802073	1.098612	5.402677	1.772241	1.714643
4	3.589059	-70.922	42.2980	-2.672924	0.0	0.779325	-0.780886	1.966693	2.162710	1.802073	1.098612	5.402677	1.772241	2.308679