import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree, DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, KFold
url = "https://liangfgithub.github.io/Data/HousingData.csv"
Housing = pd.read_csv(url)
Housing.head()
X = Housing.drop("Y", axis=1)
y = Housing["Y"]
tr1 = DecisionTreeRegressor(max_leaf_nodes=10)
tr1.fit(X, y)
fig, ax = plt.subplots(figsize=(12, 6))
plot_tree(tr1, feature_names=X.columns, filled=True,
impurity=False, ax=ax, fontsize=10)
fig.tight_layout()
Understanding the Fitted Tree:
Starting at the root node, the average value for all observations is 3.035, representing 506 data points. The first split occurs based on the variable lsat. The right node contains 178 samples with an average of 2.657, while the left node contains the remaining 328 samples with an average of 3.239. Leaf nodes appear at the bottom, and their color shading indicates the magnitude of their predictions.
pruning decision trees with cost complexity pruning
$$ \frac{1}{n}RSS(T) + \alpha |T| $$ref: https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html
path = tr1.cost_complexity_pruning_path(X, y)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
print(np.column_stack((ccp_alphas, impurities)))
Understand the relationship between ccp_alphas
and impurities
, where impurities
is the averaged RSS.
The difference between adjacent impurities
is equivalent to the corresponding alpha value. This difference signifies the improvement gained in RSS by adding an additional split, effectively setting the “cost” of that split.
np.diff(impurities)
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
# find the beta sequence (the geometric mean of adjacent alphas)
ccp_betas = np.sqrt(ccp_alphas[1:] * ccp_alphas[:-1])
ccp_betas = np.append(ccp_betas, 100)
n_beta = len(ccp_betas)
ccp_betas
# use cross-validation to find the best alpha
folds = 10
cv = KFold(n_splits=folds, shuffle=True)
scores_mean = []
scores_se = []
for beta in ccp_betas:
clf = DecisionTreeRegressor(ccp_alpha=beta)
score = cross_val_score(clf, X, y, cv=cv)
scores_mean.append(score.mean())
scores_se.append(score.std())
scores = 1.0 - np.flip(np.array(scores_mean))
err = np.flip(scores_se) / np.sqrt(folds)
score_1se = err[scores.argmin()] + scores.min()
# plot the CV relative error
fig, ax = plt.subplots()
ax.errorbar(range(n_beta), y=scores, yerr=err, marker='o', mfc='none')
ax.axhline(score_1se, linestyle='--')
ax.set_xlabel(r"$\alpha$")
ax.set_ylabel("Cross-validation Relative Error")
# manually set xticks
xticks = ax.get_xticks()
ticks = np.flip(ccp_betas)[xticks[1:-1].astype(int)]
ticks = np.round(ticks, 4)
ticks[0] = float('inf')
ax.set_xticks(xticks[1:-1])
ax.set_xticklabels(ticks)
fig.set_figwidth(10)
fig.set_figheight(5)
If you’re uncertain whether the cross-validation error has reached its minimum, consider starting with a larger tree.
tr2 = DecisionTreeRegressor(max_leaf_nodes=40)
tr2.fit(X, y)
path = tr2.cost_complexity_pruning_path(X, y)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
ccp_betas = np.sqrt(ccp_alphas[1:] * ccp_alphas[:-1])
ccp_betas = np.append(ccp_betas, 100)
n_beta = len(ccp_betas)
folds = 10
cv = KFold(folds, shuffle=True)
scores_mean = []
scores_se = []
for beta in ccp_betas:
clf = DecisionTreeRegressor(ccp_alpha=beta)
score = cross_val_score(clf, X, y, cv=cv)
scores_mean.append(score.mean())
scores_se.append(score.std())
scores = 1.0 - np.flip(np.array(scores_mean))
err = np.flip(scores_se) / np.sqrt(folds)
score_1se = err[scores.argmin()] + scores.min()
ccp_betas
fig, ax = plt.subplots()
ax.errorbar(range(len(ccp_betas)), y=scores, yerr=err, marker='o', mfc='none')
ax.axhline(score_1se, linestyle='--')
ax.set_xlabel(r"$\alpha$")
ax.set_ylabel("X-val Relative Error")
# manually set xticks
xticks = ax.get_xticks()
ticks = np.flip(ccp_betas)[xticks[1:-1].astype(int)]
ticks = np.round(ticks, 4)
ticks[0] = float('inf')
ax.set_xticks(xticks[1:-1])
ax.set_xticklabels(ticks)
fig.set_figwidth(10)
fig.set_figheight(5)
We can use either the minimum cross-validation error or the One Standard Error (1-SE) rule to choose the optimal CP value for pruning.
# Get alpha.min
new_betas = np.flip(ccp_betas)
print(scores.argmin())
new_betas[scores.argmin()]
# get alpha.1se
np.max(new_betas[scores < score_1se])