import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as lm
from sklearn.metrics import mean_squared_error, r2_score
import itertools
The Prostate data can be found [here]. The data is to examine the correlation between the level of prostate-specific antigen and a number of clinical measures in men who were about to receive a radical prostatectomy.
The first 8 columns are predictors; column 9 is the outcome/response. We do not use column 10, which indicates which 67 observations were used as the “training set” and which 30 as the test set, as described on page 48 in the ESL book. For convenience, we rename the response column with a generic name Y.
myData = pd.read_csv("https://hastie.su.domains/ElemStatLearn/datasets/prostate.data", sep='\t')
myData.head()
# Drop the 1st and 10th columns
myData = myData.drop(columns=[myData.columns[0], myData.columns[10]])
myData.head()
myData.shape
X = myData.iloc[:, :-1]
Y = myData.iloc[:, -1]
X.shape, len(Y)
First, for each model size (i.e., the number of non-intercept features), use the function bestsubset
below to select the best subset of variables that has the smallest RSS among all models with the same size. For this dataset with 8 non-intercept features, we will have 8 models.
def bestsubset(X, Y):
RSS_list, numb_features, feature_list = [], [], []
for m in range(1,len(X.columns) + 1):
best_RSS = np.inf
# Loop over all possible combinations: from n choose m (where n is the number of columns)
for combo in itertools.combinations(X.columns, m):
tmp_X = X[list(combo)]
tmp_model = lm()
tmp_model.fit(tmp_X, Y)
tmp_RSS = mean_squared_error(Y, tmp_model.predict(tmp_X)) * len(Y)
if tmp_RSS < best_RSS:
best_RSS = tmp_RSS
best_varset = combo
RSS_list.append(best_RSS)
feature_list.append(best_varset)
numb_features.append(len(best_varset))
return pd.DataFrame({'msize': numb_features,
'RSS': RSS_list,
'features':feature_list})
rs = bestsubset(X, Y)
rs
Second, compute the model selection score (AIC/BIC) for those p models and find the one that achieves the the smallest score.
n = len(Y)
Aic = n*np.log(rs.RSS/n) + 2*rs.msize
Bic = n*np.log(rs.RSS/n) + rs.msize*np.log(n)
fig = plt.figure(figsize = (16,6))
ax = fig.add_subplot(1, 2, 1)
ax.scatter(rs.msize, Aic, alpha = 1, color = 'blue' )
ax.set_xlabel('No. of Features')
ax.set_ylabel('AIC')
ax.set_title('AIC - Best subset selection')
ax = fig.add_subplot(1, 2, 2)
ax.scatter(rs.msize, Bic, alpha = .5, color = 'red' )
ax.set_xlabel('No. of Features')
ax.set_ylabel('BIC')
ax.set_title('BIC - Best subset selection')
plt.show()
For this particular data set, AIC and BIC end up selecting different models. The model selected by AIC is larger than the one selected by BIC, which is common. (AIC favors larger models while BIC favors smaller models.) Although the model selected by BIC does not have the smallest AIC score, its AIC score is very close to the smallest one.
What are the 2nd and 3rd best models in terms of AIC/BIC?
This cannot be answered by looking at the AIC/BIC plots shown above. Instead, we need to modify the bestsubset
function to return the top 3 models for each size.
def bestsubset_3(X, Y):
RSS_dict, numb_features_dict, feature_list_dict = {}, {}, {}
for m in range(1, len(X.columns) + 1):
best_RSS = []
# Loop over all possible combinations: from n choose m (where n is the number of columns)
for combo in itertools.combinations(X.columns, m):
tmp_X = X[list(combo)]
tmp_model = lm()
tmp_model.fit(tmp_X, Y)
tmp_RSS = mean_squared_error(Y, tmp_model.predict(tmp_X)) * len(Y)
# Keep track of the top 3 models
best_RSS.append((tmp_RSS, combo))
best_RSS = sorted(best_RSS)[:3]
# Store the best RSS and corresponding features for each m
RSS_dict[m] = [x[0] for x in best_RSS]
feature_list_dict[m] = [x[1] for x in best_RSS]
numb_features_dict[m] = [len(x[1]) for x in best_RSS]
return pd.DataFrame({'msize': numb_features_dict,
'RSS': RSS_dict,
'features': feature_list_dict})
rs_3 = bestsubset_3(X, Y)
n = len(Y)
# Unpack the lists into individual data points
msize_flat = []
Aic_flat = []
Bic_flat = []
features_flat = []
for index, row in rs_3.iterrows():
msize_list = row['msize']
RSS_list = row['RSS']
features_list = row['features']
for msize, RSS, feat in zip(msize_list, RSS_list, features_list):
Aic_value = n * np.log(RSS / n) + 2 * msize
Bic_value = n * np.log(RSS / n) + msize * np.log(n)
msize_flat.append(msize)
Aic_flat.append(Aic_value)
Bic_flat.append(Bic_value)
features_flat.append(feat)
fig = plt.figure(figsize=(12, 6))
# Plot for AIC
ax1 = fig.add_subplot(1, 2, 1)
ax1.scatter(msize_flat, Aic_flat, alpha=1, facecolors='none', edgecolors='black')
ax1.set_xlabel('No. of Features')
ax1.set_ylabel('AIC')
ax1.set_title('AIC - Best subset selection')
# Plot for BIC
ax2 = fig.add_subplot(1, 2, 2)
ax2.scatter(msize_flat, Bic_flat, alpha=1, facecolors='none', edgecolors='black')
ax2.set_xlabel('No. of Features')
ax2.set_ylabel('BIC')
ax2.set_title('BIC - Best subset selection')
plt.show()
msize_filtered = [m for i, m in enumerate(msize_flat) if m > 2]
Aic_filtered = [a for i, a in enumerate(Aic_flat) if msize_flat[i] > 2]
Bic_filtered = [b for i, b in enumerate(Bic_flat) if msize_flat[i] > 2]
fig = plt.figure(figsize=(12, 6))
# Plot for AIC
ax1 = fig.add_subplot(1, 2, 1)
ax1.scatter(msize_filtered, Aic_filtered, alpha=1, facecolors='none', edgecolors='black')
ax1.set_xlabel('No. of Features')
ax1.set_ylabel('AIC')
ax1.set_title('AIC - Best subset selection')
ax1.set_ylim(-67, -62)
# Plot for BIC
ax2 = fig.add_subplot(1, 2, 2)
ax2.scatter(msize_flat, Bic_flat, alpha=1, facecolors='none', edgecolors='black')
ax2.set_xlabel('No. of Features')
ax2.set_ylabel('BIC')
ax2.set_title('BIC - Best subset selection')
ax2.set_ylim(-60, -50)
plt.show()
Create a DataFrame to easily manage and sort the data. Then find the top three models by AIC or BIC.
df = pd.DataFrame({
'msize': msize_flat,
'Aic': Aic_flat,
'Bic': Bic_flat,
'features': features_flat
})
# Sort the DataFrame by Aic to get the top 3 features
top3_Aic = df.sort_values(by='Aic').head(3)
print(top3_Aic[['msize','features']].to_string(index=False))
top3_Bic = df.sort_values(by='Bic').head(3)
print(top3_Bic[['msize','features']].to_string(index=False))
The function computeAIC
computes AIC or BIC for a linear regression model with design matrix X and response vector Y; we can provide different "k" values to select between AIC (k=2, the default value) and BIC (k = np.log(n)
).
The function stepAIC
runs stepwise subset selection based on AIC or BIC. The algorithm starts with the variable set specied by features
(default is the full model, i.e., all variables), then at each step, check whether dropping a variable in feature
s or adding a new feature to features
will end up improving AIC/BIC; if not (i.e., the current model is the one with the minimal AIC/BIC), the algorithm halts.
The stepAIC
function can be easily modified for carrying out forward and backward variable selection.
Note that stepAIC
is written with the argument X being a pandas dataframe and the argument features
being a pandas index.
type(X), type(X.columns)
def computeAIC(X, Y, k=2):
n = len(Y)
model = lm()
model.fit(X, Y)
RSS = mean_squared_error(Y, model.predict(X)) * len(Y)
return n*np.log(RSS/n) + k*X.shape[1]
def stepAIC(X, Y, features = X.columns, AIC = True):
AIC_list, action_list, feature_list = [], [], []
best_AIC = np.inf
best_action = ' '
n = len(Y)
if AIC:
k = 2
else:
k = np.log(n)
AIC = computeAIC(X[features], Y, k)
while(AIC < best_AIC):
AIC_list.append(AIC)
feature_list.append(list(features))
action_list.append(best_action)
best_AIC = AIC
tmp_AIC_list, tmp_action_list, tmp_feature_list = [], [], []
for p in features:
tmp_features = features.drop(p)
tmp_AIC = computeAIC(X[tmp_features], Y, k)
tmp_AIC_list.append(tmp_AIC)
tmp_feature_list.append(tmp_features)
tmp_action_list.append('- ' + p)
remaining_features = [p for p in X.columns if p not in features]
for p in remaining_features:
tmp_features = list(features) + [p]
tmp_AIC = computeAIC(X[tmp_features], Y, k)
tmp_AIC_list.append(tmp_AIC)
tmp_feature_list.append(tmp_features)
tmp_action_list.append('+ ' + p)
best_model = np.array(tmp_AIC_list).argmin()
AIC = tmp_AIC_list[best_model]
features = tmp_feature_list[best_model]
best_action = tmp_action_list[best_model]
return pd.DataFrame({'AIC': AIC_list,
'action': action_list,
'features': feature_list})
myout = stepAIC(X, Y)
myout
The output from AIC stepwise can be summarized as below: start with the full model
myout = stepAIC(X, Y, AIC=False)
myout
The output from BIC stepwise can be summarized as below: start with the full model