import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as lm
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error
The Prostate data can be found [here]. The data is to examine the correlation between the level of prostate-specific antigen and a number of clinical measures in men who were about to receive a radical prostatectomy.
The first 8 columns are predictors; column 9 is the outcome/response. We do not use column 10, which indicates which 67 observations were used as the “training set” and which 30 as the test set, as described on page 48 in the ESL book. For convenience, we rename the response column with a generic name Y.
myData = pd.read_csv("https://hastie.su.domains/ElemStatLearn/datasets/prostate.data", sep='\t')
myData = myData.drop(columns=[myData.columns[0], myData.columns[10]])
myData.head()
myData.shape
X = myData.iloc[:, :-1]
Y = myData.iloc[:, -1]
X.shape, len(Y)
Split the data into two parts: 80% for training and 20% for testing. Throughout, we will use the the training data to select variables and estimate coefficients and report the average MSE on the test data.
X_train, X_test , Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
Fit an ordinary linear regression model using all variables.
full_model = lm()
full_model.fit(X_train, Y_train)
# averaged MSE on the test set
mean_squared_error(Y_test, full_model.predict(X_test))
Plot the ridge coefficients against the range of alpha values in Python. It's important to note that the alpha parameter in Python's Ridge regression is equivalent to lambda in R's Ridge regression. The term alpha in R denotes something different.
alphas = np.logspace(-3, 3, 100)
ridge = Ridge(normalize = True)
coefs = []
for a in alphas:
ridge.set_params(alpha = a)
ridge.fit(X_train, Y_train)
coefs.append(ridge.coef_)
np.shape(coefs)
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
plt.xlabel('Alpha')
plt.ylabel('Coefficients')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
ridgecv = RidgeCV(alphas = alphas, cv = 10,
scoring = 'neg_mean_squared_error',
normalize = True)
ridgecv.fit(X_train, Y_train)
ridgecv.alpha_
ridge_model = Ridge(alpha = ridgecv.alpha_, normalize = True)
ridge_model.fit(X_train, Y_train)
mean_squared_error(Y_test, ridge_model.predict(X_test))
Note that none of the ridge regressions is exactly zero.
pd.Series(ridge_model.coef_, index = X.columns)
Generate a cross-validation path plot similar to the one from R's glmnet. In order to generate this plot, we need to manually implement cross-validation for Ridge regression.
from sklearn.model_selection import cross_val_score
mean_mse = []
std_mse = []
# Loop through each alpha and perform cross-validation
for alpha in alphas:
ridge = Ridge(alpha=alpha, normalize = True)
mse_scores = -cross_val_score(ridge, X_train, Y_train, cv=10, scoring='neg_mean_squared_error') # 10-fold CV
mean_mse.append(np.mean(mse_scores))
std_mse.append(np.std(mse_scores) / np.sqrt(10))
# Identify best alpha and one standard error rule alpha
min_idx = np.argmin(mean_mse)
alpha_min = alphas[min_idx]
# Find the threshold within one standard error of the best alpha
threshold = mean_mse[min_idx] + std_mse[min_idx]
# Find the largest alpha within one standard error of the best alpha
one_se_rule_idx = np.where(mean_mse <= threshold)[0][-1]
alpha_1se = alphas[one_se_rule_idx]
alpha_min, alpha_1se
mean_mse = np.array(mean_mse)
std_mse = np.array(std_mse)
plt.figure(figsize=(10, 6))
plt.semilogx(alphas, mean_mse, label='Mean MSE', linewidth=2)
plt.fill_between(alphas, mean_mse - std_mse, mean_mse + std_mse, alpha=0.2)
plt.axvline(alpha_min, linestyle='--', color='r', label=f'Best alpha: {alpha_min:.2e}')
plt.axvline(alpha_1se, linestyle='--', color='g', label=f'alpha (1 SE): {alpha_1se:.2e}')
plt.xlabel('Alpha (Log Scale)')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.title('Ridge Cross-Validation Path')
plt.show()
alphas = np.logspace(-4, -1, 100)
lasso = Lasso(normalize = True)
coefs = []
for a in alphas:
lasso.set_params(alpha = a)
lasso.fit(X_train, Y_train)
coefs.append(lasso.coef_)
np.shape(coefs)
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
plt.xlabel('Alpha')
plt.ylabel('Coefficients')
plt.title('Lasso coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
lassocv = LassoCV(alphas = alphas, cv = 10,
normalize = True)
lassocv.fit(X_train, Y_train)
lassocv.alpha_
lasso_model = Lasso(alpha = lassocv.alpha_, normalize = True)
lasso_model.fit(X_train, Y_train)
mean_squared_error(Y_test, lasso_model.predict(X_test))
Note that some lasso regressions are exactly zero.
pd.Series(lasso_model.coef_, index = X.columns)
Variables that are not selected.
X.columns[np.where(lasso_model.coef_ == 0)]
In some runs, the optimal alpha value is at the boundary, for example, alphas[0]
. Then we should expand the range of alphas and re-run LassoCV
.
Generate a cross-validation path plot similar to the one from R's glmnet.
lassocv = LassoCV(alphas=alphas, cv=10, normalize = True)
lassocv.fit(X_train, Y_train)
lassocv.alpha_
# Extract mean and standard
mean_mse = np.mean(lassocv.mse_path_, axis=1)
std_mse = np.std(lassocv.mse_path_, axis=1) / np.sqrt(10)
# Extract alphas from CV output
# Note: they may not be in the same order as the alpha sequence your provide to lassoCV
cv_alphas = lassocv.alphas_
# Identify best alpha and one standard error rule alpha
min_idx = np.argmin(mean_mse)
alpha_min = cv_alphas[min_idx]
# Find the threshold within one standard error of the best alpha
threshold = mean_mse[min_idx] + std_mse[min_idx]
# Find the largest alpha within one standard error of the best alpha
alpha_1se = max(cv_alphas[np.where(mean_mse <= threshold)])
alpha_min, alpha_1se #alpha_min = lassocv.alpha_
mean_mse = np.array(mean_mse)
std_mse = np.array(std_mse)
plt.figure(figsize=(10, 6))
plt.semilogx(cv_alphas, mean_mse, label='Mean MSE', linewidth=2)
plt.fill_between(cv_alphas, mean_mse - std_mse, mean_mse + std_mse, alpha=0.2)
plt.axvline(alpha_min, linestyle='--', color='r', label=f'Best alpha: {alpha_min:.2e}')
plt.axvline(alpha_1se, linestyle='--', color='g', label=f'alpha (1 SE): {alpha_1se:.2e}')
plt.xlabel('Alpha (Log Scale)')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.title('Lasso Cross-Validation Path')
plt.show()
lasso_1se_model = Lasso(alpha = alpha_1se, normalize = True)
lasso_1se_model.fit(X_train, Y_train)
mean_squared_error(Y_test, lasso_1se_model.predict(X_test))
We refit an ordinary linear regression model using the variables selected by alpha_1se
and then use it for prediction.
lasso_1se_model.coef_
nonzero_indices = np.where(lasso_1se_model.coef_ != 0)[0]
lm_refit = lm()
lm_refit.fit(X_train.iloc[:, nonzero_indices], Y_train)
mean_squared_error(Y_test, lm_refit.predict(X_test.iloc[:, nonzero_indices]))