import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as lm
myData = pd.read_table("https://hastie.su.domains/ElemStatLearn/datasets/prostate.data")
myData.drop(myData.columns[0], axis=1, inplace=True)
myData.drop('train', axis=1, inplace=True)
myData.columns.values[8] = 'Y'
myData.head()
In linear regression models, as we introduce more variables, training error (measured by RSS or MSE) consistently decreases. However, this reduction in training error does not guarantee a corresponding decrease in test error, which measures prediction accuracy on independent test data.
To demonstrate this, we randomly split our dataset into 60% training and 40% test portions, progressively adding predictors. This approach highlights how additional predictors can lower training error while test error may not follow the same trend.
n, p = myData.shape
p_range = np.arange(p)
p_range = p_range[1:]
np.random.seed(20)
from sklearn.model_selection import train_test_split
mytrain_size = 0.6
train_MSE = []
test_MSE =[]
xTrain, xTest, yTrain, yTest = train_test_split(myData.iloc[:, :-1], \
myData['Y'], \
train_size = mytrain_size)
for i in p_range:
myfit = lm()
myfit.fit(xTrain.iloc[:, :i], yTrain)
train_Y_pred = myfit.predict(xTrain.iloc[:, :i])
train_MSE.append(np.mean((yTrain - train_Y_pred)**2))
test_Y_pred = myfit.predict(xTest.iloc[:, :i])
test_MSE.append(np.mean((yTest - test_Y_pred)**2))
plt.scatter(p_range, train_MSE, c = "blue", label="Train")
plt.plot(p_range, train_MSE, c = "blue")
plt.scatter(p_range, test_MSE, c = "red", label="Test")
plt.plot(p_range, test_MSE, c = "red")
plt.xlabel('# of variables')
plt.ylabel('MSE')
plt.legend()
plt.show()
You can remove np.random.seed(2)
and run the code above multiple time. You’ll observe the following patterns:
In most instances, the blue line (representing training error) tends to be positioned below the red line (representing test error), indicating that the training error is generally superior to the test error. However, due to the inherent randomness, there might be instances where the red line falls below the blue line, indicating that the test error outperforms the training error occasionally.
In each iteration, the blue line consistently exhibits a monotonically decreasing trend, signifying the reduction of training error as more predictors are incorporated. On the other hand, the red line’s trajectory may not uniformly decrease. Upon closer examination of the differences between adjacent terms, you’ll observe negative values consistently for the blue line, but for the red line, these differences might encompass some positive values.
np.vstack((np.diff(test_MSE), np.diff(train_MSE))).transpose()
How to interpret LS coefficients? LS coefficients express how changes in predictors affect the response while holding all other predictors constant. For instance, the coefficient for age quantifies the average change in the response variable for each year increase in age, holding all other predictors constant.
It’s noteworthy that results from Simple Linear Regression (SLR), which involves only one non-intercept predictor, might differ from results in Multiple Linear Regression (MLR), which incorporates multiple predictors. This divergence can arise due to correlations among predictors.
For instance, consider the case where age
is positively correlated with some predictors that have a positive effect on the response. In a joint model (MLR), the coefficient for age
appears negative, seemingly contrary to its positive effect in an isolated model (SLR). This adjustment compensates for the positive contribution introduced by other predictors, ensuring that the overall model accurately represents the relationships between predictors and the response variable.
import statsmodels.formula.api as smf
tmpfit = smf.ols(formula = "Y ~ age", data = myData).fit()
tmpfit.summary()
all_covariates = list(myData.columns)
all_covariates.remove('Y')
my_formula = "Y ~ " + "+".join(all_covariates)
lmfit2 = smf.ols(formula = my_formula, data = myData).fit()
lmfit2.summary()
np.round(myData.corr(), decimals = 2)
Check how to retrieve the LS coefficient for age
using Algorithm 3.1
partial_lm = lm()
X = myData.drop(['Y', 'age'], axis = 1).copy()
y = myData['Y']
partial_lm.fit(X, y)
y_star = y - partial_lm.predict(X)
y_star = y_star.values.reshape(-1, 1)
partial_lm.fit(X, myData['age'])
age_star = myData['age'] - partial_lm.predict(X)
age_star = age_star.values.reshape(-1, 1)
partial_lm.fit(age_star, y_star)
print(partial_lm.coef_)
The LS coefficient for age
(from this partial_lm
) is the same as the one from lmfit2
. The residuals from the two LS models are also the same.
res1 = y - lmfit2.predict()
res2 = y_star - partial_lm.predict(age_star)
print(np.sum(res1 ** 2))
print(np.sum(res2 ** 2))
Test a single predictor (in this case, F-test = t-test).
X = myData.iloc[:, :-1]
Y = myData['Y']
lmfit1 = lm()
lmfit1.fit(X, myData['Y'])
X0 = X.drop(['age'], axis=1)
lmfit0 = lm()
lmfit0.fit(X0, Y)
RSS0 = lmfit0.score(X0, Y)
RSS1 = lmfit1.score(X, Y)
import scipy.stats as stats
n, p = myData.shape
p = p - 1 # p = 8, the number of non-intercept predictors
f_value = (RSS1 - RSS0)/((1 - RSS1)/(n - p -1))
print(f_value, np.sqrt(f_value)) # F-stat and t-stat for age
1 - stats.f.cdf(f_value, 1, n - p -1) # p-value for F-test for age
2*(1 - stats.t.cdf(np.sqrt(f_value), n-p-1)) # p-value for T-test for age
lmfit2.summary2().tables[1].loc['age',:] # compare the T-stat and p-value for age from lmfit2
Check the Car Seat Position Data.
Car drivers like to adjust the seat position for their own comfort. Car designers would find it helpful to know how different drivers will position the seat depending on their size and age. Researchers at the HuMoSim laboratory at the University of Michigan collected data on 38 drivers.
Age
:Weight
:HtShoes
: height with shoes in cmHt
: height without shoes in cmSeated
: seated height in cmArm
: lower arm length in cmThigh
: thigh length in cmLeg
: lower leg length in cmhipcenter
: horizontal distance of the midpoint of the hips from a fixed location in the car in mmThe researchers were interested in determining if a relationship exists between hipcenter
and the other variables. Due to the high correlations among the predictors, we see high R-square, significant overall F-test, but no individual variables are significant.
seatpos = pd.read_csv("https://liangfgithub.github.io/Data/SeatPos.csv")
seatpos.shape
seatpos.head()
sns.pairplot(seatpos)
all_covariates = list(seatpos.columns)
all_covariates.remove('hipcenter')
my_formula = "hipcenter ~ " + "+".join(all_covariates)
tmpfit = smf.ols(formula = my_formula, data = seatpos).fit()
tmpfit.summary()
np.round(seatpos.corr(), decimals = 2)
tmpfit = smf.ols('hipcenter ~ Age + Weight + Ht + Seated', data = seatpos).fit()
tmpfit.summary()
tmpfit = smf.ols('hipcenter ~ Ht', data = seatpos).fit()
tmpfit.summary()