import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


np.random.seed(0)
n = 10
x = np.random.uniform(0,1,(n,2))
y = (x.sum(1) > 1).astype(int)


mpl.rcParams['figure.dpi'] = 250
mpl.rcParams['figure.figsize'] = (8, 5)
sns.set()
sns.scatterplot(x = x[:,0], y = x[:,1], hue = y);


x = np.hstack([x, np.ones((n,1))])
myfit = sm.GLM(y, x, family = sm.families.Binomial()).fit();

/Users/feng_macpro/anaconda3/lib/python3.11/site-packages/statsmodels/genmod/generalized_linear_model.py:1257: PerfectSeparationWarning: Perfect separation or prediction detected, parameter may not be identified
  warnings.warn(msg, category=PerfectSeparationWarning)
/Users/feng_macpro/anaconda3/lib/python3.11/site-packages/statsmodels/genmod/generalized_linear_model.py:1257: PerfectSeparationWarning: Perfect separation or prediction detected, parameter may not be identified
  warnings.warn(msg, category=PerfectSeparationWarning)
/Users/feng_macpro/anaconda3/lib/python3.11/site-packages/statsmodels/genmod/generalized_linear_model.py:1257: PerfectSeparationWarning: Perfect separation or prediction detected, parameter may not be identified
  warnings.warn(msg, category=PerfectSeparationWarning)


print(myfit.summary());

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                      y   No. Observations:                   10
Model:                            GLM   Df Residuals:                        7
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.8898e-09
Date:                Mon, 23 Oct 2023   Deviance:                   3.7795e-09
Time:                        16:39:08   Pearson chi2:                 1.89e-09
No. Iterations:                    22   Pseudo R-squ. (CS):             0.6324
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1           108.2192   7.63e+04      0.001      0.999   -1.49e+05     1.5e+05
x2             9.0769   7.89e+04      0.000      1.000   -1.55e+05    1.55e+05
const        -15.6047   3.03e+04     -0.001      1.000   -5.93e+04    5.93e+04
x3           -15.6047   3.03e+04     -0.001      1.000   -5.93e+04    5.93e+04
==============================================================================


url = "https://web.stanford.edu/~hastie/ElemStatLearn//datasets/SAheart.data"
df = pd.read_csv(url, index_col=0)
df;


X = df.iloc[:,0:9]
y = df['chd']
X = pd.get_dummies(X, drop_first=True).astype(float)
X['(intercept)'] = 1


heartfull = sm.GLM(y, X, family = sm.families.Binomial()).fit()


print(heartfull.summary())

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                    chd   No. Observations:                  462
Model:                            GLM   Df Residuals:                      452
Model Family:                Binomial   Df Model:                            9
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -236.07
Date:                Mon, 23 Oct 2023   Deviance:                       472.14
Time:                        17:37:45   Pearson chi2:                     452.
No. Iterations:                     5   Pseudo R-squ. (CS):             0.2353
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
sbp                 0.0065      0.006      1.135      0.256      -0.005       0.018
tobacco             0.0794      0.027      2.984      0.003       0.027       0.132
ldl                 0.1739      0.060      2.915      0.004       0.057       0.291
adiposity           0.0186      0.029      0.635      0.526      -0.039       0.076
typea               0.0396      0.012      3.214      0.001       0.015       0.064
obesity            -0.0629      0.044     -1.422      0.155      -0.150       0.024
alcohol             0.0001      0.004      0.027      0.978      -0.009       0.009
age                 0.0452      0.012      3.728      0.000       0.021       0.069
famhist_Present     0.9254      0.228      4.061      0.000       0.479       1.372
(intercept)        -6.1507      1.308     -4.701      0.000      -8.715      -3.587
===================================================================================


pd.get_dummies(df, drop_first=True).corr().round(2)


def log_likelihood(phat, y):
  return -(np.log(phat[y == 1]).sum() +  np.log(1 - phat[y == 0]).sum())


phat = heartfull.fittedvalues
print("Residual Deviance: " + str(-2*log_likelihood(phat, y)))

Residual Deviance: -472.1400323724979


sns.set_theme(style="white", palette=None)
ConfusionMatrixDisplay(confusion_matrix(y, heartfull.fittedvalues > 0.5)).plot()
plt.show()


testsamples = X.iloc[[0,0,0],:].copy()
testsamples;
testsamples.loc[:,'age'] = [X['age'].min(), X['age'].median(), X['age'].max()]
testsamples;


heartfull.predict(testsamples, which = 'linear')

row.names
1   -0.767328
1    0.589432
1    1.448714
dtype: float64


heartfull.predict(testsamples, which = 'mean')

row.names
1    0.317057
1    0.643235
1    0.809800
dtype: float64


def computeAIC(X, Y, k=2):
    model = sm.GLM(Y, X, family = sm.families.Binomial()).fit()
    return 2*log_likelihood(model.fittedvalues, Y) + k*X.shape[1]


def stepAIC(X, Y, features = X.columns, AIC = True):
    AIC_list, action_list, feature_list = [], [], []
    best_AIC = np.inf
    best_action = ' '
                             
    n = len(Y)
    if AIC:
        k = 2
    else:
        k = np.log(n)
    AIC = computeAIC(X[features], Y, k)
    
    while(AIC < best_AIC):                     
        AIC_list.append(AIC)
        feature_list.append(list(features))
        action_list.append(best_action)
        best_AIC = AIC                     
                             
        tmp_AIC_list, tmp_action_list, tmp_feature_list = [], [], []
        
        for p in features:
            tmp_features = features.drop(p)
            tmp_AIC = computeAIC(X[tmp_features], Y, k)
            tmp_AIC_list.append(tmp_AIC)
            tmp_feature_list.append(tmp_features)
            tmp_action_list.append('- ' + p)
        
        remaining_features = [p for p in X.columns if p not in features]
        for p in remaining_features:
            tmp_features = list(features) + [p]
            tmp_AIC = computeAIC(X[tmp_features], Y, k)
            tmp_AIC_list.append(tmp_AIC)
            tmp_feature_list.append(tmp_features) 
            tmp_action_list.append('+ ' + p)
        
        best_model = np.array(tmp_AIC_list).argmin()
        AIC = tmp_AIC_list[best_model]
        features = tmp_feature_list[best_model]
        best_action = tmp_action_list[best_model]
    
    return pd.DataFrame({'AIC': AIC_list,
                         'action': action_list,
                       'features': feature_list})


myout = stepAIC(X, y)
pd.options.display.max_colwidth = 100
myout


myout = stepAIC(X, y, AIC = False)
myout


features = myout['features'][len(myout) - 1]
heartAIC = sm.GLM(y, X[features], family=sm.families.Binomial()).fit()
print(heartAIC.summary())

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                    chd   No. Observations:                  462
Model:                            GLM   Df Residuals:                      456
Model Family:                Binomial   Df Model:                            5
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -237.84
Date:                Thu, 03 Aug 2023   Deviance:                       475.69
Time:                        16:52:23   Pearson chi2:                     458.
No. Iterations:                     5   Pseudo R-squ. (CS):             0.2295
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
tobacco             0.0804      0.026      3.106      0.002       0.030       0.131
ldl                 0.1620      0.055      2.947      0.003       0.054       0.270
typea               0.0371      0.012      3.051      0.002       0.013       0.061
age                 0.0505      0.010      4.944      0.000       0.030       0.070
famhist_Present     0.9082      0.226      4.023      0.000       0.466       1.351
(intercept)        -6.4464      0.921     -7.000      0.000      -8.251      -4.642
===================================================================================


url = "http://archive.ics.uci.edu/ml/machine-learning-databases/space-shuttle/o-ring-erosion-or-blowby.data"
orings = pd.read_csv(url, delim_whitespace=True, header = None)
orings


orings = orings.drop(axis = 1, columns = [0, 3, 4])
orings.columns = ['damage', 'temp']
orings = orings.sort_values(['temp'])
orings['(intercept)'] = 1
binomial_response = pd.concat([orings['damage'], 6 - orings['damage']], axis = 1)
binomial_response.columns = ['N_damaged', 'N_undamaged']
logitmod = sm.GLM(binomial_response, orings[['temp', '(intercept)']], family = sm.families.Binomial()).fit()
print(logitmod.summary())

                      Generalized Linear Model Regression Results                       
========================================================================================
Dep. Variable:     ['N_damaged', 'N_undamaged']   No. Observations:                   23
Model:                                      GLM   Df Residuals:                       21
Model Family:                          Binomial   Df Model:                            1
Link Function:                            Logit   Scale:                          1.0000
Method:                                    IRLS   Log-Likelihood:                -15.823
Date:                          Mon, 23 Oct 2023   Deviance:                       18.086
Time:                                  18:52:33   Pearson chi2:                     30.0
No. Iterations:                               6   Pseudo R-squ. (CS):             0.2344
Covariance Type:                      nonrobust                                         
===============================================================================
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
temp           -0.1156      0.047     -2.458      0.014      -0.208      -0.023
(intercept)     5.0850      3.052      1.666      0.096      -0.898      11.068
===============================================================================


logitmod.predict([31, 1])[0]

0.8177744062821898


plt.scatter(orings['temp'], orings['damage']/6, s = 10);
plt.xlim([31, orings['temp'].max() + 2]);
plt.ylim([-0.1,1]);
temps = np.linspace(31, orings['temp'].max()+1, 100).reshape(-1,1)
phat = logitmod.predict(np.hstack([temps, np.ones((len(temps),1))]))
plt.plot(temps, phat, linewidth = 1);
plt.xlabel("Temp");
plt.ylabel("Chance of Damage");

	AIC	action	features
0	492.140032		[sbp, tobacco, ldl, adiposity, typea, obesity, alcohol, age, famhist_Present, (intercept)]
1	490.140769	- alcohol	[sbp, tobacco, ldl, adiposity, typea, obesity, age, famhist_Present, (intercept)]
2	488.548965	- adiposity	[sbp, tobacco, ldl, typea, obesity, age, famhist_Present, (intercept)]
3	487.979894	- sbp	[tobacco, ldl, typea, obesity, age, famhist_Present, (intercept)]
4	487.685578	- obesity	[tobacco, ldl, typea, age, famhist_Present, (intercept)]

	AIC	action	features
0	533.495681		[sbp, tobacco, ldl, adiposity, typea, obesity, alcohol, age, famhist_Present, (intercept)]
1	527.360853	- alcohol	[sbp, tobacco, ldl, adiposity, typea, obesity, age, famhist_Present, (intercept)]
2	521.633484	- adiposity	[sbp, tobacco, ldl, typea, obesity, age, famhist_Present, (intercept)]
3	516.928848	- sbp	[tobacco, ldl, typea, obesity, age, famhist_Present, (intercept)]
4	512.498967	- obesity	[tobacco, ldl, typea, age, famhist_Present, (intercept)]

(PSL) Logistic Regression¶

1. The Convergence Issue¶

2. Heart Data¶

3. Fit a logistic model¶

3.1 Model fitting¶

3.2 Coefficient Interpretation¶

3.3 Correlation among variables affects coefficients¶

3.4 Compute deviances¶

3.5 Prediction¶

Predicted Log-Odds¶

Predicted Probabilities¶

4. Variable Selection¶

4.1 Stepwise Model Selection with AIC and BIC¶

AIC¶

BIC¶

4.2 Model selection with Lasso/Ridge¶

Use glmnet¶

Use `sklearn`¶

5. Challenger O-Ring Data¶

	sbp	tobacco	ldl	adiposity	typea	obesity	alcohol	age	chd	famhist_Present
sbp	1.00	0.21	0.16	0.36	-0.06	0.24	0.14	0.39	0.19	0.09
tobacco	0.21	1.00	0.16	0.29	-0.01	0.12	0.20	0.45	0.30	0.09
ldl	0.16	0.16	1.00	0.44	0.04	0.33	-0.03	0.31	0.26	0.16
adiposity	0.36	0.29	0.44	1.00	-0.04	0.72	0.10	0.63	0.25	0.18
typea	-0.06	-0.01	0.04	-0.04	1.00	0.07	0.04	-0.10	0.10	0.04
obesity	0.24	0.12	0.33	0.72	0.07	1.00	0.05	0.29	0.10	0.12
alcohol	0.14	0.20	-0.03	0.10	0.04	0.05	1.00	0.10	0.06	0.08
age	0.39	0.45	0.31	0.63	-0.10	0.29	0.10	1.00	0.37	0.24
chd	0.19	0.30	0.26	0.25	0.10	0.10	0.06	0.37	1.00	0.27
famhist_Present	0.09	0.09	0.16	0.18	0.04	0.12	0.08	0.24	0.27	1.00

	0	1	2	3	4
0	6	0	66	50	1
1	6	1	70	50	2
2	6	0	69	50	3
3	6	0	68	50	4
4	6	0	67	50	5
5	6	0	72	50	6
6	6	0	73	100	7
7	6	0	70	100	8
8	6	1	57	200	9
9	6	1	63	200	10
10	6	1	70	200	11
11	6	0	78	200	12
12	6	0	67	200	13
13	6	2	53	200	14
14	6	0	67	200	15
15	6	0	75	200	16
16	6	0	70	200	17
17	6	0	81	200	18
18	6	0	76	200	19
19	6	0	79	200	20
20	6	2	75	200	21
21	6	0	76	200	22
22	6	1	58	200	23

	0	1	2	3	4
0	6	0	66	50	1
1	6	1	70	50	2
2	6	0	69	50	3
3	6	0	68	50	4
4	6	0	67	50	5
5	6	0	72	50	6
6	6	0	73	100	7
7	6	0	70	100	8
8	6	1	57	200	9
9	6	1	63	200	10
10	6	1	70	200	11
11	6	0	78	200	12
12	6	0	67	200	13
13	6	2	53	200	14
14	6	0	67	200	15
15	6	0	75	200	16
16	6	0	70	200	17
17	6	0	81	200	18
18	6	0	76	200	19
19	6	0	79	200	20
20	6	2	75	200	21
21	6	0	76	200	22
22	6	1	58	200	23

(PSL) Logistic Regression¶

1. The Convergence Issue¶

2. Heart Data¶

3. Fit a logistic model¶

3.1 Model fitting¶

3.2 Coefficient Interpretation¶

3.3 Correlation among variables affects coefficients¶

3.4 Compute deviances¶

3.5 Prediction¶

Predicted Log-Odds¶

Predicted Probabilities¶

4. Variable Selection¶

4.1 Stepwise Model Selection with AIC and BIC¶

AIC¶

BIC¶

4.2 Model selection with Lasso/Ridge¶

Use glmnet¶

Use sklearn¶

5. Challenger O-Ring Data¶

Use `sklearn`¶

	0	1	2	3	4
0	6	0	66	50	1
1	6	1	70	50	2
2	6	0	69	50	3
3	6	0	68	50	4
4	6	0	67	50	5
5	6	0	72	50	6
6	6	0	73	100	7
7	6	0	70	100	8
8	6	1	57	200	9
9	6	1	63	200	10
10	6	1	70	200	11
11	6	0	78	200	12
12	6	0	67	200	13
13	6	2	53	200	14
14	6	0	67	200	15
15	6	0	75	200	16
16	6	0	70	200	17
17	6	0	81	200	18
18	6	0	76	200	19
19	6	0	79	200	20
20	6	2	75	200	21
21	6	0	76	200	22
22	6	1	58	200	23