import pandas as pd
import numpy as np

url = "https://hastie.su.domains/ElemStatLearn/datasets/phoneme.data"
df = pd.read_csv(url, index_col=0)
mydata = df.loc[(df['g'] == "aa") | (df['g'] == "ao")]
mydata.iloc[0,np.concatenate([np.arange(5), np.arange(249,258)])]

x.1                   12.96705
x.2                   13.69454
x.3                   14.91182
x.4                   18.22292
x.5                    18.4539
x.250                  7.58624
x.251                  6.65202
x.252                  7.69109
x.253                  6.93683
x.254                    7.036
x.255                  7.01278
x.256                  8.52197
g                           aa
speaker    train.dr1.mcpm0.sa1
Name: 5, dtype: object


X = mydata.drop(['g', 'speaker'], axis = 1).astype(float)
Y = mydata['g'].copy()
np.stack(np.unique(Y, return_counts=True))

array([['aa', 'ao'],
       [695, 1022]], dtype=object)


Y[Y == 'ao'] = 1
Y[Y == 'aa'] = 0
Y = Y.astype(float)
np.stack(np.unique(Y, return_counts=True))

array([[0.000e+00, 1.000e+00],
       [6.950e+02, 1.022e+03]])


import matplotlib.pyplot as plt

id0 = np.random.choice(np.where(Y == 0)[0], size=15, replace=False)
id1 = np.random.choice(np.where(Y == 1)[0], size=15, replace=False)
aa_label, ao_label = {'label': 'aa'}, {'label': 'ao'}
plt.figure(figsize=(8, 6))
plt.xlabel("Frequency")
plt.ylabel("Log-periodogram")

plt.plot([], color='#00FF00', linewidth=2, label='aa')
for i in id0:
    plt.plot(X.iloc[i], linewidth=0.5, color='#00FF00')

plt.plot([], color='#FF8C00', linewidth=2, label='ao')
for i in id1:
    plt.plot(X.iloc[i], linewidth=0.5, color='#FF8C00')

plt.legend(loc="upper right")
custom_ticks = [0, 50, 100, 150, 200, 250]
plt.xticks(custom_ticks, custom_ticks)
plt.show()


import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
logfit1 = sm.GLM(Y, add_constant(X), family=sm.families.Binomial()).fit()
coef1 = logfit1.params[1:]


from scipy.interpolate import splev
def ns(x, df=None, knots=None, boundary_knots=None, include_intercept=False):
    degree = 3
    
    if boundary_knots is None:
        boundary_knots = [np.min(x), np.max(x)]
    else:
        boundary_knots = np.sort(boundary_knots).tolist()

    oleft = x < boundary_knots[0]
    oright = x > boundary_knots[1]
    outside = oleft | oright
    inside = ~outside

    if df is not None:
        nIknots = df - 1 - include_intercept
        if nIknots < 0:
            nIknots = 0
            
        if nIknots > 0:
            knots = np.linspace(0, 1, num=nIknots + 2)[1:-1]
            knots = np.quantile(x[~outside], knots)

    Aknots = np.sort(np.concatenate((boundary_knots * 4, knots)))
    n_bases = len(Aknots) - (degree + 1)

    if any(outside):
        basis = np.empty((x.shape[0], n_bases), dtype=float)
        e = 1 / 4 # in theory anything in (0, 1); was (implicitly) 0 in R <= 3.2.2

        if any(oleft):
            k_pivot = boundary_knots[0]
            xl = x[oleft] - k_pivot
            xl = np.c_[np.ones(xl.shape[0]), xl]

            # equivalent to splineDesign(Aknots, rep(k.pivot, ord), ord, derivs)
            tt = np.empty((xl.shape[1], n_bases), dtype=float)
            for j in range(xl.shape[1]):
                for i in range(n_bases):
                    coefs = np.zeros((n_bases,))
                    coefs[i] = 1
                    tt[j, i] = splev(k_pivot, (Aknots, coefs, degree), der=j)

            basis[oleft, :] = xl @ tt

        if any(oright):
            k_pivot = boundary_knots[1]
            xr = x[oright] - k_pivot
            xr = np.c_[np.ones(xr.shape[0]), xr]

            tt = np.empty((xr.shape[1], n_bases), dtype=float)
            for j in range(xr.shape[1]):
                for i in range(n_bases):
                    coefs = np.zeros((n_bases,))
                    coefs[i] = 1
                    tt[j, i] = splev(k_pivot, (Aknots, coefs, degree), der=j)
                    
            basis[oright, :] = xr @ tt
        
        if any(inside):
            xi = x[inside]
            tt = np.empty((len(xi), n_bases), dtype=float)
            for i in range(n_bases):
                coefs = np.zeros((n_bases,))
                coefs[i] = 1
                tt[:, i] = splev(xi, (Aknots, coefs, degree))

            basis[inside, :] = tt
    else:
        basis = np.empty((x.shape[0], n_bases), dtype=float)
        for i in range(n_bases):
            coefs = np.zeros((n_bases,))
            coefs[i] = 1
            basis[:, i] = splev(x, (Aknots, coefs, degree))

    const = np.empty((2, n_bases), dtype=float)
    for i in range(n_bases):
        coefs = np.zeros((n_bases,))
        coefs[i] = 1
        const[:, i] = splev(boundary_knots, (Aknots, coefs, degree), der=2)

    if include_intercept is False:
        basis = basis[:, 1:]
        const = const[:, 1:]

    qr_const = np.linalg.qr(const.T, mode='complete')[0]
    basis = (qr_const.T @ basis.T).T[:, 2:]

    return basis


H = ns(np.arange(1, 257), df = 12)
B = X @ H 
logfit2 = sm.GLM(Y, add_constant(B), family=sm.families.Binomial()).fit()
coef2 = H @ logfit2.params.to_numpy()[1:]


import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
mpl.rcParams['figure.dpi'] = 250
sns.set()
plt.plot(np.arange(1,257), coef1, color = 'gray', label = 'Regular');
plt.plot(np.arange(1,257), coef2, color = 'red', label = 'Spline Smoothed');
plt.legend();
plt.xlabel("Frequency");
plt.ylabel("Logistic Regression Coefficients");


T = 5
n = len(X)
Err1 = np.zeros((T,2))
Err2 = np.zeros((T,2))
np.random.seed(0)
for t in range(T):
  mask = np.in1d(np.arange(n), np.random.choice(n, int(n*0.2)))
  logfit1 = sm.GLM(Y[~mask], add_constant(X)[~mask], family = sm.families.Binomial()).fit()
  yhat1 = (logfit1.fittedvalues > 0.5).astype(float)
  Err1[t, 0] = (yhat1 != Y[~mask]).mean()
  yhat1 = (add_constant(X)[mask] @ logfit1.params > 0.5).astype(float)
  Err1[t, 1] = (yhat1 != Y[mask]).mean()
  
  logfit2 = sm.GLM(Y[~mask], add_constant(B)[~mask], family = sm.families.Binomial()).fit()  
  yhat2 = (logfit2.fittedvalues > 0.5).astype(float)
  Err2[t, 0] = (yhat2 != Y[~mask]).mean()
  yhat2 = (add_constant(B)[mask] @ logfit2.params > 0.5).astype(float)
  Err2[t, 1] = (yhat2 != Y[mask]).mean()

pd.DataFrame(np.hstack([Err1, Err2]), columns = ['Regular Train Error', 'Regular Test Error',
                                                 'Spline Train Error', 'Spline Test Error'])

	Regular Train Error	Regular Test Error	Spline Train Error	Spline Test Error
0	0.101068	0.256410	0.168683	0.211538
1	0.110163	0.254839	0.180526	0.174194
2	0.120397	0.249180	0.178470	0.186885
3	0.114731	0.186885	0.177054	0.177049
4	0.100000	0.250814	0.176596	0.182410

(PSL) Example: Phoneme Recognition¶

Phoneme Data¶

Fit a Logitsic Regressio Model¶

Force the Coefficients to be Smooth¶

Compare Classification Performance¶