import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV

from skfda.misc.hat_matrix import NadarayaWatsonHatMatrix
from skfda.ml.regression._kernel_regression import KernelRegression

from skmisc.loess import loess


url = "https://liangfgithub.github.io/Data/Example_A.csv"
exa = pd.read_csv(url)

url = "https://liangfgithub.github.io/Data/Example_B.csv"
exb = pd.read_csv(url)

fig, axes = plt.subplots(nrows=1, ncols=2)
axes[0].plot(exa['x'], exa['m'], c='k')
axes[0].scatter(exa['x'], exa['y'], marker='.', alpha=0.5, c='tab:orange')

axes[1].plot(exb['x'], exb['m'], c='k')
axes[1].scatter(exb['x'], exb['y'], marker='.', alpha=0.5, c='tab:orange')

fig.set_figwidth(10)
fig.tight_layout()


url = "https://liangfgithub.github.io/Data/faithful.dat"
data = pd.read_table(url, sep="\s+", index_col=0)
x = data['eruptions'].to_numpy().reshape(-1, 1)
y = data['waiting']

plt.scatter(x, y, 12, alpha=0.5, c='tab:orange')
plt.xlabel('Eruptions')
plt.ylabel('Waiting')

Text(0, 0.5, 'Waiting')


bws = 0.25 * np.array([0.1, 0.5, 2])

x_sort = np.sort(x, axis=0)
fig, axes = plt.subplots(nrows=1, ncols=3)
for i, bw in enumerate(bws):
    tmp_ks = KernelRegression(
        kernel_estimator=NadarayaWatsonHatMatrix(bandwidth=bw))
    tmp_ks.fit(x, y)
    y_smooth = tmp_ks.predict(x_sort)
    axes[i].plot(x_sort, y_smooth)
    axes[i].scatter(x, y, marker='.', alpha=0.5, c='tab:orange')
    axes[i].set_xlabel("Eruptions")
    axes[i].set_ylabel("Waiting")

fig.set_figwidth(10)
fig.tight_layout()


# ref: https://fda.readthedocs.io/en/latest/auto_examples/plot_kernel_regression.html
bws = np.logspace(-2, 0, num=100)
nw = GridSearchCV(
    KernelRegression(kernel_estimator=NadarayaWatsonHatMatrix()),
    param_grid={'kernel_estimator__bandwidth': bws}
)

nw.fit(x, y)
print(
    'Nadaraya-Watson bandwidth:',
    nw.best_params_['kernel_estimator__bandwidth'],
)

Nadaraya-Watson bandwidth: 0.27185882427329416


fig, axes = plt.subplots(nrows=1, ncols=2)
axes[0].plot(bws, nw.cv_results_['mean_test_score'])
axes[0].set_xlabel("Bandwidth")
axes[0].set_ylabel("$R^2$ Score")

ks = nw.best_estimator_
y_smooth = ks.predict(x_sort)
axes[1].plot(x_sort, y_smooth)
axes[1].scatter(x, y, marker='.', alpha=0.5, c='tab:orange')
axes[1].set_xlabel("Eruptions")
axes[1].set_ylabel("Waiting")

fig.set_figwidth(10)
fig.tight_layout()


fig, axes = plt.subplots(nrows=1, ncols=3)

smoothed = loess(x, y).predict(x_sort).values
axes[0].plot(x_sort, smoothed)
axes[0].scatter(x, y, marker='.', alpha=0.5, c='tab:orange')

smoothed = loess(exa['x'], exa['y']).predict(exa['x']).values
smoothed2 = loess(exa['x'], exa['y'], span=0.22).predict(exa['x']).values
axes[1].plot(exa['x'], exa['m'], c='k')
axes[1].plot(exa['x'], smoothed, '--')
axes[1].plot(exa['x'], smoothed2, '--', c='r')
axes[1].scatter(exa['x'], exa['y'], marker='.', alpha=0.5, c='tab:orange')

smoothed = loess(exb['x'], exb['y']).predict(exb['x']).values
smoothed2 = loess(exb['x'], exb['y'], span=1).predict(exb['x']).values
axes[2].plot(exb['x'], exb['m'], c='k')
axes[2].plot(exb['x'], smoothed, '--')
axes[2].plot(exb['x'], smoothed2, '--', c='r')
axes[2].scatter(exb['x'], exb['y'], marker='.', alpha=0.5, c='tab:orange')

fig.set_figwidth(10)
fig.tight_layout()

(PSL) Local Smoother¶

Load Data¶

Kernel Smoothing¶

Bandwith Selection¶

Old Faithful Data¶

Simulated Data¶

LOESS¶