def plot_helper(train_accs, test_accs, ks, title, ylabel = '% Error'):
    fig, ax = plt.subplots(figsize=(5, 3))
    ax.plot(ks, np.array(train_accs), label="Training Error")
    ax.plot(ks, np.array(test_accs), label="Testing Error")
    ax.set_title(title)
    ax.legend(loc='lower right')
    ax.set_ylabel(ylabel)
    plt.show()


import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
plt.rcParams['figure.figsize'] = [3, 2]
# Load data into pandas for pre-processing
pima_train_df = pd.read_csv('./data/PimaIndiansDiabetesTrain.csv', index_col=0)
pima_test_df = pd.read_csv('./data/PimaIndiansDiabetesTest.csv', index_col=0)
pima_train_df.loc[(pima_train_df.diabetes == 'pos'),'diabetes']=1
pima_train_df.loc[(pima_train_df.diabetes == 'neg'),'diabetes']=0
pima_test_df.loc[(pima_test_df.diabetes == 'pos'),'diabetes']=1
pima_test_df.loc[(pima_test_df.diabetes == 'neg'),'diabetes']=0
pima_train_matrix = np.array(pima_train_df, dtype=float)
pima_train_X = pima_train_matrix[:,:-1]
pima_train_Y = pima_train_matrix[:,-1]
pima_train_X = (pima_train_X - np.mean(pima_train_X, axis=0))/np.std(pima_train_X, axis = 0)
pima_test_matrix = np.array(pima_test_df, dtype=float)
pima_test_X = pima_test_matrix[:,:-1]
pima_test_Y = pima_test_matrix[:,-1]
pima_test_X = (pima_test_X - np.mean(pima_test_X, axis=0))/np.std(pima_test_X, axis = 0)


ks = np.arange(1,21, dtype=int)
train_results_1 = []
test_results_1 = []
for k in ks:
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(pima_train_X, pima_train_Y)
    train_results_1.append(neigh.score(pima_train_X, pima_train_Y))
    test_results_1.append(neigh.score(pima_test_X, pima_test_Y))

train_results_1 = 1-np.array(train_results_1)
test_results_1 = 1-np.array(test_results_1)
plot_helper(train_results_1, test_results_1, ks, "Training and Testing Errors over K, no CV")
best_index = np.argmin(test_results_1)
best_k1 = ks[best_index]
best_score1 = 1-test_results_1[best_index]
print(f'No CV found {best_k1} (df={pima_test_X.shape[0]/best_k1}) to be best k with testing error: {100.0-best_score1*100.0}%')

No CV found 15 (df=25.6) to be best k with testing error: 23.177083333333343%


k_scores = {}
kf = KFold(n_splits=3, random_state=None, shuffle=False)
# for all k's
for k in ks:
    train_scores = []
    test_scores = []
    # for all splits, fit the model and store train and test scores
    for train_index, test_index in kf.split(pima_train_X):
        X_train, X_test = pima_train_X[train_index], pima_train_X[test_index]
        Y_train, Y_test = pima_train_Y[train_index], pima_train_Y[test_index]
        neigh = KNeighborsClassifier(n_neighbors=k)
        neigh.fit(X_train, Y_train)
        train_scores.append(neigh.score(X_train, Y_train))
        test_scores.append(neigh.score(X_test, Y_test))
    # store the averaged train and test scores across all splits
    result = {'train': np.mean(train_scores), 'test': np.mean(test_scores)}
    k_scores[k] = result
    
train_results_2 = [1-k_scores[k]['train'] for k in k_scores]
test_results_2 = [1-k_scores[k]['test'] for k in k_scores]
plot_helper(train_results_2, test_results_2, ks, 'Training and Testing Errors over K, KFold CV')
best_index_2 = np.argmin(test_results_2)
best_k2 = ks[best_index_2]
best_score2 = 1-test_results_2[best_index_2]
print(f'KFold CV found {best_k2} (df={pima_test_X.shape[0]/best_k2}) to be best k with testing error: {100.0-best_score2*100.0}%')

KFold CV found 8 (df=48.0) to be best k with testing error: 26.302083333333343%


# generating the data
n = 1000
p = 5
k = 5
data = []
np.random.seed(1)
for _ in range(p):
    data.append(np.random.normal(size=(n)))
X = np.array(data).T
beta = np.array([1,0.5,-1,0,0])
epsilon = np.random.normal(size=(n))
y = X @ beta + epsilon
X_train = X[:int(n/2),:]
Y_train = y[:int(n/2)]
X_test = X[int(n/2):,:]
Y_test = y[int(n/2):]

# custom helper function to find mse
def mse(actual, preds):
    return np.sum(np.power((actual-preds),2)) / len(preds)

# model fitting
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=k)
neigh.fit(X_train, Y_train)
Y_train_pred = neigh.predict(X_train)
Y_test_pred = neigh.predict(X_test)
mse_train = mse(Y_train, Y_train_pred)
mse_test = mse(Y_test, Y_test_pred)
print(f'Training MSE: {mse_train}\nTesting MSE: {mse_test}')

Training MSE: 0.9614093562145081
Testing MSE: 1.2345716288770519


class MyKNN:
    def __init__(self, k):
        self.k = k

    def fit(self, X, Y):
        self.X = X
        self.Y = Y
        
    def predict(self, X_to_preds):
        # 1. for every point to predict
        preds = []
        for X_to_pred in X_to_preds:
            # 2. calculate the distance between that point and all training points
            distances = np.linalg.norm((X_to_pred - self.X), axis=1)
            # 3. find k smallest distances (https://stackoverflow.com/questions/34226400/find-the-index-of-the-k-smallest-values-of-a-numpy-array_)
            idx = np.argpartition(distances, k)
            smallest_idx = idx[:k]
            # 4. calculate the mean of the label of those closest points
            preds.append(np.mean(self.Y[smallest_idx]))
        return np.array(preds)
    
myKNN = MyKNN(k)
myKNN.fit(X_train, Y_train)
Y_train_pred = myKNN.predict(X_train)
Y_test_pred = myKNN.predict(X_test)
mse_train = mse(Y_train, Y_train_pred)
mse_test = mse(Y_test, Y_test_pred)
print(f'Training MSE: {mse_train}\nTesting MSE: {mse_test}')

Training MSE: 0.9614093562145081
Testing MSE: 1.2345716288770519


# generate covariates and add them to what already have
more_covariates = []
for _ in range(95):
    more_covariates.append(np.random.normal(size=(n)))
more_covariates = np.array(more_covariates).T
X1 = np.hstack((X, more_covariates))
# generate beta and noise to generate y
beta1 = np.concatenate((beta, np.zeros(95)))
np.random.seed(1)
epsilon1 = np.random.normal(size=(1000))
Y1 = X1 @ beta1 + epsilon1
#split
X1_train = X1[:int(n/2),:]
Y1_train = Y1[:int(n/2)]
X1_test = X1[int(n/2):,:]
Y1_test = Y1[int(n/2):]
#experiment
train_errors1 = []
test_errors1 = []

for k in ks:
    neigh1 = KNeighborsRegressor(n_neighbors=k)
    neigh1.fit(X1_train, Y1_train)
    Y_train_pred = neigh1.predict(X1_train)
    Y_test_pred = neigh1.predict(X1_test)
    train_errors1.append(mse(Y_train_pred, Y1_train))
    test_errors1.append(mse(Y_test_pred, Y1_test))
    
plot_helper(train_errors1, test_errors1, ks, "Training and Testing Errors for Setting 1", "MSE")
best_index_1 = np.argmin(test_errors1)
best_k1 = ks[best_index_1]
best_score1 = test_errors1[best_index_1]
print(f'Setting 1 found {best_k1} to be best k with testing error: {best_score1}')

Setting 1 found 13 to be best k with testing error: 3.9615008868605694


# generate covariates and add them to what already have
A = np.random.uniform(size=(5,95))
more_covariates2 = X @ A
X2 = np.hstack((X, more_covariates2))
# generate beta and noise to generate y
beta2 = np.concatenate((beta, np.zeros(95)))
np.random.seed(1)
epsilon2 = np.random.normal(size=(1000))
Y2 = X2 @ beta2 + epsilon2
# splits
X2_train = X2[:int(n/2),:]
Y2_train = Y2[:int(n/2)]
X2_test = X2[int(n/2):,:]
Y2_test = Y2[int(n/2):]
# experiment
train_errors2 = []
test_errors2 = []

for k in ks:
    neigh2 = KNeighborsRegressor(n_neighbors=k)
    neigh2.fit(X2_train, Y2_train)
    Y_train_pred = neigh2.predict(X2_train)
    Y_test_pred = neigh2.predict(X2_test)
    train_errors2.append(mse(Y_train_pred, Y2_train))
    test_errors2.append(mse(Y_test_pred, Y2_test))

plot_helper(train_errors2, test_errors2, ks, "Training and Testing Errors for 2", "MSE")
best_index_2 = np.argmin(test_errors2)
best_k2 = ks[best_index_2]
best_score2 = test_errors2[best_index_2]
print(f'Setting 2 found {best_k2} to be best k with testing error: {best_score2}')

Setting 2 found 4 to be best k with testing error: 1.0404314402313848

About HW2¶

Custom Functions for HW2¶

Question 1 [40 Points] KNN Classification (Diabetes)¶

Loading Data¶

install.packages("mlbench") # run this line if you don't have the package¶

Reading Data¶

Question 2 [40 Points] Write your own KNN for regression¶

Question 3 [30 Points] Curse of Dimensionality¶