library(tidymodels)
## load data
bodydata <- read_csv("./data/body.csv")
body <- bodydata |>
select(GENDER, HEIGHT, WAIST, BMI) |>
mutate(GENDER = as.factor(GENDER))
## training and test data
set.seed(2024)
df_split <- initial_split(data = body, prop = 0.8)
df_trn <- training(df_split)
df_tst <- testing(df_split)
## KNN training
knn_recipe <- recipe(GENDER ~ HEIGHT + WAIST, data = df_trn) |>
step_normalize(all_predictors())
knn_mdl <- nearest_neighbor(neighbors = 3, mode = "classification")
knn_out <- workflow() |>
add_recipe(knn_recipe) |>
add_model(knn_mdl) |>
fit(data = df_trn)
## KNN prediction
knn_pred <- pull(predict(knn_out, df_tst))
table(knn_pred, df_tst$GENDER)
mean(knn_pred == df_tst$GENDER)Lab 22: K Nearest Neighbors
Note
In lab.qmd ## Lab 22 section,
use
HEIGHTandWAISTto predictGENDERusing KNN with \(K = 3\).Generate the (test) confusion matrix.
Calculate (test) accuracy rate.
Does using more predictors predict better?
R Code
Python Code
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
## load data
body = pd.read_csv('./data/body.csv')
X = body[['HEIGHT', 'WAIST']]
y = body['GENDER']
## training and test data
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=0.2, random_state=2024)
## KNN training
knn = KNeighborsClassifier(n_neighbors = 3)
X_trn = np.array(X_trn)
X_tst = np.array(X_tst)
knn.fit(X_trn, y_trn)
## KNN prediction
y_pred = knn.predict(X_tst)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_tst, y_pred)
np.mean(y_tst == y_pred)