library(tidymodels)
## load data
<- read_csv("./data/body.csv")
bodydata <- bodydata |>
body select(GENDER, HEIGHT, WAIST, BMI) |>
mutate(GENDER = as.factor(GENDER))
## training and test data
set.seed(2024)
<- initial_split(data = body, prop = 0.8)
df_split <- training(df_split)
df_trn <- testing(df_split)
df_tst
## KNN training
<- recipe(GENDER ~ HEIGHT + WAIST, data = df_trn) |>
knn_recipe step_normalize(all_predictors())
<- nearest_neighbor(neighbors = 3, mode = "classification")
knn_mdl <- workflow() |>
knn_out add_recipe(knn_recipe) |>
add_model(knn_mdl) |>
fit(data = df_trn)
## KNN prediction
<- pull(predict(knn_out, df_tst))
knn_pred table(knn_pred, df_tst$GENDER)
mean(knn_pred == df_tst$GENDER)
Lab 22: K Nearest Neighbors
Note
In lab.qmd ## Lab 22
section,
use
HEIGHT
andWAIST
to predictGENDER
using KNN with \(K = 3\).Generate the (test) confusion matrix.
Calculate (test) accuracy rate.
Does using more predictors predict better?
R Code
Python Code
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
## load data
= pd.read_csv('./data/body.csv')
body = body[['HEIGHT', 'WAIST']]
X = body['GENDER']
y
## training and test data
= train_test_split(X, y, test_size=0.2, random_state=2024)
X_trn, X_tst, y_trn, y_tst
## KNN training
= KNeighborsClassifier(n_neighbors = 3)
knn = np.array(X_trn)
X_trn = np.array(X_tst)
X_tst
knn.fit(X_trn, y_trn)
## KNN prediction
= knn.predict(X_tst)
y_pred from sklearn.metrics import confusion_matrix
confusion_matrix(y_tst, y_pred)== y_pred) np.mean(y_tst