K-Nearest Neighbors (KNN)
Learn the simplest and most intuitive machine learning algorithm
🎯 What is K-Nearest Neighbors?
KNN is a simple algorithm that makes predictions based on the 'k' closest data points. It's like asking your k nearest neighbors for advice - the majority vote wins! No complex math required - just find the closest points and see what they say.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Create sample data
X, y = make_classification(n_samples=1000, n_features=2, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# Create and train KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
# Make predictions
predictions = knn.predict(X_test)
accuracy = knn.score(X_test, y_test)
print(f"Accuracy: {accuracy:.3f}")
KNN Key Concepts
K Neighbors
Number of closest points to consider
# Try different k values
for k in [1, 3, 5, 7]:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
Distance Metrics
How to measure closeness
# Different distance measures
knn_euclidean = KNeighborsClassifier(metric='euclidean')
knn_manhattan = KNeighborsClassifier(metric='manhattan')
Voting
How neighbors make decisions
# Uniform vs distance-weighted voting
knn_uniform = KNeighborsClassifier(weights='uniform')
knn_distance = KNeighborsClassifier(weights='distance')
Both Tasks
Classification and regression
from sklearn.neighbors import KNeighborsRegressor
# For continuous predictions
knn_reg = KNeighborsRegressor(n_neighbors=5)
🔹 Basic KNN Classification
Start with a simple classification example
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Load the famous iris dataset
iris = load_iris()
X, y = iris.data, iris.target
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)
# Train the model (actually just stores the data!)
knn.fit(X_train, y_train)
# Make predictions
y_pred = knn.predict(X_test)
# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
# Detailed results
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))
# Predict a single new flower
new_flower = [[5.1, 3.5, 1.4, 0.2]] # Sepal length, width, petal length, width
prediction = knn.predict(new_flower)
print(f"\nNew flower prediction: {iris.target_names[prediction[0]]}")
🔹 Choosing the Right K
Find the optimal number of neighbors
import matplotlib.pyplot as plt
import numpy as np
# Test different k values
k_values = range(1, 21)
train_scores = []
test_scores = []
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
# Calculate scores
train_score = knn.score(X_train, y_train)
test_score = knn.score(X_test, y_test)
train_scores.append(train_score)
test_scores.append(test_score)
# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(k_values, train_scores, 'bo-', label='Training Accuracy')
plt.plot(k_values, test_scores, 'ro-', label='Testing Accuracy')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.title('KNN: Choosing the Right K')
plt.legend()
plt.grid(True)
plt.show()
# Find best k
best_k = k_values[np.argmax(test_scores)]
best_score = max(test_scores)
print(f"Best k: {best_k}")
print(f"Best test accuracy: {best_score:.3f}")
# Rule of thumb: try k = sqrt(n_samples)
suggested_k = int(np.sqrt(len(X_train)))
print(f"Suggested k (sqrt rule): {suggested_k}")
🔹 KNN for Regression
Use KNN to predict continuous values
from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score
# Create regression dataset
X_reg, y_reg = make_regression(n_samples=1000, n_features=1, noise=10, random_state=42)
# Split data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg, y_reg, test_size=0.3, random_state=42)
# Create KNN regressor
knn_reg = KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(X_train_reg, y_train_reg)
# Make predictions
y_pred_reg = knn_reg.predict(X_test_reg)
# Evaluate
mse = mean_squared_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.3f}")
# Visualize results (for 1D data)
plt.figure(figsize=(10, 6))
plt.scatter(X_test_reg, y_test_reg, alpha=0.5, label='True values')
plt.scatter(X_test_reg, y_pred_reg, alpha=0.5, label='Predictions')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.title('KNN Regression Results')
plt.legend()
plt.show()
# Compare different k values for regression
k_values = [1, 3, 5, 10, 20]
for k in k_values:
knn_temp = KNeighborsRegressor(n_neighbors=k)
knn_temp.fit(X_train_reg, y_train_reg)
score = knn_temp.score(X_test_reg, y_test_reg)
print(f"k={k}: R² = {score:.3f}")
🔹 Distance Metrics and Weights
Customize how KNN measures similarity
# Compare different distance metrics
distance_metrics = ['euclidean', 'manhattan', 'chebyshev']
print("Comparing Distance Metrics:")
for metric in distance_metrics:
knn = KNeighborsClassifier(n_neighbors=5, metric=metric)
knn.fit(X_train, y_train)
score = knn.score(X_test, y_test)
print(f"{metric.capitalize()}: {score:.3f}")
print("\nComparing Weighting Schemes:")
# Compare uniform vs distance weighting
weights = ['uniform', 'distance']
for weight in weights:
knn = KNeighborsClassifier(n_neighbors=5, weights=weight)
knn.fit(X_train, y_train)
score = knn.score(X_test, y_test)
print(f"{weight.capitalize()} weights: {score:.3f}")
# Custom distance function example
def custom_distance(x1, x2):
"""Custom distance function - just for demonstration"""
return np.sum(np.abs(x1 - x2))
# Note: Custom functions need special setup in sklearn
print("\nDistance weighting gives closer neighbors more influence!")
print("Uniform weighting treats all k neighbors equally.")
🔹 Feature Scaling for KNN
Why scaling is crucial for distance-based algorithms
⚠️ Important: Always Scale Your Features!
KNN uses distance calculations, so features with larger scales will dominate. A feature ranging 0-1000 will overshadow one ranging 0-1.
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.datasets import make_classification
# Create dataset with different scales
X_unscaled, y = make_classification(n_samples=1000, n_features=2, random_state=42)
# Make one feature much larger
X_unscaled[:, 1] = X_unscaled[:, 1] * 1000 # Scale second feature by 1000
X_train, X_test, y_train, y_test = train_test_split(X_unscaled, y, test_size=0.3, random_state=42)
print("Feature ranges before scaling:")
print(f"Feature 1: {X_train[:, 0].min():.2f} to {X_train[:, 0].max():.2f}")
print(f"Feature 2: {X_train[:, 1].min():.2f} to {X_train[:, 1].max():.2f}")
# Test without scaling
knn_unscaled = KNeighborsClassifier(n_neighbors=5)
knn_unscaled.fit(X_train, y_train)
score_unscaled = knn_unscaled.score(X_test, y_test)
# Test with StandardScaler
scaler_std = StandardScaler()
X_train_std = scaler_std.fit_transform(X_train)
X_test_std = scaler_std.transform(X_test)
knn_std = KNeighborsClassifier(n_neighbors=5)
knn_std.fit(X_train_std, y_train)
score_std = knn_std.score(X_test_std, y_test)
# Test with MinMaxScaler
scaler_minmax = MinMaxScaler()
X_train_minmax = scaler_minmax.fit_transform(X_train)
X_test_minmax = scaler_minmax.transform(X_test)
knn_minmax = KNeighborsClassifier(n_neighbors=5)
knn_minmax.fit(X_train_minmax, y_train)
score_minmax = knn_minmax.score(X_test_minmax, y_test)
print(f"\nResults:")
print(f"No scaling: {score_unscaled:.3f}")
print(f"StandardScaler: {score_std:.3f}")
print(f"MinMaxScaler: {score_minmax:.3f}")
print("\nAfter scaling, features have similar ranges!")
print(f"Std scaled - Feature 1: {X_train_std[:, 0].min():.2f} to {X_train_std[:, 0].max():.2f}")
print(f"Std scaled - Feature 2: {X_train_std[:, 1].min():.2f} to {X_train_std[:, 1].max():.2f}")
🔹 KNN Pros and Cons
Understanding when to use KNN
✅ KNN Advantages:
- Simple to understand - No complex math
- No assumptions - Works with any data distribution
- Works for both classification and regression
- Adapts to new data - Just add more points
- Good baseline - Quick to try and compare
❌ KNN Disadvantages:
- Slow predictions - Must check all training data
- Memory intensive - Stores all training data
- Sensitive to irrelevant features - Curse of dimensionality
- Sensitive to scale - Requires feature scaling
- Poor with imbalanced data - Majority class dominates
# Demonstrate curse of dimensionality
from sklearn.datasets import make_classification
import time
print("KNN Performance vs Number of Features:")
print("Features | Accuracy | Prediction Time")
print("-" * 35)
for n_features in [2, 10, 50, 100]:
# Create dataset with varying dimensions
X, y = make_classification(n_samples=1000, n_features=n_features,
n_informative=min(n_features, 10), random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train and time KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
start_time = time.time()
score = knn.score(X_test_scaled, y_test)
prediction_time = time.time() - start_time
print(f"{n_features:8d} | {score:8.3f} | {prediction_time:8.4f}s")
print("\nAs dimensions increase, performance often decreases!")
print("This is the 'curse of dimensionality' for distance-based methods.")