Bootstrap Aggregation (Bagging)
Improve model performance by combining multiple models
🎒 Understanding Bagging
Bootstrap Aggregation (Bagging) trains multiple models on different subsets of data and combines their predictions. Like asking multiple experts and taking the average opinion.
from sklearn.ensemble import BaggingClassifier
# Train multiple models and combine results
bagging = BaggingClassifier(n_estimators=10)
bagging.fit(X_train, y_train)
Why Use Bagging?
Bagging provides several advantages over single models:
Better Accuracy
Combines multiple predictions
Reduces Overfitting
Less sensitive to noise
Parallel Training
Models train independently
Works with Any Model
Can bag any base estimator
🔹 Step 1: Understanding Bootstrap Sampling
Bootstrap creates multiple datasets by sampling with replacement
import numpy as np
# Original dataset
data = [1, 2, 3, 4, 5]
print("Original data:", data)
# Create bootstrap samples
np.random.seed(42)
for i in range(3):
bootstrap_sample = np.random.choice(data, size=len(data), replace=True)
print(f"Bootstrap sample {i+1}: {bootstrap_sample}")
🔹 Step 2: Simple Bagging Example
Train multiple decision trees on different data samples
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Create sample data
X, y = make_classification(n_samples=1000, n_features=4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create bagging classifier
bagging = BaggingClassifier(
estimator=DecisionTreeClassifier(),
n_estimators=10, # Number of models
random_state=42
)
# Train the ensemble
bagging.fit(X_train, y_train)
print("Bagging model trained with 10 decision trees")
🔹 Step 3: Compare Single vs Bagged Model
See how bagging improves performance
from sklearn.metrics import accuracy_score
# Single decision tree
single_tree = DecisionTreeClassifier(random_state=42)
single_tree.fit(X_train, y_train)
single_pred = single_tree.predict(X_test)
# Bagged trees
bagged_pred = bagging.predict(X_test)
# Compare accuracies
single_accuracy = accuracy_score(y_test, single_pred)
bagged_accuracy = accuracy_score(y_test, bagged_pred)
print(f"Single tree accuracy: {single_accuracy:.3f}")
print(f"Bagged trees accuracy: {bagged_accuracy:.3f}")
print(f"Improvement: {bagged_accuracy - single_accuracy:.3f}")
🔹 Step 4: Bagging with Different Base Models
Try bagging with various algorithms
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# Bagging with KNN
bagging_knn = BaggingClassifier(
estimator=KNeighborsClassifier(n_neighbors=3),
n_estimators=10,
random_state=42
)
# Bagging with SVM
bagging_svm = BaggingClassifier(
estimator=SVC(probability=True),
n_estimators=5, # Fewer for SVM (slower)
random_state=42
)
# Train both
bagging_knn.fit(X_train, y_train)
bagging_svm.fit(X_train, y_train)
# Compare results
knn_score = bagging_knn.score(X_test, y_test)
svm_score = bagging_svm.score(X_test, y_test)
print(f"Bagged KNN accuracy: {knn_score:.3f}")
print(f"Bagged SVM accuracy: {svm_score:.3f}")
🔹 Step 5: Random Forest (Advanced Bagging)
Random Forest is bagging + random feature selection
from sklearn.ensemble import RandomForestClassifier
# Random Forest (bagging + feature randomness)
rf = RandomForestClassifier(
n_estimators=10,
max_features='sqrt', # Random feature selection
random_state=42
)
# Regular bagging with trees
bagging_trees = BaggingClassifier(
estimator=DecisionTreeClassifier(),
n_estimators=10,
random_state=42
)
# Train both
rf.fit(X_train, y_train)
bagging_trees.fit(X_train, y_train)
# Compare
rf_score = rf.score(X_test, y_test)
bagging_score = bagging_trees.score(X_test, y_test)
print(f"Random Forest accuracy: {rf_score:.3f}")
print(f"Regular Bagging accuracy: {bagging_score:.3f}")
🔹 Step 6: Tuning Bagging Parameters
Optimize the number of estimators and sampling
# Test different numbers of estimators
n_estimators_range = [5, 10, 20, 50]
scores = []
for n_est in n_estimators_range:
bagging = BaggingClassifier(
estimator=DecisionTreeClassifier(),
n_estimators=n_est,
random_state=42
)
bagging.fit(X_train, y_train)
score = bagging.score(X_test, y_test)
scores.append(score)
print(f"n_estimators={n_est}: accuracy={score:.3f}")
# Find best
best_n = n_estimators_range[np.argmax(scores)]
print(f"\nBest n_estimators: {best_n}")