Grid Search
Find the best model settings automatically
🔍 Understanding Grid Search
Grid search automatically tests different combinations of model settings to find the best performing configuration.
# Simple grid search example
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import make_classification
# Create sample data
X, y = make_classification(n_samples=100, n_features=4, random_state=42)
# Define parameter grid
param_grid = {
'C': [0.1, 1, 10],
'kernel': ['linear', 'rbf']
}
# Run grid search
grid_search = GridSearchCV(SVC(), param_grid, cv=3)
grid_search.fit(X, y)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
Key Concepts
Hyperparameters
Settings that control how the model learns
Cross Validation
Tests each combination on multiple data splits
Best Model
Automatically selects highest scoring combination
Exhaustive Search
Tests every possible combination systematically
🔹 Step 1: Basic Grid Search
Find best parameters for a simple classifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# Load data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, test_size=0.3, random_state=42
)
# Define parameter grid
param_grid = {
'n_estimators': [10, 50, 100], # Number of trees
'max_depth': [3, 5, None], # Tree depth
'min_samples_split': [2, 5, 10] # Min samples to split
}
# Create and run grid search
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best parameters found:")
for param, value in grid_search.best_params_.items():
print(f" {param}: {value}")
print(f"Best cross-validation score: {grid_search.best_score_:.3f}")
🔹 Step 2: Understanding the Results
Analyze all tested combinations
# Get detailed results
results = grid_search.cv_results_
# Show top 5 combinations
import pandas as pd
results_df = pd.DataFrame(results)
top_results = results_df.nlargest(5, 'mean_test_score')
print("Top 5 parameter combinations:")
for i, (idx, row) in enumerate(top_results.iterrows()):
print(f"{i+1}. Score: {row['mean_test_score']:.3f}")
print(f" Parameters: {row['params']}")
print()
# Test the best model
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"Test set accuracy: {test_score:.3f}")
🔹 Step 3: Different Scoring Metrics
Optimize for different objectives
# Grid search with different scoring metrics
scoring_metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in scoring_metrics:
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=3,
scoring=metric
)
grid_search.fit(X_train, y_train)
print(f"Best {metric}: {grid_search.best_score_:.3f}")
print(f"Best params for {metric}: {grid_search.best_params_}")
print()
# Different metrics may give different "best" parameters
# Choose based on your specific needs
🔹 Step 4: Efficient Grid Search
Speed up search with smart strategies
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# For large parameter spaces, use RandomizedSearchCV
# It samples random combinations instead of testing all
large_param_grid = {
'n_estimators': np.arange(10, 200, 10), # 19 values
'max_depth': np.arange(1, 20), # 19 values
'min_samples_split': np.arange(2, 20), # 18 values
'min_samples_leaf': np.arange(1, 10) # 9 values
}
# Total combinations: 19 × 19 × 18 × 9 = 58,482!
# Random search tests only a subset
random_search = RandomizedSearchCV(
RandomForestClassifier(random_state=42),
large_param_grid,
n_iter=50, # Test only 50 random combinations
cv=3,
scoring='accuracy',
random_state=42
)
random_search.fit(X_train, y_train)
print("Random search best score:", random_search.best_score_)
print("Random search best params:", random_search.best_params_)
# Often finds good results much faster than full grid search
🔹 Step 5: Real-World Example
Tuning a model for customer churn prediction
# Simulate customer data
np.random.seed(42)
n_customers = 1000
# Features: [monthly_charges, total_charges, contract_length, support_calls]
X_customers = np.random.rand(n_customers, 4)
X_customers[:, 0] *= 100 # Monthly charges: 0-100
X_customers[:, 1] *= 5000 # Total charges: 0-5000
X_customers[:, 2] *= 24 # Contract length: 0-24 months
X_customers[:, 3] *= 10 # Support calls: 0-10
# Create realistic churn labels
churn_prob = (X_customers[:, 3] > 5) | (X_customers[:, 0] > 80)
y_churn = np.random.binomial(1, churn_prob * 0.7, n_customers)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X_customers, y_churn, test_size=0.3, random_state=42
)
# Grid search for churn prediction
from sklearn.linear_model import LogisticRegression
churn_param_grid = {
'C': [0.01, 0.1, 1, 10, 100], # Regularization strength
'penalty': ['l1', 'l2'], # Regularization type
'solver': ['liblinear', 'saga'] # Optimization algorithm
}
churn_grid = GridSearchCV(
LogisticRegression(random_state=42, max_iter=1000),
churn_param_grid,
cv=5,
scoring='roc_auc' # Good for imbalanced data
)
churn_grid.fit(X_train, y_train)
print("Best parameters for churn prediction:")
print(churn_grid.best_params_)
print(f"Best ROC-AUC score: {churn_grid.best_score_:.3f}")
# Use best model for predictions
best_churn_model = churn_grid.best_estimator_
churn_predictions = best_churn_model.predict_proba(X_test)[:, 1]
# Identify high-risk customers
high_risk = X_test[churn_predictions > 0.7]
print(f"Found {len(high_risk)} high-risk customers for retention campaigns")