Grid Search

Find the best model settings automatically

🔍 Understanding Grid Search

Grid search automatically tests different combinations of model settings to find the best performing configuration.


# Simple grid search example
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import make_classification

# Create sample data
X, y = make_classification(n_samples=100, n_features=4, random_state=42)

# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

# Run grid search
grid_search = GridSearchCV(SVC(), param_grid, cv=3)
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
                                    
Auto
Tuning
Best
Parameters
Cross
Validation
Key Concepts

Key Concepts

⚙️

Hyperparameters

Settings that control how the model learns

Learning Rate Regularization
🔄

Cross Validation

Tests each combination on multiple data splits

Reliable Unbiased
🏆

Best Model

Automatically selects highest scoring combination

Optimal Performance
📊

Exhaustive Search

Tests every possible combination systematically

Thorough Complete
Step-by-Step Implementation

🔹 Step 1: Basic Grid Search

Find best parameters for a simple classifier


from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.3, random_state=42
)

# Define parameter grid
param_grid = {
    'n_estimators': [10, 50, 100],      # Number of trees
    'max_depth': [3, 5, None],          # Tree depth
    'min_samples_split': [2, 5, 10]     # Min samples to split
}

# Create and run grid search
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters found:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"Best cross-validation score: {grid_search.best_score_:.3f}")

🔹 Step 2: Understanding the Results

Analyze all tested combinations


# Get detailed results
results = grid_search.cv_results_

# Show top 5 combinations
import pandas as pd
results_df = pd.DataFrame(results)
top_results = results_df.nlargest(5, 'mean_test_score')

print("Top 5 parameter combinations:")
for i, (idx, row) in enumerate(top_results.iterrows()):
    print(f"{i+1}. Score: {row['mean_test_score']:.3f}")
    print(f"   Parameters: {row['params']}")
    print()

# Test the best model
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"Test set accuracy: {test_score:.3f}")

🔹 Step 3: Different Scoring Metrics

Optimize for different objectives


# Grid search with different scoring metrics
scoring_metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

for metric in scoring_metrics:
    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=3,
        scoring=metric
    )
    grid_search.fit(X_train, y_train)
    
    print(f"Best {metric}: {grid_search.best_score_:.3f}")
    print(f"Best params for {metric}: {grid_search.best_params_}")
    print()

# Different metrics may give different "best" parameters
# Choose based on your specific needs

🔹 Step 4: Efficient Grid Search

Speed up search with smart strategies


from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# For large parameter spaces, use RandomizedSearchCV
# It samples random combinations instead of testing all

large_param_grid = {
    'n_estimators': np.arange(10, 200, 10),        # 19 values
    'max_depth': np.arange(1, 20),                 # 19 values  
    'min_samples_split': np.arange(2, 20),         # 18 values
    'min_samples_leaf': np.arange(1, 10)           # 9 values
}
# Total combinations: 19 × 19 × 18 × 9 = 58,482!

# Random search tests only a subset
random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    large_param_grid,
    n_iter=50,  # Test only 50 random combinations
    cv=3,
    scoring='accuracy',
    random_state=42
)

random_search.fit(X_train, y_train)
print("Random search best score:", random_search.best_score_)
print("Random search best params:", random_search.best_params_)

# Often finds good results much faster than full grid search

🔹 Step 5: Real-World Example

Tuning a model for customer churn prediction


# Simulate customer data
np.random.seed(42)
n_customers = 1000

# Features: [monthly_charges, total_charges, contract_length, support_calls]
X_customers = np.random.rand(n_customers, 4)
X_customers[:, 0] *= 100  # Monthly charges: 0-100
X_customers[:, 1] *= 5000  # Total charges: 0-5000
X_customers[:, 2] *= 24   # Contract length: 0-24 months
X_customers[:, 3] *= 10   # Support calls: 0-10

# Create realistic churn labels
churn_prob = (X_customers[:, 3] > 5) | (X_customers[:, 0] > 80)
y_churn = np.random.binomial(1, churn_prob * 0.7, n_customers)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_customers, y_churn, test_size=0.3, random_state=42
)

# Grid search for churn prediction
from sklearn.linear_model import LogisticRegression

churn_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],           # Regularization strength
    'penalty': ['l1', 'l2'],                # Regularization type
    'solver': ['liblinear', 'saga']         # Optimization algorithm
}

churn_grid = GridSearchCV(
    LogisticRegression(random_state=42, max_iter=1000),
    churn_param_grid,
    cv=5,
    scoring='roc_auc'  # Good for imbalanced data
)

churn_grid.fit(X_train, y_train)

print("Best parameters for churn prediction:")
print(churn_grid.best_params_)
print(f"Best ROC-AUC score: {churn_grid.best_score_:.3f}")

# Use best model for predictions
best_churn_model = churn_grid.best_estimator_
churn_predictions = best_churn_model.predict_proba(X_test)[:, 1]

# Identify high-risk customers
high_risk = X_test[churn_predictions > 0.7]
print(f"Found {len(high_risk)} high-risk customers for retention campaigns")

🧠 Test Your Knowledge

What does grid search do?

Why use RandomizedSearchCV instead of GridSearchCV?