Cross Validation
Learn how to properly evaluate your machine learning models
🎯 What is Cross Validation?
Cross validation is a technique to evaluate how well your machine learning model will perform on unseen data. Instead of using just one train-test split, it divides your data into multiple folds and tests the model multiple times for more reliable results.
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
# Create sample data
X, y = make_classification(n_samples=1000, n_features=4, random_state=42)
# Create model
model = LogisticRegression()
# Perform 5-fold cross validation
scores = cross_val_score(model, X, y, cv=5)
print(f"CV Scores: {scores}")
print(f"Average: {scores.mean():.3f}")
Cross Validation Types
K-Fold CV
Split data into K equal parts
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True)
scores = cross_val_score(model, X, y, cv=kf)
Stratified K-Fold
Maintains class distribution
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)
scores = cross_val_score(model, X, y, cv=skf)
Leave-One-Out
Use each sample as test once
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(model, X, y, cv=loo)
Time Series Split
For time-ordered data
from sklearn.model_selection import TimeSeriesSplit
tss = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(model, X, y, cv=tss)
🔹 Basic Cross Validation
The simplest way to perform cross validation using scikit-learn
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
# Load sample dataset
iris = load_iris()
X, y = iris.data, iris.target
# Create a model
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Perform 5-fold cross validation
cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross Validation Scores:")
for i, score in enumerate(cv_scores, 1):
print(f"Fold {i}: {score:.3f}")
print(f"\nMean CV Score: {cv_scores.mean():.3f}")
print(f"Standard Deviation: {cv_scores.std():.3f}")
# This gives us a more reliable estimate than a single train-test split!
🔹 Custom Cross Validation
More control over the cross validation process
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np
# Create custom K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Manual cross validation
scores = []
for train_idx, test_idx in kf.split(X):
# Split the data
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
# Train model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Calculate accuracy
score = accuracy_score(y_test, y_pred)
scores.append(score)
print(f"Manual CV Scores: {scores}")
print(f"Mean: {np.mean(scores):.3f}")
🔹 Cross Validation for Different Metrics
Evaluate your model using different performance metrics
from sklearn.model_selection import cross_validate
# Multiple metrics at once
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_results = cross_validate(model, X, y, cv=5, scoring=scoring)
print("Cross Validation Results:")
for metric in scoring:
scores = cv_results[f'test_{metric}']
print(f"{metric.capitalize()}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
# For regression problems, you might use:
# scoring = ['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2']
🔹 Stratified Cross Validation
Ensure balanced class distribution in each fold
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import make_classification
# Create imbalanced dataset
X, y = make_classification(n_samples=1000, n_classes=3,
n_informative=3, weights=[0.1, 0.3, 0.6],
random_state=42)
print("Class distribution:")
unique, counts = np.unique(y, return_counts=True)
for cls, count in zip(unique, counts):
print(f"Class {cls}: {count} samples ({count/len(y)*100:.1f}%)")
# Use stratified cross validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stratified_scores = cross_val_score(model, X, y, cv=skf)
print(f"\nStratified CV Scores: {stratified_scores}")
print(f"Mean: {stratified_scores.mean():.3f}")
# This ensures each fold has similar class proportions!
🔹 Cross Validation Best Practices
Tips for effective cross validation
✅ Best Practices:
- Use 5 or 10 folds - Good balance between bias and variance
- Stratify for classification - Maintains class balance
- Shuffle your data - Avoid ordering bias
- Set random_state - For reproducible results
- Use appropriate CV for your data - Time series needs special handling
# Complete example with best practices
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Create a pipeline (preprocessing + model)
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
# Set up stratified cross validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Multiple metrics
scoring = ['accuracy', 'f1_macro', 'roc_auc_ovr']
# Perform cross validation
results = cross_validate(pipeline, X, y, cv=cv, scoring=scoring,
return_train_score=True)
# Print results
for metric in scoring:
test_scores = results[f'test_{metric}']
train_scores = results[f'train_{metric}']
print(f"{metric.upper()}:")
print(f" Test: {test_scores.mean():.3f} (+/- {test_scores.std() * 2:.3f})")
print(f" Train: {train_scores.mean():.3f} (+/- {train_scores.std() * 2:.3f})")
print()