Train Test Split
Learn how to split your data for proper machine learning model evaluation
๐ฏ What is Train Test Split?
Train test split is a fundamental technique in machine learning where we divide our dataset into separate portions for training and testing our model. This helps us evaluate how well our model performs on unseen data.
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
# Load sample data
iris = load_iris()
X, y = iris.data, iris.target
# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
Key Concepts
Training Set
Data used to train the model
Test Set
Data used to evaluate model performance
Random State
Ensures reproducible splits
Stratification
Maintains class distribution
๐น Basic Train Test Split
The simplest way to split your data
from sklearn.model_selection import train_test_split
import numpy as np
# Create sample data
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
y = np.array([0, 1, 0, 1, 0])
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.4, random_state=42
)
print("Training features:", X_train)
print("Training labels:", y_train)
print("Test features:", X_test)
print("Test labels:", y_test)
๐น Stratified Split
Maintain class proportions in both sets
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
# Create imbalanced dataset
X, y = make_classification(n_samples=1000, n_classes=3,
weights=[0.1, 0.3, 0.6], random_state=42)
# Regular split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Stratified split (maintains class proportions)
X_train_strat, X_test_strat, y_train_strat, y_test_strat = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
print("Original class distribution:", np.bincount(y) / len(y))
print("Regular split test:", np.bincount(y_test) / len(y_test))
print("Stratified split test:", np.bincount(y_test_strat) / len(y_test_strat))
๐น Common Split Ratios
Different ratios for different scenarios
# Small dataset (< 1000 samples): 70-30 or 60-40
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
# Medium dataset (1000-10000): 80-20
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Large dataset (> 10000): 90-10 or 95-5
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=42
)
print(f"Training size: {len(X_train)}")
print(f"Test size: {len(X_test)}")
print(f"Split ratio: {len(X_train)/(len(X_train)+len(X_test)):.1%} - {len(X_test)/(len(X_train)+len(X_test)):.1%}")
๐น Complete Example
Full workflow with model training and evaluation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
# Load data
iris = load_iris()
X, y = iris.data, iris.target
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
# Train model
model = LogisticRegression()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate
train_accuracy = model.score(X_train, y_train)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Training accuracy: {train_accuracy:.3f}")
print(f"Test accuracy: {test_accuracy:.3f}")
print(f"Difference: {train_accuracy - test_accuracy:.3f}")