Polynomial Regression
Learn to model non-linear relationships with polynomial features
🎯 Understanding Polynomial Regression
Polynomial regression extends linear regression by adding polynomial terms to capture non-linear relationships between variables.
# Simple polynomial regression example
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
# Create polynomial features
X = [[1], [2], [3], [4]]
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
print(X_poly) # [[1, 1, 1], [1, 2, 4], [1, 3, 9], [1, 4, 16]]
Key Concepts of Polynomial Regression
Polynomial regression helps us model curved relationships by transforming features into polynomial terms:
Polynomial Features
Transform input features into polynomial terms
Degree Selection
Choose the right polynomial degree
Bias-Variance
Balance between underfitting and overfitting
Applications
Real-world use cases
🔹 Basic Polynomial Regression
Let's start with a simple example of polynomial regression
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
# Generate sample data with non-linear relationship
np.random.seed(42)
X = np.linspace(0, 4, 20).reshape(-1, 1)
y = 2 * X.ravel() + 3 * X.ravel()**2 + np.random.normal(0, 2, 20)
# Create polynomial regression model
degree = 2
poly_reg = Pipeline([
('poly', PolynomialFeatures(degree=degree)),
('linear', LinearRegression())
])
# Fit the model
poly_reg.fit(X, y)
# Make predictions
X_test = np.linspace(0, 4, 100).reshape(-1, 1)
y_pred = poly_reg.predict(X_test)
# Print model score
print(f"R² Score: {poly_reg.score(X, y):.3f}")
# The model learns: y = a + bx + cx²
print("Polynomial regression captures curved relationships!")
🔹 Choosing the Right Degree
The polynomial degree affects model complexity and performance
from sklearn.model_selection import validation_curve
from sklearn.metrics import mean_squared_error
# Generate more complex data
X = np.linspace(0, 1, 50).reshape(-1, 1)
y = np.sin(2 * np.pi * X).ravel() + np.random.normal(0, 0.1, 50)
# Test different polynomial degrees
degrees = range(1, 10)
train_scores = []
val_scores = []
for degree in degrees:
# Create polynomial model
poly_model = Pipeline([
('poly', PolynomialFeatures(degree=degree)),
('linear', LinearRegression())
])
# Get validation curve
train_score, val_score = validation_curve(
poly_model, X, y, param_name='poly__degree',
param_range=[degree], cv=5, scoring='neg_mean_squared_error'
)
train_scores.append(-train_score.mean())
val_scores.append(-val_score.mean())
print(f"Degree {degree}: Train MSE = {-train_score.mean():.3f}, "
f"Val MSE = {-val_score.mean():.3f}")
# Find best degree (lowest validation error)
best_degree = degrees[np.argmin(val_scores)]
print(f"\nBest polynomial degree: {best_degree}")
🔹 Regularized Polynomial Regression
Use regularization to prevent overfitting with high-degree polynomials
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
# Generate data
X = np.random.randn(100, 1)
y = 0.5 * X.ravel() + 2 * X.ravel()**2 + np.random.randn(100) * 0.1
# Create regularized polynomial models
models = {
'Linear': LinearRegression(),
'Ridge': Ridge(alpha=1.0),
'Lasso': Lasso(alpha=0.1)
}
# Compare models with polynomial features
degree = 5
results = {}
for name, model in models.items():
# Create pipeline with polynomial features and scaling
pipeline = Pipeline([
('poly', PolynomialFeatures(degree=degree)),
('scaler', StandardScaler()),
('regressor', model)
])
# Fit and evaluate
pipeline.fit(X, y)
score = pipeline.score(X, y)
results[name] = score
print(f"{name} R² Score: {score:.3f}")
# Ridge and Lasso help prevent overfitting
print("\nRegularization helps control model complexity!")
🔹 Real-World Example: Temperature Prediction
Apply polynomial regression to model temperature patterns
# Simulate daily temperature data
days = np.arange(1, 366) # Days of the year
X = days.reshape(-1, 1)
# Create seasonal temperature pattern
base_temp = 20 # Base temperature
seasonal = 15 * np.sin(2 * np.pi * days / 365) # Seasonal variation
noise = np.random.normal(0, 2, 365) # Random variation
y = base_temp + seasonal + noise
# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Create polynomial model
temp_model = Pipeline([
('poly', PolynomialFeatures(degree=3)),
('scaler', StandardScaler()),
('ridge', Ridge(alpha=1.0))
])
# Train model
temp_model.fit(X_train, y_train)
# Evaluate
train_score = temp_model.score(X_train, y_train)
test_score = temp_model.score(X_test, y_test)
print(f"Temperature Model Performance:")
print(f"Training R²: {train_score:.3f}")
print(f"Testing R²: {test_score:.3f}")
# Predict temperature for specific days
summer_day = [[180]] # Day 180 (summer)
winter_day = [[360]] # Day 360 (winter)
summer_temp = temp_model.predict(summer_day)[0]
winter_temp = temp_model.predict(winter_day)[0]
print(f"\nPredicted summer temperature: {summer_temp:.1f}°C")
print(f"Predicted winter temperature: {winter_temp:.1f}°C")
🔹 Best Practices
Tips for effective polynomial regression
✅ Do's:
- Start with low degrees (2-3) and increase gradually
- Use cross-validation to select optimal degree
- Apply feature scaling with high-degree polynomials
- Use regularization to prevent overfitting
- Visualize your data to understand relationships
❌ Don'ts:
- Don't use very high degrees without regularization
- Don't ignore validation performance
- Don't extrapolate far beyond training data
- Don't forget to scale features
# Complete polynomial regression workflow
def polynomial_regression_workflow(X, y, max_degree=5):
"""Complete workflow for polynomial regression"""
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
best_score = -np.inf
best_degree = 1
# Test different degrees
for degree in range(1, max_degree + 1):
model = Pipeline([
('poly', PolynomialFeatures(degree=degree)),
('scaler', StandardScaler()),
('ridge', Ridge(alpha=1.0))
])
# Cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train, y_train, cv=5)
avg_score = scores.mean()
print(f"Degree {degree}: CV Score = {avg_score:.3f}")
if avg_score > best_score:
best_score = avg_score
best_degree = degree
# Train final model
final_model = Pipeline([
('poly', PolynomialFeatures(degree=best_degree)),
('scaler', StandardScaler()),
('ridge', Ridge(alpha=1.0))
])
final_model.fit(X_train, y_train)
test_score = final_model.score(X_test, y_test)
print(f"\nBest degree: {best_degree}")
print(f"Final test score: {test_score:.3f}")
return final_model
# Example usage
# model = polynomial_regression_workflow(X, y)