Polynomial Regression

Learn to model non-linear relationships with polynomial features

🎯 Understanding Polynomial Regression

Polynomial regression extends linear regression by adding polynomial terms to capture non-linear relationships between variables.


# Simple polynomial regression example
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Create polynomial features
X = [[1], [2], [3], [4]]
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
print(X_poly)  # [[1, 1, 1], [1, 2, 4], [1, 3, 9], [1, 4, 16]]
                                    
Non-linear
Relationships
Flexible
Curve Fitting
Higher
Accuracy

Key Concepts of Polynomial Regression

Polynomial regression helps us model curved relationships by transforming features into polynomial terms:

📈

Polynomial Features

Transform input features into polynomial terms

x⁴
🎯

Degree Selection

Choose the right polynomial degree

Degree 2 Degree 3 Cross-validation
⚖️

Bias-Variance

Balance between underfitting and overfitting

Regularization Validation Model Selection
📊

Applications

Real-world use cases

Growth Curves Physics Models Economics

🔹 Basic Polynomial Regression

Let's start with a simple example of polynomial regression

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

# Generate sample data with non-linear relationship
np.random.seed(42)
X = np.linspace(0, 4, 20).reshape(-1, 1)
y = 2 * X.ravel() + 3 * X.ravel()**2 + np.random.normal(0, 2, 20)

# Create polynomial regression model
degree = 2
poly_reg = Pipeline([
    ('poly', PolynomialFeatures(degree=degree)),
    ('linear', LinearRegression())
])

# Fit the model
poly_reg.fit(X, y)

# Make predictions
X_test = np.linspace(0, 4, 100).reshape(-1, 1)
y_pred = poly_reg.predict(X_test)

# Print model score
print(f"R² Score: {poly_reg.score(X, y):.3f}")

# The model learns: y = a + bx + cx²
print("Polynomial regression captures curved relationships!")

🔹 Choosing the Right Degree

The polynomial degree affects model complexity and performance

from sklearn.model_selection import validation_curve
from sklearn.metrics import mean_squared_error

# Generate more complex data
X = np.linspace(0, 1, 50).reshape(-1, 1)
y = np.sin(2 * np.pi * X).ravel() + np.random.normal(0, 0.1, 50)

# Test different polynomial degrees
degrees = range(1, 10)
train_scores = []
val_scores = []

for degree in degrees:
    # Create polynomial model
    poly_model = Pipeline([
        ('poly', PolynomialFeatures(degree=degree)),
        ('linear', LinearRegression())
    ])
    
    # Get validation curve
    train_score, val_score = validation_curve(
        poly_model, X, y, param_name='poly__degree', 
        param_range=[degree], cv=5, scoring='neg_mean_squared_error'
    )
    
    train_scores.append(-train_score.mean())
    val_scores.append(-val_score.mean())
    
    print(f"Degree {degree}: Train MSE = {-train_score.mean():.3f}, "
          f"Val MSE = {-val_score.mean():.3f}")

# Find best degree (lowest validation error)
best_degree = degrees[np.argmin(val_scores)]
print(f"\nBest polynomial degree: {best_degree}")

🔹 Regularized Polynomial Regression

Use regularization to prevent overfitting with high-degree polynomials

from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler

# Generate data
X = np.random.randn(100, 1)
y = 0.5 * X.ravel() + 2 * X.ravel()**2 + np.random.randn(100) * 0.1

# Create regularized polynomial models
models = {
    'Linear': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1)
}

# Compare models with polynomial features
degree = 5
results = {}

for name, model in models.items():
    # Create pipeline with polynomial features and scaling
    pipeline = Pipeline([
        ('poly', PolynomialFeatures(degree=degree)),
        ('scaler', StandardScaler()),
        ('regressor', model)
    ])
    
    # Fit and evaluate
    pipeline.fit(X, y)
    score = pipeline.score(X, y)
    results[name] = score
    
    print(f"{name} R² Score: {score:.3f}")

# Ridge and Lasso help prevent overfitting
print("\nRegularization helps control model complexity!")

🔹 Real-World Example: Temperature Prediction

Apply polynomial regression to model temperature patterns

# Simulate daily temperature data
days = np.arange(1, 366)  # Days of the year
X = days.reshape(-1, 1)

# Create seasonal temperature pattern
base_temp = 20  # Base temperature
seasonal = 15 * np.sin(2 * np.pi * days / 365)  # Seasonal variation
noise = np.random.normal(0, 2, 365)  # Random variation
y = base_temp + seasonal + noise

# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create polynomial model
temp_model = Pipeline([
    ('poly', PolynomialFeatures(degree=3)),
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=1.0))
])

# Train model
temp_model.fit(X_train, y_train)

# Evaluate
train_score = temp_model.score(X_train, y_train)
test_score = temp_model.score(X_test, y_test)

print(f"Temperature Model Performance:")
print(f"Training R²: {train_score:.3f}")
print(f"Testing R²: {test_score:.3f}")

# Predict temperature for specific days
summer_day = [[180]]  # Day 180 (summer)
winter_day = [[360]]  # Day 360 (winter)

summer_temp = temp_model.predict(summer_day)[0]
winter_temp = temp_model.predict(winter_day)[0]

print(f"\nPredicted summer temperature: {summer_temp:.1f}°C")
print(f"Predicted winter temperature: {winter_temp:.1f}°C")

🔹 Best Practices

Tips for effective polynomial regression

✅ Do's:

  • Start with low degrees (2-3) and increase gradually
  • Use cross-validation to select optimal degree
  • Apply feature scaling with high-degree polynomials
  • Use regularization to prevent overfitting
  • Visualize your data to understand relationships

❌ Don'ts:

  • Don't use very high degrees without regularization
  • Don't ignore validation performance
  • Don't extrapolate far beyond training data
  • Don't forget to scale features
# Complete polynomial regression workflow
def polynomial_regression_workflow(X, y, max_degree=5):
    """Complete workflow for polynomial regression"""
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    best_score = -np.inf
    best_degree = 1
    
    # Test different degrees
    for degree in range(1, max_degree + 1):
        model = Pipeline([
            ('poly', PolynomialFeatures(degree=degree)),
            ('scaler', StandardScaler()),
            ('ridge', Ridge(alpha=1.0))
        ])
        
        # Cross-validation
        from sklearn.model_selection import cross_val_score
        scores = cross_val_score(model, X_train, y_train, cv=5)
        avg_score = scores.mean()
        
        print(f"Degree {degree}: CV Score = {avg_score:.3f}")
        
        if avg_score > best_score:
            best_score = avg_score
            best_degree = degree
    
    # Train final model
    final_model = Pipeline([
        ('poly', PolynomialFeatures(degree=best_degree)),
        ('scaler', StandardScaler()),
        ('ridge', Ridge(alpha=1.0))
    ])
    
    final_model.fit(X_train, y_train)
    test_score = final_model.score(X_test, y_test)
    
    print(f"\nBest degree: {best_degree}")
    print(f"Final test score: {test_score:.3f}")
    
    return final_model

# Example usage
# model = polynomial_regression_workflow(X, y)

🧠 Test Your Knowledge

What does polynomial regression help us model?

What happens with very high polynomial degrees?

Which technique helps prevent overfitting in polynomial regression?