Multiple Regression

Learn to predict outcomes using multiple input features

🎯 Understanding Multiple Regression

Multiple regression extends simple linear regression by using multiple input features to predict a target variable, capturing more complex relationships.


# Multiple regression example
from sklearn.linear_model import LinearRegression
import numpy as np

# Multiple features: [size, bedrooms, age]
X = [[1500, 3, 10], [2000, 4, 5], [1200, 2, 15]]
y = [300000, 400000, 250000]  # House prices

model = LinearRegression()
model.fit(X, y)
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")
                                    
Multiple
Features
Better
Predictions
Real-world
Applications

Key Concepts of Multiple Regression

Multiple regression allows us to model complex relationships using several input variables:

📊

Multiple Features

Use several input variables for prediction

X₁, X₂, X₃ Feature Matrix Coefficients
🔍

Feature Selection

Choose the most important features

Correlation P-values VIF
⚠️

Multicollinearity

Handle correlated input features

Detection Regularization PCA
📈

Model Evaluation

Assess model performance and assumptions

Adjusted R² Residuals

🔹 Basic Multiple Regression

Let's start with a simple multiple regression example

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Create sample dataset: predicting house prices
np.random.seed(42)
n_samples = 100

# Generate features
size = np.random.normal(1500, 300, n_samples)  # House size
bedrooms = np.random.randint(1, 6, n_samples)  # Number of bedrooms
age = np.random.randint(0, 50, n_samples)      # House age
location_score = np.random.uniform(1, 10, n_samples)  # Location rating

# Create target variable with realistic relationships
price = (
    200 * size +           # Size effect
    15000 * bedrooms +     # Bedroom effect
    -1000 * age +          # Age effect (negative)
    5000 * location_score + # Location effect
    np.random.normal(0, 20000, n_samples)  # Noise
)

# Create feature matrix
X = np.column_stack([size, bedrooms, age, location_score])
feature_names = ['Size', 'Bedrooms', 'Age', 'Location_Score']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, price, test_size=0.2, random_state=42
)

# Create and train model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("Multiple Regression Results:")
print(f"R² Score: {r2:.3f}")
print(f"MSE: {mse:,.0f}")
print(f"RMSE: {np.sqrt(mse):,.0f}")

# Print coefficients
print("\nFeature Coefficients:")
for name, coef in zip(feature_names, model.coef_):
    print(f"{name}: {coef:.2f}")
print(f"Intercept: {model.intercept_:.2f}")

🔹 Feature Selection and Importance

Identify which features are most important for prediction

import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression
from scipy import stats

# Calculate feature importance using F-statistics
f_stats, p_values = f_regression(X_train, y_train)

# Create feature importance dataframe
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': model.coef_,
    'Abs_Coefficient': np.abs(model.coef_),
    'F_Statistic': f_stats,
    'P_Value': p_values
})

# Sort by absolute coefficient value
feature_importance = feature_importance.sort_values(
    'Abs_Coefficient', ascending=False
)

print("Feature Importance Analysis:")
print(feature_importance.round(3))

# Statistical significance check
print("\nStatistically Significant Features (p < 0.05):")
significant_features = feature_importance[feature_importance['P_Value'] < 0.05]
print(significant_features[['Feature', 'P_Value']].round(4))

# Calculate correlation matrix
correlation_matrix = np.corrcoef(X_train.T)
print("\nFeature Correlation Matrix:")
for i, name1 in enumerate(feature_names):
    for j, name2 in enumerate(feature_names):
        if i < j:  # Only show upper triangle
            corr = correlation_matrix[i, j]
            print(f"{name1} - {name2}: {corr:.3f}")

🔹 Handling Multicollinearity

Detect and handle correlated features that can cause problems

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

# Calculate Variance Inflation Factor (VIF)
def calculate_vif(X, feature_names):
    """Calculate VIF for each feature"""
    vif_data = pd.DataFrame()
    vif_data["Feature"] = feature_names
    vif_data["VIF"] = [
        variance_inflation_factor(X, i) for i in range(X.shape[1])
    ]
    return vif_data

# Calculate VIF for our features
vif_results = calculate_vif(X_train, feature_names)
print("Variance Inflation Factor (VIF):")
print(vif_results.round(2))
print("\nVIF Interpretation:")
print("VIF < 5: Low multicollinearity")
print("5 ≤ VIF < 10: Moderate multicollinearity") 
print("VIF ≥ 10: High multicollinearity (problematic)")

# Ridge regression to handle multicollinearity
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Compare regular vs Ridge regression
models = {
    'Linear Regression': LinearRegression(),
    'Ridge (α=1.0)': Ridge(alpha=1.0),
    'Ridge (α=10.0)': Ridge(alpha=10.0)
}

print("\nModel Comparison:")
for name, model in models.items():
    if 'Ridge' in name:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    print(f"{name}: R² = {r2:.3f}")

🔹 Model Diagnostics

Check model assumptions and performance

# Model diagnostics
def diagnose_regression(model, X_test, y_test, y_pred):
    """Perform regression diagnostics"""
    
    # Calculate residuals
    residuals = y_test - y_pred
    
    # 1. R² and Adjusted R²
    n = len(y_test)
    p = X_test.shape[1]  # Number of features
    r2 = r2_score(y_test, y_pred)
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    
    print("Model Performance:")
    print(f"R²: {r2:.3f}")
    print(f"Adjusted R²: {adj_r2:.3f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):,.0f}")
    
    # 2. Residual analysis
    print("\nResidual Analysis:")
    print(f"Mean residual: {np.mean(residuals):.2f}")
    print(f"Std residual: {np.std(residuals):.2f}")
    
    # 3. Normality test for residuals
    from scipy.stats import shapiro
    stat, p_value = shapiro(residuals)
    print(f"Shapiro-Wilk test p-value: {p_value:.4f}")
    if p_value > 0.05:
        print("Residuals appear normally distributed ✓")
    else:
        print("Residuals may not be normally distributed ⚠️")
    
    # 4. Homoscedasticity check
    # Simple check: correlation between absolute residuals and predictions
    abs_residuals = np.abs(residuals)
    corr_coef = np.corrcoef(y_pred, abs_residuals)[0, 1]
    print(f"Correlation(predictions, |residuals|): {corr_coef:.3f}")
    if abs(corr_coef) < 0.1:
        print("Homoscedasticity assumption likely satisfied ✓")
    else:
        print("Potential heteroscedasticity detected ⚠️")

# Run diagnostics
diagnose_regression(model, X_test, y_test, y_pred)

# Feature contribution analysis
def analyze_prediction(model, X_sample, feature_names):
    """Analyze how each feature contributes to a prediction"""
    prediction = model.predict([X_sample])[0]
    
    print(f"\nPrediction Analysis:")
    print(f"Predicted value: {prediction:,.0f}")
    print(f"Base value (intercept): {model.intercept_:,.0f}")
    
    print("\nFeature Contributions:")
    total_contribution = model.intercept_
    for i, (feature, value, coef) in enumerate(
        zip(feature_names, X_sample, model.coef_)
    ):
        contribution = value * coef
        total_contribution += contribution
        print(f"{feature}: {value:.1f} × {coef:.2f} = {contribution:,.0f}")
    
    print(f"Total: {total_contribution:,.0f}")

# Example prediction analysis
sample_house = [1800, 3, 8, 7.5]  # Size, bedrooms, age, location
analyze_prediction(model, sample_house, feature_names)

🔹 Real-World Example: Sales Prediction

Apply multiple regression to predict sales based on marketing spend

# Sales prediction example
np.random.seed(123)

# Generate marketing data
n_months = 60
tv_spend = np.random.uniform(10000, 100000, n_months)
radio_spend = np.random.uniform(5000, 50000, n_months)
online_spend = np.random.uniform(2000, 30000, n_months)
seasonality = np.sin(2 * np.pi * np.arange(n_months) / 12)  # Monthly pattern

# Generate sales with realistic relationships
sales = (
    0.8 * tv_spend +      # TV has strong effect
    1.2 * radio_spend +   # Radio has stronger effect
    2.0 * online_spend +  # Online has strongest effect
    50000 * seasonality + # Seasonal effect
    np.random.normal(0, 20000, n_months)  # Random variation
)

# Create dataset
marketing_data = pd.DataFrame({
    'TV_Spend': tv_spend,
    'Radio_Spend': radio_spend,
    'Online_Spend': online_spend,
    'Month': np.arange(1, n_months + 1),
    'Sales': sales
})

# Add derived features
marketing_data['Total_Spend'] = (
    marketing_data['TV_Spend'] + 
    marketing_data['Radio_Spend'] + 
    marketing_data['Online_Spend']
)
marketing_data['TV_Radio_Ratio'] = (
    marketing_data['TV_Spend'] / marketing_data['Radio_Spend']
)

# Prepare features
feature_cols = ['TV_Spend', 'Radio_Spend', 'Online_Spend', 'Total_Spend']
X = marketing_data[feature_cols].values
y = marketing_data['Sales'].values

# Split and train
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Train model
sales_model = LinearRegression()
sales_model.fit(X_train, y_train)

# Evaluate
y_pred = sales_model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("Sales Prediction Model:")
print(f"R² Score: {r2:.3f}")
print(f"RMSE: ${np.sqrt(mean_squared_error(y_test, y_pred)):,.0f}")

print("\nMarketing ROI Analysis:")
for feature, coef in zip(feature_cols, sales_model.coef_):
    if 'Spend' in feature and feature != 'Total_Spend':
        roi = coef  # Sales increase per dollar spent
        print(f"{feature}: ${roi:.2f} sales per $1 spent")

# Predict sales for new marketing budget
new_campaign = [[50000, 25000, 15000, 90000]]  # TV, Radio, Online, Total
predicted_sales = sales_model.predict(new_campaign)[0]
print(f"\nPredicted sales for new campaign: ${predicted_sales:,.0f}")

🔹 Best Practices

Guidelines for effective multiple regression

✅ Do's:

  • Check for multicollinearity using VIF
  • Validate model assumptions (linearity, normality, homoscedasticity)
  • Use cross-validation for model selection
  • Scale features when using regularization
  • Interpret coefficients in context of the problem

❌ Don'ts:

  • Don't include too many correlated features
  • Don't ignore statistical significance of features
  • Don't extrapolate beyond the training data range
  • Don't assume causation from correlation
# Complete multiple regression workflow
def multiple_regression_workflow(X, y, feature_names, test_size=0.2):
    """Complete workflow for multiple regression analysis"""
    
    # 1. Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )
    
    # 2. Check multicollinearity
    print("=== Multicollinearity Check ===")
    vif_data = calculate_vif(X_train, feature_names)
    print(vif_data.round(2))
    
    high_vif = vif_data[vif_data['VIF'] > 10]
    if len(high_vif) > 0:
        print("⚠️ High multicollinearity detected!")
        print("Consider removing features or using regularization")
    
    # 3. Train models
    models = {
        'Linear': LinearRegression(),
        'Ridge': Ridge(alpha=1.0),
        'Lasso': Lasso(alpha=0.1)
    }
    
    results = {}
    print("\n=== Model Comparison ===")
    
    for name, model in models.items():
        if name != 'Linear':
            # Scale features for regularized models
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
        
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        results[name] = {'R2': r2, 'RMSE': rmse, 'model': model}
        
        print(f"{name}: R² = {r2:.3f}, RMSE = {rmse:.0f}")
    
    # 4. Select best model
    best_model_name = max(results.keys(), key=lambda k: results[k]['R2'])
    best_model = results[best_model_name]['model']
    
    print(f"\nBest model: {best_model_name}")
    
    return best_model, results

# Example usage
# best_model, results = multiple_regression_workflow(X, y, feature_names)

🧠 Test Your Knowledge

What is the main advantage of multiple regression over simple regression?

What does a high VIF (>10) indicate?

Which technique helps handle multicollinearity?