Multiple Regression
Learn to predict outcomes using multiple input features
🎯 Understanding Multiple Regression
Multiple regression extends simple linear regression by using multiple input features to predict a target variable, capturing more complex relationships.
# Multiple regression example
from sklearn.linear_model import LinearRegression
import numpy as np
# Multiple features: [size, bedrooms, age]
X = [[1500, 3, 10], [2000, 4, 5], [1200, 2, 15]]
y = [300000, 400000, 250000] # House prices
model = LinearRegression()
model.fit(X, y)
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")
Key Concepts of Multiple Regression
Multiple regression allows us to model complex relationships using several input variables:
Multiple Features
Use several input variables for prediction
Feature Selection
Choose the most important features
Multicollinearity
Handle correlated input features
Model Evaluation
Assess model performance and assumptions
🔹 Basic Multiple Regression
Let's start with a simple multiple regression example
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
# Create sample dataset: predicting house prices
np.random.seed(42)
n_samples = 100
# Generate features
size = np.random.normal(1500, 300, n_samples) # House size
bedrooms = np.random.randint(1, 6, n_samples) # Number of bedrooms
age = np.random.randint(0, 50, n_samples) # House age
location_score = np.random.uniform(1, 10, n_samples) # Location rating
# Create target variable with realistic relationships
price = (
200 * size + # Size effect
15000 * bedrooms + # Bedroom effect
-1000 * age + # Age effect (negative)
5000 * location_score + # Location effect
np.random.normal(0, 20000, n_samples) # Noise
)
# Create feature matrix
X = np.column_stack([size, bedrooms, age, location_score])
feature_names = ['Size', 'Bedrooms', 'Age', 'Location_Score']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, price, test_size=0.2, random_state=42
)
# Create and train model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("Multiple Regression Results:")
print(f"R² Score: {r2:.3f}")
print(f"MSE: {mse:,.0f}")
print(f"RMSE: {np.sqrt(mse):,.0f}")
# Print coefficients
print("\nFeature Coefficients:")
for name, coef in zip(feature_names, model.coef_):
print(f"{name}: {coef:.2f}")
print(f"Intercept: {model.intercept_:.2f}")
🔹 Feature Selection and Importance
Identify which features are most important for prediction
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression
from scipy import stats
# Calculate feature importance using F-statistics
f_stats, p_values = f_regression(X_train, y_train)
# Create feature importance dataframe
feature_importance = pd.DataFrame({
'Feature': feature_names,
'Coefficient': model.coef_,
'Abs_Coefficient': np.abs(model.coef_),
'F_Statistic': f_stats,
'P_Value': p_values
})
# Sort by absolute coefficient value
feature_importance = feature_importance.sort_values(
'Abs_Coefficient', ascending=False
)
print("Feature Importance Analysis:")
print(feature_importance.round(3))
# Statistical significance check
print("\nStatistically Significant Features (p < 0.05):")
significant_features = feature_importance[feature_importance['P_Value'] < 0.05]
print(significant_features[['Feature', 'P_Value']].round(4))
# Calculate correlation matrix
correlation_matrix = np.corrcoef(X_train.T)
print("\nFeature Correlation Matrix:")
for i, name1 in enumerate(feature_names):
for j, name2 in enumerate(feature_names):
if i < j: # Only show upper triangle
corr = correlation_matrix[i, j]
print(f"{name1} - {name2}: {corr:.3f}")
🔹 Handling Multicollinearity
Detect and handle correlated features that can cause problems
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
# Calculate Variance Inflation Factor (VIF)
def calculate_vif(X, feature_names):
"""Calculate VIF for each feature"""
vif_data = pd.DataFrame()
vif_data["Feature"] = feature_names
vif_data["VIF"] = [
variance_inflation_factor(X, i) for i in range(X.shape[1])
]
return vif_data
# Calculate VIF for our features
vif_results = calculate_vif(X_train, feature_names)
print("Variance Inflation Factor (VIF):")
print(vif_results.round(2))
print("\nVIF Interpretation:")
print("VIF < 5: Low multicollinearity")
print("5 ≤ VIF < 10: Moderate multicollinearity")
print("VIF ≥ 10: High multicollinearity (problematic)")
# Ridge regression to handle multicollinearity
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Compare regular vs Ridge regression
models = {
'Linear Regression': LinearRegression(),
'Ridge (α=1.0)': Ridge(alpha=1.0),
'Ridge (α=10.0)': Ridge(alpha=10.0)
}
print("\nModel Comparison:")
for name, model in models.items():
if 'Ridge' in name:
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
else:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"{name}: R² = {r2:.3f}")
🔹 Model Diagnostics
Check model assumptions and performance
# Model diagnostics
def diagnose_regression(model, X_test, y_test, y_pred):
"""Perform regression diagnostics"""
# Calculate residuals
residuals = y_test - y_pred
# 1. R² and Adjusted R²
n = len(y_test)
p = X_test.shape[1] # Number of features
r2 = r2_score(y_test, y_pred)
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print("Model Performance:")
print(f"R²: {r2:.3f}")
print(f"Adjusted R²: {adj_r2:.3f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):,.0f}")
# 2. Residual analysis
print("\nResidual Analysis:")
print(f"Mean residual: {np.mean(residuals):.2f}")
print(f"Std residual: {np.std(residuals):.2f}")
# 3. Normality test for residuals
from scipy.stats import shapiro
stat, p_value = shapiro(residuals)
print(f"Shapiro-Wilk test p-value: {p_value:.4f}")
if p_value > 0.05:
print("Residuals appear normally distributed ✓")
else:
print("Residuals may not be normally distributed ⚠️")
# 4. Homoscedasticity check
# Simple check: correlation between absolute residuals and predictions
abs_residuals = np.abs(residuals)
corr_coef = np.corrcoef(y_pred, abs_residuals)[0, 1]
print(f"Correlation(predictions, |residuals|): {corr_coef:.3f}")
if abs(corr_coef) < 0.1:
print("Homoscedasticity assumption likely satisfied ✓")
else:
print("Potential heteroscedasticity detected ⚠️")
# Run diagnostics
diagnose_regression(model, X_test, y_test, y_pred)
# Feature contribution analysis
def analyze_prediction(model, X_sample, feature_names):
"""Analyze how each feature contributes to a prediction"""
prediction = model.predict([X_sample])[0]
print(f"\nPrediction Analysis:")
print(f"Predicted value: {prediction:,.0f}")
print(f"Base value (intercept): {model.intercept_:,.0f}")
print("\nFeature Contributions:")
total_contribution = model.intercept_
for i, (feature, value, coef) in enumerate(
zip(feature_names, X_sample, model.coef_)
):
contribution = value * coef
total_contribution += contribution
print(f"{feature}: {value:.1f} × {coef:.2f} = {contribution:,.0f}")
print(f"Total: {total_contribution:,.0f}")
# Example prediction analysis
sample_house = [1800, 3, 8, 7.5] # Size, bedrooms, age, location
analyze_prediction(model, sample_house, feature_names)
🔹 Real-World Example: Sales Prediction
Apply multiple regression to predict sales based on marketing spend
# Sales prediction example
np.random.seed(123)
# Generate marketing data
n_months = 60
tv_spend = np.random.uniform(10000, 100000, n_months)
radio_spend = np.random.uniform(5000, 50000, n_months)
online_spend = np.random.uniform(2000, 30000, n_months)
seasonality = np.sin(2 * np.pi * np.arange(n_months) / 12) # Monthly pattern
# Generate sales with realistic relationships
sales = (
0.8 * tv_spend + # TV has strong effect
1.2 * radio_spend + # Radio has stronger effect
2.0 * online_spend + # Online has strongest effect
50000 * seasonality + # Seasonal effect
np.random.normal(0, 20000, n_months) # Random variation
)
# Create dataset
marketing_data = pd.DataFrame({
'TV_Spend': tv_spend,
'Radio_Spend': radio_spend,
'Online_Spend': online_spend,
'Month': np.arange(1, n_months + 1),
'Sales': sales
})
# Add derived features
marketing_data['Total_Spend'] = (
marketing_data['TV_Spend'] +
marketing_data['Radio_Spend'] +
marketing_data['Online_Spend']
)
marketing_data['TV_Radio_Ratio'] = (
marketing_data['TV_Spend'] / marketing_data['Radio_Spend']
)
# Prepare features
feature_cols = ['TV_Spend', 'Radio_Spend', 'Online_Spend', 'Total_Spend']
X = marketing_data[feature_cols].values
y = marketing_data['Sales'].values
# Split and train
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
# Train model
sales_model = LinearRegression()
sales_model.fit(X_train, y_train)
# Evaluate
y_pred = sales_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("Sales Prediction Model:")
print(f"R² Score: {r2:.3f}")
print(f"RMSE: ${np.sqrt(mean_squared_error(y_test, y_pred)):,.0f}")
print("\nMarketing ROI Analysis:")
for feature, coef in zip(feature_cols, sales_model.coef_):
if 'Spend' in feature and feature != 'Total_Spend':
roi = coef # Sales increase per dollar spent
print(f"{feature}: ${roi:.2f} sales per $1 spent")
# Predict sales for new marketing budget
new_campaign = [[50000, 25000, 15000, 90000]] # TV, Radio, Online, Total
predicted_sales = sales_model.predict(new_campaign)[0]
print(f"\nPredicted sales for new campaign: ${predicted_sales:,.0f}")
🔹 Best Practices
Guidelines for effective multiple regression
✅ Do's:
- Check for multicollinearity using VIF
- Validate model assumptions (linearity, normality, homoscedasticity)
- Use cross-validation for model selection
- Scale features when using regularization
- Interpret coefficients in context of the problem
❌ Don'ts:
- Don't include too many correlated features
- Don't ignore statistical significance of features
- Don't extrapolate beyond the training data range
- Don't assume causation from correlation
# Complete multiple regression workflow
def multiple_regression_workflow(X, y, feature_names, test_size=0.2):
"""Complete workflow for multiple regression analysis"""
# 1. Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=42
)
# 2. Check multicollinearity
print("=== Multicollinearity Check ===")
vif_data = calculate_vif(X_train, feature_names)
print(vif_data.round(2))
high_vif = vif_data[vif_data['VIF'] > 10]
if len(high_vif) > 0:
print("⚠️ High multicollinearity detected!")
print("Consider removing features or using regularization")
# 3. Train models
models = {
'Linear': LinearRegression(),
'Ridge': Ridge(alpha=1.0),
'Lasso': Lasso(alpha=0.1)
}
results = {}
print("\n=== Model Comparison ===")
for name, model in models.items():
if name != 'Linear':
# Scale features for regularized models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
else:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
results[name] = {'R2': r2, 'RMSE': rmse, 'model': model}
print(f"{name}: R² = {r2:.3f}, RMSE = {rmse:.0f}")
# 4. Select best model
best_model_name = max(results.keys(), key=lambda k: results[k]['R2'])
best_model = results[best_model_name]['model']
print(f"\nBest model: {best_model_name}")
return best_model, results
# Example usage
# best_model, results = multiple_regression_workflow(X, y, feature_names)