Linear Regression
Learn to predict numbers by drawing the best line through your data
📈 What is Linear Regression?
Linear Regression is like drawing the best straight line through scattered dots on a graph. Once you have this line, you can use it to predict new values! It's perfect for questions like "If I study 6 hours, what grade will I get?" or "How much will a 2000 sq ft house cost?"
# Simple Linear Regression Example
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# Data: Hours studied vs Test scores
hours = np.array([1, 2, 3, 4, 5, 6, 7, 8]).reshape(-1, 1)
scores = np.array([50, 55, 65, 70, 75, 80, 85, 90])
# Create and train the model
model = LinearRegression()
model.fit(hours, scores)
# Make a prediction
new_hours = 6.5
predicted_score = model.predict([[new_hours]])
print(f"If you study {new_hours} hours, predicted score: {predicted_score[0]:.1f}")
Key Concepts
Best Fit Line
Find the line closest to all points
# The line equation: y = mx + b
# m = slope, b = y-intercept
slope = model.coef_[0]
intercept = model.intercept_
print(f"Line: y = {slope:.2f}x + {intercept:.2f}")
Make Predictions
Use the line to predict new values
# Predict for new input
new_value = 5.5
prediction = model.predict([[new_value]])
print(f"Prediction: {prediction[0]:.1f}")
Measure Accuracy
Check how good your model is
# R-squared score (0-1, higher is better)
score = model.score(X, y)
print(f"Model accuracy: {score:.2f}")
🏠 Complete Example: House Price Prediction
Let's predict house prices based on size - a classic ML problem!
# Step-by-step house price prediction
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Step 1: Create sample data
# House sizes (sq ft) and prices ($1000s)
sizes = np.array([1000, 1200, 1400, 1600, 1800, 2000, 2200, 2400]).reshape(-1, 1)
prices = np.array([150, 180, 210, 240, 270, 300, 330, 360])
print("House Data:")
for i in range(len(sizes)):
print(f"{sizes[i][0]} sq ft → ${prices[i]}k")
# Step 2: Create and train the model
model = LinearRegression()
model.fit(sizes, prices)
# Step 3: Understand the model
slope = model.coef_[0]
intercept = model.intercept_
print(f"\nModel equation: Price = {slope:.3f} × Size + {intercept:.1f}")
print(f"This means: Each sq ft adds ${slope:.2f}k to the price")
# Step 4: Make predictions
test_sizes = [1500, 1750, 2100]
for size in test_sizes:
predicted_price = model.predict([[size]])
print(f"A {size} sq ft house costs: ${predicted_price[0]:.1f}k")
# Step 5: Check accuracy
predictions = model.predict(sizes)
r2 = r2_score(prices, predictions)
print(f"\nModel accuracy (R²): {r2:.3f}")
print("(1.0 = perfect, 0.8+ = very good)")
📊 Visualizing Linear Regression
See your model in action with a simple plot
# Create a beautiful visualization
import matplotlib.pyplot as plt
import numpy as np
# Sample data
study_hours = np.array([1, 2, 3, 4, 5, 6, 7, 8]).reshape(-1, 1)
test_scores = np.array([45, 55, 65, 70, 75, 80, 85, 95])
# Train model
model = LinearRegression()
model.fit(study_hours, test_scores)
# Create predictions for smooth line
line_hours = np.linspace(1, 8, 100).reshape(-1, 1)
line_predictions = model.predict(line_hours)
# Create the plot
plt.figure(figsize=(10, 6))
plt.scatter(study_hours, test_scores, color='blue', s=100, alpha=0.7, label='Actual Data')
plt.plot(line_hours, line_predictions, color='red', linewidth=2, label='Best Fit Line')
plt.xlabel('Study Hours')
plt.ylabel('Test Score')
plt.title('Study Hours vs Test Scores - Linear Regression')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Show the equation on the plot
slope = model.coef_[0]
intercept = model.intercept_
plt.text(2, 85, f'y = {slope:.1f}x + {intercept:.1f}', fontsize=12,
bbox=dict(boxstyle="round,pad=0.3", facecolor="yellow", alpha=0.7))
🔍 Understanding Model Quality
How do we know if our model is good?
📏 Key Metrics:
- R² Score: 0-1, higher is better (0.8+ is very good)
- Mean Squared Error: Lower is better
- Visual Check: Do points cluster around the line?
# Evaluate model performance
from sklearn.metrics import mean_squared_error, r2_score
# Sample data
X = np.array([[1], [2], [3], [4], [5]])
y = np.array([2, 4, 6, 8, 10]) # Perfect linear relationship
# Train model
model = LinearRegression()
model.fit(X, y)
# Make predictions
predictions = model.predict(X)
# Calculate metrics
r2 = r2_score(y, predictions)
mse = mean_squared_error(y, predictions)
print(f"R² Score: {r2:.3f}")
print(f"Mean Squared Error: {mse:.3f}")
if r2 > 0.9:
print("Excellent model! 🎉")
elif r2 > 0.7:
print("Good model! 👍")
elif r2 > 0.5:
print("Okay model, could be better 🤔")
else:
print("Poor model, needs improvement 😞")
# What makes a good model?
print("\nTips for better models:")
print("1. More data usually helps")
print("2. Check for outliers")
print("3. Make sure relationship is actually linear")
print("4. Consider other features that might matter")
⚠️ Common Pitfalls
Watch out for these common mistakes!
# Common mistakes and how to avoid them
# Mistake 1: Assuming all relationships are linear
print("❌ Not everything is a straight line!")
print("Example: Temperature vs Ice cream sales might be curved")
# Mistake 2: Extrapolating too far
print("\n❌ Don't predict way outside your data range")
print("If you trained on houses 1000-3000 sq ft, don't predict for 10000 sq ft!")
# Mistake 3: Ignoring outliers
outlier_data = [10, 12, 11, 13, 100, 12, 11] # 100 is an outlier
print(f"\n❌ Outliers can mess up your line:")
print(f"Data with outlier: {outlier_data}")
print(f"Mean with outlier: {np.mean(outlier_data):.1f}")
print(f"Mean without outlier: {np.mean([10, 12, 11, 13, 12, 11]):.1f}")
# Mistake 4: Not checking your assumptions
print("\n✅ Always plot your data first!")
print("✅ Check if the relationship looks linear")
print("✅ Look for patterns in your errors")
# Simple check for linearity
def check_linearity(x, y):
"""Simple visual check for linear relationship"""
plt.scatter(x, y)
plt.title("Does this look linear?")
plt.show()
# Calculate correlation
correlation = np.corrcoef(x.flatten(), y)[0, 1]
print(f"Correlation: {correlation:.3f}")
if abs(correlation) > 0.8:
print("Strong linear relationship! ✅")
elif abs(correlation) > 0.5:
print("Moderate linear relationship 🤔")
else:
print("Weak linear relationship - consider other models ❌")