Linear Regression

Learn to predict numbers by drawing the best line through your data

📈 What is Linear Regression?

Linear Regression is like drawing the best straight line through scattered dots on a graph. Once you have this line, you can use it to predict new values! It's perfect for questions like "If I study 6 hours, what grade will I get?" or "How much will a 2000 sq ft house cost?"


# Simple Linear Regression Example
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Data: Hours studied vs Test scores
hours = np.array([1, 2, 3, 4, 5, 6, 7, 8]).reshape(-1, 1)
scores = np.array([50, 55, 65, 70, 75, 80, 85, 90])

# Create and train the model
model = LinearRegression()
model.fit(hours, scores)

# Make a prediction
new_hours = 6.5
predicted_score = model.predict([[new_hours]])
print(f"If you study {new_hours} hours, predicted score: {predicted_score[0]:.1f}")
                                    
Line
Best Fit
Predict
Numbers
Simple
& Powerful
Key Concepts

Key Concepts

📏

Best Fit Line

Find the line closest to all points

# The line equation: y = mx + b
# m = slope, b = y-intercept
slope = model.coef_[0]
intercept = model.intercept_
print(f"Line: y = {slope:.2f}x + {intercept:.2f}")
🎯

Make Predictions

Use the line to predict new values

# Predict for new input
new_value = 5.5
prediction = model.predict([[new_value]])
print(f"Prediction: {prediction[0]:.1f}")
📊

Measure Accuracy

Check how good your model is

# R-squared score (0-1, higher is better)
score = model.score(X, y)
print(f"Model accuracy: {score:.2f}")

🏠 Complete Example: House Price Prediction

Let's predict house prices based on size - a classic ML problem!

# Step-by-step house price prediction
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Create sample data
# House sizes (sq ft) and prices ($1000s)
sizes = np.array([1000, 1200, 1400, 1600, 1800, 2000, 2200, 2400]).reshape(-1, 1)
prices = np.array([150, 180, 210, 240, 270, 300, 330, 360])

print("House Data:")
for i in range(len(sizes)):
    print(f"{sizes[i][0]} sq ft → ${prices[i]}k")

# Step 2: Create and train the model
model = LinearRegression()
model.fit(sizes, prices)

# Step 3: Understand the model
slope = model.coef_[0]
intercept = model.intercept_
print(f"\nModel equation: Price = {slope:.3f} × Size + {intercept:.1f}")
print(f"This means: Each sq ft adds ${slope:.2f}k to the price")

# Step 4: Make predictions
test_sizes = [1500, 1750, 2100]
for size in test_sizes:
    predicted_price = model.predict([[size]])
    print(f"A {size} sq ft house costs: ${predicted_price[0]:.1f}k")

# Step 5: Check accuracy
predictions = model.predict(sizes)
r2 = r2_score(prices, predictions)
print(f"\nModel accuracy (R²): {r2:.3f}")
print("(1.0 = perfect, 0.8+ = very good)")

📊 Visualizing Linear Regression

See your model in action with a simple plot

# Create a beautiful visualization
import matplotlib.pyplot as plt
import numpy as np

# Sample data
study_hours = np.array([1, 2, 3, 4, 5, 6, 7, 8]).reshape(-1, 1)
test_scores = np.array([45, 55, 65, 70, 75, 80, 85, 95])

# Train model
model = LinearRegression()
model.fit(study_hours, test_scores)

# Create predictions for smooth line
line_hours = np.linspace(1, 8, 100).reshape(-1, 1)
line_predictions = model.predict(line_hours)

# Create the plot
plt.figure(figsize=(10, 6))
plt.scatter(study_hours, test_scores, color='blue', s=100, alpha=0.7, label='Actual Data')
plt.plot(line_hours, line_predictions, color='red', linewidth=2, label='Best Fit Line')

plt.xlabel('Study Hours')
plt.ylabel('Test Score')
plt.title('Study Hours vs Test Scores - Linear Regression')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Show the equation on the plot
slope = model.coef_[0]
intercept = model.intercept_
plt.text(2, 85, f'y = {slope:.1f}x + {intercept:.1f}', fontsize=12, 
         bbox=dict(boxstyle="round,pad=0.3", facecolor="yellow", alpha=0.7))

🔍 Understanding Model Quality

How do we know if our model is good?

📏 Key Metrics:

  • R² Score: 0-1, higher is better (0.8+ is very good)
  • Mean Squared Error: Lower is better
  • Visual Check: Do points cluster around the line?
# Evaluate model performance
from sklearn.metrics import mean_squared_error, r2_score

# Sample data
X = np.array([[1], [2], [3], [4], [5]])
y = np.array([2, 4, 6, 8, 10])  # Perfect linear relationship

# Train model
model = LinearRegression()
model.fit(X, y)

# Make predictions
predictions = model.predict(X)

# Calculate metrics
r2 = r2_score(y, predictions)
mse = mean_squared_error(y, predictions)

print(f"R² Score: {r2:.3f}")
print(f"Mean Squared Error: {mse:.3f}")

if r2 > 0.9:
    print("Excellent model! 🎉")
elif r2 > 0.7:
    print("Good model! 👍")
elif r2 > 0.5:
    print("Okay model, could be better 🤔")
else:
    print("Poor model, needs improvement 😞")

# What makes a good model?
print("\nTips for better models:")
print("1. More data usually helps")
print("2. Check for outliers")
print("3. Make sure relationship is actually linear")
print("4. Consider other features that might matter")

⚠️ Common Pitfalls

Watch out for these common mistakes!

# Common mistakes and how to avoid them

# Mistake 1: Assuming all relationships are linear
print("❌ Not everything is a straight line!")
print("Example: Temperature vs Ice cream sales might be curved")

# Mistake 2: Extrapolating too far
print("\n❌ Don't predict way outside your data range")
print("If you trained on houses 1000-3000 sq ft, don't predict for 10000 sq ft!")

# Mistake 3: Ignoring outliers
outlier_data = [10, 12, 11, 13, 100, 12, 11]  # 100 is an outlier
print(f"\n❌ Outliers can mess up your line:")
print(f"Data with outlier: {outlier_data}")
print(f"Mean with outlier: {np.mean(outlier_data):.1f}")
print(f"Mean without outlier: {np.mean([10, 12, 11, 13, 12, 11]):.1f}")

# Mistake 4: Not checking your assumptions
print("\n✅ Always plot your data first!")
print("✅ Check if the relationship looks linear")
print("✅ Look for patterns in your errors")

# Simple check for linearity
def check_linearity(x, y):
    """Simple visual check for linear relationship"""
    plt.scatter(x, y)
    plt.title("Does this look linear?")
    plt.show()
    
    # Calculate correlation
    correlation = np.corrcoef(x.flatten(), y)[0, 1]
    print(f"Correlation: {correlation:.3f}")
    if abs(correlation) > 0.8:
        print("Strong linear relationship! ✅")
    elif abs(correlation) > 0.5:
        print("Moderate linear relationship 🤔")
    else:
        print("Weak linear relationship - consider other models ❌")

🧠 Test Your Knowledge

What does Linear Regression try to find?

What does an R² score of 0.9 mean?

When should you NOT use Linear Regression?