Logistic Regression
Predict yes/no outcomes with probability-based classification
📊 Understanding Logistic Regression
Logistic regression predicts the probability of binary outcomes (yes/no, pass/fail, spam/not spam) using a sigmoid curve.
# Simple logistic regression
from sklearn.linear_model import LogisticRegression
import numpy as np
# Sample data: hours studied vs pass/fail
X = [[1], [2], [3], [4], [5], [6]] # Hours studied
y = [0, 0, 0, 1, 1, 1] # 0=fail, 1=pass
# Train model
model = LogisticRegression()
model.fit(X, y)
# Predict probability of passing with 3.5 hours study
prob = model.predict_proba([[3.5]])[0][1]
print(f"Probability of passing: {prob:.2f}")
Key Concepts
Sigmoid Function
S-shaped curve that maps any number to 0-1 range
Binary Classification
Predicts one of two possible outcomes
Odds Ratio
Measures how much features affect the outcome
Fast Training
Quick to train and make predictions
🔹 Step 1: Basic Binary Classification
Predict if an email is spam or not spam
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
# Email features: [word_count, exclamation_marks, capital_letters]
X = np.array([
[50, 0, 5], # Normal email
[30, 0, 3], # Normal email
[200, 5, 50], # Spam email
[180, 8, 45], # Spam email
[40, 1, 8], # Normal email
[220, 6, 60] # Spam email
])
y = [0, 0, 1, 1, 0, 1] # 0=normal, 1=spam
# Train model
model = LogisticRegression()
model.fit(X, y)
# Test prediction
test_email = [[100, 3, 25]] # New email features
prediction = model.predict(test_email)[0]
probability = model.predict_proba(test_email)[0][1]
print(f"Prediction: {'Spam' if prediction == 1 else 'Normal'}")
print(f"Spam probability: {probability:.2f}")
🔹 Step 2: Understanding Probabilities
How logistic regression calculates probabilities
# Get probabilities for multiple examples
test_emails = [
[20, 0, 2], # Very normal email
[100, 3, 25], # Suspicious email
[300, 10, 80] # Very spammy email
]
for i, email in enumerate(test_emails):
probs = model.predict_proba([email])[0]
normal_prob = probs[0]
spam_prob = probs[1]
print(f"Email {i+1}:")
print(f" Normal probability: {normal_prob:.3f}")
print(f" Spam probability: {spam_prob:.3f}")
print(f" Prediction: {'Spam' if spam_prob > 0.5 else 'Normal'}")
print()
# The probabilities always add up to 1.0
🔹 Step 3: Feature Importance
Understanding which features matter most
# Get feature coefficients
coefficients = model.coef_[0]
features = ['word_count', 'exclamation_marks', 'capital_letters']
print("Feature importance (coefficients):")
for feature, coef in zip(features, coefficients):
print(f"{feature}: {coef:.3f}")
# Positive coefficient = increases spam probability
# Negative coefficient = decreases spam probability
# Larger absolute value = more important feature
# Example interpretation:
# If exclamation_marks has coefficient 0.5:
# Each additional exclamation mark increases spam odds by exp(0.5) ≈ 1.65x
🔹 Step 4: Model Evaluation
Measuring how well the model performs
from sklearn.metrics import accuracy_score, classification_report
# Create larger dataset for proper evaluation
np.random.seed(42)
X_large = np.random.rand(100, 3) * 200 # Random email features
# Create realistic labels based on features
y_large = (X_large[:, 1] > 3) | (X_large[:, 2] > 40) # Spam if many ! or CAPS
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X_large, y_large, test_size=0.3, random_state=42
)
# Train and evaluate
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
# Detailed report
print("\nDetailed Report:")
print(classification_report(y_test, predictions,
target_names=['Normal', 'Spam']))
🔹 Step 5: Real-World Example
Medical diagnosis: Predicting disease risk
# Patient data: [age, blood_pressure, cholesterol]
patients = np.array([
[25, 120, 180], # Young, healthy
[45, 140, 220], # Middle-aged, moderate risk
[65, 160, 280], # Older, high risk
[30, 110, 160], # Young, very healthy
[55, 180, 300], # High risk
[40, 130, 200] # Moderate
])
# Disease outcomes (0=healthy, 1=disease)
outcomes = [0, 0, 1, 0, 1, 0]
# Train model
health_model = LogisticRegression()
health_model.fit(patients, outcomes)
# Predict for new patient
new_patient = [[50, 150, 250]] # 50 years old, BP=150, Chol=250
risk_prob = health_model.predict_proba(new_patient)[0][1]
print(f"Disease risk probability: {risk_prob:.2f}")
if risk_prob > 0.5:
print("High risk - recommend further testing")
else:
print("Low risk - routine monitoring")
# This helps doctors make informed decisions