Mean, Median, Mode
Understanding the three most important ways to describe your data
📊 Central Tendency
Mean, Median, and Mode are three ways to find the "center" or "typical value" of your data. Think of them as different ways to answer "What's normal?" in your dataset. Each tells you something different about your data!
import numpy as np
# Example: Test scores in a class
scores = [85, 90, 78, 92, 88, 76, 95, 89, 87, 91]
# Mean (average)
mean_score = np.mean(scores)
print(f"Mean score: {mean_score}") # 87.1
# Median (middle value)
median_score = np.median(scores)
print(f"Median score: {median_score}") # 88.5
# Mode (most common) - using scipy
from scipy import stats
mode_score = stats.mode(scores)
print(f"Mode: {mode_score}")
The Three Measures
Mean (Average)
Add all values and divide by count
# Simple mean calculation
numbers = [2, 4, 6, 8, 10]
mean = sum(numbers) / len(numbers)
print(f"Mean: {mean}") # 6.0
Median (Middle)
The middle value when sorted
# Find the median
numbers = [1, 3, 5, 7, 9]
numbers.sort()
median = numbers[len(numbers)//2]
print(f"Median: {median}") # 5
Mode (Most Common)
The value that appears most often
# Find the mode
from collections import Counter
numbers = [1, 2, 2, 3, 4, 2]
mode = Counter(numbers).most_common(1)[0][0]
print(f"Mode: {mode}") # 2
🧮 Calculating Mean
The mean is the sum of all values divided by the number of values
# Different ways to calculate mean
import numpy as np
# Method 1: Manual calculation
ages = [25, 30, 35, 40, 45]
manual_mean = sum(ages) / len(ages)
print(f"Manual mean: {manual_mean}") # 35.0
# Method 2: Using NumPy (recommended)
numpy_mean = np.mean(ages)
print(f"NumPy mean: {numpy_mean}") # 35.0
# Method 3: Using statistics module
import statistics
stats_mean = statistics.mean(ages)
print(f"Statistics mean: {stats_mean}") # 35.0
# Real example: Average temperature
temperatures = [72, 75, 68, 80, 77, 73, 71]
avg_temp = np.mean(temperatures)
print(f"Average temperature: {avg_temp:.1f}°F") # 73.7°F
🎯 Finding Median
The median is the middle value when data is sorted
# Finding median step by step
import numpy as np
# Example 1: Odd number of values
scores = [78, 85, 92, 88, 76]
scores_sorted = sorted(scores)
print(f"Sorted scores: {scores_sorted}") # [76, 78, 85, 88, 92]
# Middle position for odd length
middle_pos = len(scores_sorted) // 2
median_odd = scores_sorted[middle_pos]
print(f"Median (odd): {median_odd}") # 85
# Example 2: Even number of values
prices = [10, 15, 20, 25, 30, 35]
# For even length, take average of two middle values
n = len(prices)
median_even = (prices[n//2 - 1] + prices[n//2]) / 2
print(f"Median (even): {median_even}") # 22.5
# Using NumPy (handles both cases)
print(f"NumPy median: {np.median(prices)}") # 22.5
🔢 Finding Mode
The mode is the value that appears most frequently
# Finding mode in different ways
from collections import Counter
import numpy as np
from scipy import stats
# Example data: favorite colors survey
colors = ['red', 'blue', 'red', 'green', 'blue', 'red', 'yellow']
# Method 1: Using Counter
color_counts = Counter(colors)
mode_color = color_counts.most_common(1)[0][0]
print(f"Most popular color: {mode_color}") # red
# Method 2: For numbers
numbers = [1, 2, 2, 3, 4, 2, 5, 2]
number_counts = Counter(numbers)
mode_number = number_counts.most_common(1)[0][0]
print(f"Mode number: {mode_number}") # 2
# Method 3: Using scipy.stats
mode_result = stats.mode(numbers)
print(f"Scipy mode: {mode_result.mode[0]}") # 2
print(f"Appears {mode_result.count[0]} times") # 4 times
# What if there's no mode? (all values appear equally)
no_mode = [1, 2, 3, 4, 5]
print(f"No clear mode in: {no_mode}")
print(f"Scipy still picks: {stats.mode(no_mode).mode[0]}") # 1 (first one)
🤔 When to Use Which?
Each measure tells you something different about your data
📋 Quick Guide:
- Mean: Use when data is normally distributed (no extreme outliers)
- Median: Use when you have outliers or skewed data
- Mode: Use for categorical data or to find most common value
# Example: House prices in a neighborhood
# Most houses: $200k-$300k, but one mansion costs $2M
house_prices = [200, 220, 250, 280, 300, 2000] # in thousands
mean_price = np.mean(house_prices)
median_price = np.median(house_prices)
print(f"Mean price: ${mean_price:.0f}k") # $375k (pulled up by mansion!)
print(f"Median price: ${median_price:.0f}k") # $265k (more typical)
print("\nWhich is more useful?")
print("Median gives better idea of typical house price!")
print("Mean is affected by the expensive mansion.")
# Example: Survey responses (1-5 scale)
responses = [4, 5, 4, 3, 4, 5, 4, 2, 4, 4]
print(f"\nSurvey Results:")
print(f"Mean rating: {np.mean(responses):.1f}")
print(f"Median rating: {np.median(responses)}")
print(f"Most common rating: {Counter(responses).most_common(1)[0][0]}") # Mode