Python ML Percentiles
Understand data position and ranking in datasets
š Understanding Percentiles
Percentiles show where a value ranks compared to all other values. The 50th percentile is the median - half the data is below it.
import numpy as np
# Test scores
scores = [65, 70, 75, 80, 85, 90, 95]
percentile_75 = np.percentile(scores, 75)
print(f"75th percentile: {percentile_75}")
What are Percentiles?
Percentiles divide data into 100 equal parts, showing relative position.
25th Percentile (Q1)
25% of data is below this value
50th Percentile (Q2)
The median - middle value
75th Percentile (Q3)
75% of data is below this value
š¹ Calculate Percentiles
Find percentiles using NumPy and understand what they mean
import numpy as np
# Student exam scores
scores = [45, 55, 60, 65, 70, 75, 80, 85, 90, 95]
# Calculate common percentiles
p25 = np.percentile(scores, 25) # 25th percentile
p50 = np.percentile(scores, 50) # 50th percentile (median)
p75 = np.percentile(scores, 75) # 75th percentile
p90 = np.percentile(scores, 90) # 90th percentile
print("Exam Scores:", scores)
print(f"25th percentile (Q1): {p25}")
print(f"50th percentile (Median): {p50}")
print(f"75th percentile (Q3): {p75}")
print(f"90th percentile: {p90}")
# What this means
print(f"\nš Interpretation:")
print(f"25% of students scored below {p25}")
print(f"50% of students scored below {p50}")
print(f"75% of students scored below {p75}")
print(f"90% of students scored below {p90}")
# Multiple percentiles at once
percentiles = np.percentile(scores, [10, 25, 50, 75, 90])
labels = ['10th', '25th', '50th', '75th', '90th']
print(f"\nš Multiple percentiles:")
for label, value in zip(labels, percentiles):
print(f"{label}: {value}")
š¹ Real-World Example: Income Analysis
Analyze income distribution using percentiles
import numpy as np
# Annual incomes in thousands
incomes = [25, 30, 35, 40, 45, 50, 55, 60, 65, 70,
75, 80, 85, 90, 100, 120, 150, 200, 300, 500]
# Calculate key percentiles
income_percentiles = {
'10th': np.percentile(incomes, 10),
'25th': np.percentile(incomes, 25),
'50th': np.percentile(incomes, 50), # Median income
'75th': np.percentile(incomes, 75),
'90th': np.percentile(incomes, 90),
'95th': np.percentile(incomes, 95)
}
print("š° Income Distribution Analysis")
print("=" * 35)
for percentile, value in income_percentiles.items():
print(f"{percentile} percentile: ${value:,.0f}k")
# Find your income percentile
def find_income_percentile(your_income, all_incomes):
"""Find what percentile your income is in"""
below_count = sum(1 for income in all_incomes if income < your_income)
percentile = (below_count / len(all_incomes)) * 100
return percentile
# Example: What percentile is $75k?
my_income = 75
my_percentile = find_income_percentile(my_income, incomes)
print(f"\nšÆ ${my_income}k income is at {my_percentile:.0f}th percentile")
print(f"This means {my_percentile:.0f}% of people earn less than ${my_income}k")
š¹ Percentiles in Machine Learning
Use percentiles for outlier detection and feature engineering
import numpy as np
# Website response times (milliseconds)
response_times = [100, 120, 110, 130, 125, 140, 115, 2000, 135, 145,
150, 160, 155, 165, 170, 3500, 180, 175, 185, 190]
print("š Website Response Time Analysis")
print("Response times:", response_times[:10], "... (20 total)")
# Calculate percentiles for performance monitoring
p50 = np.percentile(response_times, 50) # Median response time
p95 = np.percentile(response_times, 95) # 95% of requests faster than this
p99 = np.percentile(response_times, 99) # 99% of requests faster than this
print(f"\nā” Performance Metrics:")
print(f"50th percentile (median): {p50:.0f}ms")
print(f"95th percentile: {p95:.0f}ms")
print(f"99th percentile: {p99:.0f}ms")
# Outlier detection using IQR (Interquartile Range)
q1 = np.percentile(response_times, 25)
q3 = np.percentile(response_times, 75)
iqr = q3 - q1
# Outliers are beyond 1.5 * IQR from Q1 or Q3
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = [x for x in response_times if x < lower_bound or x > upper_bound]
normal_times = [x for x in response_times if lower_bound <= x <= upper_bound]
print(f"\nš Outlier Detection:")
print(f"Q1 (25th): {q1:.0f}ms")
print(f"Q3 (75th): {q3:.0f}ms")
print(f"IQR: {iqr:.0f}ms")
print(f"Normal range: {lower_bound:.0f}ms - {upper_bound:.0f}ms")
print(f"Outliers found: {outliers}")
print(f"Normal response times: {len(normal_times)}/{len(response_times)}")
š¹ Percentile vs Percentage
Understanding the difference between percentiles and percentages
š Key Differences:
- Percentile: Position in ranked data (e.g., "75th percentile")
- Percentage: Portion of a whole (e.g., "75% correct")
import numpy as np
# Example: Test scores
test_scores = [60, 65, 70, 75, 80, 85, 90, 95]
# Student A scored 80
student_score = 80
# 1. What PERCENTAGE of questions did they get right?
total_questions = 100
percentage_correct = student_score # 80% correct
# 2. What PERCENTILE are they in?
percentile_rank = (sum(1 for score in test_scores if score < student_score)
/ len(test_scores)) * 100
print(f"Student scored: {student_score}")
print(f"Percentage correct: {percentage_correct}% of questions")
print(f"Percentile rank: {percentile_rank:.0f}th percentile")
print()
# What this means:
print("š What this means:")
print(f"⢠Got {percentage_correct}% of questions right")
print(f"⢠Performed better than {percentile_rank:.0f}% of other students")
# Another example with percentiles
print(f"\nšÆ Using np.percentile:")
p75 = np.percentile(test_scores, 75)
print(f"75th percentile score: {p75}")
print(f"This means 75% of students scored below {p75}")