Python ML Percentiles

Understand data position and ranking in datasets

šŸ“ˆ Understanding Percentiles

Percentiles show where a value ranks compared to all other values. The 50th percentile is the median - half the data is below it.


import numpy as np

# Test scores
scores = [65, 70, 75, 80, 85, 90, 95]
percentile_75 = np.percentile(scores, 75)
print(f"75th percentile: {percentile_75}")
                                    
0-100
Range
Position
Shows
Ranking
Useful For
What are Percentiles

What are Percentiles?

Percentiles divide data into 100 equal parts, showing relative position.

šŸŽÆ

25th Percentile (Q1)

25% of data is below this value

First Quartile Lower Quarter
šŸ“Š

50th Percentile (Q2)

The median - middle value

Median Middle Point
šŸ“ˆ

75th Percentile (Q3)

75% of data is below this value

Third Quartile Upper Quarter

šŸ”¹ Calculate Percentiles

Find percentiles using NumPy and understand what they mean

import numpy as np

# Student exam scores
scores = [45, 55, 60, 65, 70, 75, 80, 85, 90, 95]

# Calculate common percentiles
p25 = np.percentile(scores, 25)  # 25th percentile
p50 = np.percentile(scores, 50)  # 50th percentile (median)
p75 = np.percentile(scores, 75)  # 75th percentile
p90 = np.percentile(scores, 90)  # 90th percentile

print("Exam Scores:", scores)
print(f"25th percentile (Q1): {p25}")
print(f"50th percentile (Median): {p50}")
print(f"75th percentile (Q3): {p75}")
print(f"90th percentile: {p90}")

# What this means
print(f"\nšŸ“Š Interpretation:")
print(f"25% of students scored below {p25}")
print(f"50% of students scored below {p50}")
print(f"75% of students scored below {p75}")
print(f"90% of students scored below {p90}")

# Multiple percentiles at once
percentiles = np.percentile(scores, [10, 25, 50, 75, 90])
labels = ['10th', '25th', '50th', '75th', '90th']

print(f"\nšŸ“ˆ Multiple percentiles:")
for label, value in zip(labels, percentiles):
    print(f"{label}: {value}")

šŸ”¹ Real-World Example: Income Analysis

Analyze income distribution using percentiles

import numpy as np

# Annual incomes in thousands
incomes = [25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 
           75, 80, 85, 90, 100, 120, 150, 200, 300, 500]

# Calculate key percentiles
income_percentiles = {
    '10th': np.percentile(incomes, 10),
    '25th': np.percentile(incomes, 25),
    '50th': np.percentile(incomes, 50),  # Median income
    '75th': np.percentile(incomes, 75),
    '90th': np.percentile(incomes, 90),
    '95th': np.percentile(incomes, 95)
}

print("šŸ’° Income Distribution Analysis")
print("=" * 35)

for percentile, value in income_percentiles.items():
    print(f"{percentile} percentile: ${value:,.0f}k")

# Find your income percentile
def find_income_percentile(your_income, all_incomes):
    """Find what percentile your income is in"""
    below_count = sum(1 for income in all_incomes if income < your_income)
    percentile = (below_count / len(all_incomes)) * 100
    return percentile

# Example: What percentile is $75k?
my_income = 75
my_percentile = find_income_percentile(my_income, incomes)
print(f"\nšŸŽÆ ${my_income}k income is at {my_percentile:.0f}th percentile")
print(f"This means {my_percentile:.0f}% of people earn less than ${my_income}k")

šŸ”¹ Percentiles in Machine Learning

Use percentiles for outlier detection and feature engineering

import numpy as np

# Website response times (milliseconds)
response_times = [100, 120, 110, 130, 125, 140, 115, 2000, 135, 145,
                  150, 160, 155, 165, 170, 3500, 180, 175, 185, 190]

print("🌐 Website Response Time Analysis")
print("Response times:", response_times[:10], "... (20 total)")

# Calculate percentiles for performance monitoring
p50 = np.percentile(response_times, 50)  # Median response time
p95 = np.percentile(response_times, 95)  # 95% of requests faster than this
p99 = np.percentile(response_times, 99)  # 99% of requests faster than this

print(f"\n⚔ Performance Metrics:")
print(f"50th percentile (median): {p50:.0f}ms")
print(f"95th percentile: {p95:.0f}ms")
print(f"99th percentile: {p99:.0f}ms")

# Outlier detection using IQR (Interquartile Range)
q1 = np.percentile(response_times, 25)
q3 = np.percentile(response_times, 75)
iqr = q3 - q1

# Outliers are beyond 1.5 * IQR from Q1 or Q3
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = [x for x in response_times if x < lower_bound or x > upper_bound]
normal_times = [x for x in response_times if lower_bound <= x <= upper_bound]

print(f"\nšŸ” Outlier Detection:")
print(f"Q1 (25th): {q1:.0f}ms")
print(f"Q3 (75th): {q3:.0f}ms")
print(f"IQR: {iqr:.0f}ms")
print(f"Normal range: {lower_bound:.0f}ms - {upper_bound:.0f}ms")
print(f"Outliers found: {outliers}")
print(f"Normal response times: {len(normal_times)}/{len(response_times)}")

šŸ”¹ Percentile vs Percentage

Understanding the difference between percentiles and percentages

šŸ” Key Differences:

  • Percentile: Position in ranked data (e.g., "75th percentile")
  • Percentage: Portion of a whole (e.g., "75% correct")
import numpy as np

# Example: Test scores
test_scores = [60, 65, 70, 75, 80, 85, 90, 95]

# Student A scored 80
student_score = 80

# 1. What PERCENTAGE of questions did they get right?
total_questions = 100
percentage_correct = student_score  # 80% correct

# 2. What PERCENTILE are they in?
percentile_rank = (sum(1 for score in test_scores if score < student_score) 
                  / len(test_scores)) * 100

print(f"Student scored: {student_score}")
print(f"Percentage correct: {percentage_correct}% of questions")
print(f"Percentile rank: {percentile_rank:.0f}th percentile")
print()

# What this means:
print("šŸ“š What this means:")
print(f"• Got {percentage_correct}% of questions right")
print(f"• Performed better than {percentile_rank:.0f}% of other students")

# Another example with percentiles
print(f"\nšŸŽÆ Using np.percentile:")
p75 = np.percentile(test_scores, 75)
print(f"75th percentile score: {p75}")
print(f"This means 75% of students scored below {p75}")

🧠 Test Your Knowledge

What does the 75th percentile mean?

The 50th percentile is the same as: