Python ML Standard Deviation

📊 Understanding Standard Deviation

Standard deviation measures how spread out data points are from the average. It's crucial for understanding data quality and making ML predictions.


import numpy as np

# Sample data
ages = [25, 30, 35, 40, 45]
std_dev = np.std(ages)
print(f"Standard deviation: {std_dev:.2f}")

σ

Symbol

Spread

Measures

Quality

Data Check

What is Standard Deviation?

Standard deviation tells us how much individual data points differ from the average (mean).

🎯

Low Standard Deviation

Data points are close to the mean

Consistent Predictable

📈

High Standard Deviation

Data points are spread out from mean

Variable Diverse

🔍

ML Applications

Feature scaling and outlier detection

Scaling Outliers

🔹 Calculate Standard Deviation

Let's calculate standard deviation step by step and using NumPy

import numpy as np

# Sample dataset - test scores
scores = [85, 90, 78, 92, 88, 76, 95, 82]

# Method 1: Using NumPy (easiest)
std_numpy = np.std(scores)
print(f"Standard deviation (NumPy): {std_numpy:.2f}")

# Method 2: Manual calculation
mean = sum(scores) / len(scores)
variance = sum((x - mean) ** 2 for x in scores) / len(scores)
std_manual = variance ** 0.5

print(f"Mean: {mean:.2f}")
print(f"Variance: {variance:.2f}")
print(f"Standard deviation (manual): {std_manual:.2f}")

# Population vs Sample standard deviation
std_population = np.std(scores, ddof=0)  # Population
std_sample = np.std(scores, ddof=1)      # Sample

print(f"Population std: {std_population:.2f}")
print(f"Sample std: {std_sample:.2f}")

🔹 Real-World Example

Compare standard deviation in different datasets

import numpy as np

# Two different datasets
consistent_temps = [20, 21, 19, 22, 20, 21, 20]  # Consistent weather
variable_temps = [15, 25, 10, 30, 18, 28, 12]    # Variable weather

# Calculate standard deviations
std_consistent = np.std(consistent_temps)
std_variable = np.std(variable_temps)

print("Consistent temperatures:", consistent_temps)
print(f"Mean: {np.mean(consistent_temps):.1f}°C")
print(f"Standard deviation: {std_consistent:.2f}°C")
print()

print("Variable temperatures:", variable_temps)
print(f"Mean: {np.mean(variable_temps):.1f}°C")
print(f"Standard deviation: {std_variable:.2f}°C")
print()

# Interpretation
if std_consistent < std_variable:
    print("✅ Consistent temps have lower std dev (more predictable)")
    print("❌ Variable temps have higher std dev (less predictable)")

🔹 Standard Deviation in ML

How standard deviation helps in machine learning tasks

import numpy as np
from sklearn.preprocessing import StandardScaler

# Sample feature data (different scales)
heights = [150, 160, 170, 180, 190]  # cm
weights = [50, 60, 70, 80, 90]       # kg
ages = [20, 25, 30, 35, 40]          # years

# Check standard deviations
print("Original standard deviations:")
print(f"Heights: {np.std(heights):.2f}")
print(f"Weights: {np.std(weights):.2f}")
print(f"Ages: {np.std(ages):.2f}")

# Combine features
data = np.column_stack([heights, weights, ages])
print(f"\nOriginal data shape: {data.shape}")

# Standardize features (mean=0, std=1)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

print("\nAfter standardization:")
print(f"Heights std: {np.std(scaled_data[:, 0]):.2f}")
print(f"Weights std: {np.std(scaled_data[:, 1]):.2f}")
print(f"Ages std: {np.std(scaled_data[:, 2]):.2f}")

# Why this matters for ML
print("\n🎯 Why standardization matters:")
print("- All features now have equal importance")
print("- Algorithms work better with similar scales")
print("- Prevents one feature from dominating others")

🔹 Detecting Outliers

Use standard deviation to find unusual data points

import numpy as np

# Dataset with potential outliers
salaries = [45000, 48000, 52000, 47000, 51000, 49000, 150000, 46000]

mean_salary = np.mean(salaries)
std_salary = np.std(salaries)

print(f"Mean salary: ${mean_salary:,.0f}")
print(f"Standard deviation: ${std_salary:,.0f}")
print()

# Find outliers (more than 2 standard deviations from mean)
outliers = []
normal_values = []

for salary in salaries:
    # Calculate how many standard deviations away from mean
    z_score = abs(salary - mean_salary) / std_salary
    
    if z_score > 2:  # More than 2 std devs away
        outliers.append(salary)
        print(f"${salary:,} is an outlier (z-score: {z_score:.2f})")
    else:
        normal_values.append(salary)

print(f"\nNormal salaries: {len(normal_values)}")
print(f"Outliers found: {len(outliers)}")

# Clean dataset without outliers
if outliers:
    clean_mean = np.mean(normal_values)
    clean_std = np.std(normal_values)
    print(f"\nCleaned dataset:")
    print(f"Mean: ${clean_mean:,.0f}")
    print(f"Std dev: ${clean_std:,.0f}")