Python ML Standard Deviation

Understand data spread and variability in machine learning

šŸ“Š Understanding Standard Deviation

Standard deviation measures how spread out data points are from the average. It's crucial for understanding data quality and making ML predictions.


import numpy as np

# Sample data
ages = [25, 30, 35, 40, 45]
std_dev = np.std(ages)
print(f"Standard deviation: {std_dev:.2f}")
                                    
σ
Symbol
Spread
Measures
Quality
Data Check

What is Standard Deviation?

Standard deviation tells us how much individual data points differ from the average (mean).

šŸŽÆ

Low Standard Deviation

Data points are close to the mean

Consistent Predictable
šŸ“ˆ

High Standard Deviation

Data points are spread out from mean

Variable Diverse
šŸ”

ML Applications

Feature scaling and outlier detection

Scaling Outliers

šŸ”¹ Calculate Standard Deviation

Let's calculate standard deviation step by step and using NumPy

import numpy as np

# Sample dataset - test scores
scores = [85, 90, 78, 92, 88, 76, 95, 82]

# Method 1: Using NumPy (easiest)
std_numpy = np.std(scores)
print(f"Standard deviation (NumPy): {std_numpy:.2f}")

# Method 2: Manual calculation
mean = sum(scores) / len(scores)
variance = sum((x - mean) ** 2 for x in scores) / len(scores)
std_manual = variance ** 0.5

print(f"Mean: {mean:.2f}")
print(f"Variance: {variance:.2f}")
print(f"Standard deviation (manual): {std_manual:.2f}")

# Population vs Sample standard deviation
std_population = np.std(scores, ddof=0)  # Population
std_sample = np.std(scores, ddof=1)      # Sample

print(f"Population std: {std_population:.2f}")
print(f"Sample std: {std_sample:.2f}")

šŸ”¹ Real-World Example

Compare standard deviation in different datasets

import numpy as np

# Two different datasets
consistent_temps = [20, 21, 19, 22, 20, 21, 20]  # Consistent weather
variable_temps = [15, 25, 10, 30, 18, 28, 12]    # Variable weather

# Calculate standard deviations
std_consistent = np.std(consistent_temps)
std_variable = np.std(variable_temps)

print("Consistent temperatures:", consistent_temps)
print(f"Mean: {np.mean(consistent_temps):.1f}°C")
print(f"Standard deviation: {std_consistent:.2f}°C")
print()

print("Variable temperatures:", variable_temps)
print(f"Mean: {np.mean(variable_temps):.1f}°C")
print(f"Standard deviation: {std_variable:.2f}°C")
print()

# Interpretation
if std_consistent < std_variable:
    print("āœ… Consistent temps have lower std dev (more predictable)")
    print("āŒ Variable temps have higher std dev (less predictable)")

šŸ”¹ Standard Deviation in ML

How standard deviation helps in machine learning tasks

import numpy as np
from sklearn.preprocessing import StandardScaler

# Sample feature data (different scales)
heights = [150, 160, 170, 180, 190]  # cm
weights = [50, 60, 70, 80, 90]       # kg
ages = [20, 25, 30, 35, 40]          # years

# Check standard deviations
print("Original standard deviations:")
print(f"Heights: {np.std(heights):.2f}")
print(f"Weights: {np.std(weights):.2f}")
print(f"Ages: {np.std(ages):.2f}")

# Combine features
data = np.column_stack([heights, weights, ages])
print(f"\nOriginal data shape: {data.shape}")

# Standardize features (mean=0, std=1)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

print("\nAfter standardization:")
print(f"Heights std: {np.std(scaled_data[:, 0]):.2f}")
print(f"Weights std: {np.std(scaled_data[:, 1]):.2f}")
print(f"Ages std: {np.std(scaled_data[:, 2]):.2f}")

# Why this matters for ML
print("\nšŸŽÆ Why standardization matters:")
print("- All features now have equal importance")
print("- Algorithms work better with similar scales")
print("- Prevents one feature from dominating others")

šŸ”¹ Detecting Outliers

Use standard deviation to find unusual data points

import numpy as np

# Dataset with potential outliers
salaries = [45000, 48000, 52000, 47000, 51000, 49000, 150000, 46000]

mean_salary = np.mean(salaries)
std_salary = np.std(salaries)

print(f"Mean salary: ${mean_salary:,.0f}")
print(f"Standard deviation: ${std_salary:,.0f}")
print()

# Find outliers (more than 2 standard deviations from mean)
outliers = []
normal_values = []

for salary in salaries:
    # Calculate how many standard deviations away from mean
    z_score = abs(salary - mean_salary) / std_salary
    
    if z_score > 2:  # More than 2 std devs away
        outliers.append(salary)
        print(f"${salary:,} is an outlier (z-score: {z_score:.2f})")
    else:
        normal_values.append(salary)

print(f"\nNormal salaries: {len(normal_values)}")
print(f"Outliers found: {len(outliers)}")

# Clean dataset without outliers
if outliers:
    clean_mean = np.mean(normal_values)
    clean_std = np.std(normal_values)
    print(f"\nCleaned dataset:")
    print(f"Mean: ${clean_mean:,.0f}")
    print(f"Std dev: ${clean_std:,.0f}")

🧠 Test Your Knowledge

What does a low standard deviation indicate?

Which NumPy function calculates standard deviation?