Python ML Standard Deviation
Understand data spread and variability in machine learning
š Understanding Standard Deviation
Standard deviation measures how spread out data points are from the average. It's crucial for understanding data quality and making ML predictions.
import numpy as np
# Sample data
ages = [25, 30, 35, 40, 45]
std_dev = np.std(ages)
print(f"Standard deviation: {std_dev:.2f}")
What is Standard Deviation?
Standard deviation tells us how much individual data points differ from the average (mean).
Low Standard Deviation
Data points are close to the mean
High Standard Deviation
Data points are spread out from mean
ML Applications
Feature scaling and outlier detection
š¹ Calculate Standard Deviation
Let's calculate standard deviation step by step and using NumPy
import numpy as np
# Sample dataset - test scores
scores = [85, 90, 78, 92, 88, 76, 95, 82]
# Method 1: Using NumPy (easiest)
std_numpy = np.std(scores)
print(f"Standard deviation (NumPy): {std_numpy:.2f}")
# Method 2: Manual calculation
mean = sum(scores) / len(scores)
variance = sum((x - mean) ** 2 for x in scores) / len(scores)
std_manual = variance ** 0.5
print(f"Mean: {mean:.2f}")
print(f"Variance: {variance:.2f}")
print(f"Standard deviation (manual): {std_manual:.2f}")
# Population vs Sample standard deviation
std_population = np.std(scores, ddof=0) # Population
std_sample = np.std(scores, ddof=1) # Sample
print(f"Population std: {std_population:.2f}")
print(f"Sample std: {std_sample:.2f}")
š¹ Real-World Example
Compare standard deviation in different datasets
import numpy as np
# Two different datasets
consistent_temps = [20, 21, 19, 22, 20, 21, 20] # Consistent weather
variable_temps = [15, 25, 10, 30, 18, 28, 12] # Variable weather
# Calculate standard deviations
std_consistent = np.std(consistent_temps)
std_variable = np.std(variable_temps)
print("Consistent temperatures:", consistent_temps)
print(f"Mean: {np.mean(consistent_temps):.1f}°C")
print(f"Standard deviation: {std_consistent:.2f}°C")
print()
print("Variable temperatures:", variable_temps)
print(f"Mean: {np.mean(variable_temps):.1f}°C")
print(f"Standard deviation: {std_variable:.2f}°C")
print()
# Interpretation
if std_consistent < std_variable:
print("ā
Consistent temps have lower std dev (more predictable)")
print("ā Variable temps have higher std dev (less predictable)")
š¹ Standard Deviation in ML
How standard deviation helps in machine learning tasks
import numpy as np
from sklearn.preprocessing import StandardScaler
# Sample feature data (different scales)
heights = [150, 160, 170, 180, 190] # cm
weights = [50, 60, 70, 80, 90] # kg
ages = [20, 25, 30, 35, 40] # years
# Check standard deviations
print("Original standard deviations:")
print(f"Heights: {np.std(heights):.2f}")
print(f"Weights: {np.std(weights):.2f}")
print(f"Ages: {np.std(ages):.2f}")
# Combine features
data = np.column_stack([heights, weights, ages])
print(f"\nOriginal data shape: {data.shape}")
# Standardize features (mean=0, std=1)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
print("\nAfter standardization:")
print(f"Heights std: {np.std(scaled_data[:, 0]):.2f}")
print(f"Weights std: {np.std(scaled_data[:, 1]):.2f}")
print(f"Ages std: {np.std(scaled_data[:, 2]):.2f}")
# Why this matters for ML
print("\nšÆ Why standardization matters:")
print("- All features now have equal importance")
print("- Algorithms work better with similar scales")
print("- Prevents one feature from dominating others")
š¹ Detecting Outliers
Use standard deviation to find unusual data points
import numpy as np
# Dataset with potential outliers
salaries = [45000, 48000, 52000, 47000, 51000, 49000, 150000, 46000]
mean_salary = np.mean(salaries)
std_salary = np.std(salaries)
print(f"Mean salary: ${mean_salary:,.0f}")
print(f"Standard deviation: ${std_salary:,.0f}")
print()
# Find outliers (more than 2 standard deviations from mean)
outliers = []
normal_values = []
for salary in salaries:
# Calculate how many standard deviations away from mean
z_score = abs(salary - mean_salary) / std_salary
if z_score > 2: # More than 2 std devs away
outliers.append(salary)
print(f"${salary:,} is an outlier (z-score: {z_score:.2f})")
else:
normal_values.append(salary)
print(f"\nNormal salaries: {len(normal_values)}")
print(f"Outliers found: {len(outliers)}")
# Clean dataset without outliers
if outliers:
clean_mean = np.mean(normal_values)
clean_std = np.std(normal_values)
print(f"\nCleaned dataset:")
print(f"Mean: ${clean_mean:,.0f}")
print(f"Std dev: ${clean_std:,.0f}")