Python ML Data Distribution
Understand how data is spread and distributed
š Understanding Data Distribution
Data distribution shows how values are spread across a dataset. Understanding distribution helps choose the right ML algorithms and preprocessing steps.
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
data = np.random.normal(50, 15, 1000)
plt.hist(data, bins=30)
plt.title("Data Distribution")
plt.show()
Types of Data Distributions
Different datasets follow different distribution patterns.
Normal Distribution
Bell-shaped, symmetric curve
Skewed Distribution
Asymmetric, tail on one side
Uniform Distribution
All values equally likely
š¹ Visualizing Distributions
Create histograms to see data distribution patterns
import numpy as np
import matplotlib.pyplot as plt
# Create different types of data
normal_data = np.random.normal(50, 10, 1000) # Normal distribution
skewed_data = np.random.exponential(2, 1000) # Right-skewed
uniform_data = np.random.uniform(0, 100, 1000) # Uniform distribution
# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# Plot normal distribution
axes[0].hist(normal_data, bins=30, alpha=0.7, color='blue')
axes[0].set_title('Normal Distribution')
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Frequency')
# Plot skewed distribution
axes[1].hist(skewed_data, bins=30, alpha=0.7, color='red')
axes[1].set_title('Right-Skewed Distribution')
axes[1].set_xlabel('Value')
axes[1].set_ylabel('Frequency')
# Plot uniform distribution
axes[2].hist(uniform_data, bins=30, alpha=0.7, color='green')
axes[2].set_title('Uniform Distribution')
axes[2].set_xlabel('Value')
axes[2].set_ylabel('Frequency')
plt.tight_layout()
plt.show()
# Print basic statistics
print("š Distribution Statistics:")
print(f"Normal - Mean: {np.mean(normal_data):.2f}, Std: {np.std(normal_data):.2f}")
print(f"Skewed - Mean: {np.mean(skewed_data):.2f}, Std: {np.std(skewed_data):.2f}")
print(f"Uniform - Mean: {np.mean(uniform_data):.2f}, Std: {np.std(uniform_data):.2f}")
š¹ Real-World Example: Student Heights
Analyze the distribution of student heights
import numpy as np
import matplotlib.pyplot as plt
# Simulate student heights (in cm) - normally distributed
np.random.seed(42) # For reproducible results
heights = np.random.normal(170, 10, 500) # Mean=170cm, Std=10cm
# Basic statistics
mean_height = np.mean(heights)
std_height = np.std(heights)
median_height = np.median(heights)
print("š„ Student Height Analysis")
print("=" * 30)
print(f"Number of students: {len(heights)}")
print(f"Mean height: {mean_height:.1f} cm")
print(f"Median height: {median_height:.1f} cm")
print(f"Standard deviation: {std_height:.1f} cm")
print(f"Min height: {np.min(heights):.1f} cm")
print(f"Max height: {np.max(heights):.1f} cm")
# Create histogram
plt.figure(figsize=(10, 6))
plt.hist(heights, bins=25, alpha=0.7, color='skyblue', edgecolor='black')
plt.axvline(mean_height, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_height:.1f}cm')
plt.axvline(median_height, color='green', linestyle='--', linewidth=2, label=f'Median: {median_height:.1f}cm')
plt.title('Distribution of Student Heights')
plt.xlabel('Height (cm)')
plt.ylabel('Number of Students')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Analyze distribution shape
print(f"\nš Distribution Analysis:")
if abs(mean_height - median_height) < 1:
print("ā
Distribution appears symmetric (mean ā median)")
else:
print("ā ļø Distribution may be skewed (mean ā median)")
# Find students in different ranges
short_students = sum(1 for h in heights if h < mean_height - std_height)
tall_students = sum(1 for h in heights if h > mean_height + std_height)
print(f"Students shorter than {mean_height - std_height:.1f}cm: {short_students}")
print(f"Students taller than {mean_height + std_height:.1f}cm: {tall_students}")
š¹ Checking Distribution Shape
Use statistical measures to understand distribution characteristics
import numpy as np
from scipy import stats
# Create different datasets
normal_data = np.random.normal(50, 10, 1000)
right_skewed = np.random.exponential(2, 1000)
left_skewed = 10 - np.random.exponential(2, 1000)
def analyze_distribution(data, name):
"""Analyze the shape and characteristics of a distribution"""
mean = np.mean(data)
median = np.median(data)
std = np.std(data)
# Calculate skewness (measure of asymmetry)
skewness = stats.skew(data)
# Calculate kurtosis (measure of tail heaviness)
kurt = stats.kurtosis(data)
print(f"\nš {name} Distribution Analysis:")
print(f"Mean: {mean:.2f}")
print(f"Median: {median:.2f}")
print(f"Standard Deviation: {std:.2f}")
print(f"Skewness: {skewness:.2f}")
print(f"Kurtosis: {kurt:.2f}")
# Interpret skewness
if abs(skewness) < 0.5:
skew_interpretation = "Approximately symmetric"
elif skewness > 0.5:
skew_interpretation = "Right-skewed (positive skew)"
else:
skew_interpretation = "Left-skewed (negative skew)"
print(f"Shape: {skew_interpretation}")
return {
'mean': mean, 'median': median, 'std': std,
'skewness': skewness, 'kurtosis': kurt
}
# Analyze each dataset
results = {}
results['normal'] = analyze_distribution(normal_data, "Normal")
results['right_skewed'] = analyze_distribution(right_skewed, "Right-Skewed")
results['left_skewed'] = analyze_distribution(left_skewed, "Left-Skewed")
# Summary insights
print(f"\nšÆ Key Insights:")
print(f"⢠Normal data: Mean ā Median ({results['normal']['mean']:.1f} ā {results['normal']['median']:.1f})")
print(f"⢠Right-skewed: Mean > Median ({results['right_skewed']['mean']:.1f} > {results['right_skewed']['median']:.1f})")
print(f"⢠Left-skewed: Mean < Median ({results['left_skewed']['mean']:.1f} < {results['left_skewed']['median']:.1f})")
š¹ Distribution in Machine Learning
How data distribution affects ML model performance
import numpy as np
from sklearn.preprocessing import StandardScaler, PowerTransformer
# Simulate salary data (typically right-skewed)
np.random.seed(42)
salaries = np.random.lognormal(10.5, 0.5, 1000) # Log-normal distribution
print("š° Salary Data Analysis")
print("=" * 25)
print(f"Mean salary: ${np.mean(salaries):,.0f}")
print(f"Median salary: ${np.median(salaries):,.0f}")
print(f"Standard deviation: ${np.std(salaries):,.0f}")
# Check if data is skewed
skewness = stats.skew(salaries)
print(f"Skewness: {skewness:.2f}")
if skewness > 1:
print("ā ļø Data is highly right-skewed!")
print("This can cause problems for ML algorithms")
# Solution 1: Log transformation
log_salaries = np.log(salaries)
log_skewness = stats.skew(log_salaries)
print(f"\nš§ After log transformation:")
print(f"Skewness: {log_skewness:.2f}")
if abs(log_skewness) < 0.5:
print("ā
Data is now approximately normal!")
# Solution 2: Power transformation (Yeo-Johnson)
pt = PowerTransformer(method='yeo-johnson')
transformed_salaries = pt.fit_transform(salaries.reshape(-1, 1)).flatten()
transformed_skewness = stats.skew(transformed_salaries)
print(f"\nš§ After power transformation:")
print(f"Skewness: {transformed_skewness:.2f}")
# Why this matters for ML
print(f"\nš¤ Impact on Machine Learning:")
print("⢠Many algorithms assume normal distribution")
print("⢠Skewed data can lead to biased predictions")
print("⢠Transformations help algorithms perform better")
print("⢠Linear regression, SVM, and neural networks benefit from normalized data")
# Practical example: Feature scaling
print(f"\nš Feature Scaling Example:")
scaler = StandardScaler()
scaled_original = scaler.fit_transform(salaries.reshape(-1, 1))
scaled_transformed = scaler.fit_transform(transformed_salaries.reshape(-1, 1))
print(f"Original data std after scaling: {np.std(scaled_original):.3f}")
print(f"Transformed data std after scaling: {np.std(scaled_transformed):.3f}")
print("Both should be close to 1.0 for optimal ML performance")