Python ML Data Distribution

Understand how data is spread and distributed

šŸ“Š Understanding Data Distribution

Data distribution shows how values are spread across a dataset. Understanding distribution helps choose the right ML algorithms and preprocessing steps.


import numpy as np
import matplotlib.pyplot as plt

# Generate sample data
data = np.random.normal(50, 15, 1000)
plt.hist(data, bins=30)
plt.title("Data Distribution")
plt.show()
                                    
Shape
Shows
Pattern
Reveals
Insights
Provides
Types of Distributions

Types of Data Distributions

Different datasets follow different distribution patterns.

šŸ””

Normal Distribution

Bell-shaped, symmetric curve

Bell Curve Symmetric
šŸ“ˆ

Skewed Distribution

Asymmetric, tail on one side

Left Skew Right Skew
šŸ“Š

Uniform Distribution

All values equally likely

Flat Equal

šŸ”¹ Visualizing Distributions

Create histograms to see data distribution patterns

import numpy as np
import matplotlib.pyplot as plt

# Create different types of data
normal_data = np.random.normal(50, 10, 1000)      # Normal distribution
skewed_data = np.random.exponential(2, 1000)      # Right-skewed
uniform_data = np.random.uniform(0, 100, 1000)    # Uniform distribution

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot normal distribution
axes[0].hist(normal_data, bins=30, alpha=0.7, color='blue')
axes[0].set_title('Normal Distribution')
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Frequency')

# Plot skewed distribution
axes[1].hist(skewed_data, bins=30, alpha=0.7, color='red')
axes[1].set_title('Right-Skewed Distribution')
axes[1].set_xlabel('Value')
axes[1].set_ylabel('Frequency')

# Plot uniform distribution
axes[2].hist(uniform_data, bins=30, alpha=0.7, color='green')
axes[2].set_title('Uniform Distribution')
axes[2].set_xlabel('Value')
axes[2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Print basic statistics
print("šŸ“Š Distribution Statistics:")
print(f"Normal - Mean: {np.mean(normal_data):.2f}, Std: {np.std(normal_data):.2f}")
print(f"Skewed - Mean: {np.mean(skewed_data):.2f}, Std: {np.std(skewed_data):.2f}")
print(f"Uniform - Mean: {np.mean(uniform_data):.2f}, Std: {np.std(uniform_data):.2f}")

šŸ”¹ Real-World Example: Student Heights

Analyze the distribution of student heights

import numpy as np
import matplotlib.pyplot as plt

# Simulate student heights (in cm) - normally distributed
np.random.seed(42)  # For reproducible results
heights = np.random.normal(170, 10, 500)  # Mean=170cm, Std=10cm

# Basic statistics
mean_height = np.mean(heights)
std_height = np.std(heights)
median_height = np.median(heights)

print("šŸ‘„ Student Height Analysis")
print("=" * 30)
print(f"Number of students: {len(heights)}")
print(f"Mean height: {mean_height:.1f} cm")
print(f"Median height: {median_height:.1f} cm")
print(f"Standard deviation: {std_height:.1f} cm")
print(f"Min height: {np.min(heights):.1f} cm")
print(f"Max height: {np.max(heights):.1f} cm")

# Create histogram
plt.figure(figsize=(10, 6))
plt.hist(heights, bins=25, alpha=0.7, color='skyblue', edgecolor='black')
plt.axvline(mean_height, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_height:.1f}cm')
plt.axvline(median_height, color='green', linestyle='--', linewidth=2, label=f'Median: {median_height:.1f}cm')

plt.title('Distribution of Student Heights')
plt.xlabel('Height (cm)')
plt.ylabel('Number of Students')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Analyze distribution shape
print(f"\nšŸ“ˆ Distribution Analysis:")
if abs(mean_height - median_height) < 1:
    print("āœ… Distribution appears symmetric (mean ā‰ˆ median)")
else:
    print("āš ļø Distribution may be skewed (mean ≠ median)")

# Find students in different ranges
short_students = sum(1 for h in heights if h < mean_height - std_height)
tall_students = sum(1 for h in heights if h > mean_height + std_height)

print(f"Students shorter than {mean_height - std_height:.1f}cm: {short_students}")
print(f"Students taller than {mean_height + std_height:.1f}cm: {tall_students}")

šŸ”¹ Checking Distribution Shape

Use statistical measures to understand distribution characteristics

import numpy as np
from scipy import stats

# Create different datasets
normal_data = np.random.normal(50, 10, 1000)
right_skewed = np.random.exponential(2, 1000)
left_skewed = 10 - np.random.exponential(2, 1000)

def analyze_distribution(data, name):
    """Analyze the shape and characteristics of a distribution"""
    mean = np.mean(data)
    median = np.median(data)
    std = np.std(data)
    
    # Calculate skewness (measure of asymmetry)
    skewness = stats.skew(data)
    
    # Calculate kurtosis (measure of tail heaviness)
    kurt = stats.kurtosis(data)
    
    print(f"\nšŸ“Š {name} Distribution Analysis:")
    print(f"Mean: {mean:.2f}")
    print(f"Median: {median:.2f}")
    print(f"Standard Deviation: {std:.2f}")
    print(f"Skewness: {skewness:.2f}")
    print(f"Kurtosis: {kurt:.2f}")
    
    # Interpret skewness
    if abs(skewness) < 0.5:
        skew_interpretation = "Approximately symmetric"
    elif skewness > 0.5:
        skew_interpretation = "Right-skewed (positive skew)"
    else:
        skew_interpretation = "Left-skewed (negative skew)"
    
    print(f"Shape: {skew_interpretation}")
    
    return {
        'mean': mean, 'median': median, 'std': std,
        'skewness': skewness, 'kurtosis': kurt
    }

# Analyze each dataset
results = {}
results['normal'] = analyze_distribution(normal_data, "Normal")
results['right_skewed'] = analyze_distribution(right_skewed, "Right-Skewed")
results['left_skewed'] = analyze_distribution(left_skewed, "Left-Skewed")

# Summary insights
print(f"\nšŸŽÆ Key Insights:")
print(f"• Normal data: Mean ā‰ˆ Median ({results['normal']['mean']:.1f} ā‰ˆ {results['normal']['median']:.1f})")
print(f"• Right-skewed: Mean > Median ({results['right_skewed']['mean']:.1f} > {results['right_skewed']['median']:.1f})")
print(f"• Left-skewed: Mean < Median ({results['left_skewed']['mean']:.1f} < {results['left_skewed']['median']:.1f})")

šŸ”¹ Distribution in Machine Learning

How data distribution affects ML model performance

import numpy as np
from sklearn.preprocessing import StandardScaler, PowerTransformer

# Simulate salary data (typically right-skewed)
np.random.seed(42)
salaries = np.random.lognormal(10.5, 0.5, 1000)  # Log-normal distribution

print("šŸ’° Salary Data Analysis")
print("=" * 25)
print(f"Mean salary: ${np.mean(salaries):,.0f}")
print(f"Median salary: ${np.median(salaries):,.0f}")
print(f"Standard deviation: ${np.std(salaries):,.0f}")

# Check if data is skewed
skewness = stats.skew(salaries)
print(f"Skewness: {skewness:.2f}")

if skewness > 1:
    print("āš ļø Data is highly right-skewed!")
    print("This can cause problems for ML algorithms")

# Solution 1: Log transformation
log_salaries = np.log(salaries)
log_skewness = stats.skew(log_salaries)

print(f"\nšŸ”§ After log transformation:")
print(f"Skewness: {log_skewness:.2f}")

if abs(log_skewness) < 0.5:
    print("āœ… Data is now approximately normal!")

# Solution 2: Power transformation (Yeo-Johnson)
pt = PowerTransformer(method='yeo-johnson')
transformed_salaries = pt.fit_transform(salaries.reshape(-1, 1)).flatten()
transformed_skewness = stats.skew(transformed_salaries)

print(f"\nšŸ”§ After power transformation:")
print(f"Skewness: {transformed_skewness:.2f}")

# Why this matters for ML
print(f"\nšŸ¤– Impact on Machine Learning:")
print("• Many algorithms assume normal distribution")
print("• Skewed data can lead to biased predictions")
print("• Transformations help algorithms perform better")
print("• Linear regression, SVM, and neural networks benefit from normalized data")

# Practical example: Feature scaling
print(f"\nšŸ“ Feature Scaling Example:")
scaler = StandardScaler()
scaled_original = scaler.fit_transform(salaries.reshape(-1, 1))
scaled_transformed = scaler.fit_transform(transformed_salaries.reshape(-1, 1))

print(f"Original data std after scaling: {np.std(scaled_original):.3f}")
print(f"Transformed data std after scaling: {np.std(scaled_transformed):.3f}")
print("Both should be close to 1.0 for optimal ML performance")

🧠 Test Your Knowledge

In a right-skewed distribution, which is typically larger?

What does a histogram show?