Matplotlib Histograms

Visualize data distributions and frequency patterns with histograms

📊 Distribution Analysis

Histograms show the distribution of numerical data by dividing it into bins and counting the frequency of values in each bin.


import matplotlib.pyplot as plt
import numpy as np

# Generate sample data
data = np.random.normal(50, 15, 1000)

plt.hist(data, bins=30)
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.show()
                                    
Frequency
Distribution
Bin
Control
Statistical
Analysis

Histogram Features

Key elements for creating effective histograms:

📊

Basic Histogram

Simple frequency distribution

plt.hist()
🗂️

Bin Control

Customize number and size of bins

bins parameter
📈

Multiple Series

Compare distributions

alpha overlay
📏

Normalization

Density and probability plots

density

🔹 Basic Histograms

Create simple histograms to visualize data distribution

import matplotlib.pyplot as plt
import numpy as np

# Generate sample data
np.random.seed(42)
test_scores = np.random.normal(75, 12, 500)  # Mean=75, Std=12

# Basic histogram
plt.figure(figsize=(10, 6))
plt.hist(test_scores, bins=20, color='skyblue', alpha=0.7, edgecolor='black')
plt.xlabel('Test Scores')
plt.ylabel('Number of Students')
plt.title('Distribution of Test Scores')
plt.grid(axis='y', alpha=0.3)
plt.show()

# Different bin numbers
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
bin_counts = [10, 20, 50]

for i, bins in enumerate(bin_counts):
    axes[i].hist(test_scores, bins=bins, color='lightcoral', alpha=0.7)
    axes[i].set_title(f'{bins} Bins')
    axes[i].set_xlabel('Test Scores')
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

🔹 Customizing Bins

Control bin size and ranges for better visualization

import matplotlib.pyplot as plt
import numpy as np

# Sample data
np.random.seed(42)
ages = np.random.normal(35, 10, 1000)

# Method 1: Specify number of bins
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.hist(ages, bins=15, color='green', alpha=0.7)
plt.title('15 Bins (Auto Range)')
plt.xlabel('Age')
plt.ylabel('Frequency')

# Method 2: Specify bin edges
bin_edges = [10, 20, 25, 30, 35, 40, 45, 50, 60]
plt.subplot(1, 3, 2)
plt.hist(ages, bins=bin_edges, color='orange', alpha=0.7)
plt.title('Custom Bin Edges')
plt.xlabel('Age')
plt.ylabel('Frequency')

# Method 3: Equal-width bins with range
plt.subplot(1, 3, 3)
plt.hist(ages, bins=20, range=(0, 70), color='purple', alpha=0.7)
plt.title('20 Bins (0-70 Range)')
plt.xlabel('Age')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Automatic bin selection methods
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Different automatic binning methods
methods = ['auto', 'sturges', 'fd', 'scott']
colors = ['blue', 'red', 'green', 'orange']

for i, (method, color) in enumerate(zip(methods, colors)):
    row, col = i // 2, i % 2
    axes[row, col].hist(ages, bins=method, color=color, alpha=0.7)
    axes[row, col].set_title(f'Method: {method}')
    axes[row, col].set_xlabel('Age')
    axes[row, col].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

🔹 Multiple Histograms

Compare distributions of different datasets

import matplotlib.pyplot as plt
import numpy as np

# Generate sample data for two groups
np.random.seed(42)
group_a = np.random.normal(70, 10, 500)  # Group A scores
group_b = np.random.normal(75, 8, 500)   # Group B scores

# Overlapping histograms
plt.figure(figsize=(10, 6))
plt.hist(group_a, bins=20, alpha=0.7, label='Group A', color='blue')
plt.hist(group_b, bins=20, alpha=0.7, label='Group B', color='red')

plt.xlabel('Scores')
plt.ylabel('Frequency')
plt.title('Score Distribution Comparison')
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.show()

# Side-by-side histograms
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

ax1.hist(group_a, bins=20, color='blue', alpha=0.7)
ax1.set_title('Group A Distribution')
ax1.set_xlabel('Scores')
ax1.set_ylabel('Frequency')

ax2.hist(group_b, bins=20, color='red', alpha=0.7)
ax2.set_title('Group B Distribution')
ax2.set_xlabel('Scores')
ax2.set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Stacked histogram
plt.figure(figsize=(10, 6))
plt.hist([group_a, group_b], bins=20, 
         label=['Group A', 'Group B'], 
         color=['blue', 'red'], 
         alpha=0.7, 
         stacked=True)

plt.xlabel('Scores')
plt.ylabel('Frequency')
plt.title('Stacked Distribution Comparison')
plt.legend()
plt.show()

🔹 Normalized Histograms

Create density plots and probability distributions

import matplotlib.pyplot as plt
import numpy as np

# Sample data
np.random.seed(42)
data = np.random.normal(100, 15, 1000)

# Compare frequency vs density
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Frequency histogram
ax1.hist(data, bins=30, color='skyblue', alpha=0.7, edgecolor='black')
ax1.set_title('Frequency Histogram')
ax1.set_xlabel('Values')
ax1.set_ylabel('Frequency')

# Density histogram
ax2.hist(data, bins=30, density=True, color='lightcoral', alpha=0.7, edgecolor='black')
ax2.set_title('Density Histogram')
ax2.set_xlabel('Values')
ax2.set_ylabel('Density')

plt.tight_layout()
plt.show()

# Overlay with theoretical distribution
from scipy import stats

plt.figure(figsize=(10, 6))

# Create density histogram
n, bins, patches = plt.hist(data, bins=30, density=True, 
                           alpha=0.7, color='lightblue', 
                           edgecolor='black', label='Data')

# Overlay theoretical normal distribution
x = np.linspace(data.min(), data.max(), 100)
theoretical = stats.norm.pdf(x, data.mean(), data.std())
plt.plot(x, theoretical, 'r-', linewidth=2, 
         label=f'Normal(μ={data.mean():.1f}, σ={data.std():.1f})')

plt.xlabel('Values')
plt.ylabel('Density')
plt.title('Data Distribution vs Theoretical Normal')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# Cumulative histogram
plt.figure(figsize=(10, 6))
plt.hist(data, bins=30, cumulative=True, density=True, 
         alpha=0.7, color='green', edgecolor='black')
plt.xlabel('Values')
plt.ylabel('Cumulative Probability')
plt.title('Cumulative Distribution')
plt.grid(alpha=0.3)
plt.show()

🔹 Advanced Histogram Styling

Create professional-looking histograms with custom styling

import matplotlib.pyplot as plt
import numpy as np

# Generate sample data
np.random.seed(42)
heights = np.random.normal(170, 10, 1000)  # Heights in cm

# Advanced styled histogram
fig, ax = plt.subplots(figsize=(12, 8))

# Create histogram with custom styling
n, bins, patches = ax.hist(heights, bins=25, 
                          alpha=0.8, 
                          edgecolor='black', 
                          linewidth=1.2)

# Color bars based on height
colors = plt.cm.viridis(np.linspace(0, 1, len(patches)))
for patch, color in zip(patches, colors):
    patch.set_facecolor(color)

# Customize the plot
ax.set_xlabel('Height (cm)', fontsize=14, fontweight='bold')
ax.set_ylabel('Number of People', fontsize=14, fontweight='bold')
ax.set_title('Distribution of Heights in Population\n(Color-coded by Height Range)', 
             fontsize=16, fontweight='bold', pad=20)

# Add statistics text
mean_height = heights.mean()
std_height = heights.std()
ax.axvline(mean_height, color='red', linestyle='--', linewidth=2, 
           label=f'Mean: {mean_height:.1f} cm')
ax.axvline(mean_height + std_height, color='orange', linestyle=':', linewidth=2, 
           label=f'+1 SD: {mean_height + std_height:.1f} cm')
ax.axvline(mean_height - std_height, color='orange', linestyle=':', linewidth=2, 
           label=f'-1 SD: {mean_height - std_height:.1f} cm')

# Add grid and legend
ax.grid(axis='y', alpha=0.3, linestyle='-', linewidth=0.5)
ax.legend(fontsize=12)

# Add statistics box
stats_text = f'n = {len(heights)}\nMean = {mean_height:.1f} cm\nStd = {std_height:.1f} cm'
ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, 
        verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

# Customize spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.show()

# Histogram with custom bin colors based on values
fig, ax = plt.subplots(figsize=(10, 6))

# Create histogram
n, bins, patches = ax.hist(heights, bins=20, alpha=0.8, edgecolor='black')

# Color bins based on frequency
max_freq = max(n)
for i, (patch, freq) in enumerate(zip(patches, n)):
    # Color from light to dark based on frequency
    intensity = freq / max_freq
    patch.set_facecolor(plt.cm.Blues(0.3 + 0.7 * intensity))

ax.set_xlabel('Height (cm)')
ax.set_ylabel('Frequency')
ax.set_title('Height Distribution (Color intensity = Frequency)')
plt.show()

🧠 Test Your Knowledge

What parameter controls the number of bins in a histogram?

Which parameter creates a density histogram?