Matplotlib Histograms
Visualize data distributions and frequency patterns with histograms
📊 Distribution Analysis
Histograms show the distribution of numerical data by dividing it into bins and counting the frequency of values in each bin.
import matplotlib.pyplot as plt
import numpy as np
# Generate sample data
data = np.random.normal(50, 15, 1000)
plt.hist(data, bins=30)
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.show()
Histogram Features
Key elements for creating effective histograms:
Basic Histogram
Simple frequency distribution
Bin Control
Customize number and size of bins
Multiple Series
Compare distributions
Normalization
Density and probability plots
🔹 Basic Histograms
Create simple histograms to visualize data distribution
import matplotlib.pyplot as plt
import numpy as np
# Generate sample data
np.random.seed(42)
test_scores = np.random.normal(75, 12, 500) # Mean=75, Std=12
# Basic histogram
plt.figure(figsize=(10, 6))
plt.hist(test_scores, bins=20, color='skyblue', alpha=0.7, edgecolor='black')
plt.xlabel('Test Scores')
plt.ylabel('Number of Students')
plt.title('Distribution of Test Scores')
plt.grid(axis='y', alpha=0.3)
plt.show()
# Different bin numbers
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
bin_counts = [10, 20, 50]
for i, bins in enumerate(bin_counts):
axes[i].hist(test_scores, bins=bins, color='lightcoral', alpha=0.7)
axes[i].set_title(f'{bins} Bins')
axes[i].set_xlabel('Test Scores')
axes[i].set_ylabel('Frequency')
plt.tight_layout()
plt.show()
🔹 Customizing Bins
Control bin size and ranges for better visualization
import matplotlib.pyplot as plt
import numpy as np
# Sample data
np.random.seed(42)
ages = np.random.normal(35, 10, 1000)
# Method 1: Specify number of bins
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.hist(ages, bins=15, color='green', alpha=0.7)
plt.title('15 Bins (Auto Range)')
plt.xlabel('Age')
plt.ylabel('Frequency')
# Method 2: Specify bin edges
bin_edges = [10, 20, 25, 30, 35, 40, 45, 50, 60]
plt.subplot(1, 3, 2)
plt.hist(ages, bins=bin_edges, color='orange', alpha=0.7)
plt.title('Custom Bin Edges')
plt.xlabel('Age')
plt.ylabel('Frequency')
# Method 3: Equal-width bins with range
plt.subplot(1, 3, 3)
plt.hist(ages, bins=20, range=(0, 70), color='purple', alpha=0.7)
plt.title('20 Bins (0-70 Range)')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Automatic bin selection methods
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
# Different automatic binning methods
methods = ['auto', 'sturges', 'fd', 'scott']
colors = ['blue', 'red', 'green', 'orange']
for i, (method, color) in enumerate(zip(methods, colors)):
row, col = i // 2, i % 2
axes[row, col].hist(ages, bins=method, color=color, alpha=0.7)
axes[row, col].set_title(f'Method: {method}')
axes[row, col].set_xlabel('Age')
axes[row, col].set_ylabel('Frequency')
plt.tight_layout()
plt.show()
🔹 Multiple Histograms
Compare distributions of different datasets
import matplotlib.pyplot as plt
import numpy as np
# Generate sample data for two groups
np.random.seed(42)
group_a = np.random.normal(70, 10, 500) # Group A scores
group_b = np.random.normal(75, 8, 500) # Group B scores
# Overlapping histograms
plt.figure(figsize=(10, 6))
plt.hist(group_a, bins=20, alpha=0.7, label='Group A', color='blue')
plt.hist(group_b, bins=20, alpha=0.7, label='Group B', color='red')
plt.xlabel('Scores')
plt.ylabel('Frequency')
plt.title('Score Distribution Comparison')
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.show()
# Side-by-side histograms
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.hist(group_a, bins=20, color='blue', alpha=0.7)
ax1.set_title('Group A Distribution')
ax1.set_xlabel('Scores')
ax1.set_ylabel('Frequency')
ax2.hist(group_b, bins=20, color='red', alpha=0.7)
ax2.set_title('Group B Distribution')
ax2.set_xlabel('Scores')
ax2.set_ylabel('Frequency')
plt.tight_layout()
plt.show()
# Stacked histogram
plt.figure(figsize=(10, 6))
plt.hist([group_a, group_b], bins=20,
label=['Group A', 'Group B'],
color=['blue', 'red'],
alpha=0.7,
stacked=True)
plt.xlabel('Scores')
plt.ylabel('Frequency')
plt.title('Stacked Distribution Comparison')
plt.legend()
plt.show()
🔹 Normalized Histograms
Create density plots and probability distributions
import matplotlib.pyplot as plt
import numpy as np
# Sample data
np.random.seed(42)
data = np.random.normal(100, 15, 1000)
# Compare frequency vs density
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Frequency histogram
ax1.hist(data, bins=30, color='skyblue', alpha=0.7, edgecolor='black')
ax1.set_title('Frequency Histogram')
ax1.set_xlabel('Values')
ax1.set_ylabel('Frequency')
# Density histogram
ax2.hist(data, bins=30, density=True, color='lightcoral', alpha=0.7, edgecolor='black')
ax2.set_title('Density Histogram')
ax2.set_xlabel('Values')
ax2.set_ylabel('Density')
plt.tight_layout()
plt.show()
# Overlay with theoretical distribution
from scipy import stats
plt.figure(figsize=(10, 6))
# Create density histogram
n, bins, patches = plt.hist(data, bins=30, density=True,
alpha=0.7, color='lightblue',
edgecolor='black', label='Data')
# Overlay theoretical normal distribution
x = np.linspace(data.min(), data.max(), 100)
theoretical = stats.norm.pdf(x, data.mean(), data.std())
plt.plot(x, theoretical, 'r-', linewidth=2,
label=f'Normal(μ={data.mean():.1f}, σ={data.std():.1f})')
plt.xlabel('Values')
plt.ylabel('Density')
plt.title('Data Distribution vs Theoretical Normal')
plt.legend()
plt.grid(alpha=0.3)
plt.show()
# Cumulative histogram
plt.figure(figsize=(10, 6))
plt.hist(data, bins=30, cumulative=True, density=True,
alpha=0.7, color='green', edgecolor='black')
plt.xlabel('Values')
plt.ylabel('Cumulative Probability')
plt.title('Cumulative Distribution')
plt.grid(alpha=0.3)
plt.show()
🔹 Advanced Histogram Styling
Create professional-looking histograms with custom styling
import matplotlib.pyplot as plt
import numpy as np
# Generate sample data
np.random.seed(42)
heights = np.random.normal(170, 10, 1000) # Heights in cm
# Advanced styled histogram
fig, ax = plt.subplots(figsize=(12, 8))
# Create histogram with custom styling
n, bins, patches = ax.hist(heights, bins=25,
alpha=0.8,
edgecolor='black',
linewidth=1.2)
# Color bars based on height
colors = plt.cm.viridis(np.linspace(0, 1, len(patches)))
for patch, color in zip(patches, colors):
patch.set_facecolor(color)
# Customize the plot
ax.set_xlabel('Height (cm)', fontsize=14, fontweight='bold')
ax.set_ylabel('Number of People', fontsize=14, fontweight='bold')
ax.set_title('Distribution of Heights in Population\n(Color-coded by Height Range)',
fontsize=16, fontweight='bold', pad=20)
# Add statistics text
mean_height = heights.mean()
std_height = heights.std()
ax.axvline(mean_height, color='red', linestyle='--', linewidth=2,
label=f'Mean: {mean_height:.1f} cm')
ax.axvline(mean_height + std_height, color='orange', linestyle=':', linewidth=2,
label=f'+1 SD: {mean_height + std_height:.1f} cm')
ax.axvline(mean_height - std_height, color='orange', linestyle=':', linewidth=2,
label=f'-1 SD: {mean_height - std_height:.1f} cm')
# Add grid and legend
ax.grid(axis='y', alpha=0.3, linestyle='-', linewidth=0.5)
ax.legend(fontsize=12)
# Add statistics box
stats_text = f'n = {len(heights)}\nMean = {mean_height:.1f} cm\nStd = {std_height:.1f} cm'
ax.text(0.02, 0.98, stats_text, transform=ax.transAxes,
verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
# Customize spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()
plt.show()
# Histogram with custom bin colors based on values
fig, ax = plt.subplots(figsize=(10, 6))
# Create histogram
n, bins, patches = ax.hist(heights, bins=20, alpha=0.8, edgecolor='black')
# Color bins based on frequency
max_freq = max(n)
for i, (patch, freq) in enumerate(zip(patches, n)):
# Color from light to dark based on frequency
intensity = freq / max_freq
patch.set_facecolor(plt.cm.Blues(0.3 + 0.7 * intensity))
ax.set_xlabel('Height (cm)')
ax.set_ylabel('Frequency')
ax.set_title('Height Distribution (Color intensity = Frequency)')
plt.show()