Hash Tables
Master fast key-value storage with O(1) operations
šļø What is a Hash Table?
A Hash Table (or Hash Map) stores key-value pairs and uses a hash function to compute an index for fast access. Think of it like a dictionary where you can instantly find any word!
# Python dictionary is a hash table
student_grades = {}
# Insert (O(1) average)
student_grades["Alice"] = 95
student_grades["Bob"] = 87
student_grades["Charlie"] = 92
# Access (O(1) average)
print(student_grades["Alice"]) # 95
# Check existence
if "Bob" in student_grades:
print(f"Bob's grade: {student_grades['Bob']}")
# Delete
del student_grades["Charlie"]
print(student_grades) # {'Alice': 95, 'Bob': 87}
O(1)
Average Access
Key-Value
Pairs
Hash Function
Core
Lesson 1: Simple Hash Table
Create a basic hash table from scratch:
Basic Hash Table
class HashTable:
def __init__(self, size=10):
self.size = size
self.table = [[] for _ in range(size)] # List of lists for chaining
def _hash(self, key):
"""Simple hash function"""
return hash(key) % self.size
def put(self, key, value):
"""Insert key-value pair"""
index = self._hash(key)
bucket = self.table[index]
# Check if key already exists
for i, (k, v) in enumerate(bucket):
if k == key:
bucket[i] = (key, value) # Update existing
return
# Add new key-value pair
bucket.append((key, value))
def get(self, key):
"""Get value by key"""
index = self._hash(key)
bucket = self.table[index]
for k, v in bucket:
if k == key:
return v
raise KeyError(f"Key '{key}' not found")
def delete(self, key):
"""Remove key-value pair"""
index = self._hash(key)
bucket = self.table[index]
for i, (k, v) in enumerate(bucket):
if k == key:
del bucket[i]
return
raise KeyError(f"Key '{key}' not found")
def display(self):
"""Show hash table contents"""
for i, bucket in enumerate(self.table):
if bucket:
print(f"Bucket {i}: {bucket}")
# Example usage
ht = HashTable(5)
# Insert data
ht.put("name", "Alice")
ht.put("age", 25)
ht.put("city", "New York")
# Access data
print(ht.get("name")) # Alice
print(ht.get("age")) # 25
# Display structure
ht.display()
# Update value
ht.put("age", 26)
print(ht.get("age")) # 26
Lesson 2: Hash Functions
Understanding different hash functions:
Hash Functions
def simple_hash(key, table_size):
"""Simple hash: sum of ASCII values"""
return sum(ord(c) for c in str(key)) % table_size
def djb2_hash(key, table_size):
"""DJB2 hash algorithm - better distribution"""
hash_value = 5381
for char in str(key):
hash_value = ((hash_value << 5) + hash_value) + ord(char)
return hash_value % table_size
def polynomial_hash(key, table_size):
"""Polynomial rolling hash"""
hash_value = 0
p = 31 # Prime number
for char in str(key):
hash_value = (hash_value * p + ord(char)) % table_size
return hash_value
# Test different hash functions
keys = ["apple", "banana", "cherry", "date", "elderberry"]
table_size = 7
print("Hash Function Comparison:")
print("Key\t\tSimple\tDJB2\tPolynomial")
print("-" * 40)
for key in keys:
simple = simple_hash(key, table_size)
djb2 = djb2_hash(key, table_size)
poly = polynomial_hash(key, table_size)
print(f"{key}\t\t{simple}\t{djb2}\t{poly}")
# Check for collisions
def check_collisions(hash_func, keys, table_size):
"""Count collisions for a hash function"""
indices = [hash_func(key, table_size) for key in keys]
unique_indices = len(set(indices))
collisions = len(keys) - unique_indices
return collisions
print(f"\nCollisions (out of {len(keys)} keys):")
print(f"Simple hash: {check_collisions(simple_hash, keys, table_size)}")
print(f"DJB2 hash: {check_collisions(djb2_hash, keys, table_size)}")
print(f"Polynomial hash: {check_collisions(polynomial_hash, keys, table_size)}")
Lesson 3: Handling Collisions
Two main approaches to handle hash collisions:
Chaining
Store multiple items in same bucket using linked lists
Simple
Dynamic
Open Addressing
Find another empty slot when collision occurs
Memory efficient
Cache friendly
Open Addressing Hash Table
class OpenAddressingHashTable:
def __init__(self, size=10):
self.size = size
self.keys = [None] * size
self.values = [None] * size
self.deleted = [False] * size # Track deleted slots
def _hash(self, key):
return hash(key) % self.size
def _probe(self, key):
"""Linear probing to find slot"""
index = self._hash(key)
while self.keys[index] is not None:
if self.keys[index] == key and not self.deleted[index]:
return index # Found existing key
index = (index + 1) % self.size # Linear probing
return index # Found empty slot
def put(self, key, value):
"""Insert key-value pair"""
index = self._probe(key)
self.keys[index] = key
self.values[index] = value
self.deleted[index] = False
def get(self, key):
"""Get value by key"""
index = self._hash(key)
while self.keys[index] is not None:
if self.keys[index] == key and not self.deleted[index]:
return self.values[index]
index = (index + 1) % self.size
raise KeyError(f"Key '{key}' not found")
def delete(self, key):
"""Mark key as deleted"""
index = self._hash(key)
while self.keys[index] is not None:
if self.keys[index] == key and not self.deleted[index]:
self.deleted[index] = True
return
index = (index + 1) % self.size
raise KeyError(f"Key '{key}' not found")
def display(self):
"""Show hash table state"""
for i in range(self.size):
status = "DELETED" if self.deleted[i] else "ACTIVE"
if self.keys[i] is not None:
print(f"Index {i}: {self.keys[i]} -> {self.values[i]} ({status})")
else:
print(f"Index {i}: EMPTY")
# Example usage
oht = OpenAddressingHashTable(7)
# Insert data
oht.put("apple", 5)
oht.put("banana", 7)
oht.put("cherry", 3)
oht.display()
print()
# This might cause collision and probing
oht.put("grape", 9)
oht.display()
Lesson 4: Real-World Applications
Databases
- Database indexing
- Primary key lookup
- Query optimization
Caching
- Web page caching
- Memory caches
- CDN systems
Programming
- Symbol tables
- Variable storage
- Function lookup
Security
- Password storage
- Digital signatures
- Checksums
Practice Project: Word Counter
Count word frequency using a hash table:
Word Counter
class WordCounter:
def __init__(self):
self.word_count = {}
def add_text(self, text):
"""Add text and count words"""
# Clean and split text
words = text.lower().replace(',', '').replace('.', '').split()
for word in words:
if word in self.word_count:
self.word_count[word] += 1
else:
self.word_count[word] = 1
def get_count(self, word):
"""Get count for specific word"""
return self.word_count.get(word.lower(), 0)
def most_common(self, n=5):
"""Get n most common words"""
# Sort by count (descending)
sorted_words = sorted(self.word_count.items(),
key=lambda x: x[1], reverse=True)
return sorted_words[:n]
def total_words(self):
"""Get total word count"""
return sum(self.word_count.values())
def unique_words(self):
"""Get number of unique words"""
return len(self.word_count)
def display_stats(self):
"""Show word statistics"""
print(f"Total words: {self.total_words()}")
print(f"Unique words: {self.unique_words()}")
print(f"\nTop 5 most common words:")
for word, count in self.most_common():
print(f" '{word}': {count} times")
# Example usage
counter = WordCounter()
# Add some text
text1 = "The quick brown fox jumps over the lazy dog"
text2 = "The dog was lazy but the fox was quick"
text3 = "Quick brown animals are amazing"
counter.add_text(text1)
counter.add_text(text2)
counter.add_text(text3)
# Show statistics
counter.display_stats()
# Check specific words
print(f"\n'the' appears {counter.get_count('the')} times")
print(f"'quick' appears {counter.get_count('quick')} times")
# Advanced: Find words that appear only once
rare_words = [word for word, count in counter.word_count.items() if count == 1]
print(f"\nWords appearing only once: {rare_words}")