HashSet
HashSet is the most commonly used Set implementation in Java. It stores elements in a hash table, providing O(1) average-case performance for add, remove, and contains operations regardless of the set's size. HashSet makes no guarantee about the order in which elements are returned during iteration — the order depends on hash codes and internal bucket structure, which can change as the set is modified. Understanding HashSet means understanding the hash table beneath it, the critical equals/hashCode contract that determines correctness, load factor and resizing behaviour, and when the unordered O(1) semantics are the right trade-off. This entry covers the hash table internals, the equals/hashCode contract and its consequences, capacity and load factor tuning, null element handling, and practical patterns for correct HashSet usage.
Hash Table Internals — How HashSet Works
// ── HashSet construction ─────────────────────────────────────────────
HashSet<String> set1 = new HashSet<>(); // default: capacity=16, LF=0.75
HashSet<String> set2 = new HashSet<>(100); // initial capacity hint
HashSet<String> set3 = new HashSet<>(100, 0.5f); // capacity + load factor
HashSet<String> set4 = new HashSet<>(existingCollection); // copy constructor
// ── Core operations — all O(1) average ───────────────────────────────
Set<String> set = new HashSet<>();
boolean added = set.add("Alice"); // true — new element
boolean dup = set.add("Alice"); // false — already present (no duplicate)
boolean removed = set.remove("Alice"); // true — was present
boolean missing = set.remove("Dave"); // false — was not present
boolean has = set.contains("Bob"); // true/false — O(1) average
// ── Internal structure visualisation ─────────────────────────────────
//
// HashSet<String> with "Alice", "Bob", "Carol", "Dave"
// (simplified — actual bucket count is a power of 2)
//
// Bucket 0: ["Alice"]
// Bucket 1: []
// Bucket 2: ["Bob", "Dave"] ← collision: Bob and Dave have same bucket
// Bucket 3: ["Carol"]
// ...
//
// contains("Bob"):
// hash("Bob") → 2 → scan bucket 2 → Bob.equals("Bob") → true O(1)
//
// contains("Eve"):
// hash("Eve") → 5 → bucket 5 is empty → false O(1)
// ── No ordering guarantee ─────────────────────────────────────────────
Set<Integer> numbers = new HashSet<>();
for (int i = 1; i <= 10; i++) numbers.add(i);
// Iteration order is NOT 1 2 3 4 5 6 7 8 9 10:
for (int n : numbers) System.out.print(n + " ");
// Might print: 1 2 3 4 5 6 7 8 9 10
// Or might print: 3 7 1 8 4 9 2 5 6 10
// Order depends on hashCode and internal bucket structure
// NEVER rely on HashSet iteration order
// ── Null element — HashSet allows exactly one null ────────────────────
Set<String> withNull = new HashSet<>();
withNull.add(null); // allowed — null maps to bucket 0
withNull.add(null); // false — only one null allowed
System.out.println(withNull.contains(null)); // true
System.out.println(withNull.size()); // 1The equals/hashCode Contract — Correctness Foundation
// ── equals/hashCode contract — correct implementation ────────────────
public class Point {
private final int x;
private final int y;
public Point(int x, int y) { this.x = x; this.y = y; }
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof Point p)) return false;
return x == p.x && y == p.y;
}
@Override
public int hashCode() {
return Objects.hash(x, y); // combines x and y into one hash code
}
}
// ── Correct behaviour with proper contract ────────────────────────────
Set<Point> points = new HashSet<>();
points.add(new Point(3, 4));
points.add(new Point(3, 4)); // same logical value → not added (duplicate)
System.out.println(points.size()); // 1
System.out.println(points.contains(new Point(3, 4))); // true — correct!
// ── BROKEN: equal objects with different hashCodes ────────────────────
public class BrokenPoint {
int x, y;
public BrokenPoint(int x, int y) { this.x = x; this.y = y; }
@Override
public boolean equals(Object o) {
if (!(o instanceof BrokenPoint p)) return false;
return x == p.x && y == p.y;
}
// MISSING hashCode() — uses Object's identity hash code!
// Two equal BrokenPoints will have DIFFERENT hash codes (different objects)
}
Set<BrokenPoint> broken = new HashSet<>();
broken.add(new BrokenPoint(3, 4));
broken.add(new BrokenPoint(3, 4)); // treated as different — WRONG
System.out.println(broken.size()); // 2 — BUG!
System.out.println(broken.contains(new BrokenPoint(3, 4))); // false — BUG!
// ── Mutable object in HashSet — extremely dangerous ───────────────────
class MutableKey {
int value;
MutableKey(int v) { this.value = v; }
@Override public boolean equals(Object o) {
return o instanceof MutableKey m && value == m.value;
}
@Override public int hashCode() { return value; }
}
Set<MutableKey> mutableSet = new HashSet<>();
MutableKey key = new MutableKey(5);
mutableSet.add(key);
System.out.println(mutableSet.contains(key)); // true
key.value = 10; // MUTATE THE KEY — hash code changes!
System.out.println(mutableSet.contains(key)); // false — SET IS BROKEN
System.out.println(mutableSet.size()); // 1 — it's "in" the set but unfindable
mutableSet.add(key); // re-adds as if new — now size=2!Capacity, Load Factor, and Performance Tuning
// ── Capacity tuning for known sizes ──────────────────────────────────
// WRONG: default capacity causes 6 resize operations for 1000 elements
// 16 → 32 → 64 → 128 → 256 → 512 → 1024
Set<String> slow = new HashSet<>(); // will resize multiple times
// CORRECT: provide capacity hint — ceil(1000 / 0.75) = 1334
// No resizing needed during population
Set<String> fast = new HashSet<>(1334);
// Or use the common formula:
int expectedSize = 1000;
int initialCapacity = (int) Math.ceil(expectedSize / 0.75) + 1;
Set<String> tuned = new HashSet<>(initialCapacity);
// ── Load factor trade-offs ────────────────────────────────────────────
// Low load factor (0.25): fewer collisions, more memory
// Use when: read-heavy set, memory not a constraint
Set<String> fastLookup = new HashSet<>(100, 0.25f);
// High load factor (0.9): more collisions, less memory
// Use when: write-once-read-rarely, memory is tight
Set<String> compact = new HashSet<>(100, 0.9f);
// ── HashSet with poor hashCode — performance collapse ─────────────────
// If all elements return the same hashCode:
class BadHash {
String value;
BadHash(String v) { this.value = v; }
@Override public int hashCode() { return 42; } // TERRIBLE
@Override public boolean equals(Object o) {
return o instanceof BadHash b && value.equals(b.value);
}
}
Set<BadHash> bad = new HashSet<>();
for (int i = 0; i < 1000; i++) bad.add(new BadHash("item" + i));
// All 1000 elements in ONE bucket → Java 8+ converts to tree → O(log n)
// Still far worse than O(1) with good hash codes
// ── Bulk operations ───────────────────────────────────────────────────
Set<String> a = new HashSet<>(Set.of("A", "B", "C", "D"));
Set<String> b = new HashSet<>(Set.of("C", "D", "E", "F"));
// Union: a ∪ b
Set<String> union = new HashSet<>(a);
union.addAll(b); // {A, B, C, D, E, F}
// Intersection: a ∩ b
Set<String> intersection = new HashSet<>(a);
intersection.retainAll(b); // {C, D}
// Difference: a b
Set<String> difference = new HashSet<>(a);
difference.removeAll(b); // {A, B}
// Is a subset of b?
boolean isSubset = b.containsAll(intersection); // trueHashSet in Practice — Patterns and Pitfalls
// ── Deduplication ────────────────────────────────────────────────────
List<String> withDuplicates = List.of("apple", "banana", "apple",
"cherry", "banana", "date");
Set<String> unique = new HashSet<>(withDuplicates);
System.out.println(unique.size()); // 4 — duplicates removed
// Order not guaranteed: might be {date, cherry, apple, banana}
// ── Visited set in graph traversal ───────────────────────────────────
Set<Integer> visited = new HashSet<>();
void bfs(Graph graph, int start) {
Queue<Integer> queue = new ArrayDeque<>();
queue.offer(start);
visited.add(start);
while (!queue.isEmpty()) {
int node = queue.poll();
process(node);
for (int neighbour : graph.neighbours(node)) {
if (visited.add(neighbour)) { // add returns false if already present
queue.offer(neighbour); // only enqueue if newly visited
}
}
}
}
// ── Set operations for data analysis ──────────────────────────────────
Set<Long> activeUsers = getActiveUsersLastWeek();
Set<Long> purchasedUsers = getUsersWhoMadePurchase();
// Users who were active but did not purchase:
Set<Long> nonPurchasers = new HashSet<>(activeUsers);
nonPurchasers.removeAll(purchasedUsers); // active purchased
// Are there users who purchased without being in the active set?
boolean anomaly = !activeUsers.containsAll(purchasedUsers);
// ── Thread-safe HashSet options ────────────────────────────────────────
// Option 1: synchronised wrapper (simple, low concurrency)
Set<String> syncSet =
Collections.synchronizedSet(new HashSet<>());
// All methods synchronised — but iterator still needs external sync:
synchronized (syncSet) {
for (String s : syncSet) { ... } // must hold lock during iteration
}
// Option 2: ConcurrentHashMap.newKeySet() (better concurrency)
Set<String> concurrentSet = ConcurrentHashMap.newKeySet();
concurrentSet.add("value"); // lock-free in many cases
concurrentSet.contains("value"); // lock-free read
// Option 3: CopyOnWriteArraySet (read-heavy, writes are expensive)
Set<String> cowSet = new CopyOnWriteArraySet<>();
// All reads are lock-free; writes copy the entire array
// Suitable only when reads vastly outnumber writes