Unicode Handling
Java was designed from the ground up for Unicode. The char type holds a UTF-16 code unit, String stores characters in UTF-16 encoding, and the entire I/O system supports character encoding conversion. However, Unicode's growth beyond the original 65,536-character Basic Multilingual Plane means that char is no longer sufficient to represent every Unicode character — supplementary characters require two chars (a surrogate pair). Understanding this distinction, knowing how to correctly process Unicode text, handling character encodings, and working with Unicode-aware string operations are essential for any application that handles international text. This entry covers the Unicode standard essentials, Java's char vs code point model, encoding handling, normalisation, and correct Unicode-aware string operations.
Unicode Fundamentals — Code Points and Planes
// ── Code point vs char ───────────────────────────────────────────────
// BMP character — one code point, one char
char latinA = 'A'; // U+0041 — fits in char
System.out.println((int) 'A'); // 65 = 0x0041
// Supplementary character — one code point, TWO chars (surrogate pair)
String emoji = "😀"; // U+1F600 GRINNING FACE
System.out.println(emoji.length()); // 2 — two chars!
System.out.println(emoji.codePointCount(0, emoji.length())); // 1 — one character
// ── The surrogate pair that represents U+1F600 ────────────────────────
int codePoint = 0x1F600; // 128512 decimal
char high = Character.highSurrogate(codePoint); // '\uD83D' (0xD83D)
char low = Character.lowSurrogate(codePoint); // '\uDE00' (0xDE00)
System.out.printf("High surrogate: U+%04X%n", (int) high); // D83D
System.out.printf("Low surrogate: U+%04X%n", (int) low); // DE00
// Reconstruct code point from surrogates
int reconstructed = Character.toCodePoint(high, low);
System.out.println(reconstructed == codePoint); // true
// ── String with mixed BMP and supplementary chars ─────────────────────
String mixed = "Hi 🌍"; // 3 BMP chars + 1 supplementary (Earth emoji)
System.out.println(mixed.length()); // 5 — 3 chars + 2 surrogates
System.out.println(mixed.codePointCount(0, mixed.length())); // 4 — 4 characters
// ── Unicode planes summary ────────────────────────────────────────────
// Plane 0 (BMP) U+0000 – U+FFFF Latin, Greek, CJK, etc.
// Plane 1 (SMP) U+10000 – U+1FFFF Emoji, historic scripts, music
// Plane 2 (SIP) U+20000 – U+2FFFF CJK extensions
// Planes 3-13 (mostly unassigned)
// Plane 14 (SSP) U+E0000 – U+EFFFF Tags
// Planes 15-16 (private use areas)Code Point API — Processing Unicode Correctly
// ── Code point iteration — correct for all Unicode ───────────────────
String text = "Hello 😀 World 🌍";
// WRONG — char-based iteration splits emoji:
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
// c may be half a surrogate pair for emoji
System.out.print(c + " "); // garbage for emoji positions
}
// CORRECT — code point iteration:
text.codePoints().forEach(cp -> {
System.out.printf("U+%04X (%s) ",
cp, new String(Character.toChars(cp)));
});
// ── codePointCount vs length ──────────────────────────────────────────
String withEmoji = "Hello 😀!";
System.out.println(withEmoji.length()); // 9
System.out.println(withEmoji.codePointCount(0, withEmoji.length())); // 8
// ── Character class code point methods ───────────────────────────────
int cpA = 'A';
int cpAlpha = 0x03B1; // α (Greek small letter alpha)
int cpEmoji = 0x1F600; // 😀
int cpCJK = 0x4E2D; // 中 (Chinese character for "middle")
System.out.println(Character.isLetter(cpA)); // true
System.out.println(Character.isLetter(cpAlpha)); // true
System.out.println(Character.isLetter(cpCJK)); // true
System.out.println(Character.isLetter(cpEmoji)); // false
System.out.println(Character.isEmoji(cpEmoji)); // true (Java 19+)
// Case conversion for ALL Unicode:
System.out.println(Character.toLowerCase(0x03A3)); // σ (Σ → σ)
// String-level case conversion handles multi-char cases like 'ß' → "SS":
System.out.println("straße".toUpperCase(Locale.GERMANY)); // STRASSE
// ── Reverse a string correctly ────────────────────────────────────────
// WRONG — StringBuilder.reverse() handles surrogates since Java 1.5 but:
public static String reverseChars(String s) {
return new StringBuilder(s).reverse().toString(); // works correctly
}
// Manual CORRECT reversal via code points:
public static String reverseByCodePoints(String s) {
int[] codePoints = s.codePoints().toArray();
// Reverse the code point array
for (int i = 0, j = codePoints.length - 1; i < j; i++, j--) {
int tmp = codePoints[i];
codePoints[i] = codePoints[j];
codePoints[j] = tmp;
}
return new String(codePoints, 0, codePoints.length);
}
System.out.println(reverseByCodePoints("Hello 😀")); // 😀 olleHCharacter Encodings — Charset and I/O
// ── StandardCharsets — always use these constants ────────────────────
import java.nio.charset.StandardCharsets;
byte[] utf8Bytes = "Hello 世界".getBytes(StandardCharsets.UTF_8);
byte[] latin1Bytes = "Hello".getBytes(StandardCharsets.ISO_8859_1);
String fromUtf8 = new String(utf8Bytes, StandardCharsets.UTF_8);
System.out.println(fromUtf8); // Hello 世界
// ── Encoding mismatch — common source of garbled text ─────────────────
String original = "Héllo Wörld";
byte[] wrongBytes = original.getBytes(StandardCharsets.UTF_8);
String garbled = new String(wrongBytes, StandardCharsets.ISO_8859_1);
System.out.println(garbled); // Héllo Wörld — garbled!
String correct = new String(wrongBytes, StandardCharsets.UTF_8);
System.out.println(correct); // Héllo Wörld — correct
// ── Always specify encoding for I/O ──────────────────────────────────
// WRONG — platform default encoding (varies by OS and JVM):
BufferedReader badReader = new BufferedReader(
new FileReader("data.txt"));
// CORRECT — explicit UTF-8:
BufferedReader goodReader = new BufferedReader(
new InputStreamReader(
new FileInputStream("data.txt"),
StandardCharsets.UTF_8));
// Or Java 11+ Files API (always specify charset):
String content = Files.readString(Path.of("data.txt"),
StandardCharsets.UTF_8);
Files.writeString(Path.of("out.txt"), content,
StandardCharsets.UTF_8);
// ── Detecting encoding — Charset.forName() with fallback ──────────────
String charsetName = response.getContentType()
.replaceAll(".*;\\s*charset\\s*=\\s*", "")
.trim();
Charset charset;
try {
charset = Charset.forName(charsetName);
} catch (IllegalArgumentException e) {
charset = StandardCharsets.UTF_8; // default to UTF-8 on unknown
}
// ── UTF-8 BOM handling ────────────────────────────────────────────────
// Some UTF-8 files start with a BOM (U+FEFF, bytes EF BB BF)
// Java does not strip BOM automatically
byte[] withBom = Files.readAllBytes(Path.of("with-bom.txt"));
String text = new String(withBom, StandardCharsets.UTF_8);
if (text.startsWith("\uFEFF")) {
text = text.substring(1); // strip BOM manually
}Unicode Normalisation
// ── The normalisation problem ─────────────────────────────────────────
import java.text.Normalizer;
// é as precomposed single character (U+00E9)
String precomposed = "\u00E9"; // é — NFC form
// é as base e + combining acute (U+0065 + U+0301)
String decomposed = "\u0065\u0301"; // e + ́ — NFD form
System.out.println(precomposed); // é — looks the same
System.out.println(decomposed); // é — looks the same
System.out.println(precomposed.length()); // 1 — one code unit
System.out.println(decomposed.length()); // 2 — base + combining
System.out.println(precomposed.equals(decomposed)); // FALSE — different bytes!
// ── Normalise before comparing ────────────────────────────────────────
String nfc1 = Normalizer.normalize(precomposed, Normalizer.Form.NFC);
String nfc2 = Normalizer.normalize(decomposed, Normalizer.Form.NFC);
System.out.println(nfc1.equals(nfc2)); // TRUE — both now NFC
// ── The four normalisation forms ──────────────────────────────────────
String original = "\u00e9\ufb01"; // é + fi (fi ligature)
String nfc = Normalizer.normalize(original, Normalizer.Form.NFC);
String nfd = Normalizer.normalize(original, Normalizer.Form.NFD);
String nfkc = Normalizer.normalize(original, Normalizer.Form.NFKC);
String nfkd = Normalizer.normalize(original, Normalizer.Form.NFKD);
System.out.println(nfc.length()); // 2: é (precomposed) + fi (ligature preserved)
System.out.println(nfd.length()); // 3: e + ́ + fi (e decomposed, ligature preserved)
System.out.println(nfkc.length()); // 3: é (composed) + f + i (ligature decomposed)
System.out.println(nfkd.length()); // 4: e + ́ + f + i (both decomposed)
// ── Username normalisation for deduplication ──────────────────────────
public static String normaliseUsername(String username) {
// NFC: canonical composition (handles accented chars)
// toLowerCase: case folding
// strip: remove leading/trailing whitespace
return Normalizer
.normalize(username, Normalizer.Form.NFC)
.toLowerCase(Locale.ROOT)
.strip();
}
System.out.println(
normaliseUsername("Al\u0069\u0301ce").equals(
normaliseUsername("Al\u00EDce"))); // true — same user