Files
Odin/core/unicode/utf8/grapheme.odin
T
2024-06-29 19:07:34 +01:00

412 lines
12 KiB
Odin
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package utf8
import "core:unicode"
ZERO_WIDTH_JOINER :: unicode.ZERO_WIDTH_JOINER
is_control :: unicode.is_control
is_hangul_syllable_leading :: unicode.is_hangul_syllable_leading
is_hangul_syllable_vowel :: unicode.is_hangul_syllable_vowel
is_hangul_syllable_trailing :: unicode.is_hangul_syllable_trailing
is_hangul_syllable_lv :: unicode.is_hangul_syllable_lv
is_hangul_syllable_lvt :: unicode.is_hangul_syllable_lvt
is_indic_conjunct_break_extend :: unicode.is_indic_conjunct_break_extend
is_indic_conjunct_break_linker :: unicode.is_indic_conjunct_break_linker
is_indic_conjunct_break_consonant :: unicode.is_indic_conjunct_break_consonant
is_gcb_extend_class :: unicode.is_gcb_extend_class
is_spacing_mark :: unicode.is_spacing_mark
is_gcb_prepend_class :: unicode.is_gcb_prepend_class
is_emoji_extended_pictographic :: unicode.is_emoji_extended_pictographic
is_regional_indicator :: unicode.is_regional_indicator
normalized_east_asian_width :: unicode.normalized_east_asian_width
Grapheme :: struct {
byte_index: int,
rune_index: int,
width: int,
}
/*
Count the individual graphemes in a UTF-8 string.
Inputs:
- str: The input string.
Returns:
- graphemes: The number of graphemes in the string.
- runes: The number of runes in the string.
- width: The width of the string in number of monospace cells.
*/
@(require_results)
grapheme_count :: proc(str: string) -> (graphemes, runes, width: int) {
_, graphemes, runes, width = decode_grapheme_clusters(str, false)
return
}
/*
Decode the individual graphemes in a UTF-8 string.
*Allocates Using Provided Allocator*
Inputs:
- str: The input string.
- track_graphemes: Whether or not to allocate and return `graphemes` with extra data about each grapheme.
- allocator: (default: context.allocator)
Returns:
- graphemes: Extra data about each grapheme.
- grapheme_count: The number of graphemes in the string.
- rune_count: The number of runes in the string.
- width: The width of the string in number of monospace cells.
*/
@(require_results)
decode_grapheme_clusters :: proc(
str: string,
track_graphemes := true,
allocator := context.allocator,
) -> (
graphemes: [dynamic]Grapheme,
grapheme_count: int,
rune_count: int,
width: int,
) {
// The following procedure implements text segmentation by breaking on
// Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from
// the Unicode® Standard Annex #29, entitled:
//
// UNICODE TEXT SEGMENTATION
//
// Version: Unicode 15.1.0
// Date: 2023-08-16
// Revision: 43
//
// This procedure is conformant[4] to UAX29-C1-1, otherwise known as the
// extended, non-legacy ruleset.
//
// Please see the references below for more information.
//
//
// NOTE(Feoramund): This procedure has not been highly optimized.
// A couple opportunities were taken to bypass repeated checking when a
// rune is outside of certain codepoint ranges, but little else has been
// done. Standard switches, conditionals, and binary search are used to
// see if a rune fits into a certain category.
//
// I did find that only one prior rune of state was necessary to build an
// algorithm that successfully passes all 4,835 test cases provided with
// this implementation from the Unicode organization's website.
//
// My initial implementation tracked explicit breaks and counted them once
// the string iteration had terminated. I've found this current
// implementation to be far simpler and need no allocations (unless the
// caller wants position data).
//
// Most rules work backwards instead of forwards which has helped keep this
// simple, despite its length and verbosity.
//
//
// The implementation has been left verbose and in the order described by
// the specification, to enable better readability and future upkeep.
//
// Some possible optimizations might include:
//
// - saving the type of `last_rune` instead of the exact rune.
// - reordering rules.
// - combining tables.
//
//
// [1]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
// [2]: https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table
// [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
// [4]: https://www.unicode.org/reports/tr29/#Conformance
// Additionally, this procedure now takes into account Standard Annex #11,
// in order to estimate how visually wide the string will appear on a
// monospaced display. This can only ever be a rough guess, as this tends
// to be an implementation detail relating to which fonts are being used,
// how codepoints are interpreted and drawn, if codepoint sequences are
// interpreted correctly, and et cetera.
//
// For example, a program may not properly interpret an emoji modifier
// sequence and print the component glyphs instead of one whole glyph.
//
// See here for more information: https://www.unicode.org/reports/tr11/
//
// NOTE: There is no explicit mention of what to do with zero-width spaces
// as far as grapheme cluster segmentation goes, therefore this
// implementation may count and return graphemes with a `width` of zero.
//
// Treat them as any other space.
Grapheme_Cluster_Sequence :: enum {
None,
Indic,
Emoji,
Regional,
}
context.allocator = allocator
last_rune: rune
last_rune_breaks_forward: bool
last_width: int
last_grapheme_count: int
bypass_next_rune: bool
regional_indicator_counter: int
current_sequence: Grapheme_Cluster_Sequence
continue_sequence: bool
for this_rune, byte_index in str {
defer {
// "Break at the start and end of text, unless the text is empty."
//
// GB1: sot ÷ Any
// GB2: Any ÷ eot
if rune_count == 0 && grapheme_count == 0 {
grapheme_count += 1
}
if grapheme_count > last_grapheme_count {
width += normalized_east_asian_width(this_rune)
if track_graphemes {
append(&graphemes, Grapheme{
byte_index,
rune_count,
width - last_width,
})
}
last_grapheme_count = grapheme_count
last_width = width
}
last_rune = this_rune
rune_count += 1
if !continue_sequence {
current_sequence = .None
regional_indicator_counter = 0
}
continue_sequence = false
}
// "Do not break between a CR and LF. Otherwise, break before and after controls."
//
// GB3: CR × LF
// GB4: (Control | CR | LF) ÷
// GB5: ÷ (Control | CR | LF)
if this_rune == '\n' && last_rune == '\r' {
last_rune_breaks_forward = false
bypass_next_rune = false
continue
}
if is_control(this_rune) {
grapheme_count += 1
last_rune_breaks_forward = true
bypass_next_rune = true
continue
}
// (This check is for rules that work forwards, instead of backwards.)
if bypass_next_rune {
if last_rune_breaks_forward {
grapheme_count += 1
last_rune_breaks_forward = false
}
bypass_next_rune = false
continue
}
// (Optimization 1: Prevent low runes from proceeding further.)
//
// * 0xA9 and 0xAE are in the Extended_Pictographic range,
// which is checked later in GB11.
if this_rune != 0xA9 && this_rune != 0xAE && this_rune <= 0x2FF {
grapheme_count += 1
continue
}
// (Optimization 2: Check if the rune is in the Hangul space before getting specific.)
if 0x1100 <= this_rune && this_rune <= 0xD7FB {
// "Do not break Hangul syllable sequences."
//
// GB6: L × (L | V | LV | LVT)
// GB7: (LV | V) × (V | T)
// GB8: (LVT | T) × T
if is_hangul_syllable_leading(this_rune) ||
is_hangul_syllable_lv(this_rune) ||
is_hangul_syllable_lvt(this_rune) {
if !is_hangul_syllable_leading(last_rune) {
grapheme_count += 1
}
continue
}
if is_hangul_syllable_vowel(this_rune) {
if is_hangul_syllable_leading(last_rune) ||
is_hangul_syllable_vowel(last_rune) ||
is_hangul_syllable_lv(last_rune) {
continue
}
grapheme_count += 1
continue
}
if is_hangul_syllable_trailing(this_rune) {
if is_hangul_syllable_trailing(last_rune) ||
is_hangul_syllable_lvt(last_rune) ||
is_hangul_syllable_lv(last_rune) ||
is_hangul_syllable_vowel(last_rune) {
continue
}
grapheme_count += 1
continue
}
}
// "Do not break before extending characters or ZWJ."
//
// GB9: × (Extend | ZWJ)
if this_rune == ZERO_WIDTH_JOINER {
continue_sequence = true
continue
}
if is_gcb_extend_class(this_rune) {
// (Support for GB9c.)
if current_sequence == .Indic {
if is_indic_conjunct_break_extend(this_rune) && (
is_indic_conjunct_break_linker(last_rune) ||
is_indic_conjunct_break_consonant(last_rune) ) {
continue_sequence = true
continue
}
if is_indic_conjunct_break_linker(this_rune) && (
is_indic_conjunct_break_linker(last_rune) ||
is_indic_conjunct_break_extend(last_rune) ||
is_indic_conjunct_break_consonant(last_rune) ) {
continue_sequence = true
continue
}
continue
}
// (Support for GB11.)
if current_sequence == .Emoji && (
is_gcb_extend_class(last_rune) ||
is_emoji_extended_pictographic(last_rune) ) {
continue_sequence = true
}
continue
}
// _The GB9a and GB9b rules only apply to extended grapheme clusters:_
// "Do not break before SpacingMarks, or after Prepend characters."
//
// GB9a: × SpacingMark
// GB9b: Prepend ×
if is_spacing_mark(this_rune) {
continue
}
if is_gcb_prepend_class(this_rune) {
grapheme_count += 1
bypass_next_rune = true
continue
}
// _The GB9c rule only applies to extended grapheme clusters:_
// "Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker."
//
// GB9c: \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* × \p{InCB=Consonant}
if is_indic_conjunct_break_consonant(this_rune) {
if current_sequence == .Indic {
if last_rune == ZERO_WIDTH_JOINER ||
is_indic_conjunct_break_linker(last_rune) {
continue_sequence = true
} else {
grapheme_count += 1
}
} else {
grapheme_count += 1
current_sequence = .Indic
continue_sequence = true
}
continue
}
if is_indic_conjunct_break_extend(this_rune) {
if current_sequence == .Indic {
if is_indic_conjunct_break_consonant(last_rune) ||
is_indic_conjunct_break_linker(last_rune) {
continue_sequence = true
} else {
grapheme_count += 1
}
}
continue
}
if is_indic_conjunct_break_linker(this_rune) {
if current_sequence == .Indic {
if is_indic_conjunct_break_extend(last_rune) ||
is_indic_conjunct_break_linker(last_rune) {
continue_sequence = true
} else {
grapheme_count += 1
}
}
continue
}
//
// (Curiously, there is no GB10.)
//
// "Do not break within emoji modifier sequences or emoji zwj sequences."
//
// GB11: \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
if is_emoji_extended_pictographic(this_rune) {
if current_sequence != .Emoji || last_rune != ZERO_WIDTH_JOINER {
grapheme_count += 1
}
current_sequence = .Emoji
continue_sequence = true
continue
}
// "Do not break within emoji flag sequences.
// That is, do not break between regional indicator (RI) symbols
// if there is an odd number of RI characters before the break point."
//
// GB12: sot (RI RI)* RI × RI
// GB13: [^RI] (RI RI)* RI × RI
if is_regional_indicator(this_rune) {
if regional_indicator_counter & 1 == 0 {
grapheme_count += 1
}
current_sequence = .Regional
continue_sequence = true
regional_indicator_counter += 1
continue
}
// "Otherwise, break everywhere."
//
// GB999: Any ÷ Any
grapheme_count += 1
}
return
}