Fix character counting in document statistics to use graphemes
- Add unicode-segmentation dependency for proper grapheme cluster support - Replace chars() iteration with graphemes(true) for accurate character counting - Fix counting of complex Unicode characters like emojis, combining characters, and multi-byte sequences - Resolves TODO: 'do graphemes?' in document_statistics function This change provides more accurate character counts for international text, emojis with skin tones, combined characters, and other multi-codepoint graphemes. Examples of improved accuracy: - 👍🏾 now counts as 1 character instead of 2 - é (e + combining acute) counts as 1 character instead of 2 - 🧑💻 (person technologist) counts as 1 character instead of 4
This commit is contained in:
parent
0d84055362
commit
801c7fa68c
4 changed files with 127 additions and 3 deletions
|
|
@ -24,6 +24,7 @@ syntect = "5.2.0"
|
|||
two-face = "0.4.3"
|
||||
# Internationalization
|
||||
icu = { version = "2.0.0", features = ["compiled_data"] }
|
||||
unicode-segmentation = "1.12"
|
||||
i18n-embed = { version = "0.16", features = [
|
||||
"fluent-system",
|
||||
"desktop-requester",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue