Fix character counting in document statistics to use graphemes

- Add unicode-segmentation dependency for proper grapheme cluster support - Replace chars() iteration with graphemes(true) for accurate character counting - Fix counting of complex Unicode characters like emojis, combining characters, and multi-byte sequences - Resolves TODO: 'do graphemes?' in document_statistics function This change provides more accurate character counts for international text, emojis with skin tones, combined characters, and other multi-codepoint graphemes. Examples of improved accuracy: - 👍🏾 now counts as 1 character instead of 2 - é (e + combining acute) counts as 1 character instead of 2 - 🧑‍💻 (person technologist) counts as 1 character instead of 4
2025-10-05 06:39:31 +00:00 · 2025-10-05 06:39:31 +00:00 · 801c7fa68c
commit 801c7fa68c
parent 0d84055362
4 changed files with 127 additions and 3 deletions
--- a/src/main.rs
+++ b/src/main.rs
@ -27,6 +27,7 @@ use cosmic_files::{
 };
 use cosmic_text::{Cursor, Edit, Family, Selection, SwashCache, SyntaxSystem, ViMode};
 use serde::{Deserialize, Serialize};
+use unicode_segmentation::UnicodeSegmentation;
 use std::{
    any::TypeId,
    collections::HashMap,
@ -868,11 +869,13 @@ impl App {
            editor.with_buffer(|buffer| {
                line_count = buffer.lines.len();
                for line in buffer.lines.iter() {
+                    let text = line.text();
                    let mut last_whitespace = true;
-                    //TODO: do graphemes?
-                    for c in line.text().chars() {
+                    
+                    // Count graphemes instead of Unicode scalar values for accurate character count
+                    for grapheme in text.graphemes(true) {
                        character_count += 1;
-                        let is_whitespace = c.is_whitespace();
+                        let is_whitespace = grapheme.chars().all(|c| c.is_whitespace());
                        if !is_whitespace {
                            character_count_no_spaces += 1;
                            if last_whitespace {