Fix character counting in document statistics to use graphemes

- Add unicode-segmentation dependency for proper grapheme cluster support
- Replace chars() iteration with graphemes(true) for accurate character counting
- Fix counting of complex Unicode characters like emojis, combining characters, and multi-byte sequences
- Resolves TODO: 'do graphemes?' in document_statistics function

This change provides more accurate character counts for international text,
emojis with skin tones, combined characters, and other multi-codepoint graphemes.

Examples of improved accuracy:
- 👍🏾 now counts as 1 character instead of 2
- é (e + combining acute) counts as 1 character instead of 2
- 🧑‍💻 (person technologist) counts as 1 character instead of 4
This commit is contained in:
aquiles 2025-10-05 06:39:31 +00:00 committed by Jeremy Soller
parent 0d84055362
commit 801c7fa68c
4 changed files with 127 additions and 3 deletions

View file

@ -27,6 +27,7 @@ use cosmic_files::{
};
use cosmic_text::{Cursor, Edit, Family, Selection, SwashCache, SyntaxSystem, ViMode};
use serde::{Deserialize, Serialize};
use unicode_segmentation::UnicodeSegmentation;
use std::{
any::TypeId,
collections::HashMap,
@ -868,11 +869,13 @@ impl App {
editor.with_buffer(|buffer| {
line_count = buffer.lines.len();
for line in buffer.lines.iter() {
let text = line.text();
let mut last_whitespace = true;
//TODO: do graphemes?
for c in line.text().chars() {
// Count graphemes instead of Unicode scalar values for accurate character count
for grapheme in text.graphemes(true) {
character_count += 1;
let is_whitespace = c.is_whitespace();
let is_whitespace = grapheme.chars().all(|c| c.is_whitespace());
if !is_whitespace {
character_count_no_spaces += 1;
if last_whitespace {