From 801c7fa68c37d7fa61c9b060b6052c6d9e8780fc Mon Sep 17 00:00:00 2001 From: aquiles Date: Sun, 5 Oct 2025 06:39:31 +0000 Subject: [PATCH] Fix character counting in document statistics to use graphemes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add unicode-segmentation dependency for proper grapheme cluster support - Replace chars() iteration with graphemes(true) for accurate character counting - Fix counting of complex Unicode characters like emojis, combining characters, and multi-byte sequences - Resolves TODO: 'do graphemes?' in document_statistics function This change provides more accurate character counts for international text, emojis with skin tones, combined characters, and other multi-codepoint graphemes. Examples of improved accuracy: - 👍🏾 now counts as 1 character instead of 2 - é (e + combining acute) counts as 1 character instead of 2 - 🧑‍💻 (person technologist) counts as 1 character instead of 4 --- .idea/workspace.xml | 119 ++++++++++++++++++++++++++++++++++++++++++++ Cargo.lock | 1 + Cargo.toml | 1 + src/main.rs | 9 ++-- 4 files changed, 127 insertions(+), 3 deletions(-) create mode 100644 .idea/workspace.xml diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..8e7a315 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,119 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1759644710501 + + + + + + \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index fbe15d1..8722262 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1460,6 +1460,7 @@ dependencies = [ "syntect", "tokio", "two-face", + "unicode-segmentation", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index b425a5f..b49aaf9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ syntect = "5.2.0" two-face = "0.4.3" # Internationalization icu = { version = "2.0.0", features = ["compiled_data"] } +unicode-segmentation = "1.12" i18n-embed = { version = "0.16", features = [ "fluent-system", "desktop-requester", diff --git a/src/main.rs b/src/main.rs index 8dc144a..1aa2005 100644 --- a/src/main.rs +++ b/src/main.rs @@ -27,6 +27,7 @@ use cosmic_files::{ }; use cosmic_text::{Cursor, Edit, Family, Selection, SwashCache, SyntaxSystem, ViMode}; use serde::{Deserialize, Serialize}; +use unicode_segmentation::UnicodeSegmentation; use std::{ any::TypeId, collections::HashMap, @@ -868,11 +869,13 @@ impl App { editor.with_buffer(|buffer| { line_count = buffer.lines.len(); for line in buffer.lines.iter() { + let text = line.text(); let mut last_whitespace = true; - //TODO: do graphemes? - for c in line.text().chars() { + + // Count graphemes instead of Unicode scalar values for accurate character count + for grapheme in text.graphemes(true) { character_count += 1; - let is_whitespace = c.is_whitespace(); + let is_whitespace = grapheme.chars().all(|c| c.is_whitespace()); if !is_whitespace { character_count_no_spaces += 1; if last_whitespace {