Fix character counting in document statistics to use graphemes
- Add unicode-segmentation dependency for proper grapheme cluster support - Replace chars() iteration with graphemes(true) for accurate character counting - Fix counting of complex Unicode characters like emojis, combining characters, and multi-byte sequences - Resolves TODO: 'do graphemes?' in document_statistics function This change provides more accurate character counts for international text, emojis with skin tones, combined characters, and other multi-codepoint graphemes. Examples of improved accuracy: - 👍🏾 now counts as 1 character instead of 2 - é (e + combining acute) counts as 1 character instead of 2 - 🧑💻 (person technologist) counts as 1 character instead of 4
This commit is contained in:
parent
0d84055362
commit
801c7fa68c
4 changed files with 127 additions and 3 deletions
|
|
@ -27,6 +27,7 @@ use cosmic_files::{
|
|||
};
|
||||
use cosmic_text::{Cursor, Edit, Family, Selection, SwashCache, SyntaxSystem, ViMode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
use std::{
|
||||
any::TypeId,
|
||||
collections::HashMap,
|
||||
|
|
@ -868,11 +869,13 @@ impl App {
|
|||
editor.with_buffer(|buffer| {
|
||||
line_count = buffer.lines.len();
|
||||
for line in buffer.lines.iter() {
|
||||
let text = line.text();
|
||||
let mut last_whitespace = true;
|
||||
//TODO: do graphemes?
|
||||
for c in line.text().chars() {
|
||||
|
||||
// Count graphemes instead of Unicode scalar values for accurate character count
|
||||
for grapheme in text.graphemes(true) {
|
||||
character_count += 1;
|
||||
let is_whitespace = c.is_whitespace();
|
||||
let is_whitespace = grapheme.chars().all(|c| c.is_whitespace());
|
||||
if !is_whitespace {
|
||||
character_count_no_spaces += 1;
|
||||
if last_whitespace {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue