Fix character counting in document statistics to use graphemes

- Add unicode-segmentation dependency for proper grapheme cluster support
- Replace chars() iteration with graphemes(true) for accurate character counting
- Fix counting of complex Unicode characters like emojis, combining characters, and multi-byte sequences
- Resolves TODO: 'do graphemes?' in document_statistics function

This change provides more accurate character counts for international text,
emojis with skin tones, combined characters, and other multi-codepoint graphemes.

Examples of improved accuracy:
- 👍🏾 now counts as 1 character instead of 2
- é (e + combining acute) counts as 1 character instead of 2
- 🧑‍💻 (person technologist) counts as 1 character instead of 4
This commit is contained in:
aquiles 2025-10-05 06:39:31 +00:00 committed by Jeremy Soller
parent 0d84055362
commit 801c7fa68c
4 changed files with 127 additions and 3 deletions

119
.idea/workspace.xml generated Normal file
View file

@ -0,0 +1,119 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="AutoImportSettings">
<option name="autoReloadType" value="ALL" />
</component>
<component name="CargoProjects">
<cargoProject FILE="$PROJECT_DIR$/Cargo.toml">
<package file="$PROJECT_DIR$">
<feature name="default" enabled="true" />
</package>
</cargoProject>
</component>
<component name="ChangeListManager">
<list default="true" id="dcd1dad2-6701-46c3-a277-b30862871a25" name="Changes" comment="" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="ExecutionTargetManager" SELECTED_TARGET="RsBuildProfile:dev" />
<component name="Git.Settings">
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
</component>
<component name="GitHubPullRequestSearchHistory"><![CDATA[{
"lastFilter": {
"state": "OPEN",
"assignee": "Aquilesorei"
}
}]]></component>
<component name="GithubPullRequestsUISettings"><![CDATA[{
"selectedUrlAndAccountId": {
"url": "https://github.com/Aquilesorei/cosmic-edit.git",
"accountId": "f43afa1a-5616-4b4c-85c3-ceca38badb67"
}
}]]></component>
<component name="MacroExpansionManager">
<option name="directoryName" value="ZsQYuuik" />
</component>
<component name="ProjectColorInfo"><![CDATA[{
"associatedIndex": 7
}]]></component>
<component name="ProjectId" id="33dQwtLbmBMJywCKQ4NtK87ggG3" />
<component name="ProjectViewState">
<option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" />
</component>
<component name="PropertiesComponent"><![CDATA[{
"keyToString": {
"ModuleVcsDetector.initialDetectionPerformed": "true",
"RunOnceActivity.ShowReadmeOnStart": "true",
"RunOnceActivity.git.unshallow": "true",
"RunOnceActivity.rust.reset.selective.auto.import": "true",
"git-widget-placeholder": "master",
"last_opened_file_path": "/home/aquiles/RustroverProjects/cosmic-edit",
"node.js.detected.package.eslint": "true",
"node.js.detected.package.tslint": "true",
"node.js.selected.package.eslint": "(autodetect)",
"node.js.selected.package.tslint": "(autodetect)",
"nodejs_package_manager_path": "npm",
"org.rust.cargo.project.model.PROJECT_DISCOVERY": "true",
"org.rust.first.attach.projects": "true",
"vue.rearranger.settings.migration": "true"
}
}]]></component>
<component name="RunManager" selected="Cargo.Run cosmic-edit">
<configuration name="Run cosmic-edit" type="CargoCommandRunConfiguration" factoryName="Cargo Command">
<option name="buildProfileId" value="dev" />
<option name="command" value="run --package cosmic-edit --bin cosmic-edit" />
<option name="workingDirectory" value="file://$PROJECT_DIR$" />
<envs />
<option name="emulateTerminal" value="true" />
<option name="channel" value="DEFAULT" />
<option name="requiredFeatures" value="true" />
<option name="allFeatures" value="false" />
<option name="withSudo" value="false" />
<option name="buildTarget" value="REMOTE" />
<option name="backtrace" value="SHORT" />
<option name="isRedirectInput" value="false" />
<option name="redirectInputPath" value="" />
<method v="2">
<option name="CARGO.BUILD_TASK_PROVIDER" enabled="true" />
</method>
</configuration>
<configuration name="Test cosmic-edit" type="CargoCommandRunConfiguration" factoryName="Cargo Command">
<option name="command" value="test --workspace" />
<option name="workingDirectory" value="file://$PROJECT_DIR$" />
<envs />
<option name="emulateTerminal" value="true" />
<option name="channel" value="DEFAULT" />
<option name="requiredFeatures" value="true" />
<option name="allFeatures" value="false" />
<option name="withSudo" value="false" />
<option name="buildTarget" value="REMOTE" />
<option name="backtrace" value="SHORT" />
<option name="isRedirectInput" value="false" />
<option name="redirectInputPath" value="" />
<method v="2">
<option name="CARGO.BUILD_TASK_PROVIDER" enabled="true" />
</method>
</configuration>
</component>
<component name="RustProjectSettings">
<option name="toolchainHomeDirectory" value="$USER_HOME$/.cargo/bin" />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="dcd1dad2-6701-46c3-a277-b30862871a25" name="Changes" comment="" />
<created>1759644710501</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1759644710501</updated>
<workItem from="1759644711720" duration="720000" />
</task>
<servers />
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="3" />
</component>
</project>

1
Cargo.lock generated
View file

@ -1460,6 +1460,7 @@ dependencies = [
"syntect",
"tokio",
"two-face",
"unicode-segmentation",
]
[[package]]

View file

@ -24,6 +24,7 @@ syntect = "5.2.0"
two-face = "0.4.3"
# Internationalization
icu = { version = "2.0.0", features = ["compiled_data"] }
unicode-segmentation = "1.12"
i18n-embed = { version = "0.16", features = [
"fluent-system",
"desktop-requester",

View file

@ -27,6 +27,7 @@ use cosmic_files::{
};
use cosmic_text::{Cursor, Edit, Family, Selection, SwashCache, SyntaxSystem, ViMode};
use serde::{Deserialize, Serialize};
use unicode_segmentation::UnicodeSegmentation;
use std::{
any::TypeId,
collections::HashMap,
@ -868,11 +869,13 @@ impl App {
editor.with_buffer(|buffer| {
line_count = buffer.lines.len();
for line in buffer.lines.iter() {
let text = line.text();
let mut last_whitespace = true;
//TODO: do graphemes?
for c in line.text().chars() {
// Count graphemes instead of Unicode scalar values for accurate character count
for grapheme in text.graphemes(true) {
character_count += 1;
let is_whitespace = c.is_whitespace();
let is_whitespace = grapheme.chars().all(|c| c.is_whitespace());
if !is_whitespace {
character_count_no_spaces += 1;
if last_whitespace {