Fix character counting in document statistics to use graphemes
- Add unicode-segmentation dependency for proper grapheme cluster support - Replace chars() iteration with graphemes(true) for accurate character counting - Fix counting of complex Unicode characters like emojis, combining characters, and multi-byte sequences - Resolves TODO: 'do graphemes?' in document_statistics function This change provides more accurate character counts for international text, emojis with skin tones, combined characters, and other multi-codepoint graphemes. Examples of improved accuracy: - 👍🏾 now counts as 1 character instead of 2 - é (e + combining acute) counts as 1 character instead of 2 - 🧑💻 (person technologist) counts as 1 character instead of 4
This commit is contained in:
parent
0d84055362
commit
801c7fa68c
4 changed files with 127 additions and 3 deletions
119
.idea/workspace.xml
generated
Normal file
119
.idea/workspace.xml
generated
Normal file
|
|
@ -0,0 +1,119 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="AutoImportSettings">
|
||||||
|
<option name="autoReloadType" value="ALL" />
|
||||||
|
</component>
|
||||||
|
<component name="CargoProjects">
|
||||||
|
<cargoProject FILE="$PROJECT_DIR$/Cargo.toml">
|
||||||
|
<package file="$PROJECT_DIR$">
|
||||||
|
<feature name="default" enabled="true" />
|
||||||
|
</package>
|
||||||
|
</cargoProject>
|
||||||
|
</component>
|
||||||
|
<component name="ChangeListManager">
|
||||||
|
<list default="true" id="dcd1dad2-6701-46c3-a277-b30862871a25" name="Changes" comment="" />
|
||||||
|
<option name="SHOW_DIALOG" value="false" />
|
||||||
|
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||||
|
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
||||||
|
<option name="LAST_RESOLUTION" value="IGNORE" />
|
||||||
|
</component>
|
||||||
|
<component name="ExecutionTargetManager" SELECTED_TARGET="RsBuildProfile:dev" />
|
||||||
|
<component name="Git.Settings">
|
||||||
|
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
||||||
|
</component>
|
||||||
|
<component name="GitHubPullRequestSearchHistory"><![CDATA[{
|
||||||
|
"lastFilter": {
|
||||||
|
"state": "OPEN",
|
||||||
|
"assignee": "Aquilesorei"
|
||||||
|
}
|
||||||
|
}]]></component>
|
||||||
|
<component name="GithubPullRequestsUISettings"><![CDATA[{
|
||||||
|
"selectedUrlAndAccountId": {
|
||||||
|
"url": "https://github.com/Aquilesorei/cosmic-edit.git",
|
||||||
|
"accountId": "f43afa1a-5616-4b4c-85c3-ceca38badb67"
|
||||||
|
}
|
||||||
|
}]]></component>
|
||||||
|
<component name="MacroExpansionManager">
|
||||||
|
<option name="directoryName" value="ZsQYuuik" />
|
||||||
|
</component>
|
||||||
|
<component name="ProjectColorInfo"><![CDATA[{
|
||||||
|
"associatedIndex": 7
|
||||||
|
}]]></component>
|
||||||
|
<component name="ProjectId" id="33dQwtLbmBMJywCKQ4NtK87ggG3" />
|
||||||
|
<component name="ProjectViewState">
|
||||||
|
<option name="hideEmptyMiddlePackages" value="true" />
|
||||||
|
<option name="showLibraryContents" value="true" />
|
||||||
|
</component>
|
||||||
|
<component name="PropertiesComponent"><![CDATA[{
|
||||||
|
"keyToString": {
|
||||||
|
"ModuleVcsDetector.initialDetectionPerformed": "true",
|
||||||
|
"RunOnceActivity.ShowReadmeOnStart": "true",
|
||||||
|
"RunOnceActivity.git.unshallow": "true",
|
||||||
|
"RunOnceActivity.rust.reset.selective.auto.import": "true",
|
||||||
|
"git-widget-placeholder": "master",
|
||||||
|
"last_opened_file_path": "/home/aquiles/RustroverProjects/cosmic-edit",
|
||||||
|
"node.js.detected.package.eslint": "true",
|
||||||
|
"node.js.detected.package.tslint": "true",
|
||||||
|
"node.js.selected.package.eslint": "(autodetect)",
|
||||||
|
"node.js.selected.package.tslint": "(autodetect)",
|
||||||
|
"nodejs_package_manager_path": "npm",
|
||||||
|
"org.rust.cargo.project.model.PROJECT_DISCOVERY": "true",
|
||||||
|
"org.rust.first.attach.projects": "true",
|
||||||
|
"vue.rearranger.settings.migration": "true"
|
||||||
|
}
|
||||||
|
}]]></component>
|
||||||
|
<component name="RunManager" selected="Cargo.Run cosmic-edit">
|
||||||
|
<configuration name="Run cosmic-edit" type="CargoCommandRunConfiguration" factoryName="Cargo Command">
|
||||||
|
<option name="buildProfileId" value="dev" />
|
||||||
|
<option name="command" value="run --package cosmic-edit --bin cosmic-edit" />
|
||||||
|
<option name="workingDirectory" value="file://$PROJECT_DIR$" />
|
||||||
|
<envs />
|
||||||
|
<option name="emulateTerminal" value="true" />
|
||||||
|
<option name="channel" value="DEFAULT" />
|
||||||
|
<option name="requiredFeatures" value="true" />
|
||||||
|
<option name="allFeatures" value="false" />
|
||||||
|
<option name="withSudo" value="false" />
|
||||||
|
<option name="buildTarget" value="REMOTE" />
|
||||||
|
<option name="backtrace" value="SHORT" />
|
||||||
|
<option name="isRedirectInput" value="false" />
|
||||||
|
<option name="redirectInputPath" value="" />
|
||||||
|
<method v="2">
|
||||||
|
<option name="CARGO.BUILD_TASK_PROVIDER" enabled="true" />
|
||||||
|
</method>
|
||||||
|
</configuration>
|
||||||
|
<configuration name="Test cosmic-edit" type="CargoCommandRunConfiguration" factoryName="Cargo Command">
|
||||||
|
<option name="command" value="test --workspace" />
|
||||||
|
<option name="workingDirectory" value="file://$PROJECT_DIR$" />
|
||||||
|
<envs />
|
||||||
|
<option name="emulateTerminal" value="true" />
|
||||||
|
<option name="channel" value="DEFAULT" />
|
||||||
|
<option name="requiredFeatures" value="true" />
|
||||||
|
<option name="allFeatures" value="false" />
|
||||||
|
<option name="withSudo" value="false" />
|
||||||
|
<option name="buildTarget" value="REMOTE" />
|
||||||
|
<option name="backtrace" value="SHORT" />
|
||||||
|
<option name="isRedirectInput" value="false" />
|
||||||
|
<option name="redirectInputPath" value="" />
|
||||||
|
<method v="2">
|
||||||
|
<option name="CARGO.BUILD_TASK_PROVIDER" enabled="true" />
|
||||||
|
</method>
|
||||||
|
</configuration>
|
||||||
|
</component>
|
||||||
|
<component name="RustProjectSettings">
|
||||||
|
<option name="toolchainHomeDirectory" value="$USER_HOME$/.cargo/bin" />
|
||||||
|
</component>
|
||||||
|
<component name="TaskManager">
|
||||||
|
<task active="true" id="Default" summary="Default task">
|
||||||
|
<changelist id="dcd1dad2-6701-46c3-a277-b30862871a25" name="Changes" comment="" />
|
||||||
|
<created>1759644710501</created>
|
||||||
|
<option name="number" value="Default" />
|
||||||
|
<option name="presentableId" value="Default" />
|
||||||
|
<updated>1759644710501</updated>
|
||||||
|
<workItem from="1759644711720" duration="720000" />
|
||||||
|
</task>
|
||||||
|
<servers />
|
||||||
|
</component>
|
||||||
|
<component name="TypeScriptGeneratedFilesManager">
|
||||||
|
<option name="version" value="3" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -1460,6 +1460,7 @@ dependencies = [
|
||||||
"syntect",
|
"syntect",
|
||||||
"tokio",
|
"tokio",
|
||||||
"two-face",
|
"two-face",
|
||||||
|
"unicode-segmentation",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,7 @@ syntect = "5.2.0"
|
||||||
two-face = "0.4.3"
|
two-face = "0.4.3"
|
||||||
# Internationalization
|
# Internationalization
|
||||||
icu = { version = "2.0.0", features = ["compiled_data"] }
|
icu = { version = "2.0.0", features = ["compiled_data"] }
|
||||||
|
unicode-segmentation = "1.12"
|
||||||
i18n-embed = { version = "0.16", features = [
|
i18n-embed = { version = "0.16", features = [
|
||||||
"fluent-system",
|
"fluent-system",
|
||||||
"desktop-requester",
|
"desktop-requester",
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,7 @@ use cosmic_files::{
|
||||||
};
|
};
|
||||||
use cosmic_text::{Cursor, Edit, Family, Selection, SwashCache, SyntaxSystem, ViMode};
|
use cosmic_text::{Cursor, Edit, Family, Selection, SwashCache, SyntaxSystem, ViMode};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use unicode_segmentation::UnicodeSegmentation;
|
||||||
use std::{
|
use std::{
|
||||||
any::TypeId,
|
any::TypeId,
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
|
|
@ -868,11 +869,13 @@ impl App {
|
||||||
editor.with_buffer(|buffer| {
|
editor.with_buffer(|buffer| {
|
||||||
line_count = buffer.lines.len();
|
line_count = buffer.lines.len();
|
||||||
for line in buffer.lines.iter() {
|
for line in buffer.lines.iter() {
|
||||||
|
let text = line.text();
|
||||||
let mut last_whitespace = true;
|
let mut last_whitespace = true;
|
||||||
//TODO: do graphemes?
|
|
||||||
for c in line.text().chars() {
|
// Count graphemes instead of Unicode scalar values for accurate character count
|
||||||
|
for grapheme in text.graphemes(true) {
|
||||||
character_count += 1;
|
character_count += 1;
|
||||||
let is_whitespace = c.is_whitespace();
|
let is_whitespace = grapheme.chars().all(|c| c.is_whitespace());
|
||||||
if !is_whitespace {
|
if !is_whitespace {
|
||||||
character_count_no_spaces += 1;
|
character_count_no_spaces += 1;
|
||||||
if last_whitespace {
|
if last_whitespace {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue