summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornerdopolis <bluescreen_avenger@verizon.net>2023-12-08 21:48:47 -0500
committerRay Strode <rstrode@redhat.com>2023-12-10 13:13:35 -0500
commit6da1b27c46da3e4f2e7287be6415fccc9c3b2a35 (patch)
tree5c45e1d1b71a426d8b032fd222e151709ba5769b
parent7a7c43612cac5d60a9a2fe04ec64601eb153ce65 (diff)
ply-terminal-emulator: Handle incomplete UTF-8 characters betterresilientunicode
This commit introduces a state machine to better handle when part of a UTF-8 character comes in immediately, and the rest of it comes in later. It also tries to better handle cases where control characters are interleaved in the middle of UTF-8 characters.
-rw-r--r--src/libply-splash-core/ply-terminal-emulator.c49
1 files changed, 26 insertions, 23 deletions
diff --git a/src/libply-splash-core/ply-terminal-emulator.c b/src/libply-splash-core/ply-terminal-emulator.c
index 8d704954..a50eff98 100644
--- a/src/libply-splash-core/ply-terminal-emulator.c
+++ b/src/libply-splash-core/ply-terminal-emulator.c
@@ -95,7 +95,6 @@ typedef struct
struct _ply_terminal_emulator
{
ply_terminal_emulator_terminal_state_t state;
- ply_terminal_emulator_utf8_character_parse_state_t unicode_state;
size_t number_of_rows;
size_t number_of_columns;
@@ -114,8 +113,9 @@ struct _ply_terminal_emulator
ply_terminal_emulator_command_t *staged_command;
ply_list_t *pending_commands;
+ ply_terminal_emulator_utf8_character_parse_state_t pending_character_state;
ply_buffer_t *pending_character;
- int pending_character_unicode_bytes;
+ int pending_character_size;
ply_rich_text_t *current_line;
ply_rich_text_character_style_t current_style;
@@ -151,6 +151,8 @@ ply_terminal_emulator_new (size_t number_of_rows,
terminal_emulator->lines = ply_array_new (PLY_ARRAY_ELEMENT_TYPE_POINTER);
terminal_emulator->pending_character = ply_buffer_new ();
+ terminal_emulator->pending_character_state = PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_SINGLE_BYTE;
+ terminal_emulator->pending_character_size = 0;
span.offset = 0;
span.range = terminal_emulator->number_of_columns;
@@ -164,7 +166,6 @@ ply_terminal_emulator_new (size_t number_of_rows,
terminal_emulator->cursor_row_offset = 0;
terminal_emulator->state = PLY_TERMINAL_EMULATOR_TERMINAL_STATE_UNESCAPED;
- terminal_emulator->unicode_state = PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_SINGLE_BYTE;
terminal_emulator->last_parameter_was_integer = false;
terminal_emulator->pending_parameter_value = 0;
@@ -1109,8 +1110,6 @@ ply_terminal_emulator_parse_substring (ply_terminal_emulator_t *terminal_emulato
ply_terminal_emulator_command_t *command;
ply_rich_text_span_t span;
size_t maximum_characters;
-
- int character_unicode_bytes;
ply_list_node_t *node;
terminal_emulator->current_line = terminal_emulator_line;
@@ -1134,17 +1133,19 @@ ply_terminal_emulator_parse_substring (ply_terminal_emulator_t *terminal_emulato
fill_offsets_with_padding (terminal_emulator, new_length, terminal_emulator->cursor_column);
while (i < input_length) {
- if (break_string == PLY_TERMINAL_EMULATOR_BREAK_STRING && terminal_emulator->state == PLY_TERMINAL_EMULATOR_TERMINAL_STATE_UNESCAPED) {
+ ply_utf8_character_byte_type_t character_byte_type;
+
+ if (break_string == PLY_TERMINAL_EMULATOR_BREAK_STRING && terminal_emulator->state == PLY_TERMINAL_EMULATOR_PARSE_STATE_UNESCAPED) {
break_string = PLY_TERMINAL_EMULATOR_BREAK_STRING_NONE;
break;
}
terminal_emulator->break_action = PLY_TERMINAL_EMULATOR_BREAK_STRING_ACTION_PRESERVE_CURSOR_COLUMN;
- character_unicode_bytes = ply_utf8_character_get_size (&input[i], PLY_UTF8_CHARACTER_SIZE_MAX);
+ character_byte_type = ply_utf8_character_get_byte_type (input[i]);
/* If the previous byte was also a UTF-8 prefix byte, handle it as an invalid character */
- if (terminal_emulator->unicode_state == PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE && character_unicode_bytes != PLY_UTF8_CHARACTER_IS_SEQUENCE_BYTE) {
+ if (terminal_emulator->pending_character_state == PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE && character_byte_type != PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION) {
ply_buffer_clear (terminal_emulator->pending_character);
if (terminal_emulator->state == PLY_TERMINAL_EMULATOR_TERMINAL_STATE_UNESCAPED) {
@@ -1157,38 +1158,40 @@ ply_terminal_emulator_parse_substring (ply_terminal_emulator_t *terminal_emulato
}
}
- if (character_unicode_bytes > 1) {
+ if (PLY_UTF8_CHARACTER_BYTE_TYPE_IS_MULTI_BYTE (character_byte_type)) {
/* Multi-byte Unicode characters */
- terminal_emulator->unicode_state = PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE;
- terminal_emulator->pending_character_unicode_bytes = character_unicode_bytes;
+ terminal_emulator->pending_character_state = PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE;
+ terminal_emulator->pending_character_size = ply_utf8_character_get_size_from_byte_type (character_byte_type);
ply_buffer_append_bytes (terminal_emulator->pending_character, &input[i], 1);
i++;
continue;
- } else if (character_unicode_bytes == 1) {
+ } else if (character_byte_type == PLY_UTF8_CHARACTER_BYTE_TYPE_1_BYTE) {
/* Ascii characters could potentially be used in escape sequences */
- terminal_emulator->unicode_state = PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_SINGLE_BYTE;
- terminal_emulator->pending_character_unicode_bytes = character_unicode_bytes;
+ terminal_emulator->pending_character_state = PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_SINGLE_BYTE;
+ terminal_emulator->pending_character_size = ply_utf8_character_get_size_from_byte_type (character_byte_type);
ply_buffer_clear (terminal_emulator->pending_character);
- } else if (character_unicode_bytes == 0 || character_unicode_bytes == -1) {
- /* This should not happen with ply_utf8_character_get_size */
+ } else if (character_byte_type == PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING) {
+ i++;
+ continue;
+ } else if (character_byte_type == PLY_UTF8_CHARACTER_BYTE_TYPE_INVALID) {
i++;
continue;
- } else if (character_unicode_bytes == PLY_UTF8_CHARACTER_IS_SEQUENCE_BYTE) {
- if (terminal_emulator->unicode_state == PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE) {
+ } else if (character_byte_type == PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION) {
+ if (terminal_emulator->pending_character_state == PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE) {
/* Handle the auxiliary unicode byte if handling a multi-byte character */
- if (terminal_emulator->unicode_state == PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE)
+ if (terminal_emulator->pending_character_state == PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE)
ply_buffer_append_bytes (terminal_emulator->pending_character, &input[i], 1);
i++;
/* The multi-byte character is not finished yet, continue the loop */
- if (ply_buffer_get_size (terminal_emulator->pending_character) < terminal_emulator->pending_character_unicode_bytes)
+ if (ply_buffer_get_size (terminal_emulator->pending_character) < terminal_emulator->pending_character_size)
continue;
} else {
/* If this is an auxiliary Unicode byte when not handling a multi-byte character, replace it with a placeholder */
- terminal_emulator->pending_character_unicode_bytes = 1;
+ terminal_emulator->pending_character_size = 1;
ply_buffer_clear (terminal_emulator->pending_character);
ply_buffer_append_bytes (terminal_emulator->pending_character, "?", 1);
@@ -1203,14 +1206,14 @@ ply_terminal_emulator_parse_substring (ply_terminal_emulator_t *terminal_emulato
}
/* If the current character is a multi-byte character, and all the bytes are received */
- if (terminal_emulator->unicode_state == PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE) {
+ if (terminal_emulator->pending_character_state == PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE) {
/* Drop and skip the multi-byte character if is still escaped */
if (terminal_emulator->state != PLY_TERMINAL_EMULATOR_TERMINAL_STATE_UNESCAPED) {
ply_buffer_clear (terminal_emulator->pending_character);
continue;
}
- terminal_emulator->unicode_state = PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_SINGLE_BYTE;
+ terminal_emulator->pending_character_state = PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_SINGLE_BYTE;
break_string = ply_terminal_emulator_flush_pending_character_to_line (terminal_emulator,
maximum_characters,
ply_buffer_steal_bytes (terminal_emulator->pending_character),