diff options
author | nerdopolis <bluescreen_avenger@verizon.net> | 2023-12-08 21:48:47 -0500 |
---|---|---|
committer | Ray Strode <rstrode@redhat.com> | 2023-12-10 22:13:58 -0500 |
commit | 27fd8d115c3f9fea55940018e3cff8939ab5d22a (patch) | |
tree | a7afa375b65c96072523c92ddbcde98a1643c41b | |
parent | a354385ce88252708dd962a788c7427a8483186a (diff) |
ply-terminal-emulator: Handle incomplete UTF-8 characters better
This commit introduces a state machine to better handle when part
of a UTF-8 character comes in immediately, and the rest of it
comes in later.
It also tries to better handle cases where control characters
are interleaved in the middle of UTF-8 characters.
-rw-r--r-- | src/libply-splash-core/ply-terminal-emulator.c | 231 |
1 files changed, 155 insertions, 76 deletions
diff --git a/src/libply-splash-core/ply-terminal-emulator.c b/src/libply-splash-core/ply-terminal-emulator.c index 6f83e314..95058315 100644 --- a/src/libply-splash-core/ply-terminal-emulator.c +++ b/src/libply-splash-core/ply-terminal-emulator.c @@ -47,10 +47,16 @@ typedef enum { - PLY_TERMINAL_EMULATOR_PARSE_STATE_UNESCAPED, - PLY_TERMINAL_EMULATOR_PARSE_STATE_ESCAPED, - PLY_TERMINAL_EMULATOR_PARSE_STATE_CONTROL_SEQUENCE_PARAMETER -} ply_terminal_emulator_parse_state_t; + PLY_TERMINAL_EMULATOR_TERMINAL_STATE_UNESCAPED, + PLY_TERMINAL_EMULATOR_TERMINAL_STATE_ESCAPED, + PLY_TERMINAL_EMULATOR_TERMINAL_STATE_CONTROL_SEQUENCE_PARAMETER +} ply_terminal_emulator_terminal_state_t; + +typedef enum +{ + PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_SINGLE_BYTE, + PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE, +} ply_terminal_emulator_utf8_character_parse_state_t; typedef enum { @@ -88,26 +94,31 @@ typedef struct struct _ply_terminal_emulator { - ply_terminal_emulator_parse_state_t state; + ply_terminal_emulator_terminal_state_t state; + + size_t number_of_rows; + size_t number_of_columns; - size_t number_of_rows; - size_t number_of_columns; + size_t line_count; + ply_array_t *lines; - size_t line_count; - ply_array_t *lines; + ply_trigger_t *output_trigger; - ply_trigger_t *output_trigger; + ssize_t cursor_row_offset; /* Relative to the bottom-most allocated line */ + size_t cursor_column; + ply_terminal_emulator_break_string_action_t break_action; - ssize_t cursor_row_offset; /* Relative to the bottom-most allocated line */ - size_t cursor_column; - ply_terminal_emulator_break_string_action_t break_action; + uint32_t last_parameter_was_integer : 1; + uint32_t pending_parameter_value; + ply_terminal_emulator_command_t *staged_command; + ply_list_t *pending_commands; - uint32_t last_parameter_was_integer : 1; - ply_terminal_emulator_command_t *staged_command; - ply_list_t *pending_commands; + ply_terminal_emulator_utf8_character_parse_state_t pending_character_state; + ply_buffer_t *pending_character; + int pending_character_size; - ply_rich_text_t *current_line; - ply_rich_text_character_style_t current_style; + ply_rich_text_t *current_line; + ply_rich_text_character_style_t current_style; }; typedef ply_terminal_emulator_break_string_t (*ply_terminal_emulator_dispatch_handler_t)(); @@ -139,6 +150,10 @@ ply_terminal_emulator_new (size_t number_of_rows, terminal_emulator->number_of_columns = number_of_columns; terminal_emulator->lines = ply_array_new (PLY_ARRAY_ELEMENT_TYPE_POINTER); + terminal_emulator->pending_character = ply_buffer_new (); + terminal_emulator->pending_character_state = PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_SINGLE_BYTE; + terminal_emulator->pending_character_size = 0; + span.offset = 0; span.range = terminal_emulator->number_of_columns; @@ -150,7 +165,10 @@ ply_terminal_emulator_new (size_t number_of_rows, terminal_emulator->cursor_row_offset = 0; - terminal_emulator->state = PLY_TERMINAL_EMULATOR_PARSE_STATE_UNESCAPED; + terminal_emulator->state = PLY_TERMINAL_EMULATOR_TERMINAL_STATE_UNESCAPED; + + terminal_emulator->last_parameter_was_integer = false; + terminal_emulator->pending_parameter_value = 0; terminal_emulator->break_action = PLY_TERMINAL_EMULATOR_BREAK_STRING_ACTION_PRESERVE_CURSOR_COLUMN; terminal_emulator->output_trigger = ply_trigger_new (NULL); @@ -1052,6 +1070,40 @@ ply_terminal_emulator_get_line_count (ply_terminal_emulator_t *terminal_emulator return terminal_emulator->line_count; } +static ply_terminal_emulator_break_string_t +ply_terminal_emulator_flush_pending_character_to_line (ply_terminal_emulator_t *terminal_emulator) +{ + ply_terminal_emulator_break_string_t break_string = PLY_TERMINAL_EMULATOR_BREAK_STRING_NONE; + ply_rich_text_span_t span; + const char *character_bytes; + size_t character_size; + size_t maximum_characters; + + character_bytes = ply_buffer_get_bytes (terminal_emulator->pending_character); + character_size = ply_buffer_get_size (terminal_emulator->pending_character); + + ply_rich_text_set_character (terminal_emulator->current_line, + terminal_emulator->current_style, + terminal_emulator->cursor_column, + character_bytes, + character_size); + ply_buffer_clear (terminal_emulator->pending_character); + + terminal_emulator->cursor_column++; + + ply_rich_text_get_mutable_span (terminal_emulator->current_line, &span); + + maximum_characters = span.offset + span.range; + + if (terminal_emulator->cursor_column >= maximum_characters) { + terminal_emulator->cursor_row_offset++; + terminal_emulator->break_action = PLY_TERMINAL_EMULATOR_BREAK_STRING_ACTION_RESET_CURSOR_COLUMN; + break_string = PLY_TERMINAL_EMULATOR_BREAK_STRING; + } + + return break_string; +} + void ply_terminal_emulator_parse_substring (ply_terminal_emulator_t *terminal_emulator, ply_rich_text_t *terminal_emulator_line, @@ -1060,17 +1112,13 @@ ply_terminal_emulator_parse_substring (ply_terminal_emulator_t *terminal_emulato const char **unparsed_input, size_t *number_of_unparsed_bytes) { - char character_string[PLY_UTF8_CHARACTER_MAX_SIZE]; size_t input_length = number_of_bytes_to_parse; size_t new_length; size_t i = 0; ply_terminal_emulator_break_string_t break_string = PLY_TERMINAL_EMULATOR_BREAK_STRING_NONE; - int parameter_value; ply_terminal_emulator_command_t *command; ply_rich_text_span_t span; size_t maximum_characters; - - int character_length; ply_list_node_t *node; terminal_emulator->current_line = terminal_emulator_line; @@ -1094,89 +1142,127 @@ ply_terminal_emulator_parse_substring (ply_terminal_emulator_t *terminal_emulato fill_offsets_with_padding (terminal_emulator, new_length, terminal_emulator->cursor_column); while (i < input_length) { - if (break_string == PLY_TERMINAL_EMULATOR_BREAK_STRING && terminal_emulator->state == PLY_TERMINAL_EMULATOR_PARSE_STATE_UNESCAPED) { + ply_utf8_character_byte_type_t character_byte_type; + + if (break_string == PLY_TERMINAL_EMULATOR_BREAK_STRING && terminal_emulator->state == PLY_TERMINAL_EMULATOR_TERMINAL_STATE_UNESCAPED) { break_string = PLY_TERMINAL_EMULATOR_BREAK_STRING_NONE; break; } - parameter_value = 0; - terminal_emulator->break_action = PLY_TERMINAL_EMULATOR_BREAK_STRING_ACTION_PRESERVE_CURSOR_COLUMN; - /* Non-ASCII Unicode characters have no impact on escape code handling */ - character_length = ply_utf8_character_get_size (&input[i], 4); + character_byte_type = ply_utf8_character_get_byte_type (input[i]); + + if (character_byte_type != PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION) + ply_buffer_clear (terminal_emulator->pending_character); + + /* If the previous byte was also a UTF-8 leading byte, handle it as an invalid character */ + if (terminal_emulator->pending_character_state == PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE && + character_byte_type != PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION && + terminal_emulator->state == PLY_TERMINAL_EMULATOR_TERMINAL_STATE_UNESCAPED) { + ply_buffer_append_bytes (terminal_emulator->pending_character, "?", 1); + break_string = ply_terminal_emulator_flush_pending_character_to_line (terminal_emulator); + } + + if (PLY_UTF8_CHARACTER_BYTE_TYPE_IS_MULTI_BYTE (character_byte_type)) { + /* Multi-byte Unicode characters */ + terminal_emulator->pending_character_state = PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE; + terminal_emulator->pending_character_size = ply_utf8_character_get_size_from_byte_type (character_byte_type); + + ply_buffer_append_bytes (terminal_emulator->pending_character, &input[i], 1); - /* skip, if the character_length is -2, it's a auxiliary unicode byte */ - if (character_length < 0) { i++; continue; - } else if (character_length > 1) { - /* Last element is a nullchar */ - character_string[character_length] = '\0'; + } else if (character_byte_type == PLY_UTF8_CHARACTER_BYTE_TYPE_1_BYTE) { + /* Ascii characters could potentially be used in escape sequences */ + terminal_emulator->pending_character_state = PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_SINGLE_BYTE; + terminal_emulator->pending_character_size = ply_utf8_character_get_size_from_byte_type (character_byte_type); + } else if (character_byte_type == PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING) { + i++; + continue; + } else if (character_byte_type == PLY_UTF8_CHARACTER_BYTE_TYPE_INVALID) { + i++; + continue; + } else if (character_byte_type == PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION) { + if (terminal_emulator->pending_character_state == PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE) { + /* Handle the auxiliary unicode byte if handling a multi-byte character */ + if (terminal_emulator->pending_character_state == PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE) + ply_buffer_append_bytes (terminal_emulator->pending_character, &input[i], 1); - for (int j = 0; j < character_length; j++) { - character_string[j] = input[i]; + i++; + + /* The multi-byte character is not finished yet, continue the loop */ + if (ply_buffer_get_size (terminal_emulator->pending_character) < terminal_emulator->pending_character_size) + continue; + } else { + /* If this is an auxiliary Unicode byte when not handling a multi-byte character, replace it with a placeholder */ + terminal_emulator->pending_character_size = 1; + ply_buffer_clear (terminal_emulator->pending_character); + ply_buffer_append_bytes (terminal_emulator->pending_character, "?", 1); + break_string = ply_terminal_emulator_flush_pending_character_to_line (terminal_emulator); i++; - if (i >= maximum_characters) - break; + continue; } - ply_rich_text_set_character (terminal_emulator->current_line, terminal_emulator->current_style, terminal_emulator->cursor_column, character_string, character_length); - terminal_emulator->cursor_column++; + } + + /* If the current character is a multi-byte character, and all the bytes are received */ + if (terminal_emulator->pending_character_state == PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_MULTI_BYTE) { + /* Drop and skip the multi-byte character if is still escaped */ + if (terminal_emulator->state != PLY_TERMINAL_EMULATOR_TERMINAL_STATE_UNESCAPED) { + ply_buffer_clear (terminal_emulator->pending_character); + continue; + } + + terminal_emulator->pending_character_state = PLY_TERMINAL_EMULATOR_UTF8_CHARACTER_PARSE_STATE_SINGLE_BYTE; + break_string = ply_terminal_emulator_flush_pending_character_to_line (terminal_emulator); continue; } switch (terminal_emulator->state) { - case PLY_TERMINAL_EMULATOR_PARSE_STATE_UNESCAPED: + case PLY_TERMINAL_EMULATOR_TERMINAL_STATE_UNESCAPED: if (input[i] == '\e') { terminal_emulator->staged_command = ply_terminal_emulator_command_new (); - terminal_emulator->state = PLY_TERMINAL_EMULATOR_PARSE_STATE_ESCAPED; + terminal_emulator->state = PLY_TERMINAL_EMULATOR_TERMINAL_STATE_ESCAPED; } else if (iscntrl (input[i]) && input[i] != '\e') { terminal_emulator->staged_command = ply_terminal_emulator_command_new (); terminal_emulator->staged_command->code = input[i]; terminal_emulator->staged_command->type = PLY_TERMINAL_EMULATOR_COMMAND_TYPE_CONTROL_CHARACTER; ply_list_append_data (terminal_emulator->pending_commands, terminal_emulator->staged_command); } else { - character_string[0] = input[i]; - character_string[1] = '\0'; - ply_rich_text_set_character (terminal_emulator->current_line, terminal_emulator->current_style, terminal_emulator->cursor_column, character_string, 1); - terminal_emulator->cursor_column++; - - if (terminal_emulator->cursor_column >= maximum_characters) { - terminal_emulator->cursor_row_offset++; - terminal_emulator->break_action = PLY_TERMINAL_EMULATOR_BREAK_STRING_ACTION_RESET_CURSOR_COLUMN; - break_string = PLY_TERMINAL_EMULATOR_BREAK_STRING; - } + ply_buffer_append_bytes (terminal_emulator->pending_character, &input[i], 1); + break_string = ply_terminal_emulator_flush_pending_character_to_line (terminal_emulator); } break; - case PLY_TERMINAL_EMULATOR_PARSE_STATE_ESCAPED: + case PLY_TERMINAL_EMULATOR_TERMINAL_STATE_ESCAPED: if (input[i] == '[') { + terminal_emulator->pending_parameter_value = 0; terminal_emulator->staged_command->parameters = ply_array_new (PLY_ARRAY_ELEMENT_TYPE_UINT32); terminal_emulator->staged_command->type = PLY_TERMINAL_EMULATOR_COMMAND_TYPE_CONTROL_SEQUENCE; terminal_emulator->staged_command->parameters_valid = true; terminal_emulator->last_parameter_was_integer = false; - terminal_emulator->state = PLY_TERMINAL_EMULATOR_PARSE_STATE_CONTROL_SEQUENCE_PARAMETER; + terminal_emulator->state = PLY_TERMINAL_EMULATOR_TERMINAL_STATE_CONTROL_SEQUENCE_PARAMETER; } else { terminal_emulator->staged_command->code = input[i]; terminal_emulator->staged_command->type = PLY_TERMINAL_EMULATOR_COMMAND_TYPE_ESCAPE; ply_list_append_data (terminal_emulator->pending_commands, terminal_emulator->staged_command); - terminal_emulator->state = PLY_TERMINAL_EMULATOR_PARSE_STATE_UNESCAPED; + terminal_emulator->state = PLY_TERMINAL_EMULATOR_TERMINAL_STATE_UNESCAPED; } break; - case PLY_TERMINAL_EMULATOR_PARSE_STATE_CONTROL_SEQUENCE_PARAMETER: + case PLY_TERMINAL_EMULATOR_TERMINAL_STATE_CONTROL_SEQUENCE_PARAMETER: /* Characters that end the control sequence, and define the command */ if ((unsigned char) input[i] >= PLY_TERMINAL_ESCAPE_CODE_COMMAND_MINIMUM && (unsigned char) input[i] <= PLY_TERMINAL_ESCAPE_CODE_COMMAND_MAXIMUM) { - terminal_emulator->state = PLY_TERMINAL_EMULATOR_PARSE_STATE_UNESCAPED; + terminal_emulator->state = PLY_TERMINAL_EMULATOR_TERMINAL_STATE_UNESCAPED; terminal_emulator->staged_command->code = input[i]; - if (terminal_emulator->last_parameter_was_integer == false) - ply_array_add_uint32_element (terminal_emulator->staged_command->parameters, parameter_value); + ply_array_add_uint32_element (terminal_emulator->staged_command->parameters, terminal_emulator->pending_parameter_value); + terminal_emulator->pending_parameter_value = 0; ply_list_append_data (terminal_emulator->pending_commands, terminal_emulator->staged_command); break; @@ -1187,28 +1273,22 @@ ply_terminal_emulator_parse_substring (ply_terminal_emulator_t *terminal_emulato ply_list_append_data (terminal_emulator->pending_commands, nested_command); } else if (input[i] == ';' || (isdigit (input[i]))) { if (isdigit (input[i])) { - /* If the previous character was an integer, and this one is an integer, it is probably the next digit*/ - if (terminal_emulator->last_parameter_was_integer == true) { - parameter_value = -1; - } else { - parameter_value = atoi (&input[i]); - } + /* If the previous character was an integer, and this one is an integer, it is probably the next digit */ + terminal_emulator->pending_parameter_value = terminal_emulator->pending_parameter_value * 10; + terminal_emulator->pending_parameter_value += input[i] - '0'; terminal_emulator->last_parameter_was_integer = true; } else if (input[i] == ';') { - /* Skip, and do not add the default value of 0 if the last character encountered was a valid parameter - * Double ;;'s imply a 0 - */ - if (terminal_emulator->last_parameter_was_integer == true) - parameter_value = -1; + /* Double ;;'s imply a 0 */ + if (terminal_emulator->last_parameter_was_integer == false) { + ply_array_add_uint32_element (terminal_emulator->staged_command->parameters, 0); + } else { + ply_array_add_uint32_element (terminal_emulator->staged_command->parameters, terminal_emulator->pending_parameter_value); + } + terminal_emulator->pending_parameter_value = 0; terminal_emulator->last_parameter_was_integer = false; } - - /* Skip parameter if less than 0 */ - if (parameter_value >= 0) - ply_array_add_uint32_element (terminal_emulator->staged_command->parameters, parameter_value); - break; } else { /* invalid characters in the middle of the escape sequence invalidate it */ @@ -1217,7 +1297,7 @@ ply_terminal_emulator_parse_substring (ply_terminal_emulator_t *terminal_emulato break; } - if (terminal_emulator->state == PLY_TERMINAL_EMULATOR_PARSE_STATE_UNESCAPED) { + if (terminal_emulator->state == PLY_TERMINAL_EMULATOR_TERMINAL_STATE_UNESCAPED) { ply_list_foreach (terminal_emulator->pending_commands, node) { ply_terminal_emulator_break_string_t break_string_value = PLY_TERMINAL_EMULATOR_BREAK_STRING_NONE; @@ -1273,7 +1353,6 @@ ply_terminal_emulator_parse_lines (ply_terminal_emulator_t *terminal_emulator, unparsed_text = text; unparsed_text_length = size; while (unparsed_text_length > 0) { - assert (terminal_emulator->line_count != 0); first_row = 0; |