diff options
author | Ray Strode <rstrode@redhat.com> | 2023-12-10 10:18:30 -0500 |
---|---|---|
committer | Ray Strode <rstrode@redhat.com> | 2023-12-10 22:13:58 -0500 |
commit | 9969852ad97f943938e5e6e19496446526a9f3cb (patch) | |
tree | cc67d02d94259a50a9b639a5c7d0038f88b58f53 | |
parent | 944f2210cda40a7c8063497c01deb93a055362b0 (diff) |
utils: Rework UTF-8 handling
ply_utf8_character_get_size currently has this odd argument at
the end that is often just set to PLY_UTF8_MAX_CHARACTER_SIZE
and also the function returns magic values for cases where it
can't figure out the size because the byte isn't a leading
byte or is otherwise not valid UTF-8.
That means that API has a nuance to it that makes the code hard
to follow at a light read.
This commit attempts to improve the situation by dropping the
extra argument, and adds a way to get the type separate from the
size for clarity.
At the same time, this commit updates all the callers to use the
new API. There are two cases where the callers are trying to
remove the last character from a UTF-8 string, so this commit
adds a new function to consolidate that logic as well.
-rw-r--r-- | src/libply-splash-core/ply-keyboard.c | 33 | ||||
-rw-r--r-- | src/libply/ply-utils.c | 119 | ||||
-rw-r--r-- | src/libply/ply-utils.h | 22 | ||||
-rw-r--r-- | src/main.c | 19 |
4 files changed, 136 insertions, 57 deletions
diff --git a/src/libply-splash-core/ply-keyboard.c b/src/libply-splash-core/ply-keyboard.c index 128d3ced..ab375f37 100644 --- a/src/libply-splash-core/ply-keyboard.c +++ b/src/libply-splash-core/ply-keyboard.c @@ -153,26 +153,15 @@ ply_keyboard_new_for_renderer (ply_renderer_t *renderer) static void process_backspace (ply_keyboard_t *keyboard) { - size_t bytes_to_remove; - ssize_t previous_character_size; - const char *bytes; + char *bytes; size_t size; + size_t capacity; ply_list_node_t *node; - bytes = ply_buffer_get_bytes (keyboard->line_buffer); - size = ply_buffer_get_size (keyboard->line_buffer); - - bytes_to_remove = MIN (size, PLY_UTF8_CHARACTER_SIZE_MAX); - while ((previous_character_size = ply_utf8_character_get_size (bytes + size - bytes_to_remove, bytes_to_remove)) < (ssize_t) bytes_to_remove) { - if (previous_character_size > 0) - bytes_to_remove -= previous_character_size; - else - bytes_to_remove--; + ply_buffer_borrow_bytes (keyboard->line_buffer, &bytes, &size, &capacity) { + ply_utf8_string_remove_last_character (&bytes, &size); } - if (bytes_to_remove <= size) - ply_buffer_remove_bytes_at_end (keyboard->line_buffer, bytes_to_remove); - for (node = ply_list_get_first_node (keyboard->backspace_handler_list); node; node = ply_list_get_next_node (keyboard->backspace_handler_list, node)) { ply_keyboard_closure_t *closure = ply_list_node_get_data (node); @@ -277,6 +266,7 @@ on_key_event (ply_keyboard_t *keyboard, i = 0; while (i < size) { + ply_utf8_character_byte_type_t character_byte_type; ssize_t character_size; char *keyboard_input; size_t bytes_left = size - i; @@ -318,18 +308,23 @@ on_key_event (ply_keyboard_t *keyboard, continue; } - character_size = (ssize_t) ply_utf8_character_get_size (bytes + i, bytes_left); + character_byte_type = ply_utf8_character_get_byte_type (bytes[i]); - if (character_size < 0) + if (PLY_UTF8_CHARACTER_BYTE_TYPE_IS_NOT_LEADING (character_byte_type)) break; /* If we're at a NUL character walk through it */ - if (character_size == 0) { + if (character_byte_type == PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING) { i++; continue; } + character_size = ply_utf8_character_get_size_from_byte_type (character_byte_type); + + if (character_size > bytes_left) + break; + keyboard_input = strndup (bytes + i, character_size); process_keyboard_input (keyboard, keyboard_input, character_size); @@ -665,4 +660,4 @@ ply_keyboard_get_capslock_state (ply_keyboard_t *keyboard) } return NULL; -}
\ No newline at end of file +} diff --git a/src/libply/ply-utils.c b/src/libply/ply-utils.c index c5b0847e..95b505b1 100644 --- a/src/libply/ply-utils.c +++ b/src/libply/ply-utils.c @@ -742,21 +742,93 @@ ply_detach_daemon (ply_daemon_handle_t *handle, * 11100000-11101111 E0-EF Start of 3-byte sequence * 11110000-11110100 F0-F4 Start of 4-byte sequence */ -int -ply_utf8_character_get_size (const char *string, - size_t n) +ply_utf8_character_byte_type_t +ply_utf8_character_get_byte_type (const char byte) +{ + ply_utf8_character_byte_type_t byte_type; + + if (byte == '\0') + byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING; + else if ((byte & 0x80) == 0x00) + byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_1_BYTE; + else if ((byte & 0xE0) == 0xC0) + byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES; + else if ((byte & 0xF0) == 0xE0) + byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES; + else if ((byte & 0xF8) == 0xF0) + byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES; + else + byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION; + + return byte_type; +} + +ssize_t +ply_utf8_character_get_size_from_byte_type (ply_utf8_character_byte_type_t byte_type) +{ + size_t size; + + switch (byte_type) { + case PLY_UTF8_CHARACTER_BYTE_TYPE_1_BYTE: + size = 1; + break; + case PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES: + size = 2; + break; + case PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES: + size = 3; + break; + case PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES: + size = 4; + break; + case PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION: + case PLY_UTF8_CHARACTER_BYTE_TYPE_INVALID: + case PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING: + size = 0; + break; + } + return size; +} + +ssize_t +ply_utf8_character_get_size (const char *bytes) +{ + ply_utf8_character_byte_type_t byte_type; + ssize_t size; + + byte_type = ply_utf8_character_get_byte_type (bytes[0]); + size = ply_utf8_character_get_size_from_byte_type (byte_type); + + return size; +} + +void +ply_utf8_string_remove_last_character (char **string, + size_t *size) { - int length; - - if (n < 1) return -1; - if (string[0] == 0x00) length = 0; - else if ((string[0] & 0x80) == 0x00) length = 1; - else if ((string[0] & 0xE0) == 0xC0) length = 2; - else if ((string[0] & 0xF0) == 0xE0) length = 3; - else if ((string[0] & 0xF8) == 0xF0) length = 4; - else return -2; - if (length > (int) n) return -1; - return length; + char *bytes = *string; + size_t size_in = *size, end_offset; + + if (size_in == 0) + return; + + end_offset = size_in - 1; + do { + ply_utf8_character_byte_type_t byte_type; + + byte_type = ply_utf8_character_get_byte_type (bytes[end_offset]); + + if (byte_type != PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION) { + memset (bytes + end_offset, '\0', size_in - end_offset); + *size = end_offset; + break; + } + + if (end_offset == 0) + break; + + end_offset--; + } while (true); } int @@ -766,10 +838,16 @@ ply_utf8_string_get_length (const char *string, size_t count = 0; while (true) { - int charlen = ply_utf8_character_get_size (string, n); - if (charlen <= 0) break; - string += charlen; - n -= charlen; + size_t size = ply_utf8_character_get_size (string); + + if (size == 0) + break; + + if (size > n) + break; + + string += size; + n -= size; count++; } return count; @@ -783,7 +861,7 @@ ply_utf8_string_get_byte_offset_from_character_offset (const char *string, size_t i; for (i = 0; i < character_offset && string[byte_offset] != '\0'; i++) { - byte_offset += ply_utf8_character_get_size (string + byte_offset, PLY_UTF8_CHARACTER_SIZE_MAX); + byte_offset += ply_utf8_character_get_size (string + byte_offset); } return byte_offset; @@ -818,8 +896,7 @@ ply_utf8_string_iterator_next (ply_utf8_string_iterator_t *iterator, if (iterator->string[iterator->current_byte_offset] == '\0') return false; - size_of_current_character = ply_utf8_character_get_size (iterator->string + iterator->current_byte_offset, - PLY_UTF8_CHARACTER_SIZE_MAX); + size_of_current_character = ply_utf8_character_get_size (iterator->string + iterator->current_byte_offset); if (size_of_current_character == 0) return false; diff --git a/src/libply/ply-utils.h b/src/libply/ply-utils.h index b99d2b23..7cbbb2f4 100644 --- a/src/libply/ply-utils.h +++ b/src/libply/ply-utils.h @@ -55,6 +55,20 @@ typedef enum PLY_UNIX_SOCKET_TYPE_TRIMMED_ABSTRACT } ply_unix_socket_type_t; +typedef enum +{ + PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION = -2, + PLY_UTF8_CHARACTER_BYTE_TYPE_INVALID = -1, + PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING = 0, + PLY_UTF8_CHARACTER_BYTE_TYPE_1_BYTE = 1, + PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES = 2, + PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES = 3, + PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES = 4 +} ply_utf8_character_byte_type_t; + +#define PLY_UTF8_CHARACTER_BYTE_TYPE_IS_NOT_LEADING(t) ((t) == PLY_UTF8_CHARACTER_BYTE_TYPE_INVALID || (t) == PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION) +#define PLY_UTF8_CHARACTER_BYTE_TYPE_IS_MULTI_BYTE(t) (((t) == PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES || (t) == PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES || (t) == PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES)) + typedef struct { const char *string; @@ -120,8 +134,12 @@ ply_daemon_handle_t *ply_create_daemon (void); bool ply_detach_daemon (ply_daemon_handle_t *handle, int exit_code); -int ply_utf8_character_get_size (const char *string, - size_t n); +ply_utf8_character_byte_type_t ply_utf8_character_get_byte_type (const char byte); +ssize_t ply_utf8_character_get_size_from_byte_type (ply_utf8_character_byte_type_t byte_type); +ssize_t ply_utf8_character_get_size (const char *bytes); + +void ply_utf8_string_remove_last_character (char **string, + size_t *n); int ply_utf8_string_get_length (const char *string, size_t n); @@ -1654,28 +1654,17 @@ on_keyboard_input (state_t *state, static void on_backspace (state_t *state) { - ssize_t bytes_to_remove; - ssize_t previous_character_size; - const char *bytes; + char *bytes; size_t size; + size_t capacity; ply_list_node_t *node = ply_list_get_first_node (state->entry_triggers); if (!node) return; - bytes = ply_buffer_get_bytes (state->entry_buffer); - size = ply_buffer_get_size (state->entry_buffer); - if (size == 0) - return; - - bytes_to_remove = MIN (size, PLY_UTF8_CHARACTER_SIZE_MAX); - while ((previous_character_size = ply_utf8_character_get_size (bytes + size - bytes_to_remove, bytes_to_remove)) < bytes_to_remove) { - if (previous_character_size > 0) - bytes_to_remove -= previous_character_size; - else - bytes_to_remove--; + ply_buffer_borrow_bytes (state->entry_buffer, &bytes, &size, &capacity) { + ply_utf8_string_remove_last_character (&bytes, &size); } - ply_buffer_remove_bytes_at_end (state->entry_buffer, bytes_to_remove); update_display (state); } |