summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRay Strode <rstrode@redhat.com>2023-12-10 10:18:30 -0500
committerRay Strode <rstrode@redhat.com>2023-12-10 22:13:58 -0500
commit9969852ad97f943938e5e6e19496446526a9f3cb (patch)
treecc67d02d94259a50a9b639a5c7d0038f88b58f53
parent944f2210cda40a7c8063497c01deb93a055362b0 (diff)
utils: Rework UTF-8 handling
ply_utf8_character_get_size currently has this odd argument at the end that is often just set to PLY_UTF8_MAX_CHARACTER_SIZE and also the function returns magic values for cases where it can't figure out the size because the byte isn't a leading byte or is otherwise not valid UTF-8. That means that API has a nuance to it that makes the code hard to follow at a light read. This commit attempts to improve the situation by dropping the extra argument, and adds a way to get the type separate from the size for clarity. At the same time, this commit updates all the callers to use the new API. There are two cases where the callers are trying to remove the last character from a UTF-8 string, so this commit adds a new function to consolidate that logic as well.
-rw-r--r--src/libply-splash-core/ply-keyboard.c33
-rw-r--r--src/libply/ply-utils.c119
-rw-r--r--src/libply/ply-utils.h22
-rw-r--r--src/main.c19
4 files changed, 136 insertions, 57 deletions
diff --git a/src/libply-splash-core/ply-keyboard.c b/src/libply-splash-core/ply-keyboard.c
index 128d3ced..ab375f37 100644
--- a/src/libply-splash-core/ply-keyboard.c
+++ b/src/libply-splash-core/ply-keyboard.c
@@ -153,26 +153,15 @@ ply_keyboard_new_for_renderer (ply_renderer_t *renderer)
static void
process_backspace (ply_keyboard_t *keyboard)
{
- size_t bytes_to_remove;
- ssize_t previous_character_size;
- const char *bytes;
+ char *bytes;
size_t size;
+ size_t capacity;
ply_list_node_t *node;
- bytes = ply_buffer_get_bytes (keyboard->line_buffer);
- size = ply_buffer_get_size (keyboard->line_buffer);
-
- bytes_to_remove = MIN (size, PLY_UTF8_CHARACTER_SIZE_MAX);
- while ((previous_character_size = ply_utf8_character_get_size (bytes + size - bytes_to_remove, bytes_to_remove)) < (ssize_t) bytes_to_remove) {
- if (previous_character_size > 0)
- bytes_to_remove -= previous_character_size;
- else
- bytes_to_remove--;
+ ply_buffer_borrow_bytes (keyboard->line_buffer, &bytes, &size, &capacity) {
+ ply_utf8_string_remove_last_character (&bytes, &size);
}
- if (bytes_to_remove <= size)
- ply_buffer_remove_bytes_at_end (keyboard->line_buffer, bytes_to_remove);
-
for (node = ply_list_get_first_node (keyboard->backspace_handler_list);
node; node = ply_list_get_next_node (keyboard->backspace_handler_list, node)) {
ply_keyboard_closure_t *closure = ply_list_node_get_data (node);
@@ -277,6 +266,7 @@ on_key_event (ply_keyboard_t *keyboard,
i = 0;
while (i < size) {
+ ply_utf8_character_byte_type_t character_byte_type;
ssize_t character_size;
char *keyboard_input;
size_t bytes_left = size - i;
@@ -318,18 +308,23 @@ on_key_event (ply_keyboard_t *keyboard,
continue;
}
- character_size = (ssize_t) ply_utf8_character_get_size (bytes + i, bytes_left);
+ character_byte_type = ply_utf8_character_get_byte_type (bytes[i]);
- if (character_size < 0)
+ if (PLY_UTF8_CHARACTER_BYTE_TYPE_IS_NOT_LEADING (character_byte_type))
break;
/* If we're at a NUL character walk through it
*/
- if (character_size == 0) {
+ if (character_byte_type == PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING) {
i++;
continue;
}
+ character_size = ply_utf8_character_get_size_from_byte_type (character_byte_type);
+
+ if (character_size > bytes_left)
+ break;
+
keyboard_input = strndup (bytes + i, character_size);
process_keyboard_input (keyboard, keyboard_input, character_size);
@@ -665,4 +660,4 @@ ply_keyboard_get_capslock_state (ply_keyboard_t *keyboard)
}
return NULL;
-} \ No newline at end of file
+}
diff --git a/src/libply/ply-utils.c b/src/libply/ply-utils.c
index c5b0847e..95b505b1 100644
--- a/src/libply/ply-utils.c
+++ b/src/libply/ply-utils.c
@@ -742,21 +742,93 @@ ply_detach_daemon (ply_daemon_handle_t *handle,
* 11100000-11101111 E0-EF Start of 3-byte sequence
* 11110000-11110100 F0-F4 Start of 4-byte sequence
*/
-int
-ply_utf8_character_get_size (const char *string,
- size_t n)
+ply_utf8_character_byte_type_t
+ply_utf8_character_get_byte_type (const char byte)
+{
+ ply_utf8_character_byte_type_t byte_type;
+
+ if (byte == '\0')
+ byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING;
+ else if ((byte & 0x80) == 0x00)
+ byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_1_BYTE;
+ else if ((byte & 0xE0) == 0xC0)
+ byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES;
+ else if ((byte & 0xF0) == 0xE0)
+ byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES;
+ else if ((byte & 0xF8) == 0xF0)
+ byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES;
+ else
+ byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION;
+
+ return byte_type;
+}
+
+ssize_t
+ply_utf8_character_get_size_from_byte_type (ply_utf8_character_byte_type_t byte_type)
+{
+ size_t size;
+
+ switch (byte_type) {
+ case PLY_UTF8_CHARACTER_BYTE_TYPE_1_BYTE:
+ size = 1;
+ break;
+ case PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES:
+ size = 2;
+ break;
+ case PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES:
+ size = 3;
+ break;
+ case PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES:
+ size = 4;
+ break;
+ case PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION:
+ case PLY_UTF8_CHARACTER_BYTE_TYPE_INVALID:
+ case PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING:
+ size = 0;
+ break;
+ }
+ return size;
+}
+
+ssize_t
+ply_utf8_character_get_size (const char *bytes)
+{
+ ply_utf8_character_byte_type_t byte_type;
+ ssize_t size;
+
+ byte_type = ply_utf8_character_get_byte_type (bytes[0]);
+ size = ply_utf8_character_get_size_from_byte_type (byte_type);
+
+ return size;
+}
+
+void
+ply_utf8_string_remove_last_character (char **string,
+ size_t *size)
{
- int length;
-
- if (n < 1) return -1;
- if (string[0] == 0x00) length = 0;
- else if ((string[0] & 0x80) == 0x00) length = 1;
- else if ((string[0] & 0xE0) == 0xC0) length = 2;
- else if ((string[0] & 0xF0) == 0xE0) length = 3;
- else if ((string[0] & 0xF8) == 0xF0) length = 4;
- else return -2;
- if (length > (int) n) return -1;
- return length;
+ char *bytes = *string;
+ size_t size_in = *size, end_offset;
+
+ if (size_in == 0)
+ return;
+
+ end_offset = size_in - 1;
+ do {
+ ply_utf8_character_byte_type_t byte_type;
+
+ byte_type = ply_utf8_character_get_byte_type (bytes[end_offset]);
+
+ if (byte_type != PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION) {
+ memset (bytes + end_offset, '\0', size_in - end_offset);
+ *size = end_offset;
+ break;
+ }
+
+ if (end_offset == 0)
+ break;
+
+ end_offset--;
+ } while (true);
}
int
@@ -766,10 +838,16 @@ ply_utf8_string_get_length (const char *string,
size_t count = 0;
while (true) {
- int charlen = ply_utf8_character_get_size (string, n);
- if (charlen <= 0) break;
- string += charlen;
- n -= charlen;
+ size_t size = ply_utf8_character_get_size (string);
+
+ if (size == 0)
+ break;
+
+ if (size > n)
+ break;
+
+ string += size;
+ n -= size;
count++;
}
return count;
@@ -783,7 +861,7 @@ ply_utf8_string_get_byte_offset_from_character_offset (const char *string,
size_t i;
for (i = 0; i < character_offset && string[byte_offset] != '\0'; i++) {
- byte_offset += ply_utf8_character_get_size (string + byte_offset, PLY_UTF8_CHARACTER_SIZE_MAX);
+ byte_offset += ply_utf8_character_get_size (string + byte_offset);
}
return byte_offset;
@@ -818,8 +896,7 @@ ply_utf8_string_iterator_next (ply_utf8_string_iterator_t *iterator,
if (iterator->string[iterator->current_byte_offset] == '\0')
return false;
- size_of_current_character = ply_utf8_character_get_size (iterator->string + iterator->current_byte_offset,
- PLY_UTF8_CHARACTER_SIZE_MAX);
+ size_of_current_character = ply_utf8_character_get_size (iterator->string + iterator->current_byte_offset);
if (size_of_current_character == 0)
return false;
diff --git a/src/libply/ply-utils.h b/src/libply/ply-utils.h
index b99d2b23..7cbbb2f4 100644
--- a/src/libply/ply-utils.h
+++ b/src/libply/ply-utils.h
@@ -55,6 +55,20 @@ typedef enum
PLY_UNIX_SOCKET_TYPE_TRIMMED_ABSTRACT
} ply_unix_socket_type_t;
+typedef enum
+{
+ PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION = -2,
+ PLY_UTF8_CHARACTER_BYTE_TYPE_INVALID = -1,
+ PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING = 0,
+ PLY_UTF8_CHARACTER_BYTE_TYPE_1_BYTE = 1,
+ PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES = 2,
+ PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES = 3,
+ PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES = 4
+} ply_utf8_character_byte_type_t;
+
+#define PLY_UTF8_CHARACTER_BYTE_TYPE_IS_NOT_LEADING(t) ((t) == PLY_UTF8_CHARACTER_BYTE_TYPE_INVALID || (t) == PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION)
+#define PLY_UTF8_CHARACTER_BYTE_TYPE_IS_MULTI_BYTE(t) (((t) == PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES || (t) == PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES || (t) == PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES))
+
typedef struct
{
const char *string;
@@ -120,8 +134,12 @@ ply_daemon_handle_t *ply_create_daemon (void);
bool ply_detach_daemon (ply_daemon_handle_t *handle,
int exit_code);
-int ply_utf8_character_get_size (const char *string,
- size_t n);
+ply_utf8_character_byte_type_t ply_utf8_character_get_byte_type (const char byte);
+ssize_t ply_utf8_character_get_size_from_byte_type (ply_utf8_character_byte_type_t byte_type);
+ssize_t ply_utf8_character_get_size (const char *bytes);
+
+void ply_utf8_string_remove_last_character (char **string,
+ size_t *n);
int ply_utf8_string_get_length (const char *string,
size_t n);
diff --git a/src/main.c b/src/main.c
index e301051b..09ca6854 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1654,28 +1654,17 @@ on_keyboard_input (state_t *state,
static void
on_backspace (state_t *state)
{
- ssize_t bytes_to_remove;
- ssize_t previous_character_size;
- const char *bytes;
+ char *bytes;
size_t size;
+ size_t capacity;
ply_list_node_t *node = ply_list_get_first_node (state->entry_triggers);
if (!node) return;
- bytes = ply_buffer_get_bytes (state->entry_buffer);
- size = ply_buffer_get_size (state->entry_buffer);
- if (size == 0)
- return;
-
- bytes_to_remove = MIN (size, PLY_UTF8_CHARACTER_SIZE_MAX);
- while ((previous_character_size = ply_utf8_character_get_size (bytes + size - bytes_to_remove, bytes_to_remove)) < bytes_to_remove) {
- if (previous_character_size > 0)
- bytes_to_remove -= previous_character_size;
- else
- bytes_to_remove--;
+ ply_buffer_borrow_bytes (state->entry_buffer, &bytes, &size, &capacity) {
+ ply_utf8_string_remove_last_character (&bytes, &size);
}
- ply_buffer_remove_bytes_at_end (state->entry_buffer, bytes_to_remove);
update_display (state);
}