diff options
Diffstat (limited to 'src/dfi-text-index.c')
-rw-r--r-- | src/dfi-text-index.c | 253 |
1 files changed, 253 insertions, 0 deletions
diff --git a/src/dfi-text-index.c b/src/dfi-text-index.c new file mode 100644 index 0000000..19811ff --- /dev/null +++ b/src/dfi-text-index.c @@ -0,0 +1,253 @@ +/* + * Copyright © 2013 Canonical Limited + * + * update-desktop-database is free software; you can redistribute it + * and/or modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * update-desktop-database is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with update-desktop-database; see the file COPYING. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite + * 330, Boston, MA 02111-1307, USA. + * + * Author: Ryan Lortie <desrt@desrt.ca> + */ + +#include "dfi-text-index.h" + +#include "dfi-string-table.h" +#include "dfi-id-list.h" + +#include <string.h> + +typedef struct +{ + /* Our GSequence compare function treats DesktopFileIndexTextIndexItem + * as a subclass of 'string' for purposes of comparison. + * + * The string, therefore, must come first. + */ + gchar *token; + + GArray *id_list; +} DesktopFileIndexTextIndexItem; + +static gint +dfi_text_index_string_compare (gconstpointer a, + gconstpointer b, + gpointer user_data) +{ + /* As mentioned above: the pointers can equivalently be pointers to a + * 'DesktopFileIndexTextIndexItem' or to a 'gchar *'. + */ + const gchar * const *str_a = a; + const gchar * const *str_b = b; + + return strcmp (*str_a, *str_b); +} + +static DesktopFileIndexTextIndexItem * +dfi_text_index_item_new (const gchar *token) +{ + DesktopFileIndexTextIndexItem *item; + + item = g_slice_new (DesktopFileIndexTextIndexItem); + item->token = g_strdup (token); + item->id_list = dfi_id_list_new (); + + return item; +} + +static void +dfi_text_index_item_free (gpointer data) +{ + DesktopFileIndexTextIndexItem *item = data; + + dfi_id_list_free (item->id_list); + g_free (item->token); + + g_slice_free (DesktopFileIndexTextIndexItem, item); +} + +GSequence * +dfi_text_index_new (void) +{ + return g_sequence_new (dfi_text_index_item_free); +} + +void +dfi_text_index_free (gpointer data) +{ + g_sequence_free (data); +} + +void +dfi_text_index_add_ids (GSequence *text_index, + const gchar *token, + const guint16 *ids, + gint n_ids) +{ + DesktopFileIndexTextIndexItem *item; + GSequenceIter *iter; + + iter = g_sequence_lookup (text_index, &token, dfi_text_index_string_compare, NULL); + if (iter) + { + item = g_sequence_get (iter); + } + else + { + item = dfi_text_index_item_new (token); + g_sequence_insert_sorted (text_index, item, dfi_text_index_string_compare, NULL); + } + + dfi_id_list_add_ids (item->id_list, ids, n_ids); +} + +static void +dfi_text_index_add_folded (GPtrArray *array, + const gchar *start, + const gchar *end) +{ + gchar *normal; + + normal = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL_COMPOSE); + + /* TODO: Invent time machine. Converse with Mustafa Ataturk... */ + if (strstr (normal, "ı") || strstr (normal, "İ")) + { + gchar *s = normal; + GString *tmp; + + tmp = g_string_new (NULL); + + while (*s) + { + gchar *i, *I, *e; + + i = strstr (s, "ı"); + I = strstr (s, "İ"); + + if (!i && !I) + break; + else if (i && !I) + e = i; + else if (I && !i) + e = I; + else if (i < I) + e = i; + else + e = I; + + g_string_append_len (tmp, s, e - s); + g_string_append_c (tmp, 'i'); + s = g_utf8_next_char (e); + } + + g_string_append (tmp, s); + g_free (normal); + normal = g_string_free (tmp, FALSE); + } + + g_ptr_array_add (array, g_utf8_casefold (normal, -1)); + g_free (normal); +} + +static gchar ** +dfi_text_index_split_words (const gchar *value) +{ + const gchar *start = NULL; + GPtrArray *result; + const gchar *s; + + result = g_ptr_array_new (); + + for (s = value; *s; s = g_utf8_next_char (s)) + { + gunichar c = g_utf8_get_char (s); + + if (start == NULL) + { + if (g_unichar_isalnum (c)) + start = s; + } + else + { + if (!g_unichar_isalnum (c)) + { + dfi_text_index_add_folded (result, start, s); + start = NULL; + } + } + } + + if (start) + dfi_text_index_add_folded (result, start, s); + + g_ptr_array_add (result, NULL); + + return (gchar **) g_ptr_array_free (result, FALSE); +} + +void +dfi_text_index_add_ids_tokenised (GSequence *text_index, + const gchar *string_to_tokenise, + const guint16 *ids, + gint n_ids) +{ + gchar **tokens; + gint i; + + tokens = dfi_text_index_split_words (string_to_tokenise); + for (i = 0; tokens[i]; i++) + { + gint j; + + for (j = 0; j < i; j++) + if (g_str_equal (tokens[i], tokens[j])) + break; + + if (j < i) + continue; + + dfi_text_index_add_ids (text_index, tokens[i], ids, n_ids); + } + +} + +void +dfi_text_index_get_item (GSequenceIter *iter, + const gchar **token, + GArray **id_list) +{ + DesktopFileIndexTextIndexItem *item; + + item = g_sequence_get (iter); + + *token = item->token; + *id_list = item->id_list; +} + +void +dfi_text_index_populate_strings (GSequence *text_index, + DfiStringTable *string_table) +{ + GSequenceIter *iter; + + iter = g_sequence_get_begin_iter (text_index); + + while (!g_sequence_iter_is_end (iter)) + { + DesktopFileIndexTextIndexItem *item = g_sequence_get (iter); + + dfi_string_table_add_string (string_table, item->token); + + iter = g_sequence_iter_next (iter); + } +} |