summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/README9
-rw-r--r--contrib/harfbuzz-freetype.c149
-rw-r--r--contrib/harfbuzz-freetype.h9
-rw-r--r--contrib/harfbuzz-unicode-glib.c169
-rw-r--r--contrib/harfbuzz-unicode-tables.c84
-rw-r--r--contrib/harfbuzz-unicode.c264
-rw-r--r--contrib/harfbuzz-unicode.h54
-rw-r--r--contrib/tables/README17
-rw-r--r--contrib/tables/category-parse.py70
-rw-r--r--contrib/tables/combining-class-parse.py34
-rw-r--r--contrib/tables/grapheme-break-parse.py45
-rw-r--r--contrib/tables/scripts-parse.py75
-rw-r--r--contrib/tables/unicode_parse_common.py70
13 files changed, 1049 insertions, 0 deletions
diff --git a/contrib/README b/contrib/README
new file mode 100644
index 0000000..074cc52
--- /dev/null
+++ b/contrib/README
@@ -0,0 +1,9 @@
+Harfbuzz requires several functions to be defined in order to work with the
+platform's Unicode tables etc.
+
+If you are building on top of Qt4 you should look at the code in the tests/
+directory for examples of how to hook up Qt4 functions to Harfbuzz.
+
+Otherwise, this directory contains examples of using downloaded Unicode tables
+and/or glib to host Harfbuzz. You should read the README file in tables/ for how
+to build the header files for some of the Unicode tables.
diff --git a/contrib/harfbuzz-freetype.c b/contrib/harfbuzz-freetype.c
new file mode 100644
index 0000000..a2962df
--- /dev/null
+++ b/contrib/harfbuzz-freetype.c
@@ -0,0 +1,149 @@
+#include <stdint.h>
+
+#include <ft2build.h>
+#include FT_FREETYPE_H
+#include FT_TRUETYPE_TABLES_H
+
+#if 0
+#include <freetype/freetype.h>
+#include <freetype/tttables.h>
+#endif
+
+#include <harfbuzz-shaper.h>
+#include "harfbuzz-unicode.h"
+
+static HB_Bool
+hb_freetype_string_to_glyphs(HB_Font font,
+ const HB_UChar16 *chars, hb_uint32 len,
+ HB_Glyph *glyphs, hb_uint32 *numGlyphs,
+ HB_Bool is_rtl) {
+ FT_Face face = (FT_Face) font->userData;
+ if (len > *numGlyphs)
+ return 0;
+
+ size_t i = 0, j = 0;
+ while (i < len) {
+ const uint32_t cp = utf16_to_code_point(chars, len, &i);
+ glyphs[j++] = FT_Get_Char_Index(face, cp);
+ }
+
+ *numGlyphs = j;
+
+ return 1;
+}
+
+static void
+hb_freetype_advances_get(HB_Font font, const HB_Glyph *glyphs, hb_uint32 len,
+ HB_Fixed *advances, int flags) {
+ FT_Face face = (FT_Face) font->userData;
+
+ hb_uint32 i;
+ for (i = 0; i < len; ++i) {
+ const FT_Error error = FT_Load_Glyph(face, glyphs[i], FT_LOAD_DEFAULT);
+ if (error) {
+ advances[i] = 0;
+ continue;
+ }
+
+ advances[i] = face->glyph->advance.x;
+ }
+}
+
+static HB_Bool
+hb_freetype_can_render(HB_Font font, const HB_UChar16 *chars, hb_uint32 len) {
+ FT_Face face = (FT_Face)font->userData;
+
+ size_t i = 0;
+ while (i < len) {
+ const uint32_t cp = utf16_to_code_point(chars, len, &i);
+ if (FT_Get_Char_Index(face, cp) == 0)
+ return 0;
+ }
+
+ return 1;
+}
+
+static HB_Error
+hb_freetype_outline_point_get(HB_Font font, HB_Glyph glyph, int flags,
+ hb_uint32 point, HB_Fixed *xpos, HB_Fixed *ypos,
+ hb_uint32 *n_points) {
+ HB_Error error = HB_Err_Ok;
+ FT_Face face = (FT_Face) font->userData;
+
+ int load_flags = (flags & HB_ShaperFlag_UseDesignMetrics) ? FT_LOAD_NO_HINTING : FT_LOAD_DEFAULT;
+
+ if ((error = (HB_Error) FT_Load_Glyph(face, glyph, load_flags)))
+ return error;
+
+ if (face->glyph->format != ft_glyph_format_outline)
+ return (HB_Error)HB_Err_Invalid_SubTable;
+
+ *n_points = face->glyph->outline.n_points;
+ if (!(*n_points))
+ return HB_Err_Ok;
+
+ if (point > *n_points)
+ return (HB_Error)HB_Err_Invalid_SubTable;
+
+ *xpos = face->glyph->outline.points[point].x;
+ *ypos = face->glyph->outline.points[point].y;
+
+ return HB_Err_Ok;
+}
+
+static void
+hb_freetype_glyph_metrics_get(HB_Font font, HB_Glyph glyph,
+ HB_GlyphMetrics *metrics) {
+ FT_Face face = (FT_Face) font->userData;
+
+ const FT_Error error = FT_Load_Glyph(face, glyph, FT_LOAD_DEFAULT);
+ if (error) {
+ metrics->x = metrics->y = metrics->width = metrics->height = 0;
+ metrics->xOffset = metrics->yOffset = 0;
+ return;
+ }
+
+ const FT_Glyph_Metrics *ftmetrics = &face->glyph->metrics;
+ metrics->width = ftmetrics->width;
+ metrics->height = ftmetrics->height;
+ metrics->x = ftmetrics->horiAdvance;
+ metrics->y = 0; // unclear what this is
+ metrics->xOffset = ftmetrics->horiBearingX;
+ metrics->yOffset = ftmetrics->horiBearingY;
+}
+
+static HB_Fixed
+hb_freetype_font_metric_get(HB_Font font, HB_FontMetric metric) {
+ FT_Face face = (FT_Face) font->userData;
+
+ switch (metric) {
+ case HB_FontAscent:
+ // Note that we aren't scanning the VDMX table which we probably would in
+ // an ideal world.
+ return face->ascender;
+ default:
+ return 0;
+ }
+}
+
+const HB_FontClass hb_freetype_class = {
+ hb_freetype_string_to_glyphs,
+ hb_freetype_advances_get,
+ hb_freetype_can_render,
+ hb_freetype_outline_point_get,
+ hb_freetype_glyph_metrics_get,
+ hb_freetype_font_metric_get,
+};
+
+HB_Error
+hb_freetype_table_sfnt_get(void *voidface, const HB_Tag tag, HB_Byte *buffer, HB_UInt *len) {
+ FT_Face face = (FT_Face) voidface;
+ FT_ULong ftlen = *len;
+
+ if (!FT_IS_SFNT(face))
+ return HB_Err_Invalid_Argument;
+
+ const FT_Error error = FT_Load_Sfnt_Table(face, tag, 0, buffer, &ftlen);
+ *len = ftlen;
+ return (HB_Error) error;
+}
diff --git a/contrib/harfbuzz-freetype.h b/contrib/harfbuzz-freetype.h
new file mode 100644
index 0000000..628be16
--- /dev/null
+++ b/contrib/harfbuzz-freetype.h
@@ -0,0 +1,9 @@
+#ifndef HB_FREETYPE_H_
+#define HB_FREETYPE_H_
+
+extern const HB_FontClass hb_freetype_class;
+
+HB_Error hb_freetype_table_sfnt_get(void *voidface, const HB_Tag tag,
+ HB_Byte *buffer, HB_UInt *len);
+
+#endif // HB_FREETYPE_H_
diff --git a/contrib/harfbuzz-unicode-glib.c b/contrib/harfbuzz-unicode-glib.c
new file mode 100644
index 0000000..6a13433
--- /dev/null
+++ b/contrib/harfbuzz-unicode-glib.c
@@ -0,0 +1,169 @@
+#include "harfbuzz-external.h"
+
+#include <glib.h>
+
+static int
+hb_category_for_char(HB_UChar32 ch) {
+ switch (g_unichar_type(ch)) {
+ case G_UNICODE_CONTROL:
+ return HB_Other_Control;
+ case G_UNICODE_FORMAT:
+ return HB_Other_Format;
+ case G_UNICODE_UNASSIGNED:
+ return HB_Other_NotAssigned;
+ case G_UNICODE_PRIVATE_USE:
+ return HB_Other_PrivateUse;
+ case G_UNICODE_SURROGATE:
+ return HB_Other_Surrogate;
+ case G_UNICODE_LOWERCASE_LETTER:
+ return HB_Letter_Lowercase;
+ case G_UNICODE_MODIFIER_LETTER:
+ return HB_Letter_Modifier;
+ case G_UNICODE_OTHER_LETTER:
+ return HB_Letter_Other;
+ case G_UNICODE_TITLECASE_LETTER:
+ return HB_Letter_Titlecase;
+ case G_UNICODE_UPPERCASE_LETTER:
+ return HB_Letter_Uppercase;
+ case G_UNICODE_COMBINING_MARK:
+ return HB_Mark_SpacingCombining;
+ case G_UNICODE_ENCLOSING_MARK:
+ return HB_Mark_Enclosing;
+ case G_UNICODE_NON_SPACING_MARK:
+ return HB_Mark_NonSpacing;
+ case G_UNICODE_DECIMAL_NUMBER:
+ return HB_Number_DecimalDigit;
+ case G_UNICODE_LETTER_NUMBER:
+ return HB_Number_Letter;
+ case G_UNICODE_OTHER_NUMBER:
+ return HB_Number_Other;
+ case G_UNICODE_CONNECT_PUNCTUATION:
+ return HB_Punctuation_Connector;
+ case G_UNICODE_DASH_PUNCTUATION:
+ return HB_Punctuation_Dash;
+ case G_UNICODE_CLOSE_PUNCTUATION:
+ return HB_Punctuation_Close;
+ case G_UNICODE_FINAL_PUNCTUATION:
+ return HB_Punctuation_FinalQuote;
+ case G_UNICODE_INITIAL_PUNCTUATION:
+ return HB_Punctuation_InitialQuote;
+ case G_UNICODE_OTHER_PUNCTUATION:
+ return HB_Punctuation_Other;
+ case G_UNICODE_OPEN_PUNCTUATION:
+ return HB_Punctuation_Open;
+ case G_UNICODE_CURRENCY_SYMBOL:
+ return HB_Symbol_Currency;
+ case G_UNICODE_MODIFIER_SYMBOL:
+ return HB_Symbol_Modifier;
+ case G_UNICODE_MATH_SYMBOL:
+ return HB_Symbol_Math;
+ case G_UNICODE_OTHER_SYMBOL:
+ return HB_Symbol_Other;
+ case G_UNICODE_LINE_SEPARATOR:
+ return HB_Separator_Line;
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ return HB_Separator_Paragraph;
+ case G_UNICODE_SPACE_SEPARATOR:
+ return HB_Separator_Space;
+ default:
+ return HB_Symbol_Other;
+ }
+}
+
+HB_LineBreakClass
+HB_GetLineBreakClass(HB_UChar32 ch) {
+ switch (g_unichar_break_type(ch)) {
+ case G_UNICODE_BREAK_MANDATORY:
+ return HB_LineBreak_BK;
+ case G_UNICODE_BREAK_CARRIAGE_RETURN:
+ return HB_LineBreak_CR;
+ case G_UNICODE_BREAK_LINE_FEED:
+ return HB_LineBreak_LF;
+ case G_UNICODE_BREAK_COMBINING_MARK:
+ return HB_LineBreak_CM;
+ case G_UNICODE_BREAK_SURROGATE:
+ return HB_LineBreak_SG;
+ case G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
+ return HB_LineBreak_ZW;
+ case G_UNICODE_BREAK_INSEPARABLE:
+ return HB_LineBreak_IN;
+ case G_UNICODE_BREAK_NON_BREAKING_GLUE:
+ return HB_LineBreak_GL;
+ case G_UNICODE_BREAK_CONTINGENT:
+ return HB_LineBreak_AL;
+ case G_UNICODE_BREAK_SPACE:
+ return HB_LineBreak_SP;
+ case G_UNICODE_BREAK_AFTER:
+ return HB_LineBreak_BA;
+ case G_UNICODE_BREAK_BEFORE:
+ return HB_LineBreak_BB;
+ case G_UNICODE_BREAK_BEFORE_AND_AFTER:
+ return HB_LineBreak_B2;
+ case G_UNICODE_BREAK_HYPHEN:
+ return HB_LineBreak_HY;
+ case G_UNICODE_BREAK_NON_STARTER:
+ return HB_LineBreak_NS;
+ case G_UNICODE_BREAK_OPEN_PUNCTUATION:
+ return HB_LineBreak_OP;
+ case G_UNICODE_BREAK_CLOSE_PUNCTUATION:
+ return HB_LineBreak_CL;
+ case G_UNICODE_BREAK_QUOTATION:
+ return HB_LineBreak_QU;
+ case G_UNICODE_BREAK_EXCLAMATION:
+ return HB_LineBreak_EX;
+ case G_UNICODE_BREAK_IDEOGRAPHIC:
+ return HB_LineBreak_ID;
+ case G_UNICODE_BREAK_NUMERIC:
+ return HB_LineBreak_NU;
+ case G_UNICODE_BREAK_INFIX_SEPARATOR:
+ return HB_LineBreak_IS;
+ case G_UNICODE_BREAK_SYMBOL:
+ return HB_LineBreak_SY;
+ case G_UNICODE_BREAK_ALPHABETIC:
+ return HB_LineBreak_AL;
+ case G_UNICODE_BREAK_PREFIX:
+ return HB_LineBreak_PR;
+ case G_UNICODE_BREAK_POSTFIX:
+ return HB_LineBreak_PO;
+ case G_UNICODE_BREAK_COMPLEX_CONTEXT:
+ return HB_LineBreak_SA;
+ case G_UNICODE_BREAK_AMBIGUOUS:
+ return HB_LineBreak_AL;
+ case G_UNICODE_BREAK_UNKNOWN:
+ return HB_LineBreak_AL;
+ case G_UNICODE_BREAK_NEXT_LINE:
+ return HB_LineBreak_AL;
+ case G_UNICODE_BREAK_WORD_JOINER:
+ return HB_LineBreak_WJ;
+ case G_UNICODE_BREAK_HANGUL_L_JAMO:
+ return HB_LineBreak_JL;
+ case G_UNICODE_BREAK_HANGUL_V_JAMO:
+ return HB_LineBreak_JV;
+ case G_UNICODE_BREAK_HANGUL_T_JAMO:
+ return HB_LineBreak_JT;
+ case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
+ return HB_LineBreak_H2;
+ case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
+ return HB_LineBreak_H3;
+ default:
+ return HB_LineBreak_AL;
+ }
+}
+
+int
+HB_GetUnicodeCharCombiningClass(HB_UChar32 ch) {
+ return g_unichar_combining_class(ch);
+}
+
+void
+HB_GetUnicodeCharProperties(HB_UChar32 ch,
+ HB_CharCategory *category,
+ int *combiningClass) {
+ *category = hb_category_for_char(ch);
+ *combiningClass = g_unichar_combining_class(ch);
+}
+
+HB_CharCategory
+HB_GetUnicodeCharCategory(HB_UChar32 ch) {
+ return hb_category_for_char(ch);
+}
diff --git a/contrib/harfbuzz-unicode-tables.c b/contrib/harfbuzz-unicode-tables.c
new file mode 100644
index 0000000..3c3fead
--- /dev/null
+++ b/contrib/harfbuzz-unicode-tables.c
@@ -0,0 +1,84 @@
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <harfbuzz-external.h>
+
+#include "tables/category-properties.h"
+#include "tables/combining-properties.h"
+
+HB_LineBreakClass
+HB_GetLineBreakClass(HB_UChar32 ch) {
+ abort();
+ return 0;
+}
+
+static int
+combining_property_cmp(const void *vkey, const void *vcandidate) {
+ const uint32_t key = (uint32_t) (intptr_t) vkey;
+ const struct combining_property *candidate = vcandidate;
+
+ if (key < candidate->range_start) {
+ return -1;
+ } else if (key > candidate->range_end) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+static int
+code_point_to_combining_class(HB_UChar32 cp) {
+ const void *vprop = bsearch((void *) (intptr_t) cp, combining_properties,
+ combining_properties_count,
+ sizeof(struct combining_property),
+ combining_property_cmp);
+ if (!vprop)
+ return 0;
+
+ return ((const struct combining_property *) vprop)->klass;
+}
+
+int
+HB_GetUnicodeCharCombiningClass(HB_UChar32 ch) {
+ return code_point_to_combining_class(ch);
+ return 0;
+}
+
+static int
+category_property_cmp(const void *vkey, const void *vcandidate) {
+ const uint32_t key = (uint32_t) (intptr_t) vkey;
+ const struct category_property *candidate = vcandidate;
+
+ if (key < candidate->range_start) {
+ return -1;
+ } else if (key > candidate->range_end) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+static HB_CharCategory
+code_point_to_category(HB_UChar32 cp) {
+ const void *vprop = bsearch((void *) (intptr_t) cp, category_properties,
+ category_properties_count,
+ sizeof(struct category_property),
+ category_property_cmp);
+ if (!vprop)
+ return HB_NoCategory;
+
+ return ((const struct category_property *) vprop)->category;
+}
+
+void
+HB_GetUnicodeCharProperties(HB_UChar32 ch,
+ HB_CharCategory *category,
+ int *combiningClass) {
+ *category = code_point_to_category(ch);
+ *combiningClass = code_point_to_combining_class(ch);
+}
+
+HB_CharCategory
+HB_GetUnicodeCharCategory(HB_UChar32 ch) {
+ return code_point_to_category(ch);
+}
diff --git a/contrib/harfbuzz-unicode.c b/contrib/harfbuzz-unicode.c
new file mode 100644
index 0000000..9b3c43e
--- /dev/null
+++ b/contrib/harfbuzz-unicode.c
@@ -0,0 +1,264 @@
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <harfbuzz-external.h>
+#include <harfbuzz-impl.h>
+#include <harfbuzz-shaper.h>
+#include "harfbuzz-unicode.h"
+
+#include "tables/script-properties.h"
+#include "tables/grapheme-break-properties.h"
+
+uint32_t
+utf16_to_code_point(const uint16_t *chars, size_t len, ssize_t *iter) {
+ const uint16_t v = chars[(*iter)++];
+ if (HB_IsHighSurrogate(v)) {
+ // surrogate pair
+ if (*iter >= len) {
+ // the surrogate is incomplete.
+ return HB_InvalidCodePoint;
+ }
+ const uint16_t v2 = chars[(*iter)++];
+ if (!HB_IsLowSurrogate(v2)) {
+ // invalidate surrogate pair.
+ return HB_InvalidCodePoint;
+ }
+
+ return HB_SurrogateToUcs4(v, v2);
+ }
+
+ if (HB_IsLowSurrogate(v)) {
+ // this isn't a valid code point
+ return HB_InvalidCodePoint;
+ }
+
+ return v;
+}
+
+uint32_t
+utf16_to_code_point_prev(const uint16_t *chars, size_t len, ssize_t *iter) {
+ const uint16_t v = chars[(*iter)--];
+ if (HB_IsLowSurrogate(v)) {
+ // surrogate pair
+ if (*iter < 0) {
+ // the surrogate is incomplete.
+ return HB_InvalidCodePoint;
+ }
+ const uint16_t v2 = chars[(*iter)--];
+ if (!HB_IsHighSurrogate(v2)) {
+ // invalidate surrogate pair.
+ return HB_InvalidCodePoint;
+ }
+
+ return HB_SurrogateToUcs4(v2, v);
+ }
+
+ if (HB_IsHighSurrogate(v)) {
+ // this isn't a valid code point
+ return HB_InvalidCodePoint;
+ }
+
+ return v;
+}
+
+static int
+script_property_cmp(const void *vkey, const void *vcandidate) {
+ const uint32_t key = (uint32_t) (intptr_t) vkey;
+ const struct script_property *candidate = vcandidate;
+
+ if (key < candidate->range_start) {
+ return -1;
+ } else if (key > candidate->range_end) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+HB_Script
+code_point_to_script(uint32_t cp) {
+ const void *vprop = bsearch((void *) (intptr_t) cp, script_properties,
+ script_properties_count,
+ sizeof(struct script_property),
+ script_property_cmp);
+ if (!vprop)
+ return HB_Script_Common;
+
+ return ((const struct script_property *) vprop)->script;
+}
+
+char
+hb_utf16_script_run_next(unsigned *num_code_points, HB_ScriptItem *output,
+ const uint16_t *chars, size_t len, ssize_t *iter) {
+ if (*iter == len)
+ return 0;
+
+ output->pos = *iter;
+ const uint32_t init_cp = utf16_to_code_point(chars, len, iter);
+ unsigned cps = 1;
+ if (init_cp == HB_InvalidCodePoint)
+ return 0;
+ const HB_Script init_script = code_point_to_script(init_cp);
+ HB_Script current_script = init_script;
+ output->script = init_script;
+
+ for (;;) {
+ if (*iter == len)
+ break;
+ const ssize_t prev_iter = *iter;
+ const uint32_t cp = utf16_to_code_point(chars, len, iter);
+ if (cp == HB_InvalidCodePoint)
+ return 0;
+ cps++;
+ const HB_Script script = code_point_to_script(cp);
+
+ if (script != current_script) {
+ if (current_script == init_script == HB_Script_Inherited) {
+ // If we started off as inherited, we take whatever we can find.
+ output->script = script;
+ current_script = script;
+ continue;
+ } else if (script == HB_Script_Inherited) {
+ current_script = script;
+ continue;
+ } else {
+ *iter = prev_iter;
+ cps--;
+ break;
+ }
+ }
+ }
+
+ if (output->script == HB_Script_Inherited)
+ output->script = HB_Script_Common;
+
+ output->length = *iter - output->pos;
+ if (num_code_points)
+ *num_code_points = cps;
+ return 1;
+}
+
+char
+hb_utf16_script_run_prev(unsigned *num_code_points, HB_ScriptItem *output,
+ const uint16_t *chars, size_t len, ssize_t *iter) {
+ if (*iter == (size_t) -1)
+ return 0;
+
+ const size_t ending_index = *iter;
+ const uint32_t init_cp = utf16_to_code_point_prev(chars, len, iter);
+ unsigned cps = 1;
+ if (init_cp == HB_InvalidCodePoint)
+ return 0;
+ const HB_Script init_script = code_point_to_script(init_cp);
+ HB_Script current_script = init_script;
+ output->script = init_script;
+
+ for (;;) {
+ if (*iter < 0)
+ break;
+ const ssize_t prev_iter = *iter;
+ const uint32_t cp = utf16_to_code_point_prev(chars, len, iter);
+ if (cp == HB_InvalidCodePoint)
+ return 0;
+ cps++;
+ const HB_Script script = code_point_to_script(cp);
+
+ if (script != current_script) {
+ if (current_script == init_script == HB_Script_Inherited) {
+ // If we started off as inherited, we take whatever we can find.
+ output->script = script;
+ current_script = script;
+ continue;
+ } else if (script == HB_Script_Inherited) {
+ current_script = script;
+ continue;
+ } else {
+ *iter = prev_iter;
+ cps--;
+ break;
+ }
+ }
+ }
+
+ if (output->script == HB_Script_Inherited)
+ output->script = HB_Script_Common;
+
+ output->pos = *iter + 1;
+ output->length = ending_index - *iter;
+ if (num_code_points)
+ *num_code_points = cps;
+ return 1;
+}
+
+static int
+grapheme_break_property_cmp(const void *vkey, const void *vcandidate) {
+ const uint32_t key = (uint32_t) (intptr_t) vkey;
+ const struct grapheme_break_property *candidate = vcandidate;
+
+ if (key < candidate->range_start) {
+ return -1;
+ } else if (key > candidate->range_end) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+HB_GraphemeClass
+HB_GetGraphemeClass(HB_UChar32 ch) {
+ const void *vprop = bsearch((void *) (intptr_t) ch, grapheme_break_properties,
+ grapheme_break_properties_count,
+ sizeof(struct grapheme_break_property),
+ grapheme_break_property_cmp);
+ if (!vprop)
+ return HB_Grapheme_Other;
+
+ return ((const struct grapheme_break_property *) vprop)->klass;
+}
+
+HB_WordClass
+HB_GetWordClass(HB_UChar32 ch) {
+ abort();
+ return 0;
+}
+
+HB_SentenceClass
+HB_GetSentenceClass(HB_UChar32 ch) {
+ abort();
+ return 0;
+}
+
+void
+HB_GetGraphemeAndLineBreakClass(HB_UChar32 ch, HB_GraphemeClass *gclass, HB_LineBreakClass *breakclass) {
+ *gclass = HB_GetGraphemeClass(ch);
+ *breakclass = HB_GetLineBreakClass(ch);
+}
+
+HB_UChar16
+HB_GetMirroredChar(HB_UChar16 ch) {
+ abort();
+ return 0;
+}
+
+void *
+HB_Library_Resolve(const char *library, const char *symbol) {
+ abort();
+ return NULL;
+}
+
+void *
+HB_TextCodecForMib(int mib) {
+ abort();
+ return NULL;
+}
+
+char *
+HB_TextCodec_ConvertFromUnicode(void *codec, const HB_UChar16 *unicode, hb_uint32 length, hb_uint32 *outputLength) {
+ abort();
+ return NULL;
+}
+
+void
+HB_TextCodec_FreeResult(char *v) {
+ abort();
+}
diff --git a/contrib/harfbuzz-unicode.h b/contrib/harfbuzz-unicode.h
new file mode 100644
index 0000000..f28b3c3
--- /dev/null
+++ b/contrib/harfbuzz-unicode.h
@@ -0,0 +1,54 @@
+#ifndef SCRIPT_IDENTIFY_H_
+#define SCRIPT_IDENTIFY_H_
+
+#include <stdint.h>
+
+#include <harfbuzz-shaper.h>
+
+static const uint32_t HB_InvalidCodePoint = 0xffffffffu;
+
+// -----------------------------------------------------------------------------
+// Return the next Unicode code point from a UTF-16 vector
+// chars: a pointer to @len words
+// iter: (input/output) an index into @chars. This is updated.
+// returns: HB_InvalidCodePoint on error and the code point otherwise.
+// -----------------------------------------------------------------------------
+uint32_t utf16_to_code_point(const uint16_t *chars, size_t len, ssize_t *iter);
+
+// -----------------------------------------------------------------------------
+// Like the above, except that the code points are traversed backwards. Thus,
+// on the first call, |iter| should be |len| - 1.
+// -----------------------------------------------------------------------------
+uint32_t utf16_to_code_point(const uint16_t *chars, size_t len, ssize_t *iter);
+
+// -----------------------------------------------------------------------------
+// Return the script of the given code point
+// -----------------------------------------------------------------------------
+HB_Script code_point_to_script(uint32_t cp);
+
+// -----------------------------------------------------------------------------
+// Find the next script run in a UTF-16 string.
+//
+// A script run is a subvector of codepoints, all of which are in the same
+// script. A run will never cut a surrogate pair in half at either end.
+//
+// num_code_points: (output, maybe NULL) the number of code points in the run
+// output: (output) the @pos, @length and @script fields are set on success
+// chars: the UTF-16 string
+// len: the length of @chars, in words
+// iter: (in/out) the current index into the string. This should be 0 for the
+// first call and is updated on exit.
+//
+// returns: non-zero if a script run was found and returned.
+// -----------------------------------------------------------------------------
+char hb_utf16_script_run_next(unsigned *num_code_points, HB_ScriptItem *output,
+ const uint16_t *chars, size_t len, ssize_t *iter);
+
+// -----------------------------------------------------------------------------
+// This is the same as above, except that the input is traversed backwards.
+// Thus, on the first call, |iter| should be |len| - 1.
+// -----------------------------------------------------------------------------
+char hb_utf16_script_run_prev(unsigned *num_code_points, HB_ScriptItem *output,
+ const uint16_t *chars, size_t len, ssize_t *iter);
+
+#endif
diff --git a/contrib/tables/README b/contrib/tables/README
new file mode 100644
index 0000000..605d1c0
--- /dev/null
+++ b/contrib/tables/README
@@ -0,0 +1,17 @@
+This directory contains Python script to parse several of the Unicode tables
+that are downloadable from the web and generate C header files from them.
+
+These are the locations of the files which are parsed. You should download these
+files and put them in this directory.
+
+http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedGeneralCategory.txt
+http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedCombiningClass.txt
+http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
+http://www.unicode.org/Public/5.1.0/ucd/Scripts.txt
+
+Then you can run the following python scripts to generate the header files:
+
+python category-parse.py DerivedGeneralCategory.txt category-properties.h
+python combining-class-parse.py DerivedCombiningClass.txt combining-properties.h
+python grapheme-break-parse.py GraphemeBreakProperty.txt grapheme-break-properties.h
+python scripts-parse.py Scripts.txt script-properties.h
diff --git a/contrib/tables/category-parse.py b/contrib/tables/category-parse.py
new file mode 100644
index 0000000..6818c1d
--- /dev/null
+++ b/contrib/tables/category-parse.py
@@ -0,0 +1,70 @@
+import sys
+from unicode_parse_common import *
+
+# http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedGeneralCategory.txt
+
+category_to_harfbuzz = {
+ 'Mn': 'HB_Mark_NonSpacing',
+ 'Mc': 'HB_Mark_SpacingCombining',
+ 'Me': 'HB_Mark_Enclosing',
+
+ 'Nd': 'HB_Number_DecimalDigit',
+ 'Nl': 'HB_Number_Letter',
+ 'No': 'HB_Number_Other',
+
+ 'Zs': 'HB_Separator_Space',
+ 'Zl': 'HB_Separator_Line',
+ 'Zp': 'HB_Separator_Paragraph',
+
+ 'Cc': 'HB_Other_Control',
+ 'Cf': 'HB_Other_Format',
+ 'Cs': 'HB_Other_Surrogate',
+ 'Co': 'HB_Other_PrivateUse',
+ 'Cn': 'HB_Other_NotAssigned',
+
+ 'Lu': 'HB_Letter_Uppercase',
+ 'Ll': 'HB_Letter_Lowercase',
+ 'Lt': 'HB_Letter_Titlecase',
+ 'Lm': 'HB_Letter_Modifier',
+ 'Lo': 'HB_Letter_Other',
+
+ 'Pc': 'HB_Punctuation_Connector',
+ 'Pd': 'HB_Punctuation_Dash',
+ 'Ps': 'HB_Punctuation_Open',
+ 'Pe': 'HB_Punctuation_Close',
+ 'Pi': 'HB_Punctuation_InitialQuote',
+ 'Pf': 'HB_Punctuation_FinalQuote',
+ 'Po': 'HB_Punctuation_Other',
+
+ 'Sm': 'HB_Symbol_Math',
+ 'Sc': 'HB_Symbol_Currency',
+ 'Sk': 'HB_Symbol_Modifier',
+ 'So': 'HB_Symbol_Other',
+}
+
+def main(infile, outfile):
+ ranges = unicode_file_parse(infile, category_to_harfbuzz)
+ ranges = sort_and_merge(ranges)
+
+ print >>outfile, '// Generated from Unicode script tables\n'
+ print >>outfile, '#ifndef CATEGORY_PROPERTIES_H_'
+ print >>outfile, '#define CATEGORY_PROPERTIES_H_\n'
+ print >>outfile, '#include <stdint.h>'
+ print >>outfile, '#include "harfbuzz-external.h"\n'
+ print >>outfile, 'struct category_property {'
+ print >>outfile, ' uint32_t range_start;'
+ print >>outfile, ' uint32_t range_end;'
+ print >>outfile, ' HB_CharCategory category;'
+ print >>outfile, '};\n'
+ print >>outfile, 'static const struct category_property category_properties[] = {'
+ for (start, end, value) in ranges:
+ print >>outfile, ' {0x%x, 0x%x, %s},' % (start, end, value)
+ print >>outfile, '};\n'
+ print >>outfile, 'static const unsigned category_properties_count = %d;\n' % len(ranges)
+ print >>outfile, '#endif // CATEGORY_PROPERTIES_H_'
+
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ print 'Usage: %s <input .txt> <output .h>' % sys.argv[0]
+ else:
+ main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+'))
diff --git a/contrib/tables/combining-class-parse.py b/contrib/tables/combining-class-parse.py
new file mode 100644
index 0000000..c591ddd
--- /dev/null
+++ b/contrib/tables/combining-class-parse.py
@@ -0,0 +1,34 @@
+import sys
+from unicode_parse_common import *
+
+# http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedCombiningClass.txt
+
+class IdentityMap(object):
+ def __getitem__(_, key):
+ return key
+
+def main(infile, outfile):
+ ranges = unicode_file_parse(infile, IdentityMap(), '0')
+ ranges = sort_and_merge(ranges)
+
+ print >>outfile, '// Generated from Unicode tables\n'
+ print >>outfile, '#ifndef COMBINING_PROPERTIES_H_'
+ print >>outfile, '#define COMBINING_PROPERTIES_H_\n'
+ print >>outfile, '#include <stdint.h>'
+ print >>outfile, 'struct combining_property {'
+ print >>outfile, ' uint32_t range_start;'
+ print >>outfile, ' uint32_t range_end;'
+ print >>outfile, ' uint8_t klass;'
+ print >>outfile, '};\n'
+ print >>outfile, 'static const struct combining_property combining_properties[] = {'
+ for (start, end, value) in ranges:
+ print >>outfile, ' {0x%x, 0x%x, %s},' % (start, end, value)
+ print >>outfile, '};\n'
+ print >>outfile, 'static const unsigned combining_properties_count = %d;\n' % len(ranges)
+ print >>outfile, '#endif // COMBINING_PROPERTIES_H_'
+
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ print 'Usage: %s <input .txt> <output .h>' % sys.argv[0]
+ else:
+ main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+'))
diff --git a/contrib/tables/grapheme-break-parse.py b/contrib/tables/grapheme-break-parse.py
new file mode 100644
index 0000000..a4b3534
--- /dev/null
+++ b/contrib/tables/grapheme-break-parse.py
@@ -0,0 +1,45 @@
+import sys
+from unicode_parse_common import *
+
+# http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
+
+property_to_harfbuzz = {
+ 'CR': 'HB_Grapheme_CR',
+ 'LF': 'HB_Grapheme_LF',
+ 'Control': 'HB_Grapheme_Control',
+ 'Extend': 'HB_Grapheme_Extend',
+ 'Prepend': 'HB_Grapheme_Other',
+ 'SpacingMark': 'HB_Grapheme_Other',
+ 'L': 'HB_Grapheme_L',
+ 'V': 'HB_Grapheme_V',
+ 'T': 'HB_Grapheme_T',
+ 'LV': 'HB_Grapheme_LV',
+ 'LVT': 'HB_Grapheme_LVT',
+}
+
+def main(infile, outfile):
+ ranges = unicode_file_parse(infile, property_to_harfbuzz)
+ ranges.sort()
+
+ print >>outfile, '// Generated from Unicode Grapheme break tables\n'
+ print >>outfile, '#ifndef GRAPHEME_BREAK_PROPERTY_H_'
+ print >>outfile, '#define GRAPHEME_BREAK_PROPERTY_H_\n'
+ print >>outfile, '#include <stdint.h>'
+ print >>outfile, '#include "harfbuzz-external.h"\n'
+ print >>outfile, 'struct grapheme_break_property {'
+ print >>outfile, ' uint32_t range_start;'
+ print >>outfile, ' uint32_t range_end;'
+ print >>outfile, ' HB_GraphemeClass klass;'
+ print >>outfile, '};\n'
+ print >>outfile, 'static const struct grapheme_break_property grapheme_break_properties[] = {'
+ for (start, end, value) in ranges:
+ print >>outfile, ' {0x%x, 0x%x, %s},' % (start, end, value)
+ print >>outfile, '};\n'
+ print >>outfile, 'static const unsigned grapheme_break_properties_count = %d;\n' % len(ranges)
+ print >>outfile, '#endif // GRAPHEME_BREAK_PROPERTY_H_'
+
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ print 'Usage: %s <input .txt> <output .h>' % sys.argv[0]
+ else:
+ main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+'))
diff --git a/contrib/tables/scripts-parse.py b/contrib/tables/scripts-parse.py
new file mode 100644
index 0000000..23bac10
--- /dev/null
+++ b/contrib/tables/scripts-parse.py
@@ -0,0 +1,75 @@
+import sys
+from unicode_parse_common import *
+
+# http://www.unicode.org/Public/5.1.0/ucd/Scripts.txt
+
+script_to_harfbuzz = {
+ # This is the list of HB_Script_* at the time of writing
+ 'Common': 'HB_Script_Common',
+ 'Greek': 'HB_Script_Greek',
+ 'Cyrillic': 'HB_Script_Cyrillic',
+ 'Armenian': 'HB_Script_Armenian',
+ 'Hebrew': 'HB_Script_Hebrew',
+ 'Arabic': 'HB_Script_Arabic',
+ 'Syriac': 'HB_Script_Syriac',
+ 'Thaana': 'HB_Script_Thaana',
+ 'Devanagari': 'HB_Script_Devanagari',
+ 'Bengali': 'HB_Script_Bengali',
+ 'Gurmukhi': 'HB_Script_Gurmukhi',
+ 'Gujarati': 'HB_Script_Gujarati',
+ 'Oriya': 'HB_Script_Oriya',
+ 'Tamil': 'HB_Script_Tamil',
+ 'Telugu': 'HB_Script_Telugu',
+ 'Kannada': 'HB_Script_Kannada',
+ 'Malayalam': 'HB_Script_Malayalam',
+ 'Sinhala': 'HB_Script_Sinhala',
+ 'Thai': 'HB_Script_Thai',
+ 'Lao': 'HB_Script_Lao',
+ 'Tibetan': 'HB_Script_Tibetan',
+ 'Myanmar': 'HB_Script_Myanmar',
+ 'Georgian': 'HB_Script_Georgian',
+ 'Hangul': 'HB_Script_Hangul',
+ 'Ogham': 'HB_Script_Ogham',
+ 'Runic': 'HB_Script_Runic',
+ 'Khmer': 'HB_Script_Khmer',
+ 'Inherited': 'HB_Script_Inherited',
+}
+
+class ScriptDict(object):
+ def __init__(self, base):
+ self.base = base
+
+ def __getitem__(self, key):
+ r = self.base.get(key, None)
+ if r is None:
+ return 'HB_Script_Common'
+ return r
+
+def main(infile, outfile):
+ ranges = unicode_file_parse(infile,
+ ScriptDict(script_to_harfbuzz),
+ 'HB_Script_Common')
+ ranges = sort_and_merge(ranges)
+
+ print >>outfile, '// Generated from Unicode script tables\n'
+ print >>outfile, '#ifndef SCRIPT_PROPERTIES_H_'
+ print >>outfile, '#define SCRIPT_PROPERTIES_H_\n'
+ print >>outfile, '#include <stdint.h>'
+ print >>outfile, '#include "harfbuzz-shaper.h"\n'
+ print >>outfile, 'struct script_property {'
+ print >>outfile, ' uint32_t range_start;'
+ print >>outfile, ' uint32_t range_end;'
+ print >>outfile, ' HB_Script script;'
+ print >>outfile, '};\n'
+ print >>outfile, 'static const struct script_property script_properties[] = {'
+ for (start, end, value) in ranges:
+ print >>outfile, ' {0x%x, 0x%x, %s},' % (start, end, value)
+ print >>outfile, '};\n'
+ print >>outfile, 'static const unsigned script_properties_count = %d;\n' % len(ranges)
+ print >>outfile, '#endif // SCRIPT_PROPERTIES_H_'
+
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ print 'Usage: %s <input .txt> <output .h>' % sys.argv[0]
+ else:
+ main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+'))
diff --git a/contrib/tables/unicode_parse_common.py b/contrib/tables/unicode_parse_common.py
new file mode 100644
index 0000000..ac26eca
--- /dev/null
+++ b/contrib/tables/unicode_parse_common.py
@@ -0,0 +1,70 @@
+def lines_get(f):
+ '''Parse a file like object, removing comments and returning a list of
+ lines.'''
+ def cut_comment(line):
+ first_hash = line.find('#')
+ if first_hash == -1:
+ return line
+ return line[:first_hash]
+
+ return [x for x in [cut_comment(x[:-1]) for x in f.readlines()] if len(x)]
+
+def line_split(line):
+ '''Split a line based on a semicolon separator.'''
+ def normalise(word):
+ return word.lstrip().rstrip()
+ return [normalise(x) for x in line.split(';')]
+
+def codepoints_parse(token):
+ '''Parse a Unicode style code-point range. Return either a single value or a
+ tuple of (start, end) for a range of code-points.'''
+ def fromHex(token):
+ return int(token, 16)
+ parts = token.split('..')
+ if len(parts) == 2:
+ return (fromHex(parts[0]), fromHex(parts[1]))
+ elif len(parts) == 1:
+ return fromHex(parts[0])
+ else:
+ raise ValueError(token)
+
+def unicode_file_parse(input, map, default_value = None):
+ '''Parse a file like object, @input where the first column is a code-point
+ range and the second column is mapped via the given dict, @map.'''
+ ranges = []
+ tokens = [line_split(x) for x in lines_get(input)]
+ for line in tokens:
+ if len(line) == 2:
+ codepoints = codepoints_parse(line[0])
+ value = map[line[1]]
+ if value == default_value:
+ continue
+
+ if type(codepoints) == int:
+ codepoints = (codepoints, codepoints)
+
+ ranges.append((codepoints[0], codepoints[1], value))
+ else:
+ raise ValueError(line)
+
+ return ranges
+
+def sort_and_merge(ranges):
+ '''Given a list of (start, end, value), merge elements where the ranges are
+ continuous and the values are the same.'''
+ output = []
+ ranges.sort()
+ current = None
+ for v in ranges:
+ if current is None:
+ current = v
+ continue
+ if current[1] + 1 == v[0] and current[2] == v[2]:
+ current = (current[0], v[1], v[2])
+ else:
+ output.append(current)
+ current = v
+ if current is not None:
+ output.append(current)
+
+ return output