13 files changed, 1049 insertions, 0 deletions
diff --git a/contrib/README b/contrib/README
new file mode 100644
index 0000000..074cc52
--- /dev/null
+++ b/contrib/README
@@ -0,0 +1,9 @@
+Harfbuzz requires several functions to be defined in order to work with the
+platform's Unicode tables etc.
+
+If you are building on top of Qt4 you should look at the code in the tests/
+directory for examples of how to hook up Qt4 functions to Harfbuzz.
+
+Otherwise, this directory contains examples of using downloaded Unicode tables
+and/or glib to host Harfbuzz. You should read the README file in tables/ for how
+to build the header files for some of the Unicode tables.
diff --git a/contrib/harfbuzz-freetype.c b/contrib/harfbuzz-freetype.c
new file mode 100644
index 0000000..a2962df
--- /dev/null
+++ b/contrib/harfbuzz-freetype.c
@@ -0,0 +1,149 @@
+#include <stdint.h>
+
+#include <ft2build.h>
+#include FT_FREETYPE_H
+#include FT_TRUETYPE_TABLES_H
+
+#if 0
+#include <freetype/freetype.h>
+#include <freetype/tttables.h>
+#endif
+
+#include <harfbuzz-shaper.h>
+#include "harfbuzz-unicode.h"
+
+static HB_Bool
+hb_freetype_string_to_glyphs(HB_Font font,
+                             const HB_UChar16 *chars, hb_uint32 len,
+                             HB_Glyph *glyphs, hb_uint32 *numGlyphs,
+                             HB_Bool is_rtl) {
+  FT_Face face = (FT_Face) font->userData;
+  if (len > *numGlyphs)
+    return 0;
+
+  size_t i = 0, j = 0;
+  while (i < len) {
+    const uint32_t cp = utf16_to_code_point(chars, len, &i);
+    glyphs[j++] = FT_Get_Char_Index(face, cp);
+  }
+
+  *numGlyphs = j;
+
+  return 1;
+}
+
+static void
+hb_freetype_advances_get(HB_Font font, const HB_Glyph *glyphs, hb_uint32 len,
+                         HB_Fixed *advances, int flags) {
+  FT_Face face = (FT_Face) font->userData;
+
+  hb_uint32 i;
+  for (i = 0; i < len; ++i) {
+    const FT_Error error = FT_Load_Glyph(face, glyphs[i], FT_LOAD_DEFAULT);
+    if (error) {
+      advances[i] = 0;
+      continue;
+    }
+
+    advances[i] = face->glyph->advance.x;
+  }
+}
+
+static HB_Bool
+hb_freetype_can_render(HB_Font font, const HB_UChar16 *chars, hb_uint32 len) {
+  FT_Face face = (FT_Face)font->userData;
+
+  size_t i = 0;
+  while (i < len) {
+    const uint32_t cp = utf16_to_code_point(chars, len, &i);
+    if (FT_Get_Char_Index(face, cp) == 0)
+      return 0;
+  }
+
+  return 1;
+}
+
+static HB_Error
+hb_freetype_outline_point_get(HB_Font font, HB_Glyph glyph, int flags,
+                              hb_uint32 point, HB_Fixed *xpos, HB_Fixed *ypos,
+                              hb_uint32 *n_points) {
+  HB_Error error = HB_Err_Ok;
+  FT_Face face = (FT_Face) font->userData;
+
+  int load_flags = (flags & HB_ShaperFlag_UseDesignMetrics) ? FT_LOAD_NO_HINTING : FT_LOAD_DEFAULT;
+
+  if ((error = (HB_Error) FT_Load_Glyph(face, glyph, load_flags)))
+    return error;
+
+  if (face->glyph->format != ft_glyph_format_outline)
+    return (HB_Error)HB_Err_Invalid_SubTable;
+
+  *n_points = face->glyph->outline.n_points;
+  if (!(*n_points))
+    return HB_Err_Ok;
+
+  if (point > *n_points)
+    return (HB_Error)HB_Err_Invalid_SubTable;
+
+  *xpos = face->glyph->outline.points[point].x;
+  *ypos = face->glyph->outline.points[point].y;
+
+  return HB_Err_Ok;
+}
+
+static void
+hb_freetype_glyph_metrics_get(HB_Font font, HB_Glyph glyph,
+                              HB_GlyphMetrics *metrics) {
+  FT_Face face = (FT_Face) font->userData;
+
+  const FT_Error error = FT_Load_Glyph(face, glyph, FT_LOAD_DEFAULT);
+  if (error) {
+    metrics->x = metrics->y = metrics->width = metrics->height = 0;
+    metrics->xOffset = metrics->yOffset = 0;
+    return;
+  }
+
+  const FT_Glyph_Metrics *ftmetrics = &face->glyph->metrics;
+  metrics->width = ftmetrics->width;
+  metrics->height = ftmetrics->height;
+  metrics->x = ftmetrics->horiAdvance;
+  metrics->y = 0;  // unclear what this is
+  metrics->xOffset = ftmetrics->horiBearingX;
+  metrics->yOffset = ftmetrics->horiBearingY;
+}
+
+static HB_Fixed
+hb_freetype_font_metric_get(HB_Font font, HB_FontMetric metric) {
+  FT_Face face = (FT_Face) font->userData;
+
+  switch (metric) {
+  case HB_FontAscent:
+    // Note that we aren't scanning the VDMX table which we probably would in
+    // an ideal world.
+    return face->ascender;
+  default:
+    return 0;
+  }
+}
+
+const HB_FontClass hb_freetype_class = {
+  hb_freetype_string_to_glyphs,
+  hb_freetype_advances_get,
+  hb_freetype_can_render,
+  hb_freetype_outline_point_get,
+  hb_freetype_glyph_metrics_get,
+  hb_freetype_font_metric_get,
+};
+
+HB_Error
+hb_freetype_table_sfnt_get(void *voidface, const HB_Tag tag, HB_Byte *buffer, HB_UInt *len) {
+  FT_Face face = (FT_Face) voidface;
+  FT_ULong ftlen = *len;
+
+  if (!FT_IS_SFNT(face))
+    return HB_Err_Invalid_Argument;
+
+  const FT_Error error = FT_Load_Sfnt_Table(face, tag, 0, buffer, &ftlen);
+  *len = ftlen;
+  return (HB_Error) error;
+}
diff --git a/contrib/harfbuzz-freetype.h b/contrib/harfbuzz-freetype.h
new file mode 100644
index 0000000..628be16
--- /dev/null
+++ b/contrib/harfbuzz-freetype.h
@@ -0,0 +1,9 @@
+#ifndef HB_FREETYPE_H_
+#define HB_FREETYPE_H_
+
+extern const HB_FontClass hb_freetype_class;
+
+HB_Error hb_freetype_table_sfnt_get(void *voidface, const HB_Tag tag,
+                                    HB_Byte *buffer, HB_UInt *len);
+
+#endif  // HB_FREETYPE_H_
diff --git a/contrib/harfbuzz-unicode-glib.c b/contrib/harfbuzz-unicode-glib.c
new file mode 100644
index 0000000..6a13433
--- /dev/null
+++ b/contrib/harfbuzz-unicode-glib.c
@@ -0,0 +1,169 @@
+#include "harfbuzz-external.h"
+
+#include <glib.h>
+
+static int
+hb_category_for_char(HB_UChar32 ch) {
+  switch (g_unichar_type(ch)) {
+    case G_UNICODE_CONTROL:
+      return HB_Other_Control;
+    case G_UNICODE_FORMAT:
+      return HB_Other_Format;
+    case G_UNICODE_UNASSIGNED:
+      return HB_Other_NotAssigned;
+    case G_UNICODE_PRIVATE_USE:
+      return HB_Other_PrivateUse;
+    case G_UNICODE_SURROGATE:
+      return HB_Other_Surrogate;
+    case G_UNICODE_LOWERCASE_LETTER:
+      return HB_Letter_Lowercase;
+    case G_UNICODE_MODIFIER_LETTER:
+      return HB_Letter_Modifier;
+    case G_UNICODE_OTHER_LETTER:
+      return HB_Letter_Other;
+    case G_UNICODE_TITLECASE_LETTER:
+      return HB_Letter_Titlecase;
+    case G_UNICODE_UPPERCASE_LETTER:
+      return HB_Letter_Uppercase;
+    case G_UNICODE_COMBINING_MARK:
+      return HB_Mark_SpacingCombining;
+    case G_UNICODE_ENCLOSING_MARK:
+      return HB_Mark_Enclosing;
+    case G_UNICODE_NON_SPACING_MARK:
+      return HB_Mark_NonSpacing;
+    case G_UNICODE_DECIMAL_NUMBER:
+      return HB_Number_DecimalDigit;
+    case G_UNICODE_LETTER_NUMBER:
+      return HB_Number_Letter;
+    case G_UNICODE_OTHER_NUMBER:
+      return HB_Number_Other;
+    case G_UNICODE_CONNECT_PUNCTUATION:
+      return HB_Punctuation_Connector;
+    case G_UNICODE_DASH_PUNCTUATION:
+      return HB_Punctuation_Dash;
+    case G_UNICODE_CLOSE_PUNCTUATION:
+      return HB_Punctuation_Close;
+    case G_UNICODE_FINAL_PUNCTUATION:
+      return HB_Punctuation_FinalQuote;
+    case G_UNICODE_INITIAL_PUNCTUATION:
+      return HB_Punctuation_InitialQuote;
+    case G_UNICODE_OTHER_PUNCTUATION:
+      return HB_Punctuation_Other;
+    case G_UNICODE_OPEN_PUNCTUATION:
+      return HB_Punctuation_Open;
+    case G_UNICODE_CURRENCY_SYMBOL:
+      return HB_Symbol_Currency;
+    case G_UNICODE_MODIFIER_SYMBOL:
+      return HB_Symbol_Modifier;
+    case G_UNICODE_MATH_SYMBOL:
+      return HB_Symbol_Math;
+    case G_UNICODE_OTHER_SYMBOL:
+      return HB_Symbol_Other;
+    case G_UNICODE_LINE_SEPARATOR:
+      return HB_Separator_Line;
+    case G_UNICODE_PARAGRAPH_SEPARATOR:
+      return HB_Separator_Paragraph;
+    case G_UNICODE_SPACE_SEPARATOR:
+      return HB_Separator_Space;
+    default:
+      return HB_Symbol_Other;
+  }
+}
+
+HB_LineBreakClass
+HB_GetLineBreakClass(HB_UChar32 ch) {
+  switch (g_unichar_break_type(ch)) {
+    case G_UNICODE_BREAK_MANDATORY:
+      return HB_LineBreak_BK;
+    case G_UNICODE_BREAK_CARRIAGE_RETURN:
+      return HB_LineBreak_CR;
+    case G_UNICODE_BREAK_LINE_FEED:
+      return HB_LineBreak_LF;
+    case G_UNICODE_BREAK_COMBINING_MARK:
+      return HB_LineBreak_CM;
+    case G_UNICODE_BREAK_SURROGATE:
+      return HB_LineBreak_SG;
+    case G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
+      return HB_LineBreak_ZW;
+    case G_UNICODE_BREAK_INSEPARABLE:
+      return HB_LineBreak_IN;
+    case G_UNICODE_BREAK_NON_BREAKING_GLUE:
+      return HB_LineBreak_GL;
+    case G_UNICODE_BREAK_CONTINGENT:
+      return HB_LineBreak_AL;
+    case G_UNICODE_BREAK_SPACE:
+      return HB_LineBreak_SP;
+    case G_UNICODE_BREAK_AFTER:
+      return HB_LineBreak_BA;
+    case G_UNICODE_BREAK_BEFORE:
+      return HB_LineBreak_BB;
+    case G_UNICODE_BREAK_BEFORE_AND_AFTER:
+      return HB_LineBreak_B2;
+    case G_UNICODE_BREAK_HYPHEN:
+      return HB_LineBreak_HY;
+    case G_UNICODE_BREAK_NON_STARTER:
+      return HB_LineBreak_NS;
+    case G_UNICODE_BREAK_OPEN_PUNCTUATION:
+      return HB_LineBreak_OP;
+    case G_UNICODE_BREAK_CLOSE_PUNCTUATION:
+      return HB_LineBreak_CL;
+    case G_UNICODE_BREAK_QUOTATION:
+      return HB_LineBreak_QU;
+    case G_UNICODE_BREAK_EXCLAMATION:
+      return HB_LineBreak_EX;
+    case G_UNICODE_BREAK_IDEOGRAPHIC:
+      return HB_LineBreak_ID;
+    case G_UNICODE_BREAK_NUMERIC:
+      return HB_LineBreak_NU;
+    case G_UNICODE_BREAK_INFIX_SEPARATOR:
+      return HB_LineBreak_IS;
+    case G_UNICODE_BREAK_SYMBOL:
+      return HB_LineBreak_SY;
+    case G_UNICODE_BREAK_ALPHABETIC:
+      return HB_LineBreak_AL;
+    case G_UNICODE_BREAK_PREFIX:
+      return HB_LineBreak_PR;
+    case G_UNICODE_BREAK_POSTFIX:
+      return HB_LineBreak_PO;
+    case G_UNICODE_BREAK_COMPLEX_CONTEXT:
+      return HB_LineBreak_SA;
+    case G_UNICODE_BREAK_AMBIGUOUS:
+      return HB_LineBreak_AL;
+    case G_UNICODE_BREAK_UNKNOWN:
+      return HB_LineBreak_AL;
+    case G_UNICODE_BREAK_NEXT_LINE:
+      return HB_LineBreak_AL;
+    case G_UNICODE_BREAK_WORD_JOINER:
+      return HB_LineBreak_WJ;
+    case G_UNICODE_BREAK_HANGUL_L_JAMO:
+      return HB_LineBreak_JL;
+    case G_UNICODE_BREAK_HANGUL_V_JAMO:
+      return HB_LineBreak_JV;
+    case G_UNICODE_BREAK_HANGUL_T_JAMO:
+      return HB_LineBreak_JT;
+    case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
+      return HB_LineBreak_H2;
+    case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
+      return HB_LineBreak_H3;
+    default:
+      return HB_LineBreak_AL;
+  }
+}
+
+int
+HB_GetUnicodeCharCombiningClass(HB_UChar32 ch) {
+  return g_unichar_combining_class(ch);
+}
+
+void
+HB_GetUnicodeCharProperties(HB_UChar32 ch,
+                            HB_CharCategory *category,
+                            int *combiningClass) {
+  *category = hb_category_for_char(ch);
+  *combiningClass = g_unichar_combining_class(ch);
+}
+
+HB_CharCategory
+HB_GetUnicodeCharCategory(HB_UChar32 ch) {
+  return hb_category_for_char(ch);
+}
diff --git a/contrib/harfbuzz-unicode-tables.c b/contrib/harfbuzz-unicode-tables.c
new file mode 100644
index 0000000..3c3fead
--- /dev/null
+++ b/contrib/harfbuzz-unicode-tables.c
@@ -0,0 +1,84 @@
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <harfbuzz-external.h>
+
+#include "tables/category-properties.h"
+#include "tables/combining-properties.h"
+
+HB_LineBreakClass
+HB_GetLineBreakClass(HB_UChar32 ch) {
+  abort();
+  return 0;
+}
+
+static int
+combining_property_cmp(const void *vkey, const void *vcandidate) {
+  const uint32_t key = (uint32_t) (intptr_t) vkey;
+  const struct combining_property *candidate = vcandidate;
+
+  if (key < candidate->range_start) {
+    return -1;
+  } else if (key > candidate->range_end) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+static int
+code_point_to_combining_class(HB_UChar32 cp) {
+  const void *vprop = bsearch((void *) (intptr_t) cp, combining_properties,
+                              combining_properties_count,
+                              sizeof(struct combining_property),
+                              combining_property_cmp);
+  if (!vprop)
+    return 0;
+
+  return ((const struct combining_property *) vprop)->klass;
+}
+
+int
+HB_GetUnicodeCharCombiningClass(HB_UChar32 ch) {
+  return code_point_to_combining_class(ch);
+  return 0;
+}
+
+static int
+category_property_cmp(const void *vkey, const void *vcandidate) {
+  const uint32_t key = (uint32_t) (intptr_t) vkey;
+  const struct category_property *candidate = vcandidate;
+
+  if (key < candidate->range_start) {
+    return -1;
+  } else if (key > candidate->range_end) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+static HB_CharCategory
+code_point_to_category(HB_UChar32 cp) {
+  const void *vprop = bsearch((void *) (intptr_t) cp, category_properties,
+                              category_properties_count,
+                              sizeof(struct category_property),
+                              category_property_cmp);
+  if (!vprop)
+    return HB_NoCategory;
+
+  return ((const struct category_property *) vprop)->category;
+}
+
+void
+HB_GetUnicodeCharProperties(HB_UChar32 ch,
+                            HB_CharCategory *category,
+                            int *combiningClass) {
+  *category = code_point_to_category(ch);
+  *combiningClass = code_point_to_combining_class(ch);
+}
+
+HB_CharCategory
+HB_GetUnicodeCharCategory(HB_UChar32 ch) {
+  return code_point_to_category(ch);
+}
diff --git a/contrib/harfbuzz-unicode.c b/contrib/harfbuzz-unicode.c
new file mode 100644
index 0000000..9b3c43e
--- /dev/null
+++ b/contrib/harfbuzz-unicode.c
@@ -0,0 +1,264 @@
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <harfbuzz-external.h>
+#include <harfbuzz-impl.h>
+#include <harfbuzz-shaper.h>
+#include "harfbuzz-unicode.h"
+
+#include "tables/script-properties.h"
+#include "tables/grapheme-break-properties.h"
+
+uint32_t
+utf16_to_code_point(const uint16_t *chars, size_t len, ssize_t *iter) {
+  const uint16_t v = chars[(*iter)++];
+  if (HB_IsHighSurrogate(v)) {
+    // surrogate pair
+    if (*iter >= len) {
+      // the surrogate is incomplete.
+      return HB_InvalidCodePoint;
+    }
+    const uint16_t v2 = chars[(*iter)++];
+    if (!HB_IsLowSurrogate(v2)) {
+      // invalidate surrogate pair.
+      return HB_InvalidCodePoint;
+    }
+
+    return HB_SurrogateToUcs4(v, v2);
+  }
+
+  if (HB_IsLowSurrogate(v)) {
+    // this isn't a valid code point
+    return HB_InvalidCodePoint;
+  }
+
+  return v;
+}
+
+uint32_t
+utf16_to_code_point_prev(const uint16_t *chars, size_t len, ssize_t *iter) {
+  const uint16_t v = chars[(*iter)--];
+  if (HB_IsLowSurrogate(v)) {
+    // surrogate pair
+    if (*iter < 0) {
+      // the surrogate is incomplete.
+      return HB_InvalidCodePoint;
+    }
+    const uint16_t v2 = chars[(*iter)--];
+    if (!HB_IsHighSurrogate(v2)) {
+      // invalidate surrogate pair.
+      return HB_InvalidCodePoint;
+    }
+
+    return HB_SurrogateToUcs4(v2, v);
+  }
+
+  if (HB_IsHighSurrogate(v)) {
+    // this isn't a valid code point
+    return HB_InvalidCodePoint;
+  }
+
+  return v;
+}
+
+static int
+script_property_cmp(const void *vkey, const void *vcandidate) {
+  const uint32_t key = (uint32_t) (intptr_t) vkey;
+  const struct script_property *candidate = vcandidate;
+
+  if (key < candidate->range_start) {
+    return -1;
+  } else if (key > candidate->range_end) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+HB_Script
+code_point_to_script(uint32_t cp) {
+  const void *vprop = bsearch((void *) (intptr_t) cp, script_properties,
+                              script_properties_count,
+                              sizeof(struct script_property),
+                              script_property_cmp);
+  if (!vprop)
+    return HB_Script_Common;
+
+  return ((const struct script_property *) vprop)->script;
+}
+
+char
+hb_utf16_script_run_next(unsigned *num_code_points, HB_ScriptItem *output,
+                         const uint16_t *chars, size_t len, ssize_t *iter) {
+  if (*iter == len)
+    return 0;
+
+  output->pos = *iter;
+  const uint32_t init_cp = utf16_to_code_point(chars, len, iter);
+  unsigned cps = 1;
+  if (init_cp == HB_InvalidCodePoint)
+    return 0;
+  const HB_Script init_script = code_point_to_script(init_cp);
+  HB_Script current_script = init_script;
+  output->script = init_script;
+
+  for (;;) {
+    if (*iter == len)
+      break;
+    const ssize_t prev_iter = *iter;
+    const uint32_t cp = utf16_to_code_point(chars, len, iter);
+    if (cp == HB_InvalidCodePoint)
+      return 0;
+    cps++;
+    const HB_Script script = code_point_to_script(cp);
+
+    if (script != current_script) {
+      if (current_script == init_script == HB_Script_Inherited) {
+        // If we started off as inherited, we take whatever we can find.
+        output->script = script;
+        current_script = script;
+        continue;
+      } else if (script == HB_Script_Inherited) {
+        current_script = script;
+        continue;
+      } else {
+        *iter = prev_iter;
+        cps--;
+        break;
+      }
+    }
+  }
+
+  if (output->script == HB_Script_Inherited)
+    output->script = HB_Script_Common;
+
+  output->length = *iter - output->pos;
+  if (num_code_points)
+    *num_code_points = cps;
+  return 1;
+}
+
+char
+hb_utf16_script_run_prev(unsigned *num_code_points, HB_ScriptItem *output,
+                         const uint16_t *chars, size_t len, ssize_t *iter) {
+  if (*iter == (size_t) -1)
+    return 0;
+
+  const size_t ending_index = *iter;
+  const uint32_t init_cp = utf16_to_code_point_prev(chars, len, iter);
+  unsigned cps = 1;
+  if (init_cp == HB_InvalidCodePoint)
+    return 0;
+  const HB_Script init_script = code_point_to_script(init_cp);
+  HB_Script current_script = init_script;
+  output->script = init_script;
+
+  for (;;) {
+    if (*iter < 0)
+      break;
+    const ssize_t prev_iter = *iter;
+    const uint32_t cp = utf16_to_code_point_prev(chars, len, iter);
+    if (cp == HB_InvalidCodePoint)
+      return 0;
+    cps++;
+    const HB_Script script = code_point_to_script(cp);
+
+    if (script != current_script) {
+      if (current_script == init_script == HB_Script_Inherited) {
+        // If we started off as inherited, we take whatever we can find.
+        output->script = script;
+        current_script = script;
+        continue;
+      } else if (script == HB_Script_Inherited) {
+        current_script = script;
+        continue;
+      } else {
+        *iter = prev_iter;
+        cps--;
+        break;
+      }
+    }
+  }
+
+  if (output->script == HB_Script_Inherited)
+    output->script = HB_Script_Common;
+
+  output->pos = *iter + 1;
+  output->length = ending_index - *iter;
+  if (num_code_points)
+    *num_code_points = cps;
+  return 1;
+}
+
+static int
+grapheme_break_property_cmp(const void *vkey, const void *vcandidate) {
+  const uint32_t key = (uint32_t) (intptr_t) vkey;
+  const struct grapheme_break_property *candidate = vcandidate;
+
+  if (key < candidate->range_start) {
+    return -1;
+  } else if (key > candidate->range_end) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+HB_GraphemeClass
+HB_GetGraphemeClass(HB_UChar32 ch) {
+  const void *vprop = bsearch((void *) (intptr_t) ch, grapheme_break_properties,
+                              grapheme_break_properties_count,
+                              sizeof(struct grapheme_break_property),
+                              grapheme_break_property_cmp);
+  if (!vprop)
+    return HB_Grapheme_Other;
+
+  return ((const struct grapheme_break_property *) vprop)->klass;
+}
+
+HB_WordClass
+HB_GetWordClass(HB_UChar32 ch) {
+  abort();
+  return 0;
+}
+
+HB_SentenceClass
+HB_GetSentenceClass(HB_UChar32 ch) {
+  abort();
+  return 0;
+}
+
+void
+HB_GetGraphemeAndLineBreakClass(HB_UChar32 ch, HB_GraphemeClass *gclass, HB_LineBreakClass *breakclass) {
+  *gclass = HB_GetGraphemeClass(ch);
+  *breakclass = HB_GetLineBreakClass(ch);
+}
+
+HB_UChar16
+HB_GetMirroredChar(HB_UChar16 ch) {
+  abort();
+  return 0;
+}
+
+void *
+HB_Library_Resolve(const char *library, const char *symbol) {
+  abort();
+  return NULL;
+}
+
+void *
+HB_TextCodecForMib(int mib) {
+  abort();
+  return NULL;
+}
+
+char *
+HB_TextCodec_ConvertFromUnicode(void *codec, const HB_UChar16 *unicode, hb_uint32 length, hb_uint32 *outputLength) {
+  abort();
+  return NULL;
+}
+
+void
+HB_TextCodec_FreeResult(char *v) {
+  abort();
+}
diff --git a/contrib/harfbuzz-unicode.h b/contrib/harfbuzz-unicode.h
new file mode 100644
index 0000000..f28b3c3
--- /dev/null
+++ b/contrib/harfbuzz-unicode.h
@@ -0,0 +1,54 @@
+#ifndef SCRIPT_IDENTIFY_H_
+#define SCRIPT_IDENTIFY_H_
+
+#include <stdint.h>
+
+#include <harfbuzz-shaper.h>
+
+static const uint32_t HB_InvalidCodePoint = 0xffffffffu;
+
+// -----------------------------------------------------------------------------
+// Return the next Unicode code point from a UTF-16 vector
+//   chars: a pointer to @len words
+//   iter: (input/output) an index into @chars. This is updated.
+//   returns: HB_InvalidCodePoint on error and the code point otherwise.
+// -----------------------------------------------------------------------------
+uint32_t utf16_to_code_point(const uint16_t *chars, size_t len, ssize_t *iter);
+
+// -----------------------------------------------------------------------------
+// Like the above, except that the code points are traversed backwards. Thus,
+// on the first call, |iter| should be |len| - 1.
+// -----------------------------------------------------------------------------
+uint32_t utf16_to_code_point(const uint16_t *chars, size_t len, ssize_t *iter);
+
+// -----------------------------------------------------------------------------
+// Return the script of the given code point
+// -----------------------------------------------------------------------------
+HB_Script code_point_to_script(uint32_t cp);
+
+// -----------------------------------------------------------------------------
+// Find the next script run in a UTF-16 string.
+//
+// A script run is a subvector of codepoints, all of which are in the same
+// script. A run will never cut a surrogate pair in half at either end.
+//
+// num_code_points: (output, maybe NULL) the number of code points in the run
+// output: (output) the @pos, @length and @script fields are set on success
+// chars: the UTF-16 string
+// len: the length of @chars, in words
+// iter: (in/out) the current index into the string. This should be 0 for the
+//   first call and is updated on exit.
+//
+// returns: non-zero if a script run was found and returned.
+// -----------------------------------------------------------------------------
+char hb_utf16_script_run_next(unsigned *num_code_points, HB_ScriptItem *output,
+                              const uint16_t *chars, size_t len, ssize_t *iter);
+
+// -----------------------------------------------------------------------------
+// This is the same as above, except that the input is traversed backwards.
+// Thus, on the first call, |iter| should be |len| - 1.
+// -----------------------------------------------------------------------------
+char hb_utf16_script_run_prev(unsigned *num_code_points, HB_ScriptItem *output,
+                              const uint16_t *chars, size_t len, ssize_t *iter);
+
+#endif
diff --git a/contrib/tables/README b/contrib/tables/README
new file mode 100644
index 0000000..605d1c0
--- /dev/null
+++ b/contrib/tables/README
@@ -0,0 +1,17 @@
+This directory contains Python script to parse several of the Unicode tables
+that are downloadable from the web and generate C header files from them.
+
+These are the locations of the files which are parsed. You should download these
+files and put them in this directory.
+
+http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedGeneralCategory.txt
+http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedCombiningClass.txt
+http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
+http://www.unicode.org/Public/5.1.0/ucd/Scripts.txt
+
+Then you can run the following python scripts to generate the header files:
+
+python category-parse.py DerivedGeneralCategory.txt category-properties.h
+python combining-class-parse.py DerivedCombiningClass.txt combining-properties.h
+python grapheme-break-parse.py GraphemeBreakProperty.txt grapheme-break-properties.h
+python scripts-parse.py Scripts.txt script-properties.h
diff --git a/contrib/tables/category-parse.py b/contrib/tables/category-parse.py
new file mode 100644
index 0000000..6818c1d
--- /dev/null
+++ b/contrib/tables/category-parse.py
@@ -0,0 +1,70 @@
+import sys
+from unicode_parse_common import *
+
+# http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedGeneralCategory.txt
+
+category_to_harfbuzz = {
+  'Mn': 'HB_Mark_NonSpacing',
+  'Mc': 'HB_Mark_SpacingCombining',
+  'Me': 'HB_Mark_Enclosing',
+
+  'Nd': 'HB_Number_DecimalDigit',
+  'Nl': 'HB_Number_Letter',
+  'No': 'HB_Number_Other',
+
+  'Zs': 'HB_Separator_Space',
+  'Zl': 'HB_Separator_Line',
+  'Zp': 'HB_Separator_Paragraph',
+
+  'Cc': 'HB_Other_Control',
+  'Cf': 'HB_Other_Format',
+  'Cs': 'HB_Other_Surrogate',
+  'Co': 'HB_Other_PrivateUse',
+  'Cn': 'HB_Other_NotAssigned',
+
+  'Lu': 'HB_Letter_Uppercase',
+  'Ll': 'HB_Letter_Lowercase',
+  'Lt': 'HB_Letter_Titlecase',
+  'Lm': 'HB_Letter_Modifier',
+  'Lo': 'HB_Letter_Other',
+
+  'Pc': 'HB_Punctuation_Connector',
+  'Pd': 'HB_Punctuation_Dash',
+  'Ps': 'HB_Punctuation_Open',
+  'Pe': 'HB_Punctuation_Close',
+  'Pi': 'HB_Punctuation_InitialQuote',
+  'Pf': 'HB_Punctuation_FinalQuote',
+  'Po': 'HB_Punctuation_Other',
+
+  'Sm': 'HB_Symbol_Math',
+  'Sc': 'HB_Symbol_Currency',
+  'Sk': 'HB_Symbol_Modifier',
+  'So': 'HB_Symbol_Other',
+}
+
+def main(infile, outfile):
+  ranges = unicode_file_parse(infile, category_to_harfbuzz)
+  ranges = sort_and_merge(ranges)
+
+  print >>outfile, '// Generated from Unicode script tables\n'
+  print >>outfile, '#ifndef CATEGORY_PROPERTIES_H_'
+  print >>outfile, '#define CATEGORY_PROPERTIES_H_\n'
+  print >>outfile, '#include <stdint.h>'
+  print >>outfile, '#include "harfbuzz-external.h"\n'
+  print >>outfile, 'struct category_property {'
+  print >>outfile, '  uint32_t range_start;'
+  print >>outfile, '  uint32_t range_end;'
+  print >>outfile, '  HB_CharCategory category;'
+  print >>outfile, '};\n'
+  print >>outfile, 'static const struct category_property category_properties[] = {'
+  for (start, end, value) in ranges:
+    print >>outfile, '  {0x%x, 0x%x, %s},' % (start, end, value)
+  print >>outfile, '};\n'
+  print >>outfile, 'static const unsigned category_properties_count = %d;\n' % len(ranges)
+  print >>outfile, '#endif  // CATEGORY_PROPERTIES_H_'
+
+if __name__ == '__main__':
+  if len(sys.argv) != 3:
+    print 'Usage: %s <input .txt> <output .h>' % sys.argv[0]
+  else:
+    main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+'))
diff --git a/contrib/tables/combining-class-parse.py b/contrib/tables/combining-class-parse.py
new file mode 100644
index 0000000..c591ddd
--- /dev/null
+++ b/contrib/tables/combining-class-parse.py
@@ -0,0 +1,34 @@
+import sys
+from unicode_parse_common import *
+
+# http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedCombiningClass.txt
+
+class IdentityMap(object):
+  def __getitem__(_, key):
+    return key
+
+def main(infile, outfile):
+  ranges = unicode_file_parse(infile, IdentityMap(), '0')
+  ranges = sort_and_merge(ranges)
+
+  print >>outfile, '// Generated from Unicode tables\n'
+  print >>outfile, '#ifndef COMBINING_PROPERTIES_H_'
+  print >>outfile, '#define COMBINING_PROPERTIES_H_\n'
+  print >>outfile, '#include <stdint.h>'
+  print >>outfile, 'struct combining_property {'
+  print >>outfile, '  uint32_t range_start;'
+  print >>outfile, '  uint32_t range_end;'
+  print >>outfile, '  uint8_t klass;'
+  print >>outfile, '};\n'
+  print >>outfile, 'static const struct combining_property combining_properties[] = {'
+  for (start, end, value) in ranges:
+    print >>outfile, '  {0x%x, 0x%x, %s},' % (start, end, value)
+  print >>outfile, '};\n'
+  print >>outfile, 'static const unsigned combining_properties_count = %d;\n' % len(ranges)
+  print >>outfile, '#endif  // COMBINING_PROPERTIES_H_'
+
+if __name__ == '__main__':
+  if len(sys.argv) != 3:
+    print 'Usage: %s <input .txt> <output .h>' % sys.argv[0]
+  else:
+    main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+'))
diff --git a/contrib/tables/grapheme-break-parse.py b/contrib/tables/grapheme-break-parse.py
new file mode 100644
index 0000000..a4b3534
--- /dev/null
+++ b/contrib/tables/grapheme-break-parse.py
@@ -0,0 +1,45 @@
+import sys
+from unicode_parse_common import *
+
+# http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
+
+property_to_harfbuzz = {
+  'CR': 'HB_Grapheme_CR',
+  'LF': 'HB_Grapheme_LF',
+  'Control': 'HB_Grapheme_Control',
+  'Extend': 'HB_Grapheme_Extend',
+  'Prepend': 'HB_Grapheme_Other',
+  'SpacingMark': 'HB_Grapheme_Other',
+  'L': 'HB_Grapheme_L',
+  'V': 'HB_Grapheme_V',
+  'T': 'HB_Grapheme_T',
+  'LV': 'HB_Grapheme_LV',
+  'LVT': 'HB_Grapheme_LVT',
+}
+
+def main(infile, outfile):
+  ranges = unicode_file_parse(infile, property_to_harfbuzz)
+  ranges.sort()
+
+  print >>outfile, '// Generated from Unicode Grapheme break tables\n'
+  print >>outfile, '#ifndef GRAPHEME_BREAK_PROPERTY_H_'
+  print >>outfile, '#define GRAPHEME_BREAK_PROPERTY_H_\n'
+  print >>outfile, '#include <stdint.h>'
+  print >>outfile, '#include "harfbuzz-external.h"\n'
+  print >>outfile, 'struct grapheme_break_property {'
+  print >>outfile, '  uint32_t range_start;'
+  print >>outfile, '  uint32_t range_end;'
+  print >>outfile, '  HB_GraphemeClass klass;'
+  print >>outfile, '};\n'
+  print >>outfile, 'static const struct grapheme_break_property grapheme_break_properties[] = {'
+  for (start, end, value) in ranges:
+    print >>outfile, '  {0x%x, 0x%x, %s},' % (start, end, value)
+  print >>outfile, '};\n'
+  print >>outfile, 'static const unsigned grapheme_break_properties_count = %d;\n' % len(ranges)
+  print >>outfile, '#endif  // GRAPHEME_BREAK_PROPERTY_H_'
+
+if __name__ == '__main__':
+  if len(sys.argv) != 3:
+    print 'Usage: %s <input .txt> <output .h>' % sys.argv[0]
+  else:
+    main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+'))
diff --git a/contrib/tables/scripts-parse.py b/contrib/tables/scripts-parse.py
new file mode 100644
index 0000000..23bac10
--- /dev/null
+++ b/contrib/tables/scripts-parse.py
@@ -0,0 +1,75 @@
+import sys
+from unicode_parse_common import *
+
+# http://www.unicode.org/Public/5.1.0/ucd/Scripts.txt
+
+script_to_harfbuzz = {
+  # This is the list of HB_Script_* at the time of writing
+  'Common': 'HB_Script_Common',
+  'Greek': 'HB_Script_Greek',
+  'Cyrillic': 'HB_Script_Cyrillic',
+  'Armenian': 'HB_Script_Armenian',
+  'Hebrew': 'HB_Script_Hebrew',
+  'Arabic': 'HB_Script_Arabic',
+  'Syriac': 'HB_Script_Syriac',
+  'Thaana': 'HB_Script_Thaana',
+  'Devanagari': 'HB_Script_Devanagari',
+  'Bengali': 'HB_Script_Bengali',
+  'Gurmukhi': 'HB_Script_Gurmukhi',
+  'Gujarati': 'HB_Script_Gujarati',
+  'Oriya': 'HB_Script_Oriya',
+  'Tamil': 'HB_Script_Tamil',
+  'Telugu': 'HB_Script_Telugu',
+  'Kannada': 'HB_Script_Kannada',
+  'Malayalam': 'HB_Script_Malayalam',
+  'Sinhala': 'HB_Script_Sinhala',
+  'Thai': 'HB_Script_Thai',
+  'Lao': 'HB_Script_Lao',
+  'Tibetan': 'HB_Script_Tibetan',
+  'Myanmar': 'HB_Script_Myanmar',
+  'Georgian': 'HB_Script_Georgian',
+  'Hangul': 'HB_Script_Hangul',
+  'Ogham': 'HB_Script_Ogham',
+  'Runic': 'HB_Script_Runic',
+  'Khmer': 'HB_Script_Khmer',
+  'Inherited': 'HB_Script_Inherited',
+}
+
+class ScriptDict(object):
+  def __init__(self, base):
+    self.base = base
+
+  def __getitem__(self, key):
+    r = self.base.get(key, None)
+    if r is None:
+      return 'HB_Script_Common'
+    return r
+
+def main(infile, outfile):
+  ranges = unicode_file_parse(infile,
+                              ScriptDict(script_to_harfbuzz),
+                              'HB_Script_Common')
+  ranges = sort_and_merge(ranges)
+
+  print >>outfile, '// Generated from Unicode script tables\n'
+  print >>outfile, '#ifndef SCRIPT_PROPERTIES_H_'
+  print >>outfile, '#define SCRIPT_PROPERTIES_H_\n'
+  print >>outfile, '#include <stdint.h>'
+  print >>outfile, '#include "harfbuzz-shaper.h"\n'
+  print >>outfile, 'struct script_property {'
+  print >>outfile, '  uint32_t range_start;'
+  print >>outfile, '  uint32_t range_end;'
+  print >>outfile, '  HB_Script script;'
+  print >>outfile, '};\n'
+  print >>outfile, 'static const struct script_property script_properties[] = {'
+  for (start, end, value) in ranges:
+    print >>outfile, '  {0x%x, 0x%x, %s},' % (start, end, value)
+  print >>outfile, '};\n'
+  print >>outfile, 'static const unsigned script_properties_count = %d;\n' % len(ranges)
+  print >>outfile, '#endif  // SCRIPT_PROPERTIES_H_'
+
+if __name__ == '__main__':
+  if len(sys.argv) != 3:
+    print 'Usage: %s <input .txt> <output .h>' % sys.argv[0]
+  else:
+    main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+'))
diff --git a/contrib/tables/unicode_parse_common.py b/contrib/tables/unicode_parse_common.py
new file mode 100644
index 0000000..ac26eca
--- /dev/null
+++ b/contrib/tables/unicode_parse_common.py
@@ -0,0 +1,70 @@
+def lines_get(f):
+  '''Parse a file like object, removing comments and returning a list of
+     lines.'''
+  def cut_comment(line):
+    first_hash = line.find('#')
+    if first_hash == -1:
+      return line
+    return line[:first_hash]
+
+  return [x for x in [cut_comment(x[:-1]) for x in f.readlines()] if len(x)]
+
+def line_split(line):
+  '''Split a line based on a semicolon separator.'''
+  def normalise(word):
+    return word.lstrip().rstrip()
+  return [normalise(x) for x in line.split(';')]
+
+def codepoints_parse(token):
+  '''Parse a Unicode style code-point range. Return either a single value or a
+     tuple of (start, end) for a range of code-points.'''
+  def fromHex(token):
+    return int(token, 16)
+  parts = token.split('..')
+  if len(parts) == 2:
+    return (fromHex(parts[0]), fromHex(parts[1]))
+  elif len(parts) == 1:
+    return fromHex(parts[0])
+  else:
+    raise ValueError(token)
+
+def unicode_file_parse(input, map, default_value = None):
+  '''Parse a file like object, @input where the first column is a code-point
+     range and the second column is mapped via the given dict, @map.'''
+  ranges = []
+  tokens = [line_split(x) for x in lines_get(input)]
+  for line in tokens:
+    if len(line) == 2:
+      codepoints = codepoints_parse(line[0])
+      value = map[line[1]]
+      if value == default_value:
+        continue
+
+      if type(codepoints) == int:
+        codepoints = (codepoints, codepoints)
+
+      ranges.append((codepoints[0], codepoints[1], value))
+    else:
+      raise ValueError(line)
+
+  return ranges
+
+def sort_and_merge(ranges):
+  '''Given a list of (start, end, value), merge elements where the ranges are
+     continuous and the values are the same.'''
+  output = []
+  ranges.sort()
+  current = None
+  for v in ranges:
+    if current is None:
+      current = v
+      continue
+    if current[1] + 1 == v[0] and current[2] == v[2]:
+      current = (current[0], v[1], v[2])
+    else:
+      output.append(current)
+      current = v
+  if current is not None:
+    output.append(current)
+
+  return output