diff options
-rw-r--r-- | contrib/README | 9 | ||||
-rw-r--r-- | contrib/harfbuzz-freetype.c | 149 | ||||
-rw-r--r-- | contrib/harfbuzz-freetype.h | 9 | ||||
-rw-r--r-- | contrib/harfbuzz-unicode-glib.c | 169 | ||||
-rw-r--r-- | contrib/harfbuzz-unicode-tables.c | 84 | ||||
-rw-r--r-- | contrib/harfbuzz-unicode.c | 264 | ||||
-rw-r--r-- | contrib/harfbuzz-unicode.h | 54 | ||||
-rw-r--r-- | contrib/tables/README | 17 | ||||
-rw-r--r-- | contrib/tables/category-parse.py | 70 | ||||
-rw-r--r-- | contrib/tables/combining-class-parse.py | 34 | ||||
-rw-r--r-- | contrib/tables/grapheme-break-parse.py | 45 | ||||
-rw-r--r-- | contrib/tables/scripts-parse.py | 75 | ||||
-rw-r--r-- | contrib/tables/unicode_parse_common.py | 70 |
13 files changed, 1049 insertions, 0 deletions
diff --git a/contrib/README b/contrib/README new file mode 100644 index 0000000..074cc52 --- /dev/null +++ b/contrib/README @@ -0,0 +1,9 @@ +Harfbuzz requires several functions to be defined in order to work with the +platform's Unicode tables etc. + +If you are building on top of Qt4 you should look at the code in the tests/ +directory for examples of how to hook up Qt4 functions to Harfbuzz. + +Otherwise, this directory contains examples of using downloaded Unicode tables +and/or glib to host Harfbuzz. You should read the README file in tables/ for how +to build the header files for some of the Unicode tables. diff --git a/contrib/harfbuzz-freetype.c b/contrib/harfbuzz-freetype.c new file mode 100644 index 0000000..a2962df --- /dev/null +++ b/contrib/harfbuzz-freetype.c @@ -0,0 +1,149 @@ +#include <stdint.h> + +#include <ft2build.h> +#include FT_FREETYPE_H +#include FT_TRUETYPE_TABLES_H + +#if 0 +#include <freetype/freetype.h> +#include <freetype/tttables.h> +#endif + +#include <harfbuzz-shaper.h> +#include "harfbuzz-unicode.h" + +static HB_Bool +hb_freetype_string_to_glyphs(HB_Font font, + const HB_UChar16 *chars, hb_uint32 len, + HB_Glyph *glyphs, hb_uint32 *numGlyphs, + HB_Bool is_rtl) { + FT_Face face = (FT_Face) font->userData; + if (len > *numGlyphs) + return 0; + + size_t i = 0, j = 0; + while (i < len) { + const uint32_t cp = utf16_to_code_point(chars, len, &i); + glyphs[j++] = FT_Get_Char_Index(face, cp); + } + + *numGlyphs = j; + + return 1; +} + +static void +hb_freetype_advances_get(HB_Font font, const HB_Glyph *glyphs, hb_uint32 len, + HB_Fixed *advances, int flags) { + FT_Face face = (FT_Face) font->userData; + + hb_uint32 i; + for (i = 0; i < len; ++i) { + const FT_Error error = FT_Load_Glyph(face, glyphs[i], FT_LOAD_DEFAULT); + if (error) { + advances[i] = 0; + continue; + } + + advances[i] = face->glyph->advance.x; + } +} + +static HB_Bool +hb_freetype_can_render(HB_Font font, const HB_UChar16 *chars, hb_uint32 len) { + FT_Face face = (FT_Face)font->userData; + + size_t i = 0; + while (i < len) { + const uint32_t cp = utf16_to_code_point(chars, len, &i); + if (FT_Get_Char_Index(face, cp) == 0) + return 0; + } + + return 1; +} + +static HB_Error +hb_freetype_outline_point_get(HB_Font font, HB_Glyph glyph, int flags, + hb_uint32 point, HB_Fixed *xpos, HB_Fixed *ypos, + hb_uint32 *n_points) { + HB_Error error = HB_Err_Ok; + FT_Face face = (FT_Face) font->userData; + + int load_flags = (flags & HB_ShaperFlag_UseDesignMetrics) ? FT_LOAD_NO_HINTING : FT_LOAD_DEFAULT; + + if ((error = (HB_Error) FT_Load_Glyph(face, glyph, load_flags))) + return error; + + if (face->glyph->format != ft_glyph_format_outline) + return (HB_Error)HB_Err_Invalid_SubTable; + + *n_points = face->glyph->outline.n_points; + if (!(*n_points)) + return HB_Err_Ok; + + if (point > *n_points) + return (HB_Error)HB_Err_Invalid_SubTable; + + *xpos = face->glyph->outline.points[point].x; + *ypos = face->glyph->outline.points[point].y; + + return HB_Err_Ok; +} + +static void +hb_freetype_glyph_metrics_get(HB_Font font, HB_Glyph glyph, + HB_GlyphMetrics *metrics) { + FT_Face face = (FT_Face) font->userData; + + const FT_Error error = FT_Load_Glyph(face, glyph, FT_LOAD_DEFAULT); + if (error) { + metrics->x = metrics->y = metrics->width = metrics->height = 0; + metrics->xOffset = metrics->yOffset = 0; + return; + } + + const FT_Glyph_Metrics *ftmetrics = &face->glyph->metrics; + metrics->width = ftmetrics->width; + metrics->height = ftmetrics->height; + metrics->x = ftmetrics->horiAdvance; + metrics->y = 0; // unclear what this is + metrics->xOffset = ftmetrics->horiBearingX; + metrics->yOffset = ftmetrics->horiBearingY; +} + +static HB_Fixed +hb_freetype_font_metric_get(HB_Font font, HB_FontMetric metric) { + FT_Face face = (FT_Face) font->userData; + + switch (metric) { + case HB_FontAscent: + // Note that we aren't scanning the VDMX table which we probably would in + // an ideal world. + return face->ascender; + default: + return 0; + } +} + +const HB_FontClass hb_freetype_class = { + hb_freetype_string_to_glyphs, + hb_freetype_advances_get, + hb_freetype_can_render, + hb_freetype_outline_point_get, + hb_freetype_glyph_metrics_get, + hb_freetype_font_metric_get, +}; + +HB_Error +hb_freetype_table_sfnt_get(void *voidface, const HB_Tag tag, HB_Byte *buffer, HB_UInt *len) { + FT_Face face = (FT_Face) voidface; + FT_ULong ftlen = *len; + + if (!FT_IS_SFNT(face)) + return HB_Err_Invalid_Argument; + + const FT_Error error = FT_Load_Sfnt_Table(face, tag, 0, buffer, &ftlen); + *len = ftlen; + return (HB_Error) error; +} diff --git a/contrib/harfbuzz-freetype.h b/contrib/harfbuzz-freetype.h new file mode 100644 index 0000000..628be16 --- /dev/null +++ b/contrib/harfbuzz-freetype.h @@ -0,0 +1,9 @@ +#ifndef HB_FREETYPE_H_ +#define HB_FREETYPE_H_ + +extern const HB_FontClass hb_freetype_class; + +HB_Error hb_freetype_table_sfnt_get(void *voidface, const HB_Tag tag, + HB_Byte *buffer, HB_UInt *len); + +#endif // HB_FREETYPE_H_ diff --git a/contrib/harfbuzz-unicode-glib.c b/contrib/harfbuzz-unicode-glib.c new file mode 100644 index 0000000..6a13433 --- /dev/null +++ b/contrib/harfbuzz-unicode-glib.c @@ -0,0 +1,169 @@ +#include "harfbuzz-external.h" + +#include <glib.h> + +static int +hb_category_for_char(HB_UChar32 ch) { + switch (g_unichar_type(ch)) { + case G_UNICODE_CONTROL: + return HB_Other_Control; + case G_UNICODE_FORMAT: + return HB_Other_Format; + case G_UNICODE_UNASSIGNED: + return HB_Other_NotAssigned; + case G_UNICODE_PRIVATE_USE: + return HB_Other_PrivateUse; + case G_UNICODE_SURROGATE: + return HB_Other_Surrogate; + case G_UNICODE_LOWERCASE_LETTER: + return HB_Letter_Lowercase; + case G_UNICODE_MODIFIER_LETTER: + return HB_Letter_Modifier; + case G_UNICODE_OTHER_LETTER: + return HB_Letter_Other; + case G_UNICODE_TITLECASE_LETTER: + return HB_Letter_Titlecase; + case G_UNICODE_UPPERCASE_LETTER: + return HB_Letter_Uppercase; + case G_UNICODE_COMBINING_MARK: + return HB_Mark_SpacingCombining; + case G_UNICODE_ENCLOSING_MARK: + return HB_Mark_Enclosing; + case G_UNICODE_NON_SPACING_MARK: + return HB_Mark_NonSpacing; + case G_UNICODE_DECIMAL_NUMBER: + return HB_Number_DecimalDigit; + case G_UNICODE_LETTER_NUMBER: + return HB_Number_Letter; + case G_UNICODE_OTHER_NUMBER: + return HB_Number_Other; + case G_UNICODE_CONNECT_PUNCTUATION: + return HB_Punctuation_Connector; + case G_UNICODE_DASH_PUNCTUATION: + return HB_Punctuation_Dash; + case G_UNICODE_CLOSE_PUNCTUATION: + return HB_Punctuation_Close; + case G_UNICODE_FINAL_PUNCTUATION: + return HB_Punctuation_FinalQuote; + case G_UNICODE_INITIAL_PUNCTUATION: + return HB_Punctuation_InitialQuote; + case G_UNICODE_OTHER_PUNCTUATION: + return HB_Punctuation_Other; + case G_UNICODE_OPEN_PUNCTUATION: + return HB_Punctuation_Open; + case G_UNICODE_CURRENCY_SYMBOL: + return HB_Symbol_Currency; + case G_UNICODE_MODIFIER_SYMBOL: + return HB_Symbol_Modifier; + case G_UNICODE_MATH_SYMBOL: + return HB_Symbol_Math; + case G_UNICODE_OTHER_SYMBOL: + return HB_Symbol_Other; + case G_UNICODE_LINE_SEPARATOR: + return HB_Separator_Line; + case G_UNICODE_PARAGRAPH_SEPARATOR: + return HB_Separator_Paragraph; + case G_UNICODE_SPACE_SEPARATOR: + return HB_Separator_Space; + default: + return HB_Symbol_Other; + } +} + +HB_LineBreakClass +HB_GetLineBreakClass(HB_UChar32 ch) { + switch (g_unichar_break_type(ch)) { + case G_UNICODE_BREAK_MANDATORY: + return HB_LineBreak_BK; + case G_UNICODE_BREAK_CARRIAGE_RETURN: + return HB_LineBreak_CR; + case G_UNICODE_BREAK_LINE_FEED: + return HB_LineBreak_LF; + case G_UNICODE_BREAK_COMBINING_MARK: + return HB_LineBreak_CM; + case G_UNICODE_BREAK_SURROGATE: + return HB_LineBreak_SG; + case G_UNICODE_BREAK_ZERO_WIDTH_SPACE: + return HB_LineBreak_ZW; + case G_UNICODE_BREAK_INSEPARABLE: + return HB_LineBreak_IN; + case G_UNICODE_BREAK_NON_BREAKING_GLUE: + return HB_LineBreak_GL; + case G_UNICODE_BREAK_CONTINGENT: + return HB_LineBreak_AL; + case G_UNICODE_BREAK_SPACE: + return HB_LineBreak_SP; + case G_UNICODE_BREAK_AFTER: + return HB_LineBreak_BA; + case G_UNICODE_BREAK_BEFORE: + return HB_LineBreak_BB; + case G_UNICODE_BREAK_BEFORE_AND_AFTER: + return HB_LineBreak_B2; + case G_UNICODE_BREAK_HYPHEN: + return HB_LineBreak_HY; + case G_UNICODE_BREAK_NON_STARTER: + return HB_LineBreak_NS; + case G_UNICODE_BREAK_OPEN_PUNCTUATION: + return HB_LineBreak_OP; + case G_UNICODE_BREAK_CLOSE_PUNCTUATION: + return HB_LineBreak_CL; + case G_UNICODE_BREAK_QUOTATION: + return HB_LineBreak_QU; + case G_UNICODE_BREAK_EXCLAMATION: + return HB_LineBreak_EX; + case G_UNICODE_BREAK_IDEOGRAPHIC: + return HB_LineBreak_ID; + case G_UNICODE_BREAK_NUMERIC: + return HB_LineBreak_NU; + case G_UNICODE_BREAK_INFIX_SEPARATOR: + return HB_LineBreak_IS; + case G_UNICODE_BREAK_SYMBOL: + return HB_LineBreak_SY; + case G_UNICODE_BREAK_ALPHABETIC: + return HB_LineBreak_AL; + case G_UNICODE_BREAK_PREFIX: + return HB_LineBreak_PR; + case G_UNICODE_BREAK_POSTFIX: + return HB_LineBreak_PO; + case G_UNICODE_BREAK_COMPLEX_CONTEXT: + return HB_LineBreak_SA; + case G_UNICODE_BREAK_AMBIGUOUS: + return HB_LineBreak_AL; + case G_UNICODE_BREAK_UNKNOWN: + return HB_LineBreak_AL; + case G_UNICODE_BREAK_NEXT_LINE: + return HB_LineBreak_AL; + case G_UNICODE_BREAK_WORD_JOINER: + return HB_LineBreak_WJ; + case G_UNICODE_BREAK_HANGUL_L_JAMO: + return HB_LineBreak_JL; + case G_UNICODE_BREAK_HANGUL_V_JAMO: + return HB_LineBreak_JV; + case G_UNICODE_BREAK_HANGUL_T_JAMO: + return HB_LineBreak_JT; + case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE: + return HB_LineBreak_H2; + case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE: + return HB_LineBreak_H3; + default: + return HB_LineBreak_AL; + } +} + +int +HB_GetUnicodeCharCombiningClass(HB_UChar32 ch) { + return g_unichar_combining_class(ch); +} + +void +HB_GetUnicodeCharProperties(HB_UChar32 ch, + HB_CharCategory *category, + int *combiningClass) { + *category = hb_category_for_char(ch); + *combiningClass = g_unichar_combining_class(ch); +} + +HB_CharCategory +HB_GetUnicodeCharCategory(HB_UChar32 ch) { + return hb_category_for_char(ch); +} diff --git a/contrib/harfbuzz-unicode-tables.c b/contrib/harfbuzz-unicode-tables.c new file mode 100644 index 0000000..3c3fead --- /dev/null +++ b/contrib/harfbuzz-unicode-tables.c @@ -0,0 +1,84 @@ +#include <stdlib.h> +#include <stdint.h> + +#include <harfbuzz-external.h> + +#include "tables/category-properties.h" +#include "tables/combining-properties.h" + +HB_LineBreakClass +HB_GetLineBreakClass(HB_UChar32 ch) { + abort(); + return 0; +} + +static int +combining_property_cmp(const void *vkey, const void *vcandidate) { + const uint32_t key = (uint32_t) (intptr_t) vkey; + const struct combining_property *candidate = vcandidate; + + if (key < candidate->range_start) { + return -1; + } else if (key > candidate->range_end) { + return 1; + } else { + return 0; + } +} + +static int +code_point_to_combining_class(HB_UChar32 cp) { + const void *vprop = bsearch((void *) (intptr_t) cp, combining_properties, + combining_properties_count, + sizeof(struct combining_property), + combining_property_cmp); + if (!vprop) + return 0; + + return ((const struct combining_property *) vprop)->klass; +} + +int +HB_GetUnicodeCharCombiningClass(HB_UChar32 ch) { + return code_point_to_combining_class(ch); + return 0; +} + +static int +category_property_cmp(const void *vkey, const void *vcandidate) { + const uint32_t key = (uint32_t) (intptr_t) vkey; + const struct category_property *candidate = vcandidate; + + if (key < candidate->range_start) { + return -1; + } else if (key > candidate->range_end) { + return 1; + } else { + return 0; + } +} + +static HB_CharCategory +code_point_to_category(HB_UChar32 cp) { + const void *vprop = bsearch((void *) (intptr_t) cp, category_properties, + category_properties_count, + sizeof(struct category_property), + category_property_cmp); + if (!vprop) + return HB_NoCategory; + + return ((const struct category_property *) vprop)->category; +} + +void +HB_GetUnicodeCharProperties(HB_UChar32 ch, + HB_CharCategory *category, + int *combiningClass) { + *category = code_point_to_category(ch); + *combiningClass = code_point_to_combining_class(ch); +} + +HB_CharCategory +HB_GetUnicodeCharCategory(HB_UChar32 ch) { + return code_point_to_category(ch); +} diff --git a/contrib/harfbuzz-unicode.c b/contrib/harfbuzz-unicode.c new file mode 100644 index 0000000..9b3c43e --- /dev/null +++ b/contrib/harfbuzz-unicode.c @@ -0,0 +1,264 @@ +#include <stdint.h> +#include <stdlib.h> + +#include <harfbuzz-external.h> +#include <harfbuzz-impl.h> +#include <harfbuzz-shaper.h> +#include "harfbuzz-unicode.h" + +#include "tables/script-properties.h" +#include "tables/grapheme-break-properties.h" + +uint32_t +utf16_to_code_point(const uint16_t *chars, size_t len, ssize_t *iter) { + const uint16_t v = chars[(*iter)++]; + if (HB_IsHighSurrogate(v)) { + // surrogate pair + if (*iter >= len) { + // the surrogate is incomplete. + return HB_InvalidCodePoint; + } + const uint16_t v2 = chars[(*iter)++]; + if (!HB_IsLowSurrogate(v2)) { + // invalidate surrogate pair. + return HB_InvalidCodePoint; + } + + return HB_SurrogateToUcs4(v, v2); + } + + if (HB_IsLowSurrogate(v)) { + // this isn't a valid code point + return HB_InvalidCodePoint; + } + + return v; +} + +uint32_t +utf16_to_code_point_prev(const uint16_t *chars, size_t len, ssize_t *iter) { + const uint16_t v = chars[(*iter)--]; + if (HB_IsLowSurrogate(v)) { + // surrogate pair + if (*iter < 0) { + // the surrogate is incomplete. + return HB_InvalidCodePoint; + } + const uint16_t v2 = chars[(*iter)--]; + if (!HB_IsHighSurrogate(v2)) { + // invalidate surrogate pair. + return HB_InvalidCodePoint; + } + + return HB_SurrogateToUcs4(v2, v); + } + + if (HB_IsHighSurrogate(v)) { + // this isn't a valid code point + return HB_InvalidCodePoint; + } + + return v; +} + +static int +script_property_cmp(const void *vkey, const void *vcandidate) { + const uint32_t key = (uint32_t) (intptr_t) vkey; + const struct script_property *candidate = vcandidate; + + if (key < candidate->range_start) { + return -1; + } else if (key > candidate->range_end) { + return 1; + } else { + return 0; + } +} + +HB_Script +code_point_to_script(uint32_t cp) { + const void *vprop = bsearch((void *) (intptr_t) cp, script_properties, + script_properties_count, + sizeof(struct script_property), + script_property_cmp); + if (!vprop) + return HB_Script_Common; + + return ((const struct script_property *) vprop)->script; +} + +char +hb_utf16_script_run_next(unsigned *num_code_points, HB_ScriptItem *output, + const uint16_t *chars, size_t len, ssize_t *iter) { + if (*iter == len) + return 0; + + output->pos = *iter; + const uint32_t init_cp = utf16_to_code_point(chars, len, iter); + unsigned cps = 1; + if (init_cp == HB_InvalidCodePoint) + return 0; + const HB_Script init_script = code_point_to_script(init_cp); + HB_Script current_script = init_script; + output->script = init_script; + + for (;;) { + if (*iter == len) + break; + const ssize_t prev_iter = *iter; + const uint32_t cp = utf16_to_code_point(chars, len, iter); + if (cp == HB_InvalidCodePoint) + return 0; + cps++; + const HB_Script script = code_point_to_script(cp); + + if (script != current_script) { + if (current_script == init_script == HB_Script_Inherited) { + // If we started off as inherited, we take whatever we can find. + output->script = script; + current_script = script; + continue; + } else if (script == HB_Script_Inherited) { + current_script = script; + continue; + } else { + *iter = prev_iter; + cps--; + break; + } + } + } + + if (output->script == HB_Script_Inherited) + output->script = HB_Script_Common; + + output->length = *iter - output->pos; + if (num_code_points) + *num_code_points = cps; + return 1; +} + +char +hb_utf16_script_run_prev(unsigned *num_code_points, HB_ScriptItem *output, + const uint16_t *chars, size_t len, ssize_t *iter) { + if (*iter == (size_t) -1) + return 0; + + const size_t ending_index = *iter; + const uint32_t init_cp = utf16_to_code_point_prev(chars, len, iter); + unsigned cps = 1; + if (init_cp == HB_InvalidCodePoint) + return 0; + const HB_Script init_script = code_point_to_script(init_cp); + HB_Script current_script = init_script; + output->script = init_script; + + for (;;) { + if (*iter < 0) + break; + const ssize_t prev_iter = *iter; + const uint32_t cp = utf16_to_code_point_prev(chars, len, iter); + if (cp == HB_InvalidCodePoint) + return 0; + cps++; + const HB_Script script = code_point_to_script(cp); + + if (script != current_script) { + if (current_script == init_script == HB_Script_Inherited) { + // If we started off as inherited, we take whatever we can find. + output->script = script; + current_script = script; + continue; + } else if (script == HB_Script_Inherited) { + current_script = script; + continue; + } else { + *iter = prev_iter; + cps--; + break; + } + } + } + + if (output->script == HB_Script_Inherited) + output->script = HB_Script_Common; + + output->pos = *iter + 1; + output->length = ending_index - *iter; + if (num_code_points) + *num_code_points = cps; + return 1; +} + +static int +grapheme_break_property_cmp(const void *vkey, const void *vcandidate) { + const uint32_t key = (uint32_t) (intptr_t) vkey; + const struct grapheme_break_property *candidate = vcandidate; + + if (key < candidate->range_start) { + return -1; + } else if (key > candidate->range_end) { + return 1; + } else { + return 0; + } +} + +HB_GraphemeClass +HB_GetGraphemeClass(HB_UChar32 ch) { + const void *vprop = bsearch((void *) (intptr_t) ch, grapheme_break_properties, + grapheme_break_properties_count, + sizeof(struct grapheme_break_property), + grapheme_break_property_cmp); + if (!vprop) + return HB_Grapheme_Other; + + return ((const struct grapheme_break_property *) vprop)->klass; +} + +HB_WordClass +HB_GetWordClass(HB_UChar32 ch) { + abort(); + return 0; +} + +HB_SentenceClass +HB_GetSentenceClass(HB_UChar32 ch) { + abort(); + return 0; +} + +void +HB_GetGraphemeAndLineBreakClass(HB_UChar32 ch, HB_GraphemeClass *gclass, HB_LineBreakClass *breakclass) { + *gclass = HB_GetGraphemeClass(ch); + *breakclass = HB_GetLineBreakClass(ch); +} + +HB_UChar16 +HB_GetMirroredChar(HB_UChar16 ch) { + abort(); + return 0; +} + +void * +HB_Library_Resolve(const char *library, const char *symbol) { + abort(); + return NULL; +} + +void * +HB_TextCodecForMib(int mib) { + abort(); + return NULL; +} + +char * +HB_TextCodec_ConvertFromUnicode(void *codec, const HB_UChar16 *unicode, hb_uint32 length, hb_uint32 *outputLength) { + abort(); + return NULL; +} + +void +HB_TextCodec_FreeResult(char *v) { + abort(); +} diff --git a/contrib/harfbuzz-unicode.h b/contrib/harfbuzz-unicode.h new file mode 100644 index 0000000..f28b3c3 --- /dev/null +++ b/contrib/harfbuzz-unicode.h @@ -0,0 +1,54 @@ +#ifndef SCRIPT_IDENTIFY_H_ +#define SCRIPT_IDENTIFY_H_ + +#include <stdint.h> + +#include <harfbuzz-shaper.h> + +static const uint32_t HB_InvalidCodePoint = 0xffffffffu; + +// ----------------------------------------------------------------------------- +// Return the next Unicode code point from a UTF-16 vector +// chars: a pointer to @len words +// iter: (input/output) an index into @chars. This is updated. +// returns: HB_InvalidCodePoint on error and the code point otherwise. +// ----------------------------------------------------------------------------- +uint32_t utf16_to_code_point(const uint16_t *chars, size_t len, ssize_t *iter); + +// ----------------------------------------------------------------------------- +// Like the above, except that the code points are traversed backwards. Thus, +// on the first call, |iter| should be |len| - 1. +// ----------------------------------------------------------------------------- +uint32_t utf16_to_code_point(const uint16_t *chars, size_t len, ssize_t *iter); + +// ----------------------------------------------------------------------------- +// Return the script of the given code point +// ----------------------------------------------------------------------------- +HB_Script code_point_to_script(uint32_t cp); + +// ----------------------------------------------------------------------------- +// Find the next script run in a UTF-16 string. +// +// A script run is a subvector of codepoints, all of which are in the same +// script. A run will never cut a surrogate pair in half at either end. +// +// num_code_points: (output, maybe NULL) the number of code points in the run +// output: (output) the @pos, @length and @script fields are set on success +// chars: the UTF-16 string +// len: the length of @chars, in words +// iter: (in/out) the current index into the string. This should be 0 for the +// first call and is updated on exit. +// +// returns: non-zero if a script run was found and returned. +// ----------------------------------------------------------------------------- +char hb_utf16_script_run_next(unsigned *num_code_points, HB_ScriptItem *output, + const uint16_t *chars, size_t len, ssize_t *iter); + +// ----------------------------------------------------------------------------- +// This is the same as above, except that the input is traversed backwards. +// Thus, on the first call, |iter| should be |len| - 1. +// ----------------------------------------------------------------------------- +char hb_utf16_script_run_prev(unsigned *num_code_points, HB_ScriptItem *output, + const uint16_t *chars, size_t len, ssize_t *iter); + +#endif diff --git a/contrib/tables/README b/contrib/tables/README new file mode 100644 index 0000000..605d1c0 --- /dev/null +++ b/contrib/tables/README @@ -0,0 +1,17 @@ +This directory contains Python script to parse several of the Unicode tables +that are downloadable from the web and generate C header files from them. + +These are the locations of the files which are parsed. You should download these +files and put them in this directory. + +http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedGeneralCategory.txt +http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedCombiningClass.txt +http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt +http://www.unicode.org/Public/5.1.0/ucd/Scripts.txt + +Then you can run the following python scripts to generate the header files: + +python category-parse.py DerivedGeneralCategory.txt category-properties.h +python combining-class-parse.py DerivedCombiningClass.txt combining-properties.h +python grapheme-break-parse.py GraphemeBreakProperty.txt grapheme-break-properties.h +python scripts-parse.py Scripts.txt script-properties.h diff --git a/contrib/tables/category-parse.py b/contrib/tables/category-parse.py new file mode 100644 index 0000000..6818c1d --- /dev/null +++ b/contrib/tables/category-parse.py @@ -0,0 +1,70 @@ +import sys +from unicode_parse_common import * + +# http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedGeneralCategory.txt + +category_to_harfbuzz = { + 'Mn': 'HB_Mark_NonSpacing', + 'Mc': 'HB_Mark_SpacingCombining', + 'Me': 'HB_Mark_Enclosing', + + 'Nd': 'HB_Number_DecimalDigit', + 'Nl': 'HB_Number_Letter', + 'No': 'HB_Number_Other', + + 'Zs': 'HB_Separator_Space', + 'Zl': 'HB_Separator_Line', + 'Zp': 'HB_Separator_Paragraph', + + 'Cc': 'HB_Other_Control', + 'Cf': 'HB_Other_Format', + 'Cs': 'HB_Other_Surrogate', + 'Co': 'HB_Other_PrivateUse', + 'Cn': 'HB_Other_NotAssigned', + + 'Lu': 'HB_Letter_Uppercase', + 'Ll': 'HB_Letter_Lowercase', + 'Lt': 'HB_Letter_Titlecase', + 'Lm': 'HB_Letter_Modifier', + 'Lo': 'HB_Letter_Other', + + 'Pc': 'HB_Punctuation_Connector', + 'Pd': 'HB_Punctuation_Dash', + 'Ps': 'HB_Punctuation_Open', + 'Pe': 'HB_Punctuation_Close', + 'Pi': 'HB_Punctuation_InitialQuote', + 'Pf': 'HB_Punctuation_FinalQuote', + 'Po': 'HB_Punctuation_Other', + + 'Sm': 'HB_Symbol_Math', + 'Sc': 'HB_Symbol_Currency', + 'Sk': 'HB_Symbol_Modifier', + 'So': 'HB_Symbol_Other', +} + +def main(infile, outfile): + ranges = unicode_file_parse(infile, category_to_harfbuzz) + ranges = sort_and_merge(ranges) + + print >>outfile, '// Generated from Unicode script tables\n' + print >>outfile, '#ifndef CATEGORY_PROPERTIES_H_' + print >>outfile, '#define CATEGORY_PROPERTIES_H_\n' + print >>outfile, '#include <stdint.h>' + print >>outfile, '#include "harfbuzz-external.h"\n' + print >>outfile, 'struct category_property {' + print >>outfile, ' uint32_t range_start;' + print >>outfile, ' uint32_t range_end;' + print >>outfile, ' HB_CharCategory category;' + print >>outfile, '};\n' + print >>outfile, 'static const struct category_property category_properties[] = {' + for (start, end, value) in ranges: + print >>outfile, ' {0x%x, 0x%x, %s},' % (start, end, value) + print >>outfile, '};\n' + print >>outfile, 'static const unsigned category_properties_count = %d;\n' % len(ranges) + print >>outfile, '#endif // CATEGORY_PROPERTIES_H_' + +if __name__ == '__main__': + if len(sys.argv) != 3: + print 'Usage: %s <input .txt> <output .h>' % sys.argv[0] + else: + main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+')) diff --git a/contrib/tables/combining-class-parse.py b/contrib/tables/combining-class-parse.py new file mode 100644 index 0000000..c591ddd --- /dev/null +++ b/contrib/tables/combining-class-parse.py @@ -0,0 +1,34 @@ +import sys +from unicode_parse_common import * + +# http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedCombiningClass.txt + +class IdentityMap(object): + def __getitem__(_, key): + return key + +def main(infile, outfile): + ranges = unicode_file_parse(infile, IdentityMap(), '0') + ranges = sort_and_merge(ranges) + + print >>outfile, '// Generated from Unicode tables\n' + print >>outfile, '#ifndef COMBINING_PROPERTIES_H_' + print >>outfile, '#define COMBINING_PROPERTIES_H_\n' + print >>outfile, '#include <stdint.h>' + print >>outfile, 'struct combining_property {' + print >>outfile, ' uint32_t range_start;' + print >>outfile, ' uint32_t range_end;' + print >>outfile, ' uint8_t klass;' + print >>outfile, '};\n' + print >>outfile, 'static const struct combining_property combining_properties[] = {' + for (start, end, value) in ranges: + print >>outfile, ' {0x%x, 0x%x, %s},' % (start, end, value) + print >>outfile, '};\n' + print >>outfile, 'static const unsigned combining_properties_count = %d;\n' % len(ranges) + print >>outfile, '#endif // COMBINING_PROPERTIES_H_' + +if __name__ == '__main__': + if len(sys.argv) != 3: + print 'Usage: %s <input .txt> <output .h>' % sys.argv[0] + else: + main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+')) diff --git a/contrib/tables/grapheme-break-parse.py b/contrib/tables/grapheme-break-parse.py new file mode 100644 index 0000000..a4b3534 --- /dev/null +++ b/contrib/tables/grapheme-break-parse.py @@ -0,0 +1,45 @@ +import sys +from unicode_parse_common import * + +# http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt + +property_to_harfbuzz = { + 'CR': 'HB_Grapheme_CR', + 'LF': 'HB_Grapheme_LF', + 'Control': 'HB_Grapheme_Control', + 'Extend': 'HB_Grapheme_Extend', + 'Prepend': 'HB_Grapheme_Other', + 'SpacingMark': 'HB_Grapheme_Other', + 'L': 'HB_Grapheme_L', + 'V': 'HB_Grapheme_V', + 'T': 'HB_Grapheme_T', + 'LV': 'HB_Grapheme_LV', + 'LVT': 'HB_Grapheme_LVT', +} + +def main(infile, outfile): + ranges = unicode_file_parse(infile, property_to_harfbuzz) + ranges.sort() + + print >>outfile, '// Generated from Unicode Grapheme break tables\n' + print >>outfile, '#ifndef GRAPHEME_BREAK_PROPERTY_H_' + print >>outfile, '#define GRAPHEME_BREAK_PROPERTY_H_\n' + print >>outfile, '#include <stdint.h>' + print >>outfile, '#include "harfbuzz-external.h"\n' + print >>outfile, 'struct grapheme_break_property {' + print >>outfile, ' uint32_t range_start;' + print >>outfile, ' uint32_t range_end;' + print >>outfile, ' HB_GraphemeClass klass;' + print >>outfile, '};\n' + print >>outfile, 'static const struct grapheme_break_property grapheme_break_properties[] = {' + for (start, end, value) in ranges: + print >>outfile, ' {0x%x, 0x%x, %s},' % (start, end, value) + print >>outfile, '};\n' + print >>outfile, 'static const unsigned grapheme_break_properties_count = %d;\n' % len(ranges) + print >>outfile, '#endif // GRAPHEME_BREAK_PROPERTY_H_' + +if __name__ == '__main__': + if len(sys.argv) != 3: + print 'Usage: %s <input .txt> <output .h>' % sys.argv[0] + else: + main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+')) diff --git a/contrib/tables/scripts-parse.py b/contrib/tables/scripts-parse.py new file mode 100644 index 0000000..23bac10 --- /dev/null +++ b/contrib/tables/scripts-parse.py @@ -0,0 +1,75 @@ +import sys +from unicode_parse_common import * + +# http://www.unicode.org/Public/5.1.0/ucd/Scripts.txt + +script_to_harfbuzz = { + # This is the list of HB_Script_* at the time of writing + 'Common': 'HB_Script_Common', + 'Greek': 'HB_Script_Greek', + 'Cyrillic': 'HB_Script_Cyrillic', + 'Armenian': 'HB_Script_Armenian', + 'Hebrew': 'HB_Script_Hebrew', + 'Arabic': 'HB_Script_Arabic', + 'Syriac': 'HB_Script_Syriac', + 'Thaana': 'HB_Script_Thaana', + 'Devanagari': 'HB_Script_Devanagari', + 'Bengali': 'HB_Script_Bengali', + 'Gurmukhi': 'HB_Script_Gurmukhi', + 'Gujarati': 'HB_Script_Gujarati', + 'Oriya': 'HB_Script_Oriya', + 'Tamil': 'HB_Script_Tamil', + 'Telugu': 'HB_Script_Telugu', + 'Kannada': 'HB_Script_Kannada', + 'Malayalam': 'HB_Script_Malayalam', + 'Sinhala': 'HB_Script_Sinhala', + 'Thai': 'HB_Script_Thai', + 'Lao': 'HB_Script_Lao', + 'Tibetan': 'HB_Script_Tibetan', + 'Myanmar': 'HB_Script_Myanmar', + 'Georgian': 'HB_Script_Georgian', + 'Hangul': 'HB_Script_Hangul', + 'Ogham': 'HB_Script_Ogham', + 'Runic': 'HB_Script_Runic', + 'Khmer': 'HB_Script_Khmer', + 'Inherited': 'HB_Script_Inherited', +} + +class ScriptDict(object): + def __init__(self, base): + self.base = base + + def __getitem__(self, key): + r = self.base.get(key, None) + if r is None: + return 'HB_Script_Common' + return r + +def main(infile, outfile): + ranges = unicode_file_parse(infile, + ScriptDict(script_to_harfbuzz), + 'HB_Script_Common') + ranges = sort_and_merge(ranges) + + print >>outfile, '// Generated from Unicode script tables\n' + print >>outfile, '#ifndef SCRIPT_PROPERTIES_H_' + print >>outfile, '#define SCRIPT_PROPERTIES_H_\n' + print >>outfile, '#include <stdint.h>' + print >>outfile, '#include "harfbuzz-shaper.h"\n' + print >>outfile, 'struct script_property {' + print >>outfile, ' uint32_t range_start;' + print >>outfile, ' uint32_t range_end;' + print >>outfile, ' HB_Script script;' + print >>outfile, '};\n' + print >>outfile, 'static const struct script_property script_properties[] = {' + for (start, end, value) in ranges: + print >>outfile, ' {0x%x, 0x%x, %s},' % (start, end, value) + print >>outfile, '};\n' + print >>outfile, 'static const unsigned script_properties_count = %d;\n' % len(ranges) + print >>outfile, '#endif // SCRIPT_PROPERTIES_H_' + +if __name__ == '__main__': + if len(sys.argv) != 3: + print 'Usage: %s <input .txt> <output .h>' % sys.argv[0] + else: + main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+')) diff --git a/contrib/tables/unicode_parse_common.py b/contrib/tables/unicode_parse_common.py new file mode 100644 index 0000000..ac26eca --- /dev/null +++ b/contrib/tables/unicode_parse_common.py @@ -0,0 +1,70 @@ +def lines_get(f): + '''Parse a file like object, removing comments and returning a list of + lines.''' + def cut_comment(line): + first_hash = line.find('#') + if first_hash == -1: + return line + return line[:first_hash] + + return [x for x in [cut_comment(x[:-1]) for x in f.readlines()] if len(x)] + +def line_split(line): + '''Split a line based on a semicolon separator.''' + def normalise(word): + return word.lstrip().rstrip() + return [normalise(x) for x in line.split(';')] + +def codepoints_parse(token): + '''Parse a Unicode style code-point range. Return either a single value or a + tuple of (start, end) for a range of code-points.''' + def fromHex(token): + return int(token, 16) + parts = token.split('..') + if len(parts) == 2: + return (fromHex(parts[0]), fromHex(parts[1])) + elif len(parts) == 1: + return fromHex(parts[0]) + else: + raise ValueError(token) + +def unicode_file_parse(input, map, default_value = None): + '''Parse a file like object, @input where the first column is a code-point + range and the second column is mapped via the given dict, @map.''' + ranges = [] + tokens = [line_split(x) for x in lines_get(input)] + for line in tokens: + if len(line) == 2: + codepoints = codepoints_parse(line[0]) + value = map[line[1]] + if value == default_value: + continue + + if type(codepoints) == int: + codepoints = (codepoints, codepoints) + + ranges.append((codepoints[0], codepoints[1], value)) + else: + raise ValueError(line) + + return ranges + +def sort_and_merge(ranges): + '''Given a list of (start, end, value), merge elements where the ranges are + continuous and the values are the same.''' + output = [] + ranges.sort() + current = None + for v in ranges: + if current is None: + current = v + continue + if current[1] + 1 == v[0] and current[2] == v[2]: + current = (current[0], v[1], v[2]) + else: + output.append(current) + current = v + if current is not None: + output.append(current) + + return output |