summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBehdad Esfahbod <behdad@behdad.org>2018-10-01 19:09:58 +0200
committerBehdad Esfahbod <behdad@behdad.org>2018-10-01 19:23:47 +0200
commit51436547162a18e88144e7125ad6ce4a69a08d4b (patch)
tree86dc145119a9b8a58aef672a3354a3bcc89ecdb2
parentaaaa65baa7fcfb65ae814528bdd93cc5c4ea540d (diff)
[khmer] Rewrite grammar completely
Based on experimenting with Uniscribe to extract grammar and categories. Failures down from 44 to 35: KHMER: 299089 out of 299124 tests passed. 35 failed (0.0117008%) We still don't enforce the one-matra rule pre-decomposition, but enforce an order and one-matra-per-position post-decomposition. https://github.com/harfbuzz/harfbuzz/issues/667
-rw-r--r--src/hb-ot-shape-complex-indic.hh2
-rw-r--r--src/hb-ot-shape-complex-khmer-machine.hh300
-rw-r--r--src/hb-ot-shape-complex-khmer-machine.rl42
-rw-r--r--src/hb-ot-shape-complex-khmer.cc4
-rw-r--r--src/hb-ot-shape-complex-khmer.hh99
5 files changed, 270 insertions, 177 deletions
diff --git a/src/hb-ot-shape-complex-indic.hh b/src/hb-ot-shape-complex-indic.hh
index 9e597797..fe5595f8 100644
--- a/src/hb-ot-shape-complex-indic.hh
+++ b/src/hb-ot-shape-complex-indic.hh
@@ -125,7 +125,7 @@ enum indic_syllabic_category_t {
INDIC_SYLLABIC_CATEGORY_CONSONANT_PRECEDING_REPHA = OT_Repha,
INDIC_SYLLABIC_CATEGORY_CONSONANT_PREFIXED = OT_X, /* Don't care. */
INDIC_SYLLABIC_CATEGORY_CONSONANT_SUBJOINED = OT_CM,
- INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA = OT_N,
+ INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA = OT_CM,
INDIC_SYLLABIC_CATEGORY_CONSONANT_WITH_STACKER = OT_CS,
INDIC_SYLLABIC_CATEGORY_GEMINATION_MARK = OT_SM, /* https://github.com/harfbuzz/harfbuzz/issues/552 */
INDIC_SYLLABIC_CATEGORY_INVISIBLE_STACKER = OT_Coeng,
diff --git a/src/hb-ot-shape-complex-khmer-machine.hh b/src/hb-ot-shape-complex-khmer-machine.hh
index a7e1711e..d013456b 100644
--- a/src/hb-ot-shape-complex-khmer-machine.hh
+++ b/src/hb-ot-shape-complex-khmer-machine.hh
@@ -34,130 +34,200 @@
#line 36 "hb-ot-shape-complex-khmer-machine.hh"
static const unsigned char _khmer_syllable_machine_trans_keys[] = {
- 7u, 7u, 1u, 16u, 13u, 13u, 1u, 16u, 7u, 13u, 7u, 7u, 1u, 16u, 13u, 13u,
- 1u, 16u, 7u, 13u, 1u, 16u, 3u, 14u, 3u, 14u, 5u, 14u, 3u, 14u, 5u, 14u,
- 8u, 8u, 3u, 13u, 3u, 8u, 8u, 8u, 3u, 8u, 3u, 14u, 3u, 14u, 5u, 14u,
- 3u, 14u, 5u, 14u, 8u, 8u, 3u, 13u, 3u, 8u, 8u, 8u, 3u, 8u, 3u, 14u,
- 3u, 14u, 7u, 13u, 7u, 7u, 1u, 16u, 0
+ 5u, 26u, 5u, 21u, 5u, 26u, 5u, 21u, 1u, 16u, 5u, 21u, 5u, 26u, 5u, 21u,
+ 5u, 26u, 5u, 21u, 1u, 16u, 5u, 21u, 5u, 26u, 5u, 21u, 1u, 16u, 5u, 21u,
+ 5u, 26u, 5u, 21u, 5u, 26u, 5u, 21u, 5u, 26u, 1u, 16u, 1u, 29u, 5u, 29u,
+ 5u, 29u, 5u, 29u, 22u, 22u, 5u, 22u, 5u, 29u, 5u, 29u, 5u, 29u, 5u, 26u,
+ 5u, 29u, 5u, 29u, 22u, 22u, 5u, 22u, 5u, 29u, 5u, 29u, 1u, 16u, 5u, 29u,
+ 5u, 29u, 0
};
static const char _khmer_syllable_machine_key_spans[] = {
- 1, 16, 1, 16, 7, 1, 16, 1,
- 16, 7, 16, 12, 12, 10, 12, 10,
- 1, 11, 6, 1, 6, 12, 12, 10,
- 12, 10, 1, 11, 6, 1, 6, 12,
- 12, 7, 1, 16
+ 22, 17, 22, 17, 16, 17, 22, 17,
+ 22, 17, 16, 17, 22, 17, 16, 17,
+ 22, 17, 22, 17, 22, 16, 29, 25,
+ 25, 25, 1, 18, 25, 25, 25, 22,
+ 25, 25, 1, 18, 25, 25, 16, 25,
+ 25
};
static const short _khmer_syllable_machine_index_offsets[] = {
- 0, 2, 19, 21, 38, 46, 48, 65,
- 67, 84, 92, 109, 122, 135, 146, 159,
- 170, 172, 184, 191, 193, 200, 213, 226,
- 237, 250, 261, 263, 275, 282, 284, 291,
- 304, 317, 325, 327
+ 0, 23, 41, 64, 82, 99, 117, 140,
+ 158, 181, 199, 216, 234, 257, 275, 292,
+ 310, 333, 351, 374, 392, 415, 432, 462,
+ 488, 514, 540, 542, 561, 587, 613, 639,
+ 662, 688, 714, 716, 735, 761, 787, 804,
+ 830
};
static const char _khmer_syllable_machine_indicies[] = {
- 1, 0, 2, 2, 0, 0, 0, 0,
+ 1, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 2,
+ 3, 0, 0, 0, 0, 4, 0, 1,
+ 1, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 3,
+ 0, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
- 0, 2, 0, 3, 0, 4, 4, 0,
+ 0, 3, 0, 0, 0, 0, 4, 0,
+ 5, 5, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 4, 0, 1, 0,
- 0, 0, 0, 0, 5, 0, 7, 6,
- 8, 8, 6, 6, 6, 6, 6, 6,
- 6, 6, 6, 6, 6, 6, 6, 8,
- 6, 9, 6, 10, 10, 6, 6, 6,
- 6, 6, 6, 6, 6, 6, 6, 6,
- 6, 6, 10, 6, 7, 6, 6, 6,
- 6, 6, 11, 6, 4, 4, 13, 12,
- 14, 15, 7, 16, 12, 12, 4, 4,
- 11, 17, 12, 4, 12, 19, 18, 20,
- 21, 1, 22, 18, 18, 18, 18, 5,
- 23, 18, 24, 18, 21, 21, 1, 22,
- 18, 18, 18, 18, 18, 23, 18, 21,
- 21, 1, 22, 18, 18, 18, 18, 18,
- 23, 18, 25, 18, 21, 21, 1, 22,
- 18, 18, 18, 18, 18, 26, 18, 21,
- 21, 1, 22, 18, 18, 18, 18, 18,
- 26, 18, 27, 18, 28, 18, 29, 18,
- 18, 22, 18, 18, 18, 18, 3, 18,
- 30, 18, 18, 18, 18, 22, 18, 22,
- 18, 28, 18, 18, 18, 18, 22, 18,
- 19, 18, 21, 21, 1, 22, 18, 18,
- 18, 18, 18, 23, 18, 32, 31, 33,
- 33, 7, 16, 31, 31, 31, 31, 31,
- 34, 31, 33, 33, 7, 16, 31, 31,
- 31, 31, 31, 34, 31, 35, 31, 33,
- 33, 7, 16, 31, 31, 31, 31, 31,
- 36, 31, 33, 33, 7, 16, 31, 31,
- 31, 31, 31, 36, 31, 37, 31, 38,
- 31, 39, 31, 31, 16, 31, 31, 31,
- 31, 9, 31, 40, 31, 31, 31, 31,
- 16, 31, 16, 31, 38, 31, 31, 31,
- 31, 16, 31, 13, 31, 41, 33, 7,
- 16, 31, 31, 31, 31, 11, 34, 31,
- 13, 31, 33, 33, 7, 16, 31, 31,
- 31, 31, 31, 34, 31, 7, 42, 42,
- 42, 42, 42, 11, 42, 7, 42, 10,
- 10, 42, 42, 42, 42, 42, 42, 42,
- 42, 42, 42, 42, 42, 42, 10, 42,
+ 4, 0, 6, 6, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 6, 0, 7, 7, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 8, 0, 9, 9, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 10, 0, 0,
+ 0, 0, 4, 0, 9, 9, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 10, 0, 11, 11,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 12, 0,
+ 0, 0, 0, 4, 0, 11, 11, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 12, 0, 13,
+ 13, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 13, 0,
+ 15, 15, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14,
+ 16, 14, 15, 15, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 16, 17, 17, 17, 17, 18,
+ 17, 19, 19, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 18, 17, 20, 20, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 20, 17, 21, 21, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 22, 17, 23, 23,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 24, 17,
+ 17, 17, 17, 18, 17, 23, 23, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 24, 17, 25,
+ 25, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 26,
+ 17, 17, 17, 17, 18, 17, 25, 25,
+ 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 26, 17,
+ 15, 15, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 27,
+ 16, 17, 17, 17, 17, 18, 17, 28,
+ 28, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 28, 17,
+ 13, 13, 29, 29, 30, 30, 29, 29,
+ 29, 29, 2, 2, 29, 31, 29, 13,
+ 29, 29, 29, 29, 16, 20, 29, 29,
+ 29, 18, 24, 26, 22, 29, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 34,
+ 32, 32, 32, 32, 32, 2, 3, 6,
+ 32, 32, 32, 4, 10, 12, 8, 32,
+ 35, 35, 32, 32, 32, 32, 32, 32,
+ 32, 36, 32, 32, 32, 32, 32, 32,
+ 3, 6, 32, 32, 32, 4, 10, 12,
+ 8, 32, 5, 5, 32, 32, 32, 32,
+ 32, 32, 32, 36, 32, 32, 32, 32,
+ 32, 32, 4, 6, 32, 32, 32, 32,
+ 32, 32, 8, 32, 6, 32, 7, 7,
+ 32, 32, 32, 32, 32, 32, 32, 36,
+ 32, 32, 32, 32, 32, 32, 8, 6,
+ 32, 37, 37, 32, 32, 32, 32, 32,
+ 32, 32, 36, 32, 32, 32, 32, 32,
+ 32, 10, 6, 32, 32, 32, 4, 32,
+ 32, 8, 32, 38, 38, 32, 32, 32,
+ 32, 32, 32, 32, 36, 32, 32, 32,
+ 32, 32, 32, 12, 6, 32, 32, 32,
+ 4, 10, 32, 8, 32, 35, 35, 32,
+ 32, 32, 32, 32, 32, 32, 34, 32,
+ 32, 32, 32, 32, 32, 3, 6, 32,
+ 32, 32, 4, 10, 12, 8, 32, 15,
+ 15, 39, 39, 39, 39, 39, 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 16,
+ 39, 39, 39, 39, 18, 39, 41, 41,
+ 40, 40, 40, 40, 40, 40, 40, 42,
+ 40, 40, 40, 40, 40, 40, 16, 20,
+ 40, 40, 40, 18, 24, 26, 22, 40,
+ 19, 19, 40, 40, 40, 40, 40, 40,
+ 40, 42, 40, 40, 40, 40, 40, 40,
+ 18, 20, 40, 40, 40, 40, 40, 40,
+ 22, 40, 20, 40, 21, 21, 40, 40,
+ 40, 40, 40, 40, 40, 42, 40, 40,
+ 40, 40, 40, 40, 22, 20, 40, 43,
+ 43, 40, 40, 40, 40, 40, 40, 40,
+ 42, 40, 40, 40, 40, 40, 40, 24,
+ 20, 40, 40, 40, 18, 40, 40, 22,
+ 40, 44, 44, 40, 40, 40, 40, 40,
+ 40, 40, 42, 40, 40, 40, 40, 40,
+ 40, 26, 20, 40, 40, 40, 18, 24,
+ 40, 22, 40, 28, 28, 39, 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 39,
+ 39, 39, 28, 39, 45, 45, 40, 40,
+ 40, 40, 40, 40, 40, 46, 40, 40,
+ 40, 40, 40, 27, 16, 20, 40, 40,
+ 40, 18, 24, 26, 22, 40, 41, 41,
+ 40, 40, 40, 40, 40, 40, 40, 46,
+ 40, 40, 40, 40, 40, 40, 16, 20,
+ 40, 40, 40, 18, 24, 26, 22, 40,
0
};
static const char _khmer_syllable_machine_trans_targs[] = {
- 10, 14, 17, 20, 11, 21, 10, 24,
- 27, 30, 31, 32, 10, 22, 33, 34,
- 26, 35, 10, 12, 4, 0, 16, 3,
- 13, 15, 1, 10, 18, 2, 19, 10,
- 23, 5, 8, 25, 6, 10, 28, 7,
- 29, 9, 10
+ 22, 1, 30, 24, 25, 3, 26, 5,
+ 27, 7, 28, 9, 29, 23, 22, 11,
+ 32, 22, 33, 13, 34, 15, 35, 17,
+ 36, 19, 37, 40, 39, 22, 31, 38,
+ 22, 0, 10, 2, 4, 6, 8, 22,
+ 22, 12, 14, 16, 18, 20, 21
};
static const char _khmer_syllable_machine_trans_actions[] = {
- 1, 2, 2, 0, 2, 2, 3, 2,
- 2, 0, 2, 2, 6, 2, 0, 0,
- 0, 0, 7, 2, 0, 0, 0, 0,
- 2, 2, 0, 8, 0, 0, 0, 9,
- 2, 0, 0, 2, 0, 10, 0, 0,
- 0, 0, 11
+ 1, 0, 2, 2, 2, 0, 0, 0,
+ 2, 0, 2, 0, 2, 2, 3, 0,
+ 4, 5, 2, 0, 0, 0, 2, 0,
+ 2, 0, 2, 4, 4, 8, 9, 0,
+ 10, 0, 0, 0, 0, 0, 0, 11,
+ 12, 0, 0, 0, 0, 0, 0
};
static const char _khmer_syllable_machine_to_state_actions[] = {
0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 4, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 6, 0,
0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0
};
static const char _khmer_syllable_machine_from_state_actions[] = {
0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 5, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 7, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0
+ 0
};
static const unsigned char _khmer_syllable_machine_eof_trans[] = {
- 1, 1, 1, 1, 1, 7, 7, 7,
- 7, 7, 0, 19, 19, 19, 19, 19,
- 19, 19, 19, 19, 19, 19, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 43, 43, 43
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 15, 18, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 0, 33,
+ 33, 33, 33, 33, 33, 33, 33, 40,
+ 41, 41, 41, 41, 41, 41, 40, 41,
+ 41
};
-static const int khmer_syllable_machine_start = 10;
-static const int khmer_syllable_machine_first_final = 10;
+static const int khmer_syllable_machine_start = 22;
+static const int khmer_syllable_machine_first_final = 22;
static const int khmer_syllable_machine_error = -1;
-static const int khmer_syllable_machine_en_main = 10;
+static const int khmer_syllable_machine_en_main = 22;
#line 36 "hb-ot-shape-complex-khmer-machine.rl"
-#line 74 "hb-ot-shape-complex-khmer-machine.rl"
+#line 80 "hb-ot-shape-complex-khmer-machine.rl"
#define found_syllable(syllable_type) \
@@ -177,7 +247,7 @@ find_syllables (hb_buffer_t *buffer)
int cs;
hb_glyph_info_t *info = buffer->info;
-#line 181 "hb-ot-shape-complex-khmer-machine.hh"
+#line 251 "hb-ot-shape-complex-khmer-machine.hh"
{
cs = khmer_syllable_machine_start;
ts = 0;
@@ -185,7 +255,7 @@ find_syllables (hb_buffer_t *buffer)
act = 0;
}
-#line 95 "hb-ot-shape-complex-khmer-machine.rl"
+#line 101 "hb-ot-shape-complex-khmer-machine.rl"
p = 0;
@@ -194,7 +264,7 @@ find_syllables (hb_buffer_t *buffer)
unsigned int last = 0;
unsigned int syllable_serial = 1;
-#line 198 "hb-ot-shape-complex-khmer-machine.hh"
+#line 268 "hb-ot-shape-complex-khmer-machine.hh"
{
int _slen;
int _trans;
@@ -204,11 +274,11 @@ find_syllables (hb_buffer_t *buffer)
goto _test_eof;
_resume:
switch ( _khmer_syllable_machine_from_state_actions[cs] ) {
- case 5:
+ case 7:
#line 1 "NONE"
{ts = p;}
break;
-#line 212 "hb-ot-shape-complex-khmer-machine.hh"
+#line 282 "hb-ot-shape-complex-khmer-machine.hh"
}
_keys = _khmer_syllable_machine_trans_keys + (cs<<1);
@@ -231,47 +301,63 @@ _eof_trans:
{te = p+1;}
break;
case 8:
-#line 68 "hb-ot-shape-complex-khmer-machine.rl"
- {te = p+1;{ found_syllable (consonant_syllable); }}
- break;
- case 10:
-#line 69 "hb-ot-shape-complex-khmer-machine.rl"
- {te = p+1;{ found_syllable (broken_cluster); }}
- break;
- case 6:
-#line 70 "hb-ot-shape-complex-khmer-machine.rl"
+#line 76 "hb-ot-shape-complex-khmer-machine.rl"
{te = p+1;{ found_syllable (non_khmer_cluster); }}
break;
- case 7:
-#line 68 "hb-ot-shape-complex-khmer-machine.rl"
+ case 10:
+#line 74 "hb-ot-shape-complex-khmer-machine.rl"
{te = p;p--;{ found_syllable (consonant_syllable); }}
break;
- case 9:
-#line 69 "hb-ot-shape-complex-khmer-machine.rl"
+ case 12:
+#line 75 "hb-ot-shape-complex-khmer-machine.rl"
{te = p;p--;{ found_syllable (broken_cluster); }}
break;
case 11:
-#line 70 "hb-ot-shape-complex-khmer-machine.rl"
+#line 76 "hb-ot-shape-complex-khmer-machine.rl"
{te = p;p--;{ found_syllable (non_khmer_cluster); }}
break;
case 1:
-#line 68 "hb-ot-shape-complex-khmer-machine.rl"
+#line 74 "hb-ot-shape-complex-khmer-machine.rl"
{{p = ((te))-1;}{ found_syllable (consonant_syllable); }}
break;
- case 3:
-#line 69 "hb-ot-shape-complex-khmer-machine.rl"
+ case 5:
+#line 75 "hb-ot-shape-complex-khmer-machine.rl"
{{p = ((te))-1;}{ found_syllable (broken_cluster); }}
break;
-#line 266 "hb-ot-shape-complex-khmer-machine.hh"
+ case 3:
+#line 1 "NONE"
+ { switch( act ) {
+ case 2:
+ {{p = ((te))-1;} found_syllable (broken_cluster); }
+ break;
+ case 3:
+ {{p = ((te))-1;} found_syllable (non_khmer_cluster); }
+ break;
+ }
+ }
+ break;
+ case 4:
+#line 1 "NONE"
+ {te = p+1;}
+#line 75 "hb-ot-shape-complex-khmer-machine.rl"
+ {act = 2;}
+ break;
+ case 9:
+#line 1 "NONE"
+ {te = p+1;}
+#line 76 "hb-ot-shape-complex-khmer-machine.rl"
+ {act = 3;}
+ break;
+#line 352 "hb-ot-shape-complex-khmer-machine.hh"
}
_again:
switch ( _khmer_syllable_machine_to_state_actions[cs] ) {
- case 4:
+ case 6:
#line 1 "NONE"
{ts = 0;}
break;
-#line 275 "hb-ot-shape-complex-khmer-machine.hh"
+#line 361 "hb-ot-shape-complex-khmer-machine.hh"
}
if ( ++p != pe )
@@ -287,7 +373,7 @@ _again:
}
-#line 104 "hb-ot-shape-complex-khmer-machine.rl"
+#line 110 "hb-ot-shape-complex-khmer-machine.rl"
}
diff --git a/src/hb-ot-shape-complex-khmer-machine.rl b/src/hb-ot-shape-complex-khmer-machine.rl
index 7c795162..eb9f6988 100644
--- a/src/hb-ot-shape-complex-khmer-machine.rl
+++ b/src/hb-ot-shape-complex-khmer-machine.rl
@@ -40,28 +40,34 @@
# Same order as enum khmer_category_t. Not sure how to avoid duplication.
C = 1;
V = 2;
-N = 3;
ZWNJ = 5;
ZWJ = 6;
-M = 7;
-SM = 8;
PLACEHOLDER = 11;
DOTTEDCIRCLE = 12;
-RS = 13;
-Coeng = 14;
-Ra = 16;
-
-c = (C | Ra | V); # is_consonant
-n = ((ZWNJ?.RS)? (N.N?)?); # is_consonant_modifier
-z = ZWJ|ZWNJ; # is_joiner
-
-cn = c.n?;
-matra_group = z?.M.N?;
-syllable_tail = (SM.SM?)?;
-
-
-broken_cluster = n? (Coeng.cn)* matra_group* (Coeng.cn)? syllable_tail;
-consonant_syllable = (c|PLACEHOLDER|DOTTEDCIRCLE) broken_cluster;
+Coeng= 14;
+Ra = 16;
+Robatic = 20;
+Xgroup = 21;
+Ygroup = 22;
+VAbv = 26;
+VBlw = 27;
+VPre = 28;
+VPst = 29;
+
+c = (C | Ra | V);
+cn = c.((ZWJ|ZWNJ)?.Robatic)?;
+joiner = (ZWJ | ZWNJ);
+xgroup = (joiner*.Xgroup)*;
+ygroup = Ygroup*;
+
+# This grammar was experimentally extracted from what Uniscribe allows.
+
+matra_group = VPre? xgroup VBlw? xgroup (joiner?.VAbv)? xgroup VPst?;
+syllable_tail = xgroup matra_group xgroup (Coeng.c)? ygroup;
+
+
+broken_cluster = (Coeng.cn)* syllable_tail;
+consonant_syllable = (cn|PLACEHOLDER|DOTTEDCIRCLE) broken_cluster;
other = any;
main := |*
diff --git a/src/hb-ot-shape-complex-khmer.cc b/src/hb-ot-shape-complex-khmer.cc
index d46f0b3a..9c766be1 100644
--- a/src/hb-ot-shape-complex-khmer.cc
+++ b/src/hb-ot-shape-complex-khmer.cc
@@ -241,7 +241,6 @@ setup_masks_khmer (const hb_ot_shape_plan_t *plan HB_UNUSED,
hb_font_t *font HB_UNUSED)
{
HB_BUFFER_ALLOCATE_VAR (buffer, khmer_category);
- HB_BUFFER_ALLOCATE_VAR (buffer, khmer_position);
/* We cannot setup masks here. We save information about characters
* and setup masks later on in a pause-callback. */
@@ -330,7 +329,7 @@ reorder_consonant_syllable (const hb_ot_shape_plan_t *plan,
}
/* Reorder left matra piece. */
- else if (info[i].khmer_position() == POS_PRE_M)
+ else if (info[i].khmer_category() == OT_VPre)
{
/* Move to the start. */
buffer->merge_clusters (start, i + 1);
@@ -432,7 +431,6 @@ reorder (const hb_ot_shape_plan_t *plan,
initial_reordering_syllable (plan, font->face, buffer, start, end);
HB_BUFFER_DEALLOCATE_VAR (buffer, khmer_category);
- HB_BUFFER_DEALLOCATE_VAR (buffer, khmer_position);
}
static void
diff --git a/src/hb-ot-shape-complex-khmer.hh b/src/hb-ot-shape-complex-khmer.hh
index c86e7aad..4ee0b838 100644
--- a/src/hb-ot-shape-complex-khmer.hh
+++ b/src/hb-ot-shape-complex-khmer.hh
@@ -34,30 +34,22 @@
/* buffer var allocations */
#define khmer_category() indic_category() /* khmer_category_t */
-#define khmer_position() indic_position() /* khmer_position_t */
+#define khmer_position() indic_position() /* indic_position_t */
-typedef indic_category_t khmer_category_t;
-typedef indic_position_t khmer_position_t;
-
-
-static inline khmer_position_t
-matra_position_khmer (khmer_position_t side)
+/* Note: This enum is duplicated in the -machine.rl source file.
+ * Not sure how to avoid duplication. */
+enum khmer_category_t
{
- switch ((int) side)
- {
- case POS_PRE_C:
- return POS_PRE_M;
+ OT_Robatic = 20,
+ OT_Xgroup = 21,
+ OT_Ygroup = 22,
- case POS_POST_C:
- case POS_ABOVE_C:
- case POS_BELOW_C:
- return POS_AFTER_POST;
-
- default:
- return side;
- };
-}
+ OT_VAbv = 26,
+ OT_VBlw = 27,
+ OT_VPre = 28,
+ OT_VPst = 29,
+};
static inline void
set_khmer_properties (hb_glyph_info_t &info)
@@ -65,47 +57,58 @@ set_khmer_properties (hb_glyph_info_t &info)
hb_codepoint_t u = info.codepoint;
unsigned int type = hb_indic_get_categories (u);
khmer_category_t cat = (khmer_category_t) (type & 0x7Fu);
- khmer_position_t pos = (khmer_position_t) (type >> 8);
+ indic_position_t pos = (indic_position_t) (type >> 8);
/*
* Re-assign category
+ *
+ * These categories are experimentally extracted from what Uniscribe allows.
*/
-
- if (unlikely (u == 0x17C6u)) cat = OT_N; /* Khmer Bindu doesn't like to be repositioned. */
- else if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x17CDu, 0x17D1u) ||
- u == 0x17CBu || u == 0x17D3u || u == 0x17DDu)) /* Khmer Various signs */
+ switch (u)
{
- /* These can occur mid-syllable (eg. before matras), even though Unicode marks them as Syllable_Modifier.
- * https://github.com/roozbehp/unicode-data/issues/5 */
- cat = OT_M;
- pos = POS_ABOVE_C;
+ case 0x179Au:
+ cat = (khmer_category_t) OT_Ra;
+ break;
+
+ case 0x17CCu:
+ case 0x17C9u:
+ case 0x17CAu:
+ cat = OT_Robatic;
+ break;
+
+ case 0x17C6u:
+ case 0x17CBu:
+ case 0x17CDu:
+ case 0x17CEu:
+ case 0x17CFu:
+ case 0x17D0u:
+ case 0x17D1u:
+ cat = OT_Xgroup;
+ break;
+
+ case 0x17C7u:
+ case 0x17C8u:
+ case 0x17DDu:
+ case 0x17D3u: /* Just guessing. Uniscribe doesn't categorize it. */
+ cat = OT_Ygroup;
+ break;
}
- else if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x2010u, 0x2011u))) cat = OT_PLACEHOLDER;
- else if (unlikely (u == 0x25CCu)) cat = OT_DOTTEDCIRCLE;
-
/*
* Re-assign position.
*/
-
- if ((FLAG_UNSAFE (cat) & CONSONANT_FLAGS))
- {
- pos = POS_BASE_C;
- if (u == 0x179Au)
- cat = OT_Ra;
- }
- else if (cat == OT_M)
- {
- pos = matra_position_khmer (pos);
- }
- else if ((FLAG_UNSAFE (cat) & (FLAG (OT_SM) | FLAG (OT_A) | FLAG (OT_Symbol))))
- {
- pos = POS_SMVD;
- }
+ if (cat == (khmer_category_t) OT_M)
+ switch ((int) pos)
+ {
+ case POS_PRE_C: cat = OT_VPre; break;
+ case POS_BELOW_C: cat = OT_VBlw; break;
+ case POS_ABOVE_C: cat = OT_VAbv; break;
+ case POS_POST_C: cat = OT_VPst; break;
+ default: assert (0);
+ };
info.khmer_category() = cat;
- info.khmer_position() = pos;
}