diff options
author | Behdad Esfahbod <behdad@behdad.org> | 2018-10-01 19:09:58 +0200 |
---|---|---|
committer | Behdad Esfahbod <behdad@behdad.org> | 2018-10-01 19:23:47 +0200 |
commit | 51436547162a18e88144e7125ad6ce4a69a08d4b (patch) | |
tree | 86dc145119a9b8a58aef672a3354a3bcc89ecdb2 | |
parent | aaaa65baa7fcfb65ae814528bdd93cc5c4ea540d (diff) |
[khmer] Rewrite grammar completely
Based on experimenting with Uniscribe to extract grammar and categories.
Failures down from 44 to 35:
KHMER: 299089 out of 299124 tests passed. 35 failed (0.0117008%)
We still don't enforce the one-matra rule pre-decomposition, but enforce
an order and one-matra-per-position post-decomposition.
https://github.com/harfbuzz/harfbuzz/issues/667
-rw-r--r-- | src/hb-ot-shape-complex-indic.hh | 2 | ||||
-rw-r--r-- | src/hb-ot-shape-complex-khmer-machine.hh | 300 | ||||
-rw-r--r-- | src/hb-ot-shape-complex-khmer-machine.rl | 42 | ||||
-rw-r--r-- | src/hb-ot-shape-complex-khmer.cc | 4 | ||||
-rw-r--r-- | src/hb-ot-shape-complex-khmer.hh | 99 |
5 files changed, 270 insertions, 177 deletions
diff --git a/src/hb-ot-shape-complex-indic.hh b/src/hb-ot-shape-complex-indic.hh index 9e597797..fe5595f8 100644 --- a/src/hb-ot-shape-complex-indic.hh +++ b/src/hb-ot-shape-complex-indic.hh @@ -125,7 +125,7 @@ enum indic_syllabic_category_t { INDIC_SYLLABIC_CATEGORY_CONSONANT_PRECEDING_REPHA = OT_Repha, INDIC_SYLLABIC_CATEGORY_CONSONANT_PREFIXED = OT_X, /* Don't care. */ INDIC_SYLLABIC_CATEGORY_CONSONANT_SUBJOINED = OT_CM, - INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA = OT_N, + INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA = OT_CM, INDIC_SYLLABIC_CATEGORY_CONSONANT_WITH_STACKER = OT_CS, INDIC_SYLLABIC_CATEGORY_GEMINATION_MARK = OT_SM, /* https://github.com/harfbuzz/harfbuzz/issues/552 */ INDIC_SYLLABIC_CATEGORY_INVISIBLE_STACKER = OT_Coeng, diff --git a/src/hb-ot-shape-complex-khmer-machine.hh b/src/hb-ot-shape-complex-khmer-machine.hh index a7e1711e..d013456b 100644 --- a/src/hb-ot-shape-complex-khmer-machine.hh +++ b/src/hb-ot-shape-complex-khmer-machine.hh @@ -34,130 +34,200 @@ #line 36 "hb-ot-shape-complex-khmer-machine.hh" static const unsigned char _khmer_syllable_machine_trans_keys[] = { - 7u, 7u, 1u, 16u, 13u, 13u, 1u, 16u, 7u, 13u, 7u, 7u, 1u, 16u, 13u, 13u, - 1u, 16u, 7u, 13u, 1u, 16u, 3u, 14u, 3u, 14u, 5u, 14u, 3u, 14u, 5u, 14u, - 8u, 8u, 3u, 13u, 3u, 8u, 8u, 8u, 3u, 8u, 3u, 14u, 3u, 14u, 5u, 14u, - 3u, 14u, 5u, 14u, 8u, 8u, 3u, 13u, 3u, 8u, 8u, 8u, 3u, 8u, 3u, 14u, - 3u, 14u, 7u, 13u, 7u, 7u, 1u, 16u, 0 + 5u, 26u, 5u, 21u, 5u, 26u, 5u, 21u, 1u, 16u, 5u, 21u, 5u, 26u, 5u, 21u, + 5u, 26u, 5u, 21u, 1u, 16u, 5u, 21u, 5u, 26u, 5u, 21u, 1u, 16u, 5u, 21u, + 5u, 26u, 5u, 21u, 5u, 26u, 5u, 21u, 5u, 26u, 1u, 16u, 1u, 29u, 5u, 29u, + 5u, 29u, 5u, 29u, 22u, 22u, 5u, 22u, 5u, 29u, 5u, 29u, 5u, 29u, 5u, 26u, + 5u, 29u, 5u, 29u, 22u, 22u, 5u, 22u, 5u, 29u, 5u, 29u, 1u, 16u, 5u, 29u, + 5u, 29u, 0 }; static const char _khmer_syllable_machine_key_spans[] = { - 1, 16, 1, 16, 7, 1, 16, 1, - 16, 7, 16, 12, 12, 10, 12, 10, - 1, 11, 6, 1, 6, 12, 12, 10, - 12, 10, 1, 11, 6, 1, 6, 12, - 12, 7, 1, 16 + 22, 17, 22, 17, 16, 17, 22, 17, + 22, 17, 16, 17, 22, 17, 16, 17, + 22, 17, 22, 17, 22, 16, 29, 25, + 25, 25, 1, 18, 25, 25, 25, 22, + 25, 25, 1, 18, 25, 25, 16, 25, + 25 }; static const short _khmer_syllable_machine_index_offsets[] = { - 0, 2, 19, 21, 38, 46, 48, 65, - 67, 84, 92, 109, 122, 135, 146, 159, - 170, 172, 184, 191, 193, 200, 213, 226, - 237, 250, 261, 263, 275, 282, 284, 291, - 304, 317, 325, 327 + 0, 23, 41, 64, 82, 99, 117, 140, + 158, 181, 199, 216, 234, 257, 275, 292, + 310, 333, 351, 374, 392, 415, 432, 462, + 488, 514, 540, 542, 561, 587, 613, 639, + 662, 688, 714, 716, 735, 761, 787, 804, + 830 }; static const char _khmer_syllable_machine_indicies[] = { - 1, 0, 2, 2, 0, 0, 0, 0, + 1, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2, + 3, 0, 0, 0, 0, 4, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 3, + 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 2, 0, 3, 0, 4, 4, 0, + 0, 3, 0, 0, 0, 0, 4, 0, + 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 4, 0, 1, 0, - 0, 0, 0, 0, 5, 0, 7, 6, - 8, 8, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 8, - 6, 9, 6, 10, 10, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 10, 6, 7, 6, 6, 6, - 6, 6, 11, 6, 4, 4, 13, 12, - 14, 15, 7, 16, 12, 12, 4, 4, - 11, 17, 12, 4, 12, 19, 18, 20, - 21, 1, 22, 18, 18, 18, 18, 5, - 23, 18, 24, 18, 21, 21, 1, 22, - 18, 18, 18, 18, 18, 23, 18, 21, - 21, 1, 22, 18, 18, 18, 18, 18, - 23, 18, 25, 18, 21, 21, 1, 22, - 18, 18, 18, 18, 18, 26, 18, 21, - 21, 1, 22, 18, 18, 18, 18, 18, - 26, 18, 27, 18, 28, 18, 29, 18, - 18, 22, 18, 18, 18, 18, 3, 18, - 30, 18, 18, 18, 18, 22, 18, 22, - 18, 28, 18, 18, 18, 18, 22, 18, - 19, 18, 21, 21, 1, 22, 18, 18, - 18, 18, 18, 23, 18, 32, 31, 33, - 33, 7, 16, 31, 31, 31, 31, 31, - 34, 31, 33, 33, 7, 16, 31, 31, - 31, 31, 31, 34, 31, 35, 31, 33, - 33, 7, 16, 31, 31, 31, 31, 31, - 36, 31, 33, 33, 7, 16, 31, 31, - 31, 31, 31, 36, 31, 37, 31, 38, - 31, 39, 31, 31, 16, 31, 31, 31, - 31, 9, 31, 40, 31, 31, 31, 31, - 16, 31, 16, 31, 38, 31, 31, 31, - 31, 16, 31, 13, 31, 41, 33, 7, - 16, 31, 31, 31, 31, 11, 34, 31, - 13, 31, 33, 33, 7, 16, 31, 31, - 31, 31, 31, 34, 31, 7, 42, 42, - 42, 42, 42, 11, 42, 7, 42, 10, - 10, 42, 42, 42, 42, 42, 42, 42, - 42, 42, 42, 42, 42, 42, 10, 42, + 4, 0, 6, 6, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 0, 7, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 0, 9, 9, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 10, 0, 0, + 0, 0, 4, 0, 9, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 10, 0, 11, 11, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 12, 0, + 0, 0, 0, 4, 0, 11, 11, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 12, 0, 13, + 13, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 13, 0, + 15, 15, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 16, 14, 15, 15, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 16, 17, 17, 17, 17, 18, + 17, 19, 19, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 18, 17, 20, 20, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 20, 17, 21, 21, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 22, 17, 23, 23, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 24, 17, + 17, 17, 17, 18, 17, 23, 23, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 24, 17, 25, + 25, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 26, + 17, 17, 17, 17, 18, 17, 25, 25, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 26, 17, + 15, 15, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 27, + 16, 17, 17, 17, 17, 18, 17, 28, + 28, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 28, 17, + 13, 13, 29, 29, 30, 30, 29, 29, + 29, 29, 2, 2, 29, 31, 29, 13, + 29, 29, 29, 29, 16, 20, 29, 29, + 29, 18, 24, 26, 22, 29, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 34, + 32, 32, 32, 32, 32, 2, 3, 6, + 32, 32, 32, 4, 10, 12, 8, 32, + 35, 35, 32, 32, 32, 32, 32, 32, + 32, 36, 32, 32, 32, 32, 32, 32, + 3, 6, 32, 32, 32, 4, 10, 12, + 8, 32, 5, 5, 32, 32, 32, 32, + 32, 32, 32, 36, 32, 32, 32, 32, + 32, 32, 4, 6, 32, 32, 32, 32, + 32, 32, 8, 32, 6, 32, 7, 7, + 32, 32, 32, 32, 32, 32, 32, 36, + 32, 32, 32, 32, 32, 32, 8, 6, + 32, 37, 37, 32, 32, 32, 32, 32, + 32, 32, 36, 32, 32, 32, 32, 32, + 32, 10, 6, 32, 32, 32, 4, 32, + 32, 8, 32, 38, 38, 32, 32, 32, + 32, 32, 32, 32, 36, 32, 32, 32, + 32, 32, 32, 12, 6, 32, 32, 32, + 4, 10, 32, 8, 32, 35, 35, 32, + 32, 32, 32, 32, 32, 32, 34, 32, + 32, 32, 32, 32, 32, 3, 6, 32, + 32, 32, 4, 10, 12, 8, 32, 15, + 15, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 16, + 39, 39, 39, 39, 18, 39, 41, 41, + 40, 40, 40, 40, 40, 40, 40, 42, + 40, 40, 40, 40, 40, 40, 16, 20, + 40, 40, 40, 18, 24, 26, 22, 40, + 19, 19, 40, 40, 40, 40, 40, 40, + 40, 42, 40, 40, 40, 40, 40, 40, + 18, 20, 40, 40, 40, 40, 40, 40, + 22, 40, 20, 40, 21, 21, 40, 40, + 40, 40, 40, 40, 40, 42, 40, 40, + 40, 40, 40, 40, 22, 20, 40, 43, + 43, 40, 40, 40, 40, 40, 40, 40, + 42, 40, 40, 40, 40, 40, 40, 24, + 20, 40, 40, 40, 18, 40, 40, 22, + 40, 44, 44, 40, 40, 40, 40, 40, + 40, 40, 42, 40, 40, 40, 40, 40, + 40, 26, 20, 40, 40, 40, 18, 24, + 40, 22, 40, 28, 28, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 28, 39, 45, 45, 40, 40, + 40, 40, 40, 40, 40, 46, 40, 40, + 40, 40, 40, 27, 16, 20, 40, 40, + 40, 18, 24, 26, 22, 40, 41, 41, + 40, 40, 40, 40, 40, 40, 40, 46, + 40, 40, 40, 40, 40, 40, 16, 20, + 40, 40, 40, 18, 24, 26, 22, 40, 0 }; static const char _khmer_syllable_machine_trans_targs[] = { - 10, 14, 17, 20, 11, 21, 10, 24, - 27, 30, 31, 32, 10, 22, 33, 34, - 26, 35, 10, 12, 4, 0, 16, 3, - 13, 15, 1, 10, 18, 2, 19, 10, - 23, 5, 8, 25, 6, 10, 28, 7, - 29, 9, 10 + 22, 1, 30, 24, 25, 3, 26, 5, + 27, 7, 28, 9, 29, 23, 22, 11, + 32, 22, 33, 13, 34, 15, 35, 17, + 36, 19, 37, 40, 39, 22, 31, 38, + 22, 0, 10, 2, 4, 6, 8, 22, + 22, 12, 14, 16, 18, 20, 21 }; static const char _khmer_syllable_machine_trans_actions[] = { - 1, 2, 2, 0, 2, 2, 3, 2, - 2, 0, 2, 2, 6, 2, 0, 0, - 0, 0, 7, 2, 0, 0, 0, 0, - 2, 2, 0, 8, 0, 0, 0, 9, - 2, 0, 0, 2, 0, 10, 0, 0, - 0, 0, 11 + 1, 0, 2, 2, 2, 0, 0, 0, + 2, 0, 2, 0, 2, 2, 3, 0, + 4, 5, 2, 0, 0, 0, 2, 0, + 2, 0, 2, 4, 4, 8, 9, 0, + 10, 0, 0, 0, 0, 0, 0, 11, + 12, 0, 0, 0, 0, 0, 0 }; static const char _khmer_syllable_machine_to_state_actions[] = { 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0 + 0, 0, 0, 0, 0, 0, 0, 0, + 0 }; static const char _khmer_syllable_machine_from_state_actions[] = { 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 5, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0 + 0 }; static const unsigned char _khmer_syllable_machine_eof_trans[] = { - 1, 1, 1, 1, 1, 7, 7, 7, - 7, 7, 0, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32, - 32, 43, 43, 43 + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 15, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 0, 33, + 33, 33, 33, 33, 33, 33, 33, 40, + 41, 41, 41, 41, 41, 41, 40, 41, + 41 }; -static const int khmer_syllable_machine_start = 10; -static const int khmer_syllable_machine_first_final = 10; +static const int khmer_syllable_machine_start = 22; +static const int khmer_syllable_machine_first_final = 22; static const int khmer_syllable_machine_error = -1; -static const int khmer_syllable_machine_en_main = 10; +static const int khmer_syllable_machine_en_main = 22; #line 36 "hb-ot-shape-complex-khmer-machine.rl" -#line 74 "hb-ot-shape-complex-khmer-machine.rl" +#line 80 "hb-ot-shape-complex-khmer-machine.rl" #define found_syllable(syllable_type) \ @@ -177,7 +247,7 @@ find_syllables (hb_buffer_t *buffer) int cs; hb_glyph_info_t *info = buffer->info; -#line 181 "hb-ot-shape-complex-khmer-machine.hh" +#line 251 "hb-ot-shape-complex-khmer-machine.hh" { cs = khmer_syllable_machine_start; ts = 0; @@ -185,7 +255,7 @@ find_syllables (hb_buffer_t *buffer) act = 0; } -#line 95 "hb-ot-shape-complex-khmer-machine.rl" +#line 101 "hb-ot-shape-complex-khmer-machine.rl" p = 0; @@ -194,7 +264,7 @@ find_syllables (hb_buffer_t *buffer) unsigned int last = 0; unsigned int syllable_serial = 1; -#line 198 "hb-ot-shape-complex-khmer-machine.hh" +#line 268 "hb-ot-shape-complex-khmer-machine.hh" { int _slen; int _trans; @@ -204,11 +274,11 @@ find_syllables (hb_buffer_t *buffer) goto _test_eof; _resume: switch ( _khmer_syllable_machine_from_state_actions[cs] ) { - case 5: + case 7: #line 1 "NONE" {ts = p;} break; -#line 212 "hb-ot-shape-complex-khmer-machine.hh" +#line 282 "hb-ot-shape-complex-khmer-machine.hh" } _keys = _khmer_syllable_machine_trans_keys + (cs<<1); @@ -231,47 +301,63 @@ _eof_trans: {te = p+1;} break; case 8: -#line 68 "hb-ot-shape-complex-khmer-machine.rl" - {te = p+1;{ found_syllable (consonant_syllable); }} - break; - case 10: -#line 69 "hb-ot-shape-complex-khmer-machine.rl" - {te = p+1;{ found_syllable (broken_cluster); }} - break; - case 6: -#line 70 "hb-ot-shape-complex-khmer-machine.rl" +#line 76 "hb-ot-shape-complex-khmer-machine.rl" {te = p+1;{ found_syllable (non_khmer_cluster); }} break; - case 7: -#line 68 "hb-ot-shape-complex-khmer-machine.rl" + case 10: +#line 74 "hb-ot-shape-complex-khmer-machine.rl" {te = p;p--;{ found_syllable (consonant_syllable); }} break; - case 9: -#line 69 "hb-ot-shape-complex-khmer-machine.rl" + case 12: +#line 75 "hb-ot-shape-complex-khmer-machine.rl" {te = p;p--;{ found_syllable (broken_cluster); }} break; case 11: -#line 70 "hb-ot-shape-complex-khmer-machine.rl" +#line 76 "hb-ot-shape-complex-khmer-machine.rl" {te = p;p--;{ found_syllable (non_khmer_cluster); }} break; case 1: -#line 68 "hb-ot-shape-complex-khmer-machine.rl" +#line 74 "hb-ot-shape-complex-khmer-machine.rl" {{p = ((te))-1;}{ found_syllable (consonant_syllable); }} break; - case 3: -#line 69 "hb-ot-shape-complex-khmer-machine.rl" + case 5: +#line 75 "hb-ot-shape-complex-khmer-machine.rl" {{p = ((te))-1;}{ found_syllable (broken_cluster); }} break; -#line 266 "hb-ot-shape-complex-khmer-machine.hh" + case 3: +#line 1 "NONE" + { switch( act ) { + case 2: + {{p = ((te))-1;} found_syllable (broken_cluster); } + break; + case 3: + {{p = ((te))-1;} found_syllable (non_khmer_cluster); } + break; + } + } + break; + case 4: +#line 1 "NONE" + {te = p+1;} +#line 75 "hb-ot-shape-complex-khmer-machine.rl" + {act = 2;} + break; + case 9: +#line 1 "NONE" + {te = p+1;} +#line 76 "hb-ot-shape-complex-khmer-machine.rl" + {act = 3;} + break; +#line 352 "hb-ot-shape-complex-khmer-machine.hh" } _again: switch ( _khmer_syllable_machine_to_state_actions[cs] ) { - case 4: + case 6: #line 1 "NONE" {ts = 0;} break; -#line 275 "hb-ot-shape-complex-khmer-machine.hh" +#line 361 "hb-ot-shape-complex-khmer-machine.hh" } if ( ++p != pe ) @@ -287,7 +373,7 @@ _again: } -#line 104 "hb-ot-shape-complex-khmer-machine.rl" +#line 110 "hb-ot-shape-complex-khmer-machine.rl" } diff --git a/src/hb-ot-shape-complex-khmer-machine.rl b/src/hb-ot-shape-complex-khmer-machine.rl index 7c795162..eb9f6988 100644 --- a/src/hb-ot-shape-complex-khmer-machine.rl +++ b/src/hb-ot-shape-complex-khmer-machine.rl @@ -40,28 +40,34 @@ # Same order as enum khmer_category_t. Not sure how to avoid duplication. C = 1; V = 2; -N = 3; ZWNJ = 5; ZWJ = 6; -M = 7; -SM = 8; PLACEHOLDER = 11; DOTTEDCIRCLE = 12; -RS = 13; -Coeng = 14; -Ra = 16; - -c = (C | Ra | V); # is_consonant -n = ((ZWNJ?.RS)? (N.N?)?); # is_consonant_modifier -z = ZWJ|ZWNJ; # is_joiner - -cn = c.n?; -matra_group = z?.M.N?; -syllable_tail = (SM.SM?)?; - - -broken_cluster = n? (Coeng.cn)* matra_group* (Coeng.cn)? syllable_tail; -consonant_syllable = (c|PLACEHOLDER|DOTTEDCIRCLE) broken_cluster; +Coeng= 14; +Ra = 16; +Robatic = 20; +Xgroup = 21; +Ygroup = 22; +VAbv = 26; +VBlw = 27; +VPre = 28; +VPst = 29; + +c = (C | Ra | V); +cn = c.((ZWJ|ZWNJ)?.Robatic)?; +joiner = (ZWJ | ZWNJ); +xgroup = (joiner*.Xgroup)*; +ygroup = Ygroup*; + +# This grammar was experimentally extracted from what Uniscribe allows. + +matra_group = VPre? xgroup VBlw? xgroup (joiner?.VAbv)? xgroup VPst?; +syllable_tail = xgroup matra_group xgroup (Coeng.c)? ygroup; + + +broken_cluster = (Coeng.cn)* syllable_tail; +consonant_syllable = (cn|PLACEHOLDER|DOTTEDCIRCLE) broken_cluster; other = any; main := |* diff --git a/src/hb-ot-shape-complex-khmer.cc b/src/hb-ot-shape-complex-khmer.cc index d46f0b3a..9c766be1 100644 --- a/src/hb-ot-shape-complex-khmer.cc +++ b/src/hb-ot-shape-complex-khmer.cc @@ -241,7 +241,6 @@ setup_masks_khmer (const hb_ot_shape_plan_t *plan HB_UNUSED, hb_font_t *font HB_UNUSED) { HB_BUFFER_ALLOCATE_VAR (buffer, khmer_category); - HB_BUFFER_ALLOCATE_VAR (buffer, khmer_position); /* We cannot setup masks here. We save information about characters * and setup masks later on in a pause-callback. */ @@ -330,7 +329,7 @@ reorder_consonant_syllable (const hb_ot_shape_plan_t *plan, } /* Reorder left matra piece. */ - else if (info[i].khmer_position() == POS_PRE_M) + else if (info[i].khmer_category() == OT_VPre) { /* Move to the start. */ buffer->merge_clusters (start, i + 1); @@ -432,7 +431,6 @@ reorder (const hb_ot_shape_plan_t *plan, initial_reordering_syllable (plan, font->face, buffer, start, end); HB_BUFFER_DEALLOCATE_VAR (buffer, khmer_category); - HB_BUFFER_DEALLOCATE_VAR (buffer, khmer_position); } static void diff --git a/src/hb-ot-shape-complex-khmer.hh b/src/hb-ot-shape-complex-khmer.hh index c86e7aad..4ee0b838 100644 --- a/src/hb-ot-shape-complex-khmer.hh +++ b/src/hb-ot-shape-complex-khmer.hh @@ -34,30 +34,22 @@ /* buffer var allocations */ #define khmer_category() indic_category() /* khmer_category_t */ -#define khmer_position() indic_position() /* khmer_position_t */ +#define khmer_position() indic_position() /* indic_position_t */ -typedef indic_category_t khmer_category_t; -typedef indic_position_t khmer_position_t; - - -static inline khmer_position_t -matra_position_khmer (khmer_position_t side) +/* Note: This enum is duplicated in the -machine.rl source file. + * Not sure how to avoid duplication. */ +enum khmer_category_t { - switch ((int) side) - { - case POS_PRE_C: - return POS_PRE_M; + OT_Robatic = 20, + OT_Xgroup = 21, + OT_Ygroup = 22, - case POS_POST_C: - case POS_ABOVE_C: - case POS_BELOW_C: - return POS_AFTER_POST; - - default: - return side; - }; -} + OT_VAbv = 26, + OT_VBlw = 27, + OT_VPre = 28, + OT_VPst = 29, +}; static inline void set_khmer_properties (hb_glyph_info_t &info) @@ -65,47 +57,58 @@ set_khmer_properties (hb_glyph_info_t &info) hb_codepoint_t u = info.codepoint; unsigned int type = hb_indic_get_categories (u); khmer_category_t cat = (khmer_category_t) (type & 0x7Fu); - khmer_position_t pos = (khmer_position_t) (type >> 8); + indic_position_t pos = (indic_position_t) (type >> 8); /* * Re-assign category + * + * These categories are experimentally extracted from what Uniscribe allows. */ - - if (unlikely (u == 0x17C6u)) cat = OT_N; /* Khmer Bindu doesn't like to be repositioned. */ - else if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x17CDu, 0x17D1u) || - u == 0x17CBu || u == 0x17D3u || u == 0x17DDu)) /* Khmer Various signs */ + switch (u) { - /* These can occur mid-syllable (eg. before matras), even though Unicode marks them as Syllable_Modifier. - * https://github.com/roozbehp/unicode-data/issues/5 */ - cat = OT_M; - pos = POS_ABOVE_C; + case 0x179Au: + cat = (khmer_category_t) OT_Ra; + break; + + case 0x17CCu: + case 0x17C9u: + case 0x17CAu: + cat = OT_Robatic; + break; + + case 0x17C6u: + case 0x17CBu: + case 0x17CDu: + case 0x17CEu: + case 0x17CFu: + case 0x17D0u: + case 0x17D1u: + cat = OT_Xgroup; + break; + + case 0x17C7u: + case 0x17C8u: + case 0x17DDu: + case 0x17D3u: /* Just guessing. Uniscribe doesn't categorize it. */ + cat = OT_Ygroup; + break; } - else if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x2010u, 0x2011u))) cat = OT_PLACEHOLDER; - else if (unlikely (u == 0x25CCu)) cat = OT_DOTTEDCIRCLE; - /* * Re-assign position. */ - - if ((FLAG_UNSAFE (cat) & CONSONANT_FLAGS)) - { - pos = POS_BASE_C; - if (u == 0x179Au) - cat = OT_Ra; - } - else if (cat == OT_M) - { - pos = matra_position_khmer (pos); - } - else if ((FLAG_UNSAFE (cat) & (FLAG (OT_SM) | FLAG (OT_A) | FLAG (OT_Symbol)))) - { - pos = POS_SMVD; - } + if (cat == (khmer_category_t) OT_M) + switch ((int) pos) + { + case POS_PRE_C: cat = OT_VPre; break; + case POS_BELOW_C: cat = OT_VBlw; break; + case POS_ABOVE_C: cat = OT_VAbv; break; + case POS_POST_C: cat = OT_VPst; break; + default: assert (0); + }; info.khmer_category() = cat; - info.khmer_position() = pos; } |