diff options
author | Jonathan Clark <jonathan@libreoffice.org> | 2024-07-15 15:30:16 -0600 |
---|---|---|
committer | Jonathan Clark <jonathan@libreoffice.org> | 2024-07-16 02:17:54 +0200 |
commit | 174aa6e980f973cea9b1c402d03bd6dba951f5ae (patch) | |
tree | bb547ca3964f595bbcb85a84da6aa08f38ef2518 /i18npool | |
parent | 07468c71ad47e027bdadbf2cdf35e903734a8078 (diff) |
tdf#46950 Allow intra-word right double quotation mark
Hebrew text may use the character RIGHT DOUBLE QUOTATION MARK as a
substitute for HEBREW PUNCTUATION GERSHAYIM. This change customizes the
ICU word BreakIterator rules to that end.
Change-Id: I03a48729de103505a2f68f9a1635c0f0cd7d126a
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/170536
Reviewed-by: Jonathan Clark <jonathan@libreoffice.org>
Tested-by: Jenkins
Diffstat (limited to 'i18npool')
-rw-r--r-- | i18npool/qa/cppunit/test_breakiterator.cxx | 105 | ||||
-rw-r--r-- | i18npool/source/breakiterator/data/dict_word.txt | 5 | ||||
-rw-r--r-- | i18npool/source/breakiterator/data/edit_word.txt | 5 |
3 files changed, 94 insertions, 21 deletions
diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index 7e9f47ad22f1..baf1d47603c7 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -48,7 +48,7 @@ public: void testLegacyDictWordPrepostDash_nds_DE(); void testLegacyDictWordPrepostDash_nl_NL(); void testLegacyDictWordPrepostDash_sv_SE(); - void testLegacyHebrewQuoteInsideWord(); + void testHebrewGereshGershaim(); void testLegacySurrogatePairs(); void testWordCount(); @@ -71,7 +71,7 @@ public: CPPUNIT_TEST(testLegacyDictWordPrepostDash_nds_DE); CPPUNIT_TEST(testLegacyDictWordPrepostDash_nl_NL); CPPUNIT_TEST(testLegacyDictWordPrepostDash_sv_SE); - CPPUNIT_TEST(testLegacyHebrewQuoteInsideWord); + CPPUNIT_TEST(testHebrewGereshGershaim); CPPUNIT_TEST(testLegacySurrogatePairs); CPPUNIT_TEST(testWordCount); CPPUNIT_TEST_SUITE_END(); @@ -1708,41 +1708,108 @@ void TestBreakIterator::testLegacyDictWordPrepostDash_sv_SE() } } -void TestBreakIterator::testLegacyHebrewQuoteInsideWord() +void TestBreakIterator::testHebrewGereshGershaim() { + // In Hebrew documents, there are multiple valid ways to represent the geresh and gershaim + // intra-word punctuation marks. This test exhaustively exercises them. + // + // See the following bugs: + // i#51661: Add quotation mark as middle letter for Hebrew + // tdf#46950: Spell-checking breaks Hebrew words at intra-word single and double quotes + lang::Locale aLocale; aLocale.Language = "he"; aLocale.Country = "IL"; - // i#51661: Add quotation mark as middle letter for Hebrew + // Unicode U+05F3 HEBREW PUNCTUATION GERESH { - auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr; + auto aTest = u"ג׳ירפה"_ustr; - i18n::Boundary aBounds + auto aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); - aBounds - = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); - CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); + aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); } - // i#51661: Add quotation mark as middle letter for Hebrew + // Apostrophe as geresh { - auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr; + auto aTest = u"ג'ירפה"_ustr; - i18n::Boundary aBounds = m_xBreak->getWordBoundary( - aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + auto aBounds + = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); - aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, + aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); - CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + } + + // Right single quote as geresh + { + auto aTest = u"ג’ירפה"_ustr; + + auto aBounds + = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + } + + // Unicode U+05F4 HEBREW PUNCTUATION GERSHAYIM + { + auto aTest = u"דו״ח"_ustr; + + auto aBounds + = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + } + + // Double quote as gershayim + { + auto aTest = u"דו\"ח"_ustr; + + auto aBounds + = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + } + + // Right double quote as gershayim + { + auto aTest = u"דו”ח"_ustr; + + auto aBounds + = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); } } diff --git a/i18npool/source/breakiterator/data/dict_word.txt b/i18npool/source/breakiterator/data/dict_word.txt index deeec7dd659e..4a09af5cf1b2 100644 --- a/i18npool/source/breakiterator/data/dict_word.txt +++ b/i18npool/source/breakiterator/data/dict_word.txt @@ -50,7 +50,6 @@ $Katakana = [\p{Word_Break = Katakana}]; $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; $ALetter = [\p{Word_Break = ALetter}]; $Single_Quote = [\p{Word_Break = Single_Quote}]; -$Double_Quote = [\p{Word_Break = Double_Quote}]; $MidNumLet = [\p{Word_Break = MidNumLet}]; $MidNum = [\p{Word_Break = MidNum}]; $Numeric = [\p{Word_Break = Numeric}]; @@ -74,6 +73,10 @@ $ExcludedML = [[:name = COLON:] # $MidLetter = [\p{Word_Break = MidLetter}]; $MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML]; +### tdf#46950: Right double-quotes are also used as substitutes for Hebrew gershaim +# $Double_Quote = [\p{Word_Break = Double_Quote}]; +$Double_Quote = [[\p{Word_Break = Double_Quote}][:name= RIGHT DOUBLE QUOTATION MARK:]]; + ### END CUSTOMIZATION $Hiragana = [:Hiragana:]; diff --git a/i18npool/source/breakiterator/data/edit_word.txt b/i18npool/source/breakiterator/data/edit_word.txt index 1e3bcd15b20d..8db21d9b281a 100644 --- a/i18npool/source/breakiterator/data/edit_word.txt +++ b/i18npool/source/breakiterator/data/edit_word.txt @@ -50,7 +50,6 @@ $Katakana = [\p{Word_Break = Katakana}]; $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; $ALetter = [\p{Word_Break = ALetter}]; $Single_Quote = [\p{Word_Break = Single_Quote}]; -$Double_Quote = [\p{Word_Break = Double_Quote}]; $MidLetter = [\p{Word_Break = MidLetter}]; $MidNum = [\p{Word_Break = MidNum}]; $Numeric = [\p{Word_Break = Numeric}]; @@ -67,6 +66,10 @@ $MidNumLet = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]]; # $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; $ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]-[:name = NARROW NO-BREAK SPACE:]]; +### tdf#46950: Right double-quotes are also used as substitutes for Hebrew gershaim +# $Double_Quote = [\p{Word_Break = Double_Quote}]; +$Double_Quote = [[\p{Word_Break = Double_Quote}][:name= RIGHT DOUBLE QUOTATION MARK:]]; + ### END CUSTOMIZATION $Hiragana = [:Hiragana:]; |