summaryrefslogtreecommitdiff
path: root/i18npool
diff options
context:
space:
mode:
authorJonathan Clark <jonathan@libreoffice.org>2024-07-15 15:30:16 -0600
committerJonathan Clark <jonathan@libreoffice.org>2024-07-16 02:17:54 +0200
commit174aa6e980f973cea9b1c402d03bd6dba951f5ae (patch)
treebb547ca3964f595bbcb85a84da6aa08f38ef2518 /i18npool
parent07468c71ad47e027bdadbf2cdf35e903734a8078 (diff)
tdf#46950 Allow intra-word right double quotation mark
Hebrew text may use the character RIGHT DOUBLE QUOTATION MARK as a substitute for HEBREW PUNCTUATION GERSHAYIM. This change customizes the ICU word BreakIterator rules to that end. Change-Id: I03a48729de103505a2f68f9a1635c0f0cd7d126a Reviewed-on: https://gerrit.libreoffice.org/c/core/+/170536 Reviewed-by: Jonathan Clark <jonathan@libreoffice.org> Tested-by: Jenkins
Diffstat (limited to 'i18npool')
-rw-r--r--i18npool/qa/cppunit/test_breakiterator.cxx105
-rw-r--r--i18npool/source/breakiterator/data/dict_word.txt5
-rw-r--r--i18npool/source/breakiterator/data/edit_word.txt5
3 files changed, 94 insertions, 21 deletions
diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx
index 7e9f47ad22f1..baf1d47603c7 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -48,7 +48,7 @@ public:
void testLegacyDictWordPrepostDash_nds_DE();
void testLegacyDictWordPrepostDash_nl_NL();
void testLegacyDictWordPrepostDash_sv_SE();
- void testLegacyHebrewQuoteInsideWord();
+ void testHebrewGereshGershaim();
void testLegacySurrogatePairs();
void testWordCount();
@@ -71,7 +71,7 @@ public:
CPPUNIT_TEST(testLegacyDictWordPrepostDash_nds_DE);
CPPUNIT_TEST(testLegacyDictWordPrepostDash_nl_NL);
CPPUNIT_TEST(testLegacyDictWordPrepostDash_sv_SE);
- CPPUNIT_TEST(testLegacyHebrewQuoteInsideWord);
+ CPPUNIT_TEST(testHebrewGereshGershaim);
CPPUNIT_TEST(testLegacySurrogatePairs);
CPPUNIT_TEST(testWordCount);
CPPUNIT_TEST_SUITE_END();
@@ -1708,41 +1708,108 @@ void TestBreakIterator::testLegacyDictWordPrepostDash_sv_SE()
}
}
-void TestBreakIterator::testLegacyHebrewQuoteInsideWord()
+void TestBreakIterator::testHebrewGereshGershaim()
{
+ // In Hebrew documents, there are multiple valid ways to represent the geresh and gershaim
+ // intra-word punctuation marks. This test exhaustively exercises them.
+ //
+ // See the following bugs:
+ // i#51661: Add quotation mark as middle letter for Hebrew
+ // tdf#46950: Spell-checking breaks Hebrew words at intra-word single and double quotes
+
lang::Locale aLocale;
aLocale.Language = "he";
aLocale.Country = "IL";
- // i#51661: Add quotation mark as middle letter for Hebrew
+ // Unicode U+05F3 HEBREW PUNCTUATION GERESH
{
- auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr;
+ auto aTest = u"ג׳ירפה"_ustr;
- i18n::Boundary aBounds
+ auto aBounds
= m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
- CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
- aBounds
- = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false);
- CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
- CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
+ aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
}
- // i#51661: Add quotation mark as middle letter for Hebrew
+ // Apostrophe as geresh
{
- auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr;
+ auto aTest = u"ג'ירפה"_ustr;
- i18n::Boundary aBounds = m_xBreak->getWordBoundary(
- aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
+ auto aBounds
+ = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
- CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
- aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale,
+ aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
- CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
- CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+ }
+
+ // Right single quote as geresh
+ {
+ auto aTest = u"ג’ירפה"_ustr;
+
+ auto aBounds
+ = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+ }
+
+ // Unicode U+05F4 HEBREW PUNCTUATION GERSHAYIM
+ {
+ auto aTest = u"דו״ח"_ustr;
+
+ auto aBounds
+ = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
+ }
+
+ // Double quote as gershayim
+ {
+ auto aTest = u"דו\"ח"_ustr;
+
+ auto aBounds
+ = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
+ }
+
+ // Right double quote as gershayim
+ {
+ auto aTest = u"דו”ח"_ustr;
+
+ auto aBounds
+ = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
+ i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
}
}
diff --git a/i18npool/source/breakiterator/data/dict_word.txt b/i18npool/source/breakiterator/data/dict_word.txt
index deeec7dd659e..4a09af5cf1b2 100644
--- a/i18npool/source/breakiterator/data/dict_word.txt
+++ b/i18npool/source/breakiterator/data/dict_word.txt
@@ -50,7 +50,6 @@ $Katakana = [\p{Word_Break = Katakana}];
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
$ALetter = [\p{Word_Break = ALetter}];
$Single_Quote = [\p{Word_Break = Single_Quote}];
-$Double_Quote = [\p{Word_Break = Double_Quote}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
@@ -74,6 +73,10 @@ $ExcludedML = [[:name = COLON:]
# $MidLetter = [\p{Word_Break = MidLetter}];
$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML];
+### tdf#46950: Right double-quotes are also used as substitutes for Hebrew gershaim
+# $Double_Quote = [\p{Word_Break = Double_Quote}];
+$Double_Quote = [[\p{Word_Break = Double_Quote}][:name= RIGHT DOUBLE QUOTATION MARK:]];
+
### END CUSTOMIZATION
$Hiragana = [:Hiragana:];
diff --git a/i18npool/source/breakiterator/data/edit_word.txt b/i18npool/source/breakiterator/data/edit_word.txt
index 1e3bcd15b20d..8db21d9b281a 100644
--- a/i18npool/source/breakiterator/data/edit_word.txt
+++ b/i18npool/source/breakiterator/data/edit_word.txt
@@ -50,7 +50,6 @@ $Katakana = [\p{Word_Break = Katakana}];
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
$ALetter = [\p{Word_Break = ALetter}];
$Single_Quote = [\p{Word_Break = Single_Quote}];
-$Double_Quote = [\p{Word_Break = Double_Quote}];
$MidLetter = [\p{Word_Break = MidLetter}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
@@ -67,6 +66,10 @@ $MidNumLet = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]];
# $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]-[:name = NARROW NO-BREAK SPACE:]];
+### tdf#46950: Right double-quotes are also used as substitutes for Hebrew gershaim
+# $Double_Quote = [\p{Word_Break = Double_Quote}];
+$Double_Quote = [[\p{Word_Break = Double_Quote}][:name= RIGHT DOUBLE QUOTATION MARK:]];
+
### END CUSTOMIZATION
$Hiragana = [:Hiragana:];