Fix bug in khmr linebreaking and update dictionary

Change-Id: I2b776925c2c95cb56ccd592d036823c26054e059 Reviewed-on: https://gerrit.libreoffice.org/23316 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Martin Hosken <martin_hosken@sil.org>
author: Martin Hosken <martin_hosken@sil.org> 2016-03-17 09:57:35 +0700
committer: Martin Hosken <martin_hosken@sil.org> 2016-03-17 03:31:32 +0000
commit: a976a19ca82661d8b459b85f5514b0e4c9222d47 (patch)
tree: b85c4b330550132a75edcb4ebf9fd496062b4892 /external
parent: 1caac283894d0deeac564c67cd816cc2907f9ac7 (diff)
2 files changed, 17 insertions, 310 deletions
diff --git a/external/icu/khmerbreakengine.patch b/external/icu/khmerbreakengine.patch
index ba3e392a27f3..bc0d287929b0 100644
--- a/external/icu/khmerbreakengine.patch
+++ b/external/icu/khmerbreakengine.patch
@@ -2,7 +2,7 @@ diff --git a/source/common/dictbe.cpp b/source/common/dictbe.cpp
 index f1c874d..3ad1b3f 100644
 --- misc/icu/source/common/dictbe.cpp
 +++ build/icu/source/common/dictbe.cpp
-@@ -27,8 +27,16 @@ U_NAMESPACE_BEGIN
+@@ -27,8 +27,17 @@ U_NAMESPACE_BEGIN
   ******************************************************************
   */
  
@@ -14,13 +14,14 @@ index f1c874d..3ad1b3f 100644
      fTypes = breakTypes;
 +    fViramaSet.applyPattern(UNICODE_STRING_SIMPLE("[[:ccc=VR:]]"), status);
 +
++    // note Skip Sets contain fIgnoreSet characters too.
 +    fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]]\\u200C\\u200D\\u2060"), status);
 +    fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]]\\u200C\\u200D\\u2060"), status);
 +    fNBeforeSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status);
  }
  
  DictionaryBreakEngine::~DictionaryBreakEngine() {
-@@ -90,7 +98,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
+@@ -90,7 +99,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
          result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
          utext_setNativeIndex(text, current);
      }
@@ -29,7 +30,7 @@ index f1c874d..3ad1b3f 100644
      return result;
  }
  
-@@ -101,6 +109,163 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
+@@ -101,6 +110,169 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
      fSet.compact();
  }
  
@@ -87,6 +88,8 @@ index f1c874d..3ad1b3f 100644
 +    }
 +    for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters
 +        while (start > textStart) {
++            while (fIgnoreSet.contains(c))
++                c = utext_previous32(text);
 +            if (!fMarkSet.contains(c)) {
 +                if (fBaseSet.contains(c)) {
 +                    c = utext_previous32(text);
@@ -125,6 +128,10 @@ index f1c874d..3ad1b3f 100644
 +        ++end;
 +    }
 +    for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters
++        while (fIgnoreSet.contains(c)) {
++            utext_next32(text);
++            c = utext_current32(text);
++        }
 +        if (fBaseSet.contains(c)) {
 +            while (end < textEnd) {
 +                utext_next32(text);
@@ -193,7 +200,7 @@ index f1c874d..3ad1b3f 100644
  /*
   ******************************************************************
   * PossibleWord
-@@ -128,35 +293,35 @@ private:
+@@ -128,35 +302,35 @@ private:
  public:
      PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
      ~PossibleWord() {};
@@ -238,242 +245,7 @@ index f1c874d..3ad1b3f 100644
          // Dictionary leaves text after longest prefix, not longest word. Back up.
          if (count <= 0) {
              utext_setNativeIndex(text, start);
-@@ -261,16 +426,16 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
-     int32_t current;
-     UErrorCode status = U_ZERO_ERROR;
-     PossibleWord words[THAI_LOOKAHEAD];
--    
-+
-     utext_setNativeIndex(text, rangeStart);
--    
-+
-     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
-         cpWordLength = 0;
-         cuWordLength = 0;
- 
-         // Look for candidate words at the current position
-         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
--        
-+
-         // If we found exactly one, use that
-         if (candidates == 1) {
-             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
-@@ -291,12 +456,12 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
-                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
-                         wordsMatched = 2;
-                     }
--                    
-+
-                     // If we're already at the end of the range, we're done
-                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
-                         goto foundBest;
-                     }
--                    
-+
-                     // See if any of the possible second words is followed by a third word
-                     do {
-                         // If we find a third word, stop right away
-@@ -315,13 +480,13 @@ foundBest:
-             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
-             wordsFound += 1;
-         }
--        
-+
-         // We come here after having either found a word or not. We look ahead to the
-         // next word. If it's not a dictionary word, we will combine it with the word we
-         // just found (if there is one), but only if the preceding word does not exceed
-         // the threshold.
-         // The text iterator should now be positioned at the end of the word we found.
--        
-+
-         UChar32 uc = 0;
-         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
-             // if it is a dictionary word, do nothing. If it isn't, then if there is
-@@ -357,12 +522,12 @@ foundBest:
-                         }
-                     }
-                 }
--                
-+
-                 // Bump the word count if there wasn't already one
-                 if (cuWordLength <= 0) {
-                     wordsFound += 1;
-                 }
--                
-+
-                 // Update the length with the passed-over characters
-                 cuWordLength += chars;
-             }
-@@ -371,14 +536,14 @@ foundBest:
-                 utext_setNativeIndex(text, current+cuWordLength);
-             }
-         }
--        
-+
-         // Never stop before a combining mark.
-         int32_t currPos;
-         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
-             utext_next32(text);
-             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
-         }
--        
-+
-         // Look ahead for possible suffixes if a dictionary word does not follow.
-         // We do this in code rather than using a rule so that the heuristic
-         // resynch continues to function. For example, one of the suffix characters
-@@ -496,16 +661,16 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text,
-     int32_t current;
-     UErrorCode status = U_ZERO_ERROR;
-     PossibleWord words[LAO_LOOKAHEAD];
--    
-+
-     utext_setNativeIndex(text, rangeStart);
--    
-+
-     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
-         cuWordLength = 0;
-         cpWordLength = 0;
- 
-         // Look for candidate words at the current position
-         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
--        
-+
-         // If we found exactly one, use that
-         if (candidates == 1) {
-             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
-@@ -526,12 +691,12 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text,
-                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
-                         wordsMatched = 2;
-                     }
--                    
-+
-                     // If we're already at the end of the range, we're done
-                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
-                         goto foundBest;
-                     }
--                    
-+
-                     // See if any of the possible second words is followed by a third word
-                     do {
-                         // If we find a third word, stop right away
-@@ -549,7 +714,7 @@ foundBest:
-             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
-             wordsFound += 1;
-         }
--        
-+
-         // We come here after having either found a word or not. We look ahead to the
-         // next word. If it's not a dictionary word, we will combine it withe the word we
-         // just found (if there is one), but only if the preceding word does not exceed
-@@ -587,12 +752,12 @@ foundBest:
-                         }
-                     }
-                 }
--                
-+
-                 // Bump the word count if there wasn't already one
-                 if (cuWordLength <= 0) {
-                     wordsFound += 1;
-                 }
--                
-+
-                 // Update the length with the passed-over characters
-                 cuWordLength += chars;
-             }
-@@ -601,14 +766,14 @@ foundBest:
-                 utext_setNativeIndex(text, current + cuWordLength);
-             }
-         }
--        
-+
-         // Never stop before a combining mark.
-         int32_t currPos;
-         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
-             utext_next32(text);
-             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
-         }
--        
-+
-         // Look ahead for possible suffixes if a dictionary word does not follow.
-         // We do this in code rather than using a rule so that the heuristic
-         // resynch continues to function. For example, one of the suffix characters
-@@ -689,16 +854,16 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
-     int32_t current;
-     UErrorCode status = U_ZERO_ERROR;
-     PossibleWord words[BURMESE_LOOKAHEAD];
--    
-+
-     utext_setNativeIndex(text, rangeStart);
--    
-+
-     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
-         cuWordLength = 0;
-         cpWordLength = 0;
- 
-         // Look for candidate words at the current position
-         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
--        
-+
-         // If we found exactly one, use that
-         if (candidates == 1) {
-             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
-@@ -719,12 +884,12 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
-                         words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
-                         wordsMatched = 2;
-                     }
--                    
-+
-                     // If we're already at the end of the range, we're done
-                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
-                         goto foundBest;
-                     }
--                    
-+
-                     // See if any of the possible second words is followed by a third word
-                     do {
-                         // If we find a third word, stop right away
-@@ -742,7 +907,7 @@ foundBest:
-             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
-             wordsFound += 1;
-         }
--        
-+
-         // We come here after having either found a word or not. We look ahead to the
-         // next word. If it's not a dictionary word, we will combine it withe the word we
-         // just found (if there is one), but only if the preceding word does not exceed
-@@ -780,12 +945,12 @@ foundBest:
-                         }
-                     }
-                 }
--                
-+
-                 // Bump the word count if there wasn't already one
-                 if (cuWordLength <= 0) {
-                     wordsFound += 1;
-                 }
--                
-+
-                 // Update the length with the passed-over characters
-                 cuWordLength += chars;
-             }
-@@ -794,14 +959,14 @@ foundBest:
-                 utext_setNativeIndex(text, current + cuWordLength);
-             }
-         }
--        
-+
-         // Never stop before a combining mark.
-         int32_t currPos;
-         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
-             utext_next32(text);
-             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
-         }
--        
-+
-         // Look ahead for possible suffixes if a dictionary word does not follow.
-         // We do this in code rather than using a rule so that the heuristic
-         // resynch continues to function. For example, one of the suffix characters
-@@ -828,51 +993,28 @@ foundBest:
+@@ -828,51 +1002,28 @@ foundBest:
   * KhmerBreakEngine
   */
  
@@ -536,7 +308,7 @@ index f1c874d..3ad1b3f 100644
  }
  
  KhmerBreakEngine::~KhmerBreakEngine() {
-@@ -884,180 +1027,204 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
+@@ -884,180 +1036,204 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
                                                  int32_t rangeStart,
                                                  int32_t rangeEnd,
                                                  UStack &foundBreaks ) const {
@@ -560,10 +332,10 @@ index f1c874d..3ad1b3f 100644
 +        startZwsp = scanBeforeStart(text, scanStart, breakStart);
 +    }
 +    utext_setNativeIndex(text, rangeStart);
-+    scanFwdClusters(text, rangeEnd, initAfter);
++    scanFwdClusters(text, rangeStart, initAfter);
 +    bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd);
 +    utext_setNativeIndex(text, rangeEnd - 1);
-+    scanBackClusters(text, rangeStart, finalBefore);
++    scanBackClusters(text, rangeEnd, finalBefore);
 +    if (finalBefore < initAfter) {   // the whole run is tented so no breaks
 +        if (breakStart || fTypes < UBRK_LINE)
 +            foundBreaks.push(rangeStart, status);
@@ -715,7 +487,7 @@ index f1c874d..3ad1b3f 100644
 +        if (count == 0) {
 +            utext_setNativeIndex(text, ix);
 +            int32_t c = utext_current32(text);
-+            if (fPuncSet.contains(c) || c == ZWSP || c == WJ) {
++            if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) {
 +                values.setElementAt(0, count);
 +                lengths.setElementAt(1, count++);
 +            } else if (fBaseSet.contains(c)) {
@@ -767,7 +539,7 @@ index f1c874d..3ad1b3f 100644
 +            int32_t ln = lengths.elementAti(j);
 +            utext_setNativeIndex(text, ln+ix);
 +            int32_t c = utext_current32(text);
-+            while (fPuncSet.contains(c)) {
++            while (fPuncSet.contains(c) || fIgnoreSet.contains(c)) {
 +                ++ln;
 +                utext_next32(text);
 +                c = utext_current32(text);
@@ -887,71 +659,6 @@ index f1c874d..3ad1b3f 100644
  }
  
  #if !UCONFIG_NO_NORMALIZATION
-@@ -1121,7 +1288,7 @@ static inline int32_t utext_i32_flag(int32_t bitIndex) {
-     return (int32_t)1 << bitIndex;
- }
- 
--       
-+
- /*
-  * @param text A UText representing the text
-  * @param rangeStart The start of the range of dictionary characters
-@@ -1129,7 +1296,7 @@ static inline int32_t utext_i32_flag(int32_t bitIndex) {
-  * @param foundBreaks Output of C array of int32_t break positions, or 0
-  * @return The number of breaks found
-  */
--int32_t 
-+int32_t
- CjkBreakEngine::divideUpDictionaryRange( UText *inText,
-         int32_t rangeStart,
-         int32_t rangeEnd,
-@@ -1192,7 +1359,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
-         if (U_FAILURE(status)) {
-             return 0;
-         }
--        
-+
-         UnicodeString fragment;
-         UnicodeString normalizedFragment;
-         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
-@@ -1261,7 +1428,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
-             }
-         }
-     }
--                
-+
-     // bestSnlp[i] is the snlp of the best segmentation of the first i
-     // code points in the range to be matched.
-     UVector32 bestSnlp(numCodePts + 1, status);
-@@ -1271,7 +1438,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
-     }
- 
- 
--    // prev[i] is the index of the last CJK code point in the previous word in 
-+    // prev[i] is the index of the last CJK code point in the previous word in
-     // the best segmentation of the first i characters.
-     UVector32 prev(numCodePts + 1, status);
-     for(int32_t i = 0; i <= numCodePts; i++){
-@@ -1305,8 +1472,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
-                              // Note: lengths is filled with code point lengths
-                              //       The NULL parameter is the ignored code unit lengths.
- 
--        // if there are no single character matches found in the dictionary 
--        // starting with this charcter, treat character as a 1-character word 
-+        // if there are no single character matches found in the dictionary
-+        // starting with this charcter, treat character as a 1-character word
-         // with the highest value possible, i.e. the least likely to occur.
-         // Exclude Korean characters from this treatment, as they should be left
-         // together by default.
-@@ -1380,7 +1547,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
-         numBreaks++;
-     }
- 
--    // Now that we're done, convert positions in t_boundary[] (indices in 
-+    // Now that we're done, convert positions in t_boundary[] (indices in
-     // the normalized input string) back to indices in the original input UText
-     // while reversing t_boundary and pushing values to foundBreaks.
-     for (int32_t i = numBreaks-1; i >= 0; i--) {
 diff --git a/source/common/dictbe.h b/source/common/dictbe.h
 index d3488cd..26caa75 100644
 --- misc/icu/source/common/dictbe.h
diff --git a/external/icu/khmerdict.dict b/external/icu/khmerdict.dict
index c935cd088659..52605b65469d 100644
--- a/external/icu/khmerdict.dict
+++ b/external/icu/khmerdict.dict
author	Martin Hosken <martin_hosken@sil.org>	2016-03-17 09:57:35 +0700
committer	Martin Hosken <martin_hosken@sil.org>	2016-03-17 03:31:32 +0000
commit	a976a19ca82661d8b459b85f5514b0e4c9222d47 (patch)
tree	b85c4b330550132a75edcb4ebf9fd496062b4892 /external
parent	1caac283894d0deeac564c67cd816cc2907f9ac7 (diff)