Fix text search across lines between paragraphs

This commit fixes the "across lines" text search feature of TextPage::findText() when the match happens from the last line of a paragraph to the first line of next paragraph. Includes tests for this bug. Fixes #1475 Fixes https://gitlab.gnome.org/GNOME/evince/-/issues/2001
author: Nelson Benítez León <nbenitezl@gmail.com> 2024-03-12 21:37:46 +0000
committer: Albert Astals Cid <aacid@kde.org> 2024-03-30 10:38:54 +0000
commit: 9ace4f33e38fe24add87dc4e7c2a43e1441f2bec (patch)
tree: b32be374ea798a88651837334e17205bb9bd7ce2
parent: 7a435135a1bfb8c3f9f5984d88bbe5dd8977335a (diff)
3 files changed, 50 insertions, 24 deletions
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 6e6f55d6..034209eb 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -3860,7 +3860,8 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
     TextLine *line;
     Unicode *s2, *txt, *reordered;
     Unicode *p;
-    Unicode *nextline;
+    TextLine *nextline;
+    Unicode *nextline_txt;
     int nextline_len;
     bool nextlineAfterHyphen = false;
     int txtSize, m, i, j, k;
@@ -3969,11 +3970,22 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
                 line->normalized = unicodeNormalizeNFKC(line->text, line->len, &line->normalized_len, &line->normalized_idx, true);
             }
 
-            if (matchAcrossLines && line->next && !line->next->normalized) {
-                line->next->normalized = unicodeNormalizeNFKC(line->next->text, line->next->len, &line->next->normalized_len, &line->next->normalized_idx, true);
-            }
             nextline = nullptr;
+            nextline_txt = nullptr;
             nextline_len = 0;
+            if (line->next) {
+                nextline = line->next;
+            } else {
+                // set nextline to first line of next block
+                int ind = i + (backward ? -1 : 1);
+                if ((backward && ind >= 0) || (!backward && ind < nBlocks)) {
+                    nextline = blocks[ind]->lines;
+                }
+            }
+
+            if (matchAcrossLines && nextline && !nextline->normalized) {
+                nextline->normalized = unicodeNormalizeNFKC(nextline->text, nextline->len, &nextline->normalized_len, &nextline->normalized_idx, true);
+            }
 
             // convert the line to uppercase
             m = line->normalized_len;
@@ -3988,8 +4000,8 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
                     ignoreDiacritics = false;
                 }
 
-                if (matchAcrossLines && line->next && !line->next->ascii_translation) {
-                    unicodeToAscii7(line->next->normalized, line->next->normalized_len, &line->next->ascii_translation, &line->next->ascii_len, line->next->normalized_idx, &line->next->ascii_idx);
+                if (matchAcrossLines && nextline && !nextline->ascii_translation) {
+                    unicodeToAscii7(nextline->normalized, nextline->normalized_len, &nextline->ascii_translation, &nextline->ascii_len, nextline->normalized_idx, &nextline->ascii_idx);
                 }
             }
             if (!caseSensitive) {
@@ -4004,11 +4016,11 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
                         txt[k] = unicodeToUpper(line->normalized[k]);
                     }
                 }
-                if (matchAcrossLines && line->next) {
-                    nextline_len = ignoreDiacritics ? line->next->ascii_len : line->next->normalized_len;
-                    nextline = (Unicode *)gmallocn(nextline_len, sizeof(Unicode));
+                if (matchAcrossLines && nextline) {
+                    nextline_len = ignoreDiacritics ? nextline->ascii_len : nextline->normalized_len;
+                    nextline_txt = (Unicode *)gmallocn(nextline_len, sizeof(Unicode));
                     for (k = 0; k < nextline_len; ++k) {
-                        nextline[k] = ignoreDiacritics ? unicodeToUpper(line->next->ascii_translation[k]) : unicodeToUpper(line->next->normalized[k]);
+                        nextline_txt[k] = ignoreDiacritics ? unicodeToUpper(nextline->ascii_translation[k]) : unicodeToUpper(nextline->normalized[k]);
                     }
                 }
             } else {
@@ -4018,20 +4030,20 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
                     txt = line->normalized;
                 }
 
-                if (matchAcrossLines && line->next) {
-                    nextline_len = ignoreDiacritics ? line->next->ascii_len : line->next->normalized_len;
-                    nextline = ignoreDiacritics ? line->next->ascii_translation : line->next->normalized;
+                if (matchAcrossLines && nextline) {
+                    nextline_len = ignoreDiacritics ? nextline->ascii_len : nextline->normalized_len;
+                    nextline_txt = ignoreDiacritics ? nextline->ascii_translation : nextline->normalized;
                 }
             }
 
             // search each position in this line
             j = backward ? m - len : 0;
             p = txt + j;
-            while (backward ? j >= 0 : j <= m - (nextline ? 1 : len)) {
+            while (backward ? j >= 0 : j <= m - (nextline_txt ? 1 : len)) {
                 bool wholeWordStartIsOk, wholeWordEndIsOk;
                 if (wholeWord) {
                     wholeWordStartIsOk = j == 0 || !unicodeTypeAlphaNum(txt[j - 1]);
-                    if (nextline) {
+                    if (nextline_txt) {
                         wholeWordEndIsOk = true; // word end may be in next line, so we'll check it later
                     } else {
                         wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(txt[j + len]);
@@ -4048,7 +4060,7 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
                         bool last_char_of_search_term = k == len - 1;
                         bool match_started = (bool)k;
 
-                        if (p[k] != s2[k] || (nextline && last_char_of_line && !last_char_of_search_term)) {
+                        if (p[k] != s2[k] || (nextline_txt && last_char_of_line && !last_char_of_search_term)) {
                             // now check if the comparison failed at the end-of-line hyphen,
                             // and if so, keep on comparing at the next line
                             nextlineAfterHyphen = false;
@@ -4065,7 +4077,7 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
                             }
 
                             for (; n < nextline_len && k < len; ++k, ++n) {
-                                if (nextline[n] != s2[k]) {
+                                if (nextline_txt[n] != s2[k]) {
                                     if (!spaceConsumedByNewline && !n && UnicodeIsWhitespace(s2[k])) {
                                         n = -1;
                                         spaceConsumedByNewline = true;
@@ -4079,9 +4091,9 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
                     }
 
                     found_it = k == len;
-                    if (found_it && nextline && wholeWord) { // check word end for nextline case
+                    if (found_it && nextline_txt && wholeWord) { // check word end for nextline case
                         if (n) { // Match ended at next line
-                            wholeWordEndIsOk = n == nextline_len || !unicodeTypeAlphaNum(nextline[n]);
+                            wholeWordEndIsOk = n == nextline_len || !unicodeTypeAlphaNum(nextline_txt[n]);
                         } else { // Match ended on same line
                             wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(txt[j + len]);
                         }
@@ -4102,14 +4114,14 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
                         int normStart, normAfterEnd;
                         if (ignoreDiacritics) {
                             normStart = line->ascii_idx[j];
-                            if (nextline) {
+                            if (nextline_txt) {
                                 normAfterEnd = line->ascii_idx[j + k - n];
                             } else {
                                 normAfterEnd = line->ascii_idx[j + len - 1] + 1;
                             }
                         } else {
                             normStart = line->normalized_idx[j];
-                            if (nextline) {
+                            if (nextline_txt) {
                                 normAfterEnd = line->normalized_idx[j + k - n];
                             } else {
                                 normAfterEnd = line->normalized_idx[j + len - 1] + 1;
@@ -4142,7 +4154,7 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
                                         }
 
                                         if (continueMatch) {
-                                            adjustRotation(line->next, 0, n, &xMin2, &xMax2, &yMin2, &yMax2);
+                                            adjustRotation(nextline, 0, n, &xMin2, &xMax2, &yMin2, &yMax2);
                                             continueMatch->x1 = xMin2;
                                             continueMatch->y1 = yMax2;
                                             continueMatch->x2 = xMax2;
@@ -4169,8 +4181,8 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
                 }
             }
 
-            if (nextline && nextline != line->next->ascii_translation && nextline != line->next->normalized) {
-                gfree(nextline);
+            if (nextline_txt && nextline_txt != nextline->ascii_translation && nextline_txt != nextline->normalized) {
+                gfree(nextline_txt);
             }
         }
     }
diff --git a/qt5/tests/check_search.cpp b/qt5/tests/check_search.cpp
index c9bb65e3..be2d6bcc 100644
--- a/qt5/tests/check_search.cpp
+++ b/qt5/tests/check_search.cpp
@@ -350,6 +350,13 @@ void TestSearch::testAcrossLinesSearch()
     QCOMPARE(page0->search(str6, l, t, r, b, direction, mode1), true);
     QCOMPARE(page0->search(str6, l, t, r, b, direction, mode2), true);
     QCOMPARE(page0->search(str6, l, t, r, b, direction, mode2W), true);
+    // Check for the case when next line falls in next paragraph. Issue #1475
+    const QString across_block = QString::fromUtf8("emacs jose"); // clazy:exclude=qstring-allocations
+    QCOMPARE(page0->search(across_block, l, t, r, b, direction, empty), false);
+    QCOMPARE(page0->search(across_block, l, t, r, b, direction, mode0), false);
+    QCOMPARE(page0->search(across_block, l, t, r, b, direction, mode1), false);
+    QCOMPARE(page0->search(across_block, l, t, r, b, direction, mode2), true);
+    QCOMPARE(page0->search(across_block, l, t, r, b, direction, mode2W), true);
 
     // Now for completeness, we will match the full text of two lines
     const QString full2lines = QString::fromUtf8("Las pruebas se practicarán en vista pública, si bien, excepcionalmente, el Tribunal podrá acordar, mediante providencia, que determinadas pruebas se celebren fuera del acto de juicio");
diff --git a/qt6/tests/check_search.cpp b/qt6/tests/check_search.cpp
index ede2d0c2..6242676b 100644
--- a/qt6/tests/check_search.cpp
+++ b/qt6/tests/check_search.cpp
@@ -348,6 +348,13 @@ void TestSearch::testAcrossLinesSearch()
     QCOMPARE(page0->search(str6, l, t, r, b, direction, mode1), true);
     QCOMPARE(page0->search(str6, l, t, r, b, direction, mode2), true);
     QCOMPARE(page0->search(str6, l, t, r, b, direction, mode2W), true);
+    // Check for the case when next line falls in next paragraph. Issue #1475
+    const QString across_block = QString::fromUtf8("emacs jose"); // clazy:exclude=qstring-allocations
+    QCOMPARE(page0->search(across_block, l, t, r, b, direction, empty), false);
+    QCOMPARE(page0->search(across_block, l, t, r, b, direction, mode0), false);
+    QCOMPARE(page0->search(across_block, l, t, r, b, direction, mode1), false);
+    QCOMPARE(page0->search(across_block, l, t, r, b, direction, mode2), true);
+    QCOMPARE(page0->search(across_block, l, t, r, b, direction, mode2W), true);
 
     // Now for completeness, we will match the full text of two lines
     const QString full2lines = QString::fromUtf8(
author	Nelson Benítez León <nbenitezl@gmail.com>	2024-03-12 21:37:46 +0000
committer	Albert Astals Cid <aacid@kde.org>	2024-03-30 10:38:54 +0000
commit	9ace4f33e38fe24add87dc4e7c2a43e1441f2bec (patch)
tree	b32be374ea798a88651837334e17205bb9bd7ce2
parent	7a435135a1bfb8c3f9f5984d88bbe5dd8977335a (diff)