From 835987362d9873cf98cc3f86959910ff2107a509 Mon Sep 17 00:00:00 2001 From: Stefan BrĂ¼ns Date: Sun, 24 Mar 2024 00:31:52 +0100 Subject: Reduce TextWord space and allocation overhead Currently, the word characters are allocated as a struct of arrays, e.g. text and charcode are allocated separately. This causes some space (6 pointers, 6 malloc chunk management words (size_t/flags), alignment, ...) and runtime overhead (6 allocs/ frees per word). Changing this to an array of struct reduces this overhead. It also allows to be more conservative with allocations, as resizing is less costly, i.e. starting with a single character allocation instead of 16. It is also more efficient, as most accesses affect multiple or all attributes, i.e. values in the same or neighboring CPU cache lines. Using a std::vector instead of separate raw arrays also reduces code and manual data management. The "charPos end index" and trailing "edge" attributes are no longer stored as an additional entry entry in the array, but as dedicated data members, `charPosEnd` and `edgeEnd`. The memory saving is most notably for short words, but even for words with 16 characters there are small savings, and still less allocations (1 + 4 allocations instead of 6. Growing is fairly cheap, as the CharInfo struct is trivially copyable.) See poppler#1173. --- poppler/TextOutputDev.cc | 378 ++++++++++++++++++++++------------------------- poppler/TextOutputDev.h | 42 +++--- 2 files changed, 197 insertions(+), 223 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 034209eb..03b68bc2 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -419,13 +419,6 @@ TextWord::TextWord(const GfxState *state, int rotA, double fontSizeA) { rot = rotA; fontSize = fontSizeA; - text = nullptr; - charcode = nullptr; - edge = nullptr; - charPos = nullptr; - font = nullptr; - textMat = nullptr; - len = size = 0; spaceAfter = false; next = nullptr; invisible = state->getRender() == 3; @@ -447,27 +440,14 @@ TextWord::TextWord(const GfxState *state, int rotA, double fontSizeA) link = nullptr; } -TextWord::~TextWord() -{ - gfree(text); - gfree(charcode); - gfree(edge); - gfree(charPos); - gfree(font); - gfree(textMat); -} +TextWord::~TextWord() { } void TextWord::addChar(const GfxState *state, TextFontInfo *fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA) { - ensureCapacity(len + 1); - text[len] = u; - charcode[len] = c; - charPos[len] = charPosA; - charPos[len + 1] = charPosA + charLen; - font[len] = fontA; - textMat[len] = textMatA; + chars.push_back(CharInfo { u, c, charPosA, 0.0, fontA, textMatA }); + charPosEnd = charPosA + charLen; - if (len == 0) { + if (len() == 1) { setInitialBounds(fontA, x, y); } @@ -476,43 +456,42 @@ void TextWord::addChar(const GfxState *state, TextFontInfo *fontA, double x, dou // TextPage::beginWord()) for vertical writing mode switch (rot) { case 0: - edge[len] = x - fontSize; - xMax = edge[len + 1] = x; + chars.back().edge = x - fontSize; + xMax = edgeEnd = x; break; case 1: - edge[len] = y - fontSize; - yMax = edge[len + 1] = y; + chars.back().edge = y - fontSize; + yMax = edgeEnd = y; break; case 2: - edge[len] = x + fontSize; - xMin = edge[len + 1] = x; + chars.back().edge = x + fontSize; + xMin = edgeEnd = x; break; case 3: - edge[len] = y + fontSize; - yMin = edge[len + 1] = y; + chars.back().edge = y + fontSize; + yMin = edgeEnd = y; break; } } else { // horizontal writing mode switch (rot) { case 0: - edge[len] = x; - xMax = edge[len + 1] = x + dx; + chars.back().edge = x; + xMax = edgeEnd = x + dx; break; case 1: - edge[len] = y; - yMax = edge[len + 1] = y + dy; + chars.back().edge = y; + yMax = edgeEnd = y + dy; break; case 2: - edge[len] = x; - xMin = edge[len + 1] = x + dx; + chars.back().edge = x; + xMin = edgeEnd = x + dx; break; case 3: - edge[len] = y; - yMin = edge[len + 1] = y + dy; + chars.back().edge = y; + yMin = edgeEnd = y + dy; break; } } - ++len; } void TextWord::setInitialBounds(TextFontInfo *fontA, double x, double y) @@ -604,19 +583,6 @@ void TextWord::setInitialBounds(TextFontInfo *fontA, double x, double y) } } -void TextWord::ensureCapacity(int capacity) -{ - if (capacity > size) { - size = std::max(size + 16, capacity); - text = (Unicode *)greallocn(text, size, sizeof(Unicode)); - charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode)); - edge = (double *)greallocn(edge, (size + 1), sizeof(double)); - charPos = (int *)greallocn(charPos, size + 1, sizeof(int)); - font = (TextFontInfo **)greallocn(font, size, sizeof(TextFontInfo *)); - textMat = (Matrix *)greallocn(textMat, size, sizeof(Matrix)); - } -} - struct CombiningTable { Unicode base; @@ -651,19 +617,15 @@ static Unicode getCombiningChar(Unicode u) bool TextWord::addCombining(const GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA) { - if (len == 0 || wMode != 0 || fontA->getWMode() != 0) { + if (chars.empty() || wMode != 0 || fontA->getWMode() != 0) { return false; } Unicode cCurrent = getCombiningChar(u); - Unicode cPrev = getCombiningChar(text[len - 1]); - double edgeMid = (edge[len - 1] + edge[len]) / 2; - double charMid, maxScaledMidDelta, charBase, maxScaledBaseDelta; - - if (cCurrent != 0 && unicodeTypeAlphaNum(text[len - 1])) { + if (cCurrent != 0 && unicodeTypeAlphaNum(chars.back().text)) { // Current is a combining character, previous is base character - maxScaledMidDelta = fabs(edge[len] - edge[len - 1]) * combMaxMidDelta; - charMid = charBase = maxScaledBaseDelta = 0; + double maxScaledMidDelta = fabs(edgeEnd - chars.back().edge) * combMaxMidDelta; + double charMid, charBase, maxScaledBaseDelta; // Test if characters overlap if (rot == 0 || rot == 2) { @@ -676,29 +638,24 @@ bool TextWord::addCombining(const GfxState *state, TextFontInfo *fontA, double f maxScaledBaseDelta = (xMax - xMin) * combMaxBaseDelta; } + double edgeMid = (chars.back().edge + edgeEnd) / 2; if (fabs(charMid - edgeMid) >= maxScaledMidDelta || fabs(charBase - base) >= maxScaledBaseDelta) { return false; } // Add character, but don't adjust edge / bounding box because // combining character's positioning could be odd. - ensureCapacity(len + 1); - text[len] = cCurrent; - charcode[len] = c; - charPos[len] = charPosA; - charPos[len + 1] = charPosA + charLen; - font[len] = fontA; - textMat[len] = textMatA; - edge[len + 1] = edge[len]; - edge[len] = (edge[len + 1] + edge[len - 1]) / 2; - ++len; + chars.emplace_back(CharInfo { cCurrent, c, charPosA, edgeMid, fontA, textMatA }); + charPosEnd = charPosA + charLen; + return true; } + Unicode cPrev = getCombiningChar(chars.back().text); if (cPrev != 0 && unicodeTypeAlphaNum(u)) { // Previous is a combining character, current is base character - maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta; - charMid = charBase = maxScaledMidDelta = 0; + double maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta; + double charMid, charBase, maxScaledMidDelta; // Test if characters overlap if (rot == 0 || rot == 2) { @@ -711,73 +668,71 @@ bool TextWord::addCombining(const GfxState *state, TextFontInfo *fontA, double f maxScaledMidDelta = fabs(dy * combMaxMidDelta); } + double edgeMid = (chars.back().edge + edgeEnd) / 2; if (fabs(charMid - edgeMid) >= maxScaledMidDelta || fabs(charBase - base) >= maxScaledBaseDelta) { return false; } - // move combining character to after base character - ensureCapacity(len + 1); fontSize = fontSizeA; - text[len] = cPrev; - charcode[len] = charcode[len - 1]; - charPos[len] = charPosA; - charPos[len + 1] = charPosA + charLen; - font[len] = font[len - 1]; - textMat[len] = textMat[len - 1]; - - text[len - 1] = u; - charcode[len - 1] = c; - font[len - 1] = fontA; - textMat[len - 1] = textMatA; - - if (len == 1) { + // move combining character to after base character + chars.emplace_back(CharInfo { cPrev, chars.back().charcode, charPosA, edgeMid, chars.back().font, chars.back().textMat }); + + auto &lastChar = chars[chars.size() - 2]; + + charPosEnd = charPosA + charLen; + lastChar.text = u; + lastChar.charcode = c; + lastChar.font = fontA; + lastChar.textMat = textMatA; + + if (len() == 2) { setInitialBounds(fontA, x, y); } // Updated edges / bounding box because we changed the base // character. if (wMode) { + // FIXME unreachable, wMode == 0 switch (rot) { case 0: - edge[len - 1] = x - fontSize; - xMax = edge[len + 1] = x; + lastChar.edge = x - fontSize; + xMax = edgeEnd = x; break; case 1: - edge[len - 1] = y - fontSize; - yMax = edge[len + 1] = y; + lastChar.edge = y - fontSize; + yMax = edgeEnd = y; break; case 2: - edge[len - 1] = x + fontSize; - xMin = edge[len + 1] = x; + lastChar.edge = x + fontSize; + xMin = edgeEnd = x; break; case 3: - edge[len - 1] = y + fontSize; - yMin = edge[len + 1] = y; + lastChar.edge = y + fontSize; + yMin = edgeEnd = y; break; } } else { switch (rot) { case 0: - edge[len - 1] = x; - xMax = edge[len + 1] = x + dx; + lastChar.edge = x; + xMax = edgeEnd = x + dx; break; case 1: - edge[len - 1] = y; - yMax = edge[len + 1] = y + dy; + lastChar.edge = y; + yMax = edgeEnd = y + dy; break; case 2: - edge[len - 1] = x; - xMin = edge[len + 1] = x + dx; + lastChar.edge = x; + xMin = edgeEnd = x + dx; break; case 3: - edge[len - 1] = y; - yMin = edge[len + 1] = y + dy; + lastChar.edge = y; + yMin = edgeEnd = y + dy; break; } } - edge[len] = (edge[len + 1] + edge[len - 1]) / 2; - ++len; + chars.back().edge = (edgeEnd + lastChar.edge) / 2; return true; } return false; @@ -785,8 +740,6 @@ bool TextWord::addCombining(const GfxState *state, TextFontInfo *fontA, double f void TextWord::merge(TextWord *word) { - int i; - if (word->xMin < xMin) { xMin = word->xMin; } @@ -799,18 +752,9 @@ void TextWord::merge(TextWord *word) if (word->yMax > yMax) { yMax = word->yMax; } - ensureCapacity(len + word->len); - for (i = 0; i < word->len; ++i) { - text[len + i] = word->text[i]; - charcode[len + i] = word->charcode[i]; - edge[len + i] = word->edge[i]; - charPos[len + i] = word->charPos[i]; - font[len + i] = word->font[i]; - textMat[len + i] = word->textMat[i]; - } - edge[len + word->len] = word->edge[word->len]; - charPos[len + word->len] = word->charPos[word->len]; - len += word->len; + chars.insert(chars.end(), word->chars.begin(), word->chars.end()); + edgeEnd = word->edgeEnd; + charPosEnd = word->charPosEnd; } inline int TextWord::primaryCmp(const TextWord *word) const @@ -877,14 +821,13 @@ GooString *TextWord::getText() const GooString *s; const UnicodeMap *uMap; char buf[8]; - int n, i; s = new GooString(); if (!(uMap = globalParams->getTextEncoding())) { return s; } - for (i = 0; i < len; ++i) { - n = uMap->mapUnicode(text[i], buf, sizeof(buf)); + for (size_t i = 0; i < len(); ++i) { + auto n = uMap->mapUnicode(chars[i].text, buf, sizeof(buf)); s->append(buf, n); } return s; @@ -892,33 +835,39 @@ GooString *TextWord::getText() const void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const { - if (charIdx < 0 || charIdx >= len) { + if (charIdx < 0) { return; } + size_t uCharIdx = charIdx; + if (uCharIdx >= len()) { + return; + } + auto startingEdge = chars[uCharIdx].edge; + auto endingEdge = (uCharIdx + 1 == len()) ? edgeEnd : chars[charIdx + 1].edge; switch (rot) { case 0: - *xMinA = edge[charIdx]; - *xMaxA = edge[charIdx + 1]; + *xMinA = startingEdge; + *xMaxA = endingEdge; *yMinA = yMin; *yMaxA = yMax; break; case 1: *xMinA = xMin; *xMaxA = xMax; - *yMinA = edge[charIdx]; - *yMaxA = edge[charIdx + 1]; + *yMinA = startingEdge; + *yMaxA = endingEdge; break; case 2: - *xMinA = edge[charIdx + 1]; - *xMaxA = edge[charIdx]; + *xMinA = endingEdge; + *xMaxA = startingEdge; *yMinA = yMin; *yMaxA = yMax; break; case 3: *xMinA = xMin; *xMaxA = xMax; - *yMinA = edge[charIdx + 1]; - *yMaxA = edge[charIdx]; + *yMinA = endingEdge; + *yMaxA = startingEdge; break; } } @@ -1189,21 +1138,19 @@ int TextLine::cmpXY(const void *p1, const void *p2) void TextLine::coalesce(const UnicodeMap *uMap) { - TextWord *word0, *word1; double space, delta, minSpace; bool isUnicode; char buf[8]; - int i, j; if (words->next) { // compute the inter-word space threshold - if (words->len > 1 || words->next->len > 1) { + if (words->len() > 1 || words->next->len() > 1) { minSpace = 0; } else { minSpace = words->primaryDelta(words->next); - for (word0 = words->next, word1 = word0->next; word1 && minSpace > 0; word0 = word1, word1 = word0->next) { - if (word1->len > 1) { + for (auto word0 = words->next, word1 = word0->next; word1 && minSpace > 0; word0 = word1, word1 = word0->next) { + if (word1->len() > 1) { minSpace = 0; } delta = word0->primaryDelta(word1); @@ -1222,15 +1169,17 @@ void TextLine::coalesce(const UnicodeMap *uMap) } // merge words - word0 = words; - word1 = words->next; + auto word0 = words; + auto word1 = words->next; while (word1) { if (word0->primaryDelta(word1) >= space) { word0->spaceAfter = true; word0 = word1; word1 = word1->next; - } else if (word0->font[word0->len - 1] == word1->font[0] && word0->underlined == word1->underlined && fabs(word0->fontSize - word1->fontSize) < maxWordFontSizeDelta * words->fontSize - && word1->charPos[0] == word0->charPos[word0->len]) { + } else if (word0->chars.back().font == word1->chars.front().font // + && word0->underlined == word1->underlined // + && fabs(word0->fontSize - word1->fontSize) < maxWordFontSizeDelta * words->fontSize // + && word1->chars.front().charPos == word0->charPosEnd) { word0->merge(word1); word0->next = word1->next; delete word1; @@ -1245,22 +1194,22 @@ void TextLine::coalesce(const UnicodeMap *uMap) // build the line text isUnicode = uMap ? uMap->isUnicode() : false; len = 0; - for (word1 = words; word1; word1 = word1->next) { - len += word1->len; + for (auto word1 = words; word1; word1 = word1->next) { + len += word1->len(); if (word1->spaceAfter) { ++len; } } text = (Unicode *)gmallocn(len, sizeof(Unicode)); edge = (double *)gmallocn(len + 1, sizeof(double)); - i = 0; - for (word1 = words; word1; word1 = word1->next) { - for (j = 0; j < word1->len; ++j) { - text[i] = word1->text[j]; - edge[i] = word1->edge[j]; + size_t i = 0; + for (auto word1 = words; word1; word1 = word1->next) { + for (size_t j = 0; j < word1->len(); ++j) { + text[i] = word1->chars[j].text; + edge[i] = word1->chars[j].edge; ++i; } - edge[i] = word1->edge[word1->len]; + edge[i] = word1->edgeEnd; if (word1->spaceAfter) { text[i] = (Unicode)0x0020; ++i; @@ -1270,12 +1219,12 @@ void TextLine::coalesce(const UnicodeMap *uMap) // compute convertedLen and set up the col array col = (int *)gmallocn(len + 1, sizeof(int)); convertedLen = 0; - for (i = 0; i < len; ++i) { - col[i] = convertedLen; + for (int ci = 0; ci < len; ++ci) { + col[ci] = convertedLen; if (isUnicode) { ++convertedLen; } else if (uMap) { - convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf)); + convertedLen += uMap->mapUnicode(text[ci], buf, sizeof(buf)); } } col[len] = convertedLen; @@ -1690,8 +1639,14 @@ void TextBlock::coalesce(const UnicodeMap *uMap, double fixedPitch) word1 = nullptr; word2 = pool->getPool(idx1); } + TextWord *word1 = prevWord->next; + + auto equalText = [](const TextWord &w1, const TextWord &w2) -> bool { // + return std::equal(w1.chars.begin(), w1.chars.end(), w2.chars.begin(), w2.chars.end(), // + [](auto c1, auto c2) { return c1.text == c2.text; }); + }; for (; word2; word1 = word2, word2 = word2->next) { - if (word2->len == word0->len && !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode))) { + if (equalText(*word0, *word2)) { switch (rot) { case 0: case 2: @@ -2713,28 +2668,28 @@ void TextPage::addChar(const GfxState *state, double x, double y, double dx, dou // character to be in a word by itself at this stage), // (4) the font size has changed // (5) the WMode changed - if (curWord && curWord->len > 0) { + if (curWord && curWord->len() > 0) { base = sp = delta = 0; // make gcc happy switch (curWord->rot) { case 0: base = y1; sp = x1 - curWord->xMax; - delta = x1 - curWord->edge[curWord->len - 1]; + delta = x1 - curWord->chars.back().edge; break; case 1: base = x1; sp = y1 - curWord->yMax; - delta = y1 - curWord->edge[curWord->len - 1]; + delta = y1 - curWord->chars.back().edge; break; case 2: base = y1; sp = curWord->xMin - x1; - delta = curWord->edge[curWord->len - 1] - x1; + delta = curWord->chars.back().edge - x1; break; case 3: base = x1; sp = curWord->yMin - y1; - delta = curWord->edge[curWord->len - 1] - y1; + delta = curWord->chars.back().edge - y1; break; } overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize && fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize; @@ -2813,7 +2768,7 @@ void TextPage::addWord(TextWord *word) { // throw away zero-length words -- they don't have valid xMin/xMax // values, and they're useless anyway - if (word->len == 0) { + if (word->len() == 0) { delete word; return; } @@ -3309,10 +3264,10 @@ void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML, double for (blk = blkList; blk; blk = blk->next) { for (line = blk->lines; line; line = line->next) { for (word0 = line->words; word0; word0 = word0->next) { - for (int i = 0; i < word0->len; ++i) { - if (unicodeTypeL(word0->text[i])) { + for (size_t i = 0; i < word0->len(); ++i) { + if (unicodeTypeL(word0->chars[i].text)) { ++lrCount; - } else if (unicodeTypeR(word0->text[i])) { + } else if (unicodeTypeR(word0->chars[i].text)) { --lrCount; } } @@ -4587,12 +4542,16 @@ GooString *TextSelectionDumper::getText() spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); + std::vector uText; for (i = 0; i < nLines; i++) { std::vector *lineWords = lines[i]; for (std::size_t j = 0; j < lineWords->size(); j++) { TextWordSelection *sel = (*lineWords)[j]; - page->dumpFragment(sel->word->text + sel->begin, sel->end - sel->begin, uMap, text); + uText.resize(sel->end - sel->begin); + std::transform(sel->word->chars.begin() + sel->begin, sel->word->chars.begin() + sel->end, uText.begin(), [](auto &c) { return c.text; }); + page->dumpFragment(uText.data(), uText.size(), uMap, text); + if (j < lineWords->size() - 1 && sel->word->spaceAfter) { text->append(space, spaceLen); } @@ -4838,36 +4797,39 @@ void TextSelectionPainter::endPage() out->updateFillColor(state); + GooString string; for (const TextWordSelection *sel : *selectionList) { int begin = sel->begin; while (begin < sel->end) { - TextFontInfo *font = sel->word->font[begin]; - Matrix *mat = &sel->word->textMat[begin]; + TextFontInfo *font = sel->word->chars[begin].font; + const Matrix *mat = &sel->word->chars[begin].textMat; state->setTextMat(mat->m[0], mat->m[1], mat->m[2], mat->m[3], 0, 0); state->setFont(font->gfxFont, 1); out->updateFont(state); int fEnd = begin + 1; - while (fEnd < sel->end && font->matches(sel->word->font[fEnd]) && mat->m[0] == sel->word->textMat[fEnd].m[0] && mat->m[1] == sel->word->textMat[fEnd].m[1] && mat->m[2] == sel->word->textMat[fEnd].m[2] - && mat->m[3] == sel->word->textMat[fEnd].m[3]) { + while (fEnd < sel->end && font->matches(sel->word->chars[fEnd].font) // + && mat->m[0] == sel->word->chars[fEnd].textMat.m[0] && mat->m[1] == sel->word->chars[fEnd].textMat.m[1] // + && mat->m[2] == sel->word->chars[fEnd].textMat.m[2] && mat->m[3] == sel->word->chars[fEnd].textMat.m[3]) { fEnd++; } /* The only purpose of this string is to let the output device query * it's length. Might want to change this interface later. */ - GooString *string = new GooString((char *)sel->word->charcode, fEnd - begin); - out->beginString(state, string); + string.clear(); + std::for_each(sel->word->chars.begin() + begin, sel->word->chars.begin() + fEnd, [&string](const auto c) { string.append(c.charcode); }); + out->beginString(state, &string); for (int j = begin; j < fEnd; j++) { - if (j != begin && sel->word->charPos[j] == sel->word->charPos[j - 1]) { + const auto &charJ = sel->word->chars[j]; + if (j != begin && charJ.charPos == sel->word->chars[j - 1].charPos) { continue; } - out->drawChar(state, sel->word->textMat[j].m[4], sel->word->textMat[j].m[5], 0, 0, 0, 0, sel->word->charcode[j], 1, nullptr, 0); + out->drawChar(state, charJ.textMat.m[4], charJ.textMat.m[5], 0, 0, 0, 0, charJ.charcode, 1, nullptr, 0); } out->endString(state); - delete string; begin = fEnd; } } @@ -4878,7 +4840,6 @@ void TextSelectionPainter::endPage() void TextWord::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style) { - int i, begin, end; double mid, s1, s2; if (rot == 0 || rot == 2) { @@ -4889,10 +4850,14 @@ void TextWord::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle s2 = selection->y2; } - begin = len; - end = 0; - for (i = 0; i < len; i++) { - mid = (edge[i] + edge[i + 1]) / 2; + size_t begin = len(); + size_t end = 0; + for (size_t i = 0; i < len(); i++) { + if (i + 1 < len()) { + mid = (chars[i].edge + chars[i + 1].edge) / 2; + } else { + mid = (chars[i].edge + edgeEnd) / 2; + } if (XBetweenAB(mid, s1, s2)) { if (i < begin) { begin = i; @@ -5279,7 +5244,6 @@ bool TextPage::findCharRange(int pos, int length, double *xMin, double *yMin, do double xMin0, xMax0, yMin0, yMax0; double xMin1, xMax1, yMin1, yMax1; bool first; - int i, j0, j1; if (rawOrder) { return false; @@ -5291,41 +5255,44 @@ bool TextPage::findCharRange(int pos, int length, double *xMin, double *yMin, do first = true; xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy - for (i = 0; i < nBlocks; ++i) { + for (int i = 0; i < nBlocks; ++i) { blk = blocks[i]; for (line = blk->lines; line; line = line->next) { for (word = line->words; word; word = word->next) { - if (pos < word->charPos[word->len] && pos + length > word->charPos[0]) { - for (j0 = 0; j0 < word->len && pos >= word->charPos[j0 + 1]; ++j0) { + if (pos < word->charPosEnd && pos + length > word->chars.front().charPos) { + size_t j0, j1; + for (j0 = 0; (j0 + 1) < word->len() && pos >= word->chars[j0 + 1].charPos; ++j0) { ; } - for (j1 = word->len - 1; j1 > j0 && pos + length <= word->charPos[j1]; --j1) { + for (j1 = word->len(); j1 > j0 && pos + length <= word->chars[j1].charPos; --j1) { ; } + auto startingEdge = word->chars[j0].edge; + auto endingEdge = (j1 + 1 == word->len()) ? word->edgeEnd : word->chars[j1 + 1].edge; switch (line->rot) { case 0: - xMin1 = word->edge[j0]; - xMax1 = word->edge[j1 + 1]; + xMin1 = startingEdge; + xMax1 = endingEdge; yMin1 = word->yMin; yMax1 = word->yMax; break; case 1: xMin1 = word->xMin; xMax1 = word->xMax; - yMin1 = word->edge[j0]; - yMax1 = word->edge[j1 + 1]; + yMin1 = startingEdge; + yMax1 = endingEdge; break; case 2: - xMin1 = word->edge[j1 + 1]; - xMax1 = word->edge[j0]; + xMin1 = endingEdge; + xMax1 = startingEdge; yMin1 = word->yMin; yMax1 = word->yMax; break; case 3: xMin1 = word->xMin; xMax1 = word->xMax; - yMin1 = word->edge[j1 + 1]; - yMax1 = word->edge[j0]; + yMin1 = endingEdge; + yMax1 = startingEdge; break; } if (first || xMin1 < xMin0) { @@ -5367,7 +5334,6 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, bool physLayo TextLineFrag *frag; char space[8], eol[16], eop[8]; int spaceLen, eolLen, eopLen; - GooString *s; double delta; int col, i, j, d, n; @@ -5396,11 +5362,16 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, bool physLayo // output the page in raw (content stream) order if (rawOrder) { + GooString s; + std::vector uText; + for (word = rawWords; word; word = word->next) { - s = new GooString(); - dumpFragment(word->text, word->len, uMap, s); - (*outputFunc)(outputStream, s->c_str(), s->getLength()); - delete s; + s.clear(); + uText.resize(word->len()); + std::transform(word->chars.begin(), word->chars.end(), uText.begin(), [](auto &c) { return c.text; }); + dumpFragment(uText.data(), uText.size(), uMap, &s); + (*outputFunc)(outputStream, s.c_str(), s.getLength()); + if (word->next && fabs(word->next->base - word->base) < maxIntraLineDelta * word->fontSize && word->next->xMin > word->xMax - minDupBreakOverlap * word->fontSize) { if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) { (*outputFunc)(outputStream, space, spaceLen); @@ -5454,6 +5425,7 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, bool physLayo printf("\n"); #endif + GooString s; // generate output col = 0; for (i = 0; i < nFrags; ++i) { @@ -5465,10 +5437,9 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, bool physLayo } // print the line - s = new GooString(); - col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s); - (*outputFunc)(outputStream, s->c_str(), s->getLength()); - delete s; + s.clear(); + col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, &s); + (*outputFunc)(outputStream, s.c_str(), s.getLength()); // print one or more returns if necessary if (i == nFrags - 1 || frags[i + 1].col < col || fabs(frags[i + 1].base - frag->base) > maxIntraLineDelta * frag->line->words->fontSize) { @@ -5500,10 +5471,9 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, bool physLayo if (line->hyphenated && (line->next || blk->next)) { --n; } - s = new GooString(); - dumpFragment(line->text, n, uMap, s); - (*outputFunc)(outputStream, s->c_str(), s->getLength()); - delete s; + GooString s; + dumpFragment(line->text, n, uMap, &s); + (*outputFunc)(outputStream, s.c_str(), s.getLength()); // output a newline when a hyphen is not suppressed if (n == line->len) { (*outputFunc)(outputStream, eol, eolLen); diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index af007fa0..98e1df18 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -171,16 +171,16 @@ public: void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style); // Get the TextFontInfo object associated with a character. - const TextFontInfo *getFontInfo(int idx) const { return font[idx]; } + const TextFontInfo *getFontInfo(int idx) const { return chars[idx].font; } // Get the next TextWord on the linked list. const TextWord *getNext() const { return next; } #ifdef TEXTOUT_WORD_LIST - int getLength() const { return len; } - const Unicode *getChar(int idx) const { return &text[idx]; } + int getLength() const { return chars.size(); } + const Unicode *getChar(int idx) const { return &chars[idx].text; } GooString *getText() const; - const GooString *getFontName(int idx) const { return font[idx]->fontName; } + const GooString *getFontName(int idx) const { return chars[idx].font->fontName; } void getColor(double *r, double *g, double *b) const { *r = colorR; @@ -197,19 +197,19 @@ public: void getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const; double getFontSize() const { return fontSize; } int getRotation() const { return rot; } - int getCharPos() const { return charPos[0]; } - int getCharLen() const { return charPos[len] - charPos[0]; } + int getCharPos() const { return chars.empty() ? 0 : chars.front().charPos; } + int getCharLen() const { return chars.empty() ? 0 : chars.back().charPos - chars.front().charPos; } bool getSpaceAfter() const { return spaceAfter; } #endif bool isUnderlined() const { return underlined; } const AnnotLink *getLink() const { return link; } - double getEdge(int i) const { return edge[i]; } + double getEdge(int i) const { return chars[i].edge; } double getBaseline() const { return base; } bool hasSpaceAfter() const { return spaceAfter; } const TextWord *nextWord() const { return next; }; + auto len() const { return chars.size(); } private: - void ensureCapacity(int capacity); void setInitialBounds(TextFontInfo *fontA, double x, double y); int rot; // rotation, multiple of 90 degrees @@ -218,18 +218,22 @@ private: double xMin, xMax; // bounding box x coordinates double yMin, yMax; // bounding box y coordinates double base; // baseline x or y coordinate - Unicode *text; // the text - CharCode *charcode; // glyph indices - double *edge; // "near" edge x or y coord of each char - // (plus one extra entry for the last char) - int *charPos; // character position (within content stream) - // of each char (plus one extra entry for - // the last char) - int len; // length of text/edge/charPos/font arrays - int size; // size of text/edge/charPos/font arrays - TextFontInfo **font; // font information for each char - Matrix *textMat; // transformation matrix for each char + double fontSize; // font size + + struct CharInfo + { + Unicode text; + CharCode charcode; + int charPos; + double edge; + TextFontInfo *font; + Matrix textMat; + }; + std::vector chars; + int charPosEnd = 0; + double edgeEnd = 0; + bool spaceAfter; // set if there is a space between this // word and the next word on the line bool underlined; -- cgit v1.2.3