summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Brüns <stefan.bruens@rwth-aachen.de>2024-03-24 00:31:52 +0100
committerStefan Brüns <stefan.bruens@rwth-aachen.de>2024-03-30 16:21:08 +0100
commit835987362d9873cf98cc3f86959910ff2107a509 (patch)
tree7e6dd320be7d911508a68e656814c8acfc26e17a
parente803b3714a44001ac1e001d948ae505b24086b66 (diff)
Reduce TextWord space and allocation overhead
Currently, the word characters are allocated as a struct of arrays, e.g. text and charcode are allocated separately. This causes some space (6 pointers, 6 malloc chunk management words (size_t/flags), alignment, ...) and runtime overhead (6 allocs/ frees per word). Changing this to an array of struct reduces this overhead. It also allows to be more conservative with allocations, as resizing is less costly, i.e. starting with a single character allocation instead of 16. It is also more efficient, as most accesses affect multiple or all attributes, i.e. values in the same or neighboring CPU cache lines. Using a std::vector instead of separate raw arrays also reduces code and manual data management. The "charPos end index" and trailing "edge" attributes are no longer stored as an additional entry entry in the array, but as dedicated data members, `charPosEnd` and `edgeEnd`. The memory saving is most notably for short words, but even for words with 16 characters there are small savings, and still less allocations (1 + 4 allocations instead of 6. Growing is fairly cheap, as the CharInfo struct is trivially copyable.) See poppler#1173.
-rw-r--r--poppler/TextOutputDev.cc378
-rw-r--r--poppler/TextOutputDev.h42
2 files changed, 197 insertions, 223 deletions
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 034209eb..03b68bc2 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -419,13 +419,6 @@ TextWord::TextWord(const GfxState *state, int rotA, double fontSizeA)
{
rot = rotA;
fontSize = fontSizeA;
- text = nullptr;
- charcode = nullptr;
- edge = nullptr;
- charPos = nullptr;
- font = nullptr;
- textMat = nullptr;
- len = size = 0;
spaceAfter = false;
next = nullptr;
invisible = state->getRender() == 3;
@@ -447,27 +440,14 @@ TextWord::TextWord(const GfxState *state, int rotA, double fontSizeA)
link = nullptr;
}
-TextWord::~TextWord()
-{
- gfree(text);
- gfree(charcode);
- gfree(edge);
- gfree(charPos);
- gfree(font);
- gfree(textMat);
-}
+TextWord::~TextWord() { }
void TextWord::addChar(const GfxState *state, TextFontInfo *fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA)
{
- ensureCapacity(len + 1);
- text[len] = u;
- charcode[len] = c;
- charPos[len] = charPosA;
- charPos[len + 1] = charPosA + charLen;
- font[len] = fontA;
- textMat[len] = textMatA;
+ chars.push_back(CharInfo { u, c, charPosA, 0.0, fontA, textMatA });
+ charPosEnd = charPosA + charLen;
- if (len == 0) {
+ if (len() == 1) {
setInitialBounds(fontA, x, y);
}
@@ -476,43 +456,42 @@ void TextWord::addChar(const GfxState *state, TextFontInfo *fontA, double x, dou
// TextPage::beginWord()) for vertical writing mode
switch (rot) {
case 0:
- edge[len] = x - fontSize;
- xMax = edge[len + 1] = x;
+ chars.back().edge = x - fontSize;
+ xMax = edgeEnd = x;
break;
case 1:
- edge[len] = y - fontSize;
- yMax = edge[len + 1] = y;
+ chars.back().edge = y - fontSize;
+ yMax = edgeEnd = y;
break;
case 2:
- edge[len] = x + fontSize;
- xMin = edge[len + 1] = x;
+ chars.back().edge = x + fontSize;
+ xMin = edgeEnd = x;
break;
case 3:
- edge[len] = y + fontSize;
- yMin = edge[len + 1] = y;
+ chars.back().edge = y + fontSize;
+ yMin = edgeEnd = y;
break;
}
} else { // horizontal writing mode
switch (rot) {
case 0:
- edge[len] = x;
- xMax = edge[len + 1] = x + dx;
+ chars.back().edge = x;
+ xMax = edgeEnd = x + dx;
break;
case 1:
- edge[len] = y;
- yMax = edge[len + 1] = y + dy;
+ chars.back().edge = y;
+ yMax = edgeEnd = y + dy;
break;
case 2:
- edge[len] = x;
- xMin = edge[len + 1] = x + dx;
+ chars.back().edge = x;
+ xMin = edgeEnd = x + dx;
break;
case 3:
- edge[len] = y;
- yMin = edge[len + 1] = y + dy;
+ chars.back().edge = y;
+ yMin = edgeEnd = y + dy;
break;
}
}
- ++len;
}
void TextWord::setInitialBounds(TextFontInfo *fontA, double x, double y)
@@ -604,19 +583,6 @@ void TextWord::setInitialBounds(TextFontInfo *fontA, double x, double y)
}
}
-void TextWord::ensureCapacity(int capacity)
-{
- if (capacity > size) {
- size = std::max(size + 16, capacity);
- text = (Unicode *)greallocn(text, size, sizeof(Unicode));
- charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode));
- edge = (double *)greallocn(edge, (size + 1), sizeof(double));
- charPos = (int *)greallocn(charPos, size + 1, sizeof(int));
- font = (TextFontInfo **)greallocn(font, size, sizeof(TextFontInfo *));
- textMat = (Matrix *)greallocn(textMat, size, sizeof(Matrix));
- }
-}
-
struct CombiningTable
{
Unicode base;
@@ -651,19 +617,15 @@ static Unicode getCombiningChar(Unicode u)
bool TextWord::addCombining(const GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA)
{
- if (len == 0 || wMode != 0 || fontA->getWMode() != 0) {
+ if (chars.empty() || wMode != 0 || fontA->getWMode() != 0) {
return false;
}
Unicode cCurrent = getCombiningChar(u);
- Unicode cPrev = getCombiningChar(text[len - 1]);
- double edgeMid = (edge[len - 1] + edge[len]) / 2;
- double charMid, maxScaledMidDelta, charBase, maxScaledBaseDelta;
-
- if (cCurrent != 0 && unicodeTypeAlphaNum(text[len - 1])) {
+ if (cCurrent != 0 && unicodeTypeAlphaNum(chars.back().text)) {
// Current is a combining character, previous is base character
- maxScaledMidDelta = fabs(edge[len] - edge[len - 1]) * combMaxMidDelta;
- charMid = charBase = maxScaledBaseDelta = 0;
+ double maxScaledMidDelta = fabs(edgeEnd - chars.back().edge) * combMaxMidDelta;
+ double charMid, charBase, maxScaledBaseDelta;
// Test if characters overlap
if (rot == 0 || rot == 2) {
@@ -676,29 +638,24 @@ bool TextWord::addCombining(const GfxState *state, TextFontInfo *fontA, double f
maxScaledBaseDelta = (xMax - xMin) * combMaxBaseDelta;
}
+ double edgeMid = (chars.back().edge + edgeEnd) / 2;
if (fabs(charMid - edgeMid) >= maxScaledMidDelta || fabs(charBase - base) >= maxScaledBaseDelta) {
return false;
}
// Add character, but don't adjust edge / bounding box because
// combining character's positioning could be odd.
- ensureCapacity(len + 1);
- text[len] = cCurrent;
- charcode[len] = c;
- charPos[len] = charPosA;
- charPos[len + 1] = charPosA + charLen;
- font[len] = fontA;
- textMat[len] = textMatA;
- edge[len + 1] = edge[len];
- edge[len] = (edge[len + 1] + edge[len - 1]) / 2;
- ++len;
+ chars.emplace_back(CharInfo { cCurrent, c, charPosA, edgeMid, fontA, textMatA });
+ charPosEnd = charPosA + charLen;
+
return true;
}
+ Unicode cPrev = getCombiningChar(chars.back().text);
if (cPrev != 0 && unicodeTypeAlphaNum(u)) {
// Previous is a combining character, current is base character
- maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta;
- charMid = charBase = maxScaledMidDelta = 0;
+ double maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta;
+ double charMid, charBase, maxScaledMidDelta;
// Test if characters overlap
if (rot == 0 || rot == 2) {
@@ -711,73 +668,71 @@ bool TextWord::addCombining(const GfxState *state, TextFontInfo *fontA, double f
maxScaledMidDelta = fabs(dy * combMaxMidDelta);
}
+ double edgeMid = (chars.back().edge + edgeEnd) / 2;
if (fabs(charMid - edgeMid) >= maxScaledMidDelta || fabs(charBase - base) >= maxScaledBaseDelta) {
return false;
}
- // move combining character to after base character
- ensureCapacity(len + 1);
fontSize = fontSizeA;
- text[len] = cPrev;
- charcode[len] = charcode[len - 1];
- charPos[len] = charPosA;
- charPos[len + 1] = charPosA + charLen;
- font[len] = font[len - 1];
- textMat[len] = textMat[len - 1];
-
- text[len - 1] = u;
- charcode[len - 1] = c;
- font[len - 1] = fontA;
- textMat[len - 1] = textMatA;
-
- if (len == 1) {
+ // move combining character to after base character
+ chars.emplace_back(CharInfo { cPrev, chars.back().charcode, charPosA, edgeMid, chars.back().font, chars.back().textMat });
+
+ auto &lastChar = chars[chars.size() - 2];
+
+ charPosEnd = charPosA + charLen;
+ lastChar.text = u;
+ lastChar.charcode = c;
+ lastChar.font = fontA;
+ lastChar.textMat = textMatA;
+
+ if (len() == 2) {
setInitialBounds(fontA, x, y);
}
// Updated edges / bounding box because we changed the base
// character.
if (wMode) {
+ // FIXME unreachable, wMode == 0
switch (rot) {
case 0:
- edge[len - 1] = x - fontSize;
- xMax = edge[len + 1] = x;
+ lastChar.edge = x - fontSize;
+ xMax = edgeEnd = x;
break;
case 1:
- edge[len - 1] = y - fontSize;
- yMax = edge[len + 1] = y;
+ lastChar.edge = y - fontSize;
+ yMax = edgeEnd = y;
break;
case 2:
- edge[len - 1] = x + fontSize;
- xMin = edge[len + 1] = x;
+ lastChar.edge = x + fontSize;
+ xMin = edgeEnd = x;
break;
case 3:
- edge[len - 1] = y + fontSize;
- yMin = edge[len + 1] = y;
+ lastChar.edge = y + fontSize;
+ yMin = edgeEnd = y;
break;
}
} else {
switch (rot) {
case 0:
- edge[len - 1] = x;
- xMax = edge[len + 1] = x + dx;
+ lastChar.edge = x;
+ xMax = edgeEnd = x + dx;
break;
case 1:
- edge[len - 1] = y;
- yMax = edge[len + 1] = y + dy;
+ lastChar.edge = y;
+ yMax = edgeEnd = y + dy;
break;
case 2:
- edge[len - 1] = x;
- xMin = edge[len + 1] = x + dx;
+ lastChar.edge = x;
+ xMin = edgeEnd = x + dx;
break;
case 3:
- edge[len - 1] = y;
- yMin = edge[len + 1] = y + dy;
+ lastChar.edge = y;
+ yMin = edgeEnd = y + dy;
break;
}
}
- edge[len] = (edge[len + 1] + edge[len - 1]) / 2;
- ++len;
+ chars.back().edge = (edgeEnd + lastChar.edge) / 2;
return true;
}
return false;
@@ -785,8 +740,6 @@ bool TextWord::addCombining(const GfxState *state, TextFontInfo *fontA, double f
void TextWord::merge(TextWord *word)
{
- int i;
-
if (word->xMin < xMin) {
xMin = word->xMin;
}
@@ -799,18 +752,9 @@ void TextWord::merge(TextWord *word)
if (word->yMax > yMax) {
yMax = word->yMax;
}
- ensureCapacity(len + word->len);
- for (i = 0; i < word->len; ++i) {
- text[len + i] = word->text[i];
- charcode[len + i] = word->charcode[i];
- edge[len + i] = word->edge[i];
- charPos[len + i] = word->charPos[i];
- font[len + i] = word->font[i];
- textMat[len + i] = word->textMat[i];
- }
- edge[len + word->len] = word->edge[word->len];
- charPos[len + word->len] = word->charPos[word->len];
- len += word->len;
+ chars.insert(chars.end(), word->chars.begin(), word->chars.end());
+ edgeEnd = word->edgeEnd;
+ charPosEnd = word->charPosEnd;
}
inline int TextWord::primaryCmp(const TextWord *word) const
@@ -877,14 +821,13 @@ GooString *TextWord::getText() const
GooString *s;
const UnicodeMap *uMap;
char buf[8];
- int n, i;
s = new GooString();
if (!(uMap = globalParams->getTextEncoding())) {
return s;
}
- for (i = 0; i < len; ++i) {
- n = uMap->mapUnicode(text[i], buf, sizeof(buf));
+ for (size_t i = 0; i < len(); ++i) {
+ auto n = uMap->mapUnicode(chars[i].text, buf, sizeof(buf));
s->append(buf, n);
}
return s;
@@ -892,33 +835,39 @@ GooString *TextWord::getText() const
void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const
{
- if (charIdx < 0 || charIdx >= len) {
+ if (charIdx < 0) {
return;
}
+ size_t uCharIdx = charIdx;
+ if (uCharIdx >= len()) {
+ return;
+ }
+ auto startingEdge = chars[uCharIdx].edge;
+ auto endingEdge = (uCharIdx + 1 == len()) ? edgeEnd : chars[charIdx + 1].edge;
switch (rot) {
case 0:
- *xMinA = edge[charIdx];
- *xMaxA = edge[charIdx + 1];
+ *xMinA = startingEdge;
+ *xMaxA = endingEdge;
*yMinA = yMin;
*yMaxA = yMax;
break;
case 1:
*xMinA = xMin;
*xMaxA = xMax;
- *yMinA = edge[charIdx];
- *yMaxA = edge[charIdx + 1];
+ *yMinA = startingEdge;
+ *yMaxA = endingEdge;
break;
case 2:
- *xMinA = edge[charIdx + 1];
- *xMaxA = edge[charIdx];
+ *xMinA = endingEdge;
+ *xMaxA = startingEdge;
*yMinA = yMin;
*yMaxA = yMax;
break;
case 3:
*xMinA = xMin;
*xMaxA = xMax;
- *yMinA = edge[charIdx + 1];
- *yMaxA = edge[charIdx];
+ *yMinA = endingEdge;
+ *yMaxA = startingEdge;
break;
}
}
@@ -1189,21 +1138,19 @@ int TextLine::cmpXY(const void *p1, const void *p2)
void TextLine::coalesce(const UnicodeMap *uMap)
{
- TextWord *word0, *word1;
double space, delta, minSpace;
bool isUnicode;
char buf[8];
- int i, j;
if (words->next) {
// compute the inter-word space threshold
- if (words->len > 1 || words->next->len > 1) {
+ if (words->len() > 1 || words->next->len() > 1) {
minSpace = 0;
} else {
minSpace = words->primaryDelta(words->next);
- for (word0 = words->next, word1 = word0->next; word1 && minSpace > 0; word0 = word1, word1 = word0->next) {
- if (word1->len > 1) {
+ for (auto word0 = words->next, word1 = word0->next; word1 && minSpace > 0; word0 = word1, word1 = word0->next) {
+ if (word1->len() > 1) {
minSpace = 0;
}
delta = word0->primaryDelta(word1);
@@ -1222,15 +1169,17 @@ void TextLine::coalesce(const UnicodeMap *uMap)
}
// merge words
- word0 = words;
- word1 = words->next;
+ auto word0 = words;
+ auto word1 = words->next;
while (word1) {
if (word0->primaryDelta(word1) >= space) {
word0->spaceAfter = true;
word0 = word1;
word1 = word1->next;
- } else if (word0->font[word0->len - 1] == word1->font[0] && word0->underlined == word1->underlined && fabs(word0->fontSize - word1->fontSize) < maxWordFontSizeDelta * words->fontSize
- && word1->charPos[0] == word0->charPos[word0->len]) {
+ } else if (word0->chars.back().font == word1->chars.front().font //
+ && word0->underlined == word1->underlined //
+ && fabs(word0->fontSize - word1->fontSize) < maxWordFontSizeDelta * words->fontSize //
+ && word1->chars.front().charPos == word0->charPosEnd) {
word0->merge(word1);
word0->next = word1->next;
delete word1;
@@ -1245,22 +1194,22 @@ void TextLine::coalesce(const UnicodeMap *uMap)
// build the line text
isUnicode = uMap ? uMap->isUnicode() : false;
len = 0;
- for (word1 = words; word1; word1 = word1->next) {
- len += word1->len;
+ for (auto word1 = words; word1; word1 = word1->next) {
+ len += word1->len();
if (word1->spaceAfter) {
++len;
}
}
text = (Unicode *)gmallocn(len, sizeof(Unicode));
edge = (double *)gmallocn(len + 1, sizeof(double));
- i = 0;
- for (word1 = words; word1; word1 = word1->next) {
- for (j = 0; j < word1->len; ++j) {
- text[i] = word1->text[j];
- edge[i] = word1->edge[j];
+ size_t i = 0;
+ for (auto word1 = words; word1; word1 = word1->next) {
+ for (size_t j = 0; j < word1->len(); ++j) {
+ text[i] = word1->chars[j].text;
+ edge[i] = word1->chars[j].edge;
++i;
}
- edge[i] = word1->edge[word1->len];
+ edge[i] = word1->edgeEnd;
if (word1->spaceAfter) {
text[i] = (Unicode)0x0020;
++i;
@@ -1270,12 +1219,12 @@ void TextLine::coalesce(const UnicodeMap *uMap)
// compute convertedLen and set up the col array
col = (int *)gmallocn(len + 1, sizeof(int));
convertedLen = 0;
- for (i = 0; i < len; ++i) {
- col[i] = convertedLen;
+ for (int ci = 0; ci < len; ++ci) {
+ col[ci] = convertedLen;
if (isUnicode) {
++convertedLen;
} else if (uMap) {
- convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
+ convertedLen += uMap->mapUnicode(text[ci], buf, sizeof(buf));
}
}
col[len] = convertedLen;
@@ -1690,8 +1639,14 @@ void TextBlock::coalesce(const UnicodeMap *uMap, double fixedPitch)
word1 = nullptr;
word2 = pool->getPool(idx1);
}
+ TextWord *word1 = prevWord->next;
+
+ auto equalText = [](const TextWord &w1, const TextWord &w2) -> bool { //
+ return std::equal(w1.chars.begin(), w1.chars.end(), w2.chars.begin(), w2.chars.end(), //
+ [](auto c1, auto c2) { return c1.text == c2.text; });
+ };
for (; word2; word1 = word2, word2 = word2->next) {
- if (word2->len == word0->len && !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode))) {
+ if (equalText(*word0, *word2)) {
switch (rot) {
case 0:
case 2:
@@ -2713,28 +2668,28 @@ void TextPage::addChar(const GfxState *state, double x, double y, double dx, dou
// character to be in a word by itself at this stage),
// (4) the font size has changed
// (5) the WMode changed
- if (curWord && curWord->len > 0) {
+ if (curWord && curWord->len() > 0) {
base = sp = delta = 0; // make gcc happy
switch (curWord->rot) {
case 0:
base = y1;
sp = x1 - curWord->xMax;
- delta = x1 - curWord->edge[curWord->len - 1];
+ delta = x1 - curWord->chars.back().edge;
break;
case 1:
base = x1;
sp = y1 - curWord->yMax;
- delta = y1 - curWord->edge[curWord->len - 1];
+ delta = y1 - curWord->chars.back().edge;
break;
case 2:
base = y1;
sp = curWord->xMin - x1;
- delta = curWord->edge[curWord->len - 1] - x1;
+ delta = curWord->chars.back().edge - x1;
break;
case 3:
base = x1;
sp = curWord->yMin - y1;
- delta = curWord->edge[curWord->len - 1] - y1;
+ delta = curWord->chars.back().edge - y1;
break;
}
overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize && fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
@@ -2813,7 +2768,7 @@ void TextPage::addWord(TextWord *word)
{
// throw away zero-length words -- they don't have valid xMin/xMax
// values, and they're useless anyway
- if (word->len == 0) {
+ if (word->len() == 0) {
delete word;
return;
}
@@ -3309,10 +3264,10 @@ void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML, double
for (blk = blkList; blk; blk = blk->next) {
for (line = blk->lines; line; line = line->next) {
for (word0 = line->words; word0; word0 = word0->next) {
- for (int i = 0; i < word0->len; ++i) {
- if (unicodeTypeL(word0->text[i])) {
+ for (size_t i = 0; i < word0->len(); ++i) {
+ if (unicodeTypeL(word0->chars[i].text)) {
++lrCount;
- } else if (unicodeTypeR(word0->text[i])) {
+ } else if (unicodeTypeR(word0->chars[i].text)) {
--lrCount;
}
}
@@ -4587,12 +4542,16 @@ GooString *TextSelectionDumper::getText()
spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
+ std::vector<Unicode> uText;
for (i = 0; i < nLines; i++) {
std::vector<TextWordSelection *> *lineWords = lines[i];
for (std::size_t j = 0; j < lineWords->size(); j++) {
TextWordSelection *sel = (*lineWords)[j];
- page->dumpFragment(sel->word->text + sel->begin, sel->end - sel->begin, uMap, text);
+ uText.resize(sel->end - sel->begin);
+ std::transform(sel->word->chars.begin() + sel->begin, sel->word->chars.begin() + sel->end, uText.begin(), [](auto &c) { return c.text; });
+ page->dumpFragment(uText.data(), uText.size(), uMap, text);
+
if (j < lineWords->size() - 1 && sel->word->spaceAfter) {
text->append(space, spaceLen);
}
@@ -4838,36 +4797,39 @@ void TextSelectionPainter::endPage()
out->updateFillColor(state);
+ GooString string;
for (const TextWordSelection *sel : *selectionList) {
int begin = sel->begin;
while (begin < sel->end) {
- TextFontInfo *font = sel->word->font[begin];
- Matrix *mat = &sel->word->textMat[begin];
+ TextFontInfo *font = sel->word->chars[begin].font;
+ const Matrix *mat = &sel->word->chars[begin].textMat;
state->setTextMat(mat->m[0], mat->m[1], mat->m[2], mat->m[3], 0, 0);
state->setFont(font->gfxFont, 1);
out->updateFont(state);
int fEnd = begin + 1;
- while (fEnd < sel->end && font->matches(sel->word->font[fEnd]) && mat->m[0] == sel->word->textMat[fEnd].m[0] && mat->m[1] == sel->word->textMat[fEnd].m[1] && mat->m[2] == sel->word->textMat[fEnd].m[2]
- && mat->m[3] == sel->word->textMat[fEnd].m[3]) {
+ while (fEnd < sel->end && font->matches(sel->word->chars[fEnd].font) //
+ && mat->m[0] == sel->word->chars[fEnd].textMat.m[0] && mat->m[1] == sel->word->chars[fEnd].textMat.m[1] //
+ && mat->m[2] == sel->word->chars[fEnd].textMat.m[2] && mat->m[3] == sel->word->chars[fEnd].textMat.m[3]) {
fEnd++;
}
/* The only purpose of this string is to let the output device query
* it's length. Might want to change this interface later. */
- GooString *string = new GooString((char *)sel->word->charcode, fEnd - begin);
- out->beginString(state, string);
+ string.clear();
+ std::for_each(sel->word->chars.begin() + begin, sel->word->chars.begin() + fEnd, [&string](const auto c) { string.append(c.charcode); });
+ out->beginString(state, &string);
for (int j = begin; j < fEnd; j++) {
- if (j != begin && sel->word->charPos[j] == sel->word->charPos[j - 1]) {
+ const auto &charJ = sel->word->chars[j];
+ if (j != begin && charJ.charPos == sel->word->chars[j - 1].charPos) {
continue;
}
- out->drawChar(state, sel->word->textMat[j].m[4], sel->word->textMat[j].m[5], 0, 0, 0, 0, sel->word->charcode[j], 1, nullptr, 0);
+ out->drawChar(state, charJ.textMat.m[4], charJ.textMat.m[5], 0, 0, 0, 0, charJ.charcode, 1, nullptr, 0);
}
out->endString(state);
- delete string;
begin = fEnd;
}
}
@@ -4878,7 +4840,6 @@ void TextSelectionPainter::endPage()
void TextWord::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style)
{
- int i, begin, end;
double mid, s1, s2;
if (rot == 0 || rot == 2) {
@@ -4889,10 +4850,14 @@ void TextWord::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle
s2 = selection->y2;
}
- begin = len;
- end = 0;
- for (i = 0; i < len; i++) {
- mid = (edge[i] + edge[i + 1]) / 2;
+ size_t begin = len();
+ size_t end = 0;
+ for (size_t i = 0; i < len(); i++) {
+ if (i + 1 < len()) {
+ mid = (chars[i].edge + chars[i + 1].edge) / 2;
+ } else {
+ mid = (chars[i].edge + edgeEnd) / 2;
+ }
if (XBetweenAB(mid, s1, s2)) {
if (i < begin) {
begin = i;
@@ -5279,7 +5244,6 @@ bool TextPage::findCharRange(int pos, int length, double *xMin, double *yMin, do
double xMin0, xMax0, yMin0, yMax0;
double xMin1, xMax1, yMin1, yMax1;
bool first;
- int i, j0, j1;
if (rawOrder) {
return false;
@@ -5291,41 +5255,44 @@ bool TextPage::findCharRange(int pos, int length, double *xMin, double *yMin, do
first = true;
xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
- for (i = 0; i < nBlocks; ++i) {
+ for (int i = 0; i < nBlocks; ++i) {
blk = blocks[i];
for (line = blk->lines; line; line = line->next) {
for (word = line->words; word; word = word->next) {
- if (pos < word->charPos[word->len] && pos + length > word->charPos[0]) {
- for (j0 = 0; j0 < word->len && pos >= word->charPos[j0 + 1]; ++j0) {
+ if (pos < word->charPosEnd && pos + length > word->chars.front().charPos) {
+ size_t j0, j1;
+ for (j0 = 0; (j0 + 1) < word->len() && pos >= word->chars[j0 + 1].charPos; ++j0) {
;
}
- for (j1 = word->len - 1; j1 > j0 && pos + length <= word->charPos[j1]; --j1) {
+ for (j1 = word->len(); j1 > j0 && pos + length <= word->chars[j1].charPos; --j1) {
;
}
+ auto startingEdge = word->chars[j0].edge;
+ auto endingEdge = (j1 + 1 == word->len()) ? word->edgeEnd : word->chars[j1 + 1].edge;
switch (line->rot) {
case 0:
- xMin1 = word->edge[j0];
- xMax1 = word->edge[j1 + 1];
+ xMin1 = startingEdge;
+ xMax1 = endingEdge;
yMin1 = word->yMin;
yMax1 = word->yMax;
break;
case 1:
xMin1 = word->xMin;
xMax1 = word->xMax;
- yMin1 = word->edge[j0];
- yMax1 = word->edge[j1 + 1];
+ yMin1 = startingEdge;
+ yMax1 = endingEdge;
break;
case 2:
- xMin1 = word->edge[j1 + 1];
- xMax1 = word->edge[j0];
+ xMin1 = endingEdge;
+ xMax1 = startingEdge;
yMin1 = word->yMin;
yMax1 = word->yMax;
break;
case 3:
xMin1 = word->xMin;
xMax1 = word->xMax;
- yMin1 = word->edge[j1 + 1];
- yMax1 = word->edge[j0];
+ yMin1 = endingEdge;
+ yMax1 = startingEdge;
break;
}
if (first || xMin1 < xMin0) {
@@ -5367,7 +5334,6 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, bool physLayo
TextLineFrag *frag;
char space[8], eol[16], eop[8];
int spaceLen, eolLen, eopLen;
- GooString *s;
double delta;
int col, i, j, d, n;
@@ -5396,11 +5362,16 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, bool physLayo
// output the page in raw (content stream) order
if (rawOrder) {
+ GooString s;
+ std::vector<Unicode> uText;
+
for (word = rawWords; word; word = word->next) {
- s = new GooString();
- dumpFragment(word->text, word->len, uMap, s);
- (*outputFunc)(outputStream, s->c_str(), s->getLength());
- delete s;
+ s.clear();
+ uText.resize(word->len());
+ std::transform(word->chars.begin(), word->chars.end(), uText.begin(), [](auto &c) { return c.text; });
+ dumpFragment(uText.data(), uText.size(), uMap, &s);
+ (*outputFunc)(outputStream, s.c_str(), s.getLength());
+
if (word->next && fabs(word->next->base - word->base) < maxIntraLineDelta * word->fontSize && word->next->xMin > word->xMax - minDupBreakOverlap * word->fontSize) {
if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
(*outputFunc)(outputStream, space, spaceLen);
@@ -5454,6 +5425,7 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, bool physLayo
printf("\n");
#endif
+ GooString s;
// generate output
col = 0;
for (i = 0; i < nFrags; ++i) {
@@ -5465,10 +5437,9 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, bool physLayo
}
// print the line
- s = new GooString();
- col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
- (*outputFunc)(outputStream, s->c_str(), s->getLength());
- delete s;
+ s.clear();
+ col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, &s);
+ (*outputFunc)(outputStream, s.c_str(), s.getLength());
// print one or more returns if necessary
if (i == nFrags - 1 || frags[i + 1].col < col || fabs(frags[i + 1].base - frag->base) > maxIntraLineDelta * frag->line->words->fontSize) {
@@ -5500,10 +5471,9 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, bool physLayo
if (line->hyphenated && (line->next || blk->next)) {
--n;
}
- s = new GooString();
- dumpFragment(line->text, n, uMap, s);
- (*outputFunc)(outputStream, s->c_str(), s->getLength());
- delete s;
+ GooString s;
+ dumpFragment(line->text, n, uMap, &s);
+ (*outputFunc)(outputStream, s.c_str(), s.getLength());
// output a newline when a hyphen is not suppressed
if (n == line->len) {
(*outputFunc)(outputStream, eol, eolLen);
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index af007fa0..98e1df18 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -171,16 +171,16 @@ public:
void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style);
// Get the TextFontInfo object associated with a character.
- const TextFontInfo *getFontInfo(int idx) const { return font[idx]; }
+ const TextFontInfo *getFontInfo(int idx) const { return chars[idx].font; }
// Get the next TextWord on the linked list.
const TextWord *getNext() const { return next; }
#ifdef TEXTOUT_WORD_LIST
- int getLength() const { return len; }
- const Unicode *getChar(int idx) const { return &text[idx]; }
+ int getLength() const { return chars.size(); }
+ const Unicode *getChar(int idx) const { return &chars[idx].text; }
GooString *getText() const;
- const GooString *getFontName(int idx) const { return font[idx]->fontName; }
+ const GooString *getFontName(int idx) const { return chars[idx].font->fontName; }
void getColor(double *r, double *g, double *b) const
{
*r = colorR;
@@ -197,19 +197,19 @@ public:
void getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const;
double getFontSize() const { return fontSize; }
int getRotation() const { return rot; }
- int getCharPos() const { return charPos[0]; }
- int getCharLen() const { return charPos[len] - charPos[0]; }
+ int getCharPos() const { return chars.empty() ? 0 : chars.front().charPos; }
+ int getCharLen() const { return chars.empty() ? 0 : chars.back().charPos - chars.front().charPos; }
bool getSpaceAfter() const { return spaceAfter; }
#endif
bool isUnderlined() const { return underlined; }
const AnnotLink *getLink() const { return link; }
- double getEdge(int i) const { return edge[i]; }
+ double getEdge(int i) const { return chars[i].edge; }
double getBaseline() const { return base; }
bool hasSpaceAfter() const { return spaceAfter; }
const TextWord *nextWord() const { return next; };
+ auto len() const { return chars.size(); }
private:
- void ensureCapacity(int capacity);
void setInitialBounds(TextFontInfo *fontA, double x, double y);
int rot; // rotation, multiple of 90 degrees
@@ -218,18 +218,22 @@ private:
double xMin, xMax; // bounding box x coordinates
double yMin, yMax; // bounding box y coordinates
double base; // baseline x or y coordinate
- Unicode *text; // the text
- CharCode *charcode; // glyph indices
- double *edge; // "near" edge x or y coord of each char
- // (plus one extra entry for the last char)
- int *charPos; // character position (within content stream)
- // of each char (plus one extra entry for
- // the last char)
- int len; // length of text/edge/charPos/font arrays
- int size; // size of text/edge/charPos/font arrays
- TextFontInfo **font; // font information for each char
- Matrix *textMat; // transformation matrix for each char
+
double fontSize; // font size
+
+ struct CharInfo
+ {
+ Unicode text;
+ CharCode charcode;
+ int charPos;
+ double edge;
+ TextFontInfo *font;
+ Matrix textMat;
+ };
+ std::vector<CharInfo> chars;
+ int charPosEnd = 0;
+ double edgeEnd = 0;
+
bool spaceAfter; // set if there is a space between this
// word and the next word on the line
bool underlined;