Files
Yajbir Singh f1b860b25c
Some checks failed
check / markdownlint (push) Has been cancelled
check / spellchecker (push) Has been cancelled
updated
2025-12-11 19:03:17 +05:30

6349 lines
164 KiB
C++

//========================================================================
//
// TextOutputDev.cc
//
// Copyright 1997-2014 Glyph & Cog, LLC
//
//========================================================================
#include <aconf.h>
#ifdef USE_GCC_PRAGMAS
#pragma implementation
#endif
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <math.h>
#include <limits.h>
#include <ctype.h>
#ifdef _WIN32
#include <fcntl.h> // for O_BINARY
#include <io.h> // for setmode
#endif
#include "gmem.h"
#include "gmempp.h"
#include "GString.h"
#include "GList.h"
#include "gfile.h"
#include "config.h"
#include "Error.h"
#include "GlobalParams.h"
#include "UnicodeMap.h"
#include "UnicodeRemapping.h"
#include "UnicodeTypeTable.h"
#include "GfxState.h"
#include "Link.h"
#include "TextOutputDev.h"
//------------------------------------------------------------------------
// parameters
//------------------------------------------------------------------------
// Size of bins used for horizontal and vertical profiles is
// splitPrecisionMul * minFontSize.
#define splitPrecisionMul 0.05
// Minimum allowed split precision.
#define minSplitPrecision 0.01
// yMin and yMax (or xMin and xMax for rot=1,3) are adjusted by this
// fraction of the text height, to allow for slightly overlapping
// lines (or large ascent/descent values).
#define ascentAdjustFactor 0
#define descentAdjustFactor 0.35
// Gaps larger than max{gap} - splitGapSlack * avgFontSize are
// considered to be equivalent.
#define splitGapSlack 0.2
// The vertical gap threshold (minimum gap required to split
// vertically) depends on the (approximate) number of lines in the
// block:
// threshold = (max + slope * nLines) * avgFontSize
// with a min value of vertGapThresholdMin * avgFontSize.
#define vertGapThresholdMin 0.8
#define vertGapThresholdMax 3
#define vertGapThresholdSlope -0.5
// Vertical gap threshold for table mode.
#define vertGapThresholdTableMin 0.2
#define vertGapThresholdTableMax 0.5
#define vertGapThresholdTableSlope -0.02
// A large character has a font size larger than
// largeCharThreshold * avgFontSize.
#define largeCharThreshold 1.5
// A block will be split vertically only if the resulting chunk
// widths are greater than vertSplitChunkThreshold * avgFontSize.
#define vertSplitChunkThreshold 2
// Max difference in primary,secondary coordinates (as a fraction of
// the font size) allowed for duplicated text (fake boldface, drop
// shadows) which is to be discarded.
#define dupMaxPriDelta 0.1
#define dupMaxSecDelta 0.2
// Inter-character spacing that varies by less than this multiple of
// font size is assumed to be equivalent.
#define uniformSpacing 0.07
#define tableModeUniformSpacing 0.14
// Typical word spacing, as a fraction of font size. This will be
// added to the minimum inter-character spacing, to account for wide
// character spacing.
#define wordSpacing 0.1
#define tableModeWordSpacing 0.2
// Minimum paragraph indent from left margin, as a fraction of font
// size.
#define minParagraphIndent 0.5
// If the space between two lines is greater than
// paragraphSpacingThreshold * avgLineSpacing, start a new paragraph.
#define paragraphSpacingThreshold 1.25
// If font size changes by at least this much (measured in points)
// between lines, start a new paragraph.
#define paragraphFontSizeDelta 1
// Spaces at the start of a line in physical layout mode are this wide
// (as a multiple of font size).
#define physLayoutSpaceWidth 0.33
// In simple layout mode, lines are broken at gaps larger than this
// value multiplied by font size.
#define simpleLayoutGapThreshold 0.7
// Minimum overlap in simple2 mode.
#define simple2MinOverlap 0.2
// Table cells (TextColumns) are allowed to overlap by this much
// in table layout mode (as a fraction of cell width or height).
#define tableCellOverlapSlack 0.05
// Primary axis delta which will cause a line break in raw mode
// (as a fraction of font size).
#define rawModeLineDelta 0.5
// Secondary axis delta which will cause a word break in raw mode
// (as a fraction of font size).
#define rawModeWordSpacing 0.15
// Secondary axis overlap which will cause a line break in raw mode
// (as a fraction of font size).
#define rawModeCharOverlap 0.2
// Max spacing (as a multiple of font size) allowed between the end of
// a line and a clipped character to be included in that line.
#define clippedTextMaxWordSpace 0.5
// Max width of underlines (in points).
#define maxUnderlineWidth 3
// Max horizontal distance between edge of word and start of underline
// (as a fraction of font size).
#define underlineSlack 0.2
// Max vertical distance between baseline of word and start of
// underline (as a fraction of font size).
#define underlineBaselineSlack 0.2
// Max distance between edge of text and edge of link border (as a
// fraction of font size).
#define hyperlinkSlack 0.2
// Text is considered diagonal if abs(tan(angle)) > diagonalThreshold.
// (Or 1/tan(angle) for 90/270 degrees.)
#define diagonalThreshold 0.1
// This value is used as the ascent when computing selection
// rectangles, in order to work around flakey ascent values in fonts.
#define selectionAscent 0.8
// Grid size used to bin sort characters for overlap detection.
#define overlapGridWidth 20
#define overlapGridHeight 20
// Minimum character bbox overlap (horizontal and vertical) as a
// fraction of character bbox width/height for a character to be
// treated as overlapping.
#define minCharOverlap 0.3
#define maxUnicodeLen 16
//------------------------------------------------------------------------
static inline double dmin(double x, double y) {
return x < y ? x : y;
}
static inline double dmax(double x, double y) {
return x > y ? x : y;
}
//------------------------------------------------------------------------
// TextChar
//------------------------------------------------------------------------
class TextChar {
public:
TextChar(Unicode cA, int charPosA, int charLenA,
double xMinA, double yMinA, double xMaxA, double yMaxA,
int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA,
TextFontInfo *fontA, double fontSizeA,
double colorRA, double colorGA, double colorBA);
static int cmpX(const void *p1, const void *p2);
static int cmpY(const void *p1, const void *p2);
static int cmpCharPos(const void *p1, const void *p2);
Unicode c;
int charPos;
int charLen;
double xMin, yMin, xMax, yMax;
TextFontInfo *font;
double fontSize;
double colorR,
colorG,
colorB;
// group the byte-size fields to minimize object size
Guchar rot;
char rotated;
char clipped;
char invisible;
char spaceAfter;
char overlap;
};
TextChar::TextChar(Unicode cA, int charPosA, int charLenA,
double xMinA, double yMinA, double xMaxA, double yMaxA,
int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA,
TextFontInfo *fontA, double fontSizeA,
double colorRA, double colorGA, double colorBA) {
double t;
c = cA;
charPos = charPosA;
charLen = charLenA;
xMin = xMinA;
yMin = yMinA;
xMax = xMaxA;
yMax = yMaxA;
// this can happen with vertical writing mode, or with odd values
// for the char/word spacing parameters
if (xMin > xMax) {
t = xMin; xMin = xMax; xMax = t;
}
if (yMin > yMax) {
t = yMin; yMin = yMax; yMax = t;
}
// TextPage::findGaps uses integer coordinates, so clip the char
// bbox to fit in a 32-bit int (this is generally only a problem in
// damaged PDF files)
if (xMin < -1e8) {
xMin = -1e8;
}
if (xMax > 1e8) {
xMax = 1e8;
}
if (yMin < -1e8) {
yMin = -1e8;
}
if (yMax > 1e8) {
yMax = 1e8;
}
rot = (Guchar)rotA;
rotated = (char)rotatedA;
clipped = (char)clippedA;
invisible = (char)invisibleA;
spaceAfter = (char)gFalse;
font = fontA;
fontSize = fontSizeA;
colorR = colorRA;
colorG = colorGA;
colorB = colorBA;
overlap = gFalse;
}
int TextChar::cmpX(const void *p1, const void *p2) {
const TextChar *ch1 = *(const TextChar **)p1;
const TextChar *ch2 = *(const TextChar **)p2;
if (ch1->xMin < ch2->xMin) {
return -1;
} else if (ch1->xMin > ch2->xMin) {
return 1;
} else {
return ch1->charPos - ch2->charPos;
}
}
int TextChar::cmpY(const void *p1, const void *p2) {
const TextChar *ch1 = *(const TextChar **)p1;
const TextChar *ch2 = *(const TextChar **)p2;
if (ch1->yMin < ch2->yMin) {
return -1;
} else if (ch1->yMin > ch2->yMin) {
return 1;
} else {
return ch1->charPos - ch2->charPos;
}
}
int TextChar::cmpCharPos(const void *p1, const void *p2) {
const TextChar *ch1 = *(const TextChar **)p1;
const TextChar *ch2 = *(const TextChar **)p2;
return ch1->charPos - ch2->charPos;
}
//------------------------------------------------------------------------
// TextBlock
//------------------------------------------------------------------------
enum TextBlockType {
blkVertSplit,
blkHorizSplit,
blkLeaf
};
enum TextBlockTag {
blkTagMulticolumn,
blkTagColumn,
blkTagSuperLine,
blkTagLine
};
class TextBlock {
public:
TextBlock(TextBlockType typeA, int rotA);
~TextBlock();
void addChild(TextBlock *child);
void addChild(TextChar *child, GBool updateBox);
void prependChild(TextChar *child);
void updateBounds(int childIdx);
TextBlockType type;
TextBlockTag tag;
int rot;
double xMin, yMin, xMax, yMax;
GBool smallSplit; // true for blkVertSplit/blkHorizSplit
// where the gap size is small
GList *children; // for blkLeaf, children are TextWord;
// for others, children are TextBlock
};
TextBlock::TextBlock(TextBlockType typeA, int rotA) {
type = typeA;
tag = blkTagMulticolumn;
rot = rotA;
xMin = yMin = xMax = yMax = 0;
smallSplit = gFalse;
children = new GList();
}
TextBlock::~TextBlock() {
if (type == blkLeaf) {
delete children;
} else {
deleteGList(children, TextBlock);
}
}
void TextBlock::addChild(TextBlock *child) {
if (children->getLength() == 0) {
xMin = child->xMin;
yMin = child->yMin;
xMax = child->xMax;
yMax = child->yMax;
} else {
if (child->xMin < xMin) {
xMin = child->xMin;
}
if (child->yMin < yMin) {
yMin = child->yMin;
}
if (child->xMax > xMax) {
xMax = child->xMax;
}
if (child->yMax > yMax) {
yMax = child->yMax;
}
}
children->append(child);
}
void TextBlock::addChild(TextChar *child, GBool updateBox) {
if (updateBox) {
if (children->getLength() == 0) {
xMin = child->xMin;
yMin = child->yMin;
xMax = child->xMax;
yMax = child->yMax;
} else {
if (child->xMin < xMin) {
xMin = child->xMin;
}
if (child->yMin < yMin) {
yMin = child->yMin;
}
if (child->xMax > xMax) {
xMax = child->xMax;
}
if (child->yMax > yMax) {
yMax = child->yMax;
}
}
}
children->append(child);
}
void TextBlock::prependChild(TextChar *child) {
if (children->getLength() == 0) {
xMin = child->xMin;
yMin = child->yMin;
xMax = child->xMax;
yMax = child->yMax;
} else {
if (child->xMin < xMin) {
xMin = child->xMin;
}
if (child->yMin < yMin) {
yMin = child->yMin;
}
if (child->xMax > xMax) {
xMax = child->xMax;
}
if (child->yMax > yMax) {
yMax = child->yMax;
}
}
children->insert(0, child);
}
void TextBlock::updateBounds(int childIdx) {
TextBlock *child;
child = (TextBlock *)children->get(childIdx);
if (child->xMin < xMin) {
xMin = child->xMin;
}
if (child->yMin < yMin) {
yMin = child->yMin;
}
if (child->xMax > xMax) {
xMax = child->xMax;
}
if (child->yMax > yMax) {
yMax = child->yMax;
}
}
//------------------------------------------------------------------------
// TextCharLine
//------------------------------------------------------------------------
class TextCharLine {
public:
TextCharLine(int rotA);
~TextCharLine();
void add(TextChar *ch);
GList *chars;
double yMin, yMax;
int rot;
TextCharLine *next, *prev;
};
TextCharLine::TextCharLine(int rotA) {
chars = new GList();
yMin = yMax = 0;
rot = rotA;
next = prev = NULL;
}
TextCharLine::~TextCharLine() {
delete chars;
}
void TextCharLine::add(TextChar *ch) {
chars->append(ch);
yMin = ch->yMin;
yMax = ch->yMax;
}
//------------------------------------------------------------------------
// TextGaps
//------------------------------------------------------------------------
struct TextGap {
double x; // center of gap: x for vertical gaps,
// y for horizontal gaps
double w; // width/height of gap
};
class TextGaps {
public:
TextGaps();
~TextGaps();
void addGap(double x, double w);
int getLength() { return length; }
double getX(int idx) { return gaps[idx].x; }
double getW(int idx) { return gaps[idx].w; }
private:
int length;
int size;
TextGap *gaps;
};
TextGaps::TextGaps() {
length = 0;
size = 16;
gaps = (TextGap *)gmallocn(size, sizeof(TextGap));
}
TextGaps::~TextGaps() {
gfree(gaps);
}
void TextGaps::addGap(double x, double w) {
if (length == size) {
size *= 2;
gaps = (TextGap *)greallocn(gaps, size, sizeof(TextGap));
}
gaps[length].x = x;
gaps[length].w = w;
++length;
}
//------------------------------------------------------------------------
// TextSuperLine
//------------------------------------------------------------------------
class TextSuperLine {
public:
TextSuperLine(GList *linesA);
~TextSuperLine();
GList *lines; // [TextLine]
double yMin, yMax;
double fontSize;
};
TextSuperLine::TextSuperLine(GList *linesA) {
TextLine *line;
int i;
lines = linesA;
yMin = yMax = 0;
fontSize = ((TextLine *)lines->get(0))->fontSize;
for (i = 0; i < lines->getLength(); ++i) {
line = (TextLine *)lines->get(i);
if (i == 0 || line->yMin < yMin) {
yMin = line->yMin;
}
if (i == 0 || line->yMax > yMax) {
yMax = line->yMax;
}
}
}
TextSuperLine::~TextSuperLine() {
deleteGList(lines, TextLine);
}
//------------------------------------------------------------------------
// TextUnderline
//------------------------------------------------------------------------
class TextUnderline {
public:
TextUnderline(double x0A, double y0A, double x1A, double y1A)
{ x0 = x0A; y0 = y0A; x1 = x1A; y1 = y1A; horiz = y0 == y1; }
~TextUnderline() {}
double x0, y0, x1, y1;
GBool horiz;
};
//------------------------------------------------------------------------
// TextLink
//------------------------------------------------------------------------
class TextLink {
public:
TextLink(double xMinA, double yMinA, double xMaxA, double yMaxA,
GString *uriA)
{ xMin = xMinA; yMin = yMinA; xMax = xMaxA; yMax = yMaxA; uri = uriA; }
~TextLink();
double xMin, yMin, xMax, yMax;
GString *uri;
};
TextLink::~TextLink() {
if (uri) {
delete uri;
}
}
//------------------------------------------------------------------------
// TextOutputControl
//------------------------------------------------------------------------
TextOutputControl::TextOutputControl() {
mode = textOutReadingOrder;
fixedPitch = 0;
fixedLineSpacing = 0;
html = gFalse;
clipText = gFalse;
discardDiagonalText = gFalse;
discardRotatedText = gFalse;
discardInvisibleText = gFalse;
discardClippedText = gFalse;
splitRotatedWords = gFalse;
overlapHandling = textOutIgnoreOverlaps;
separateLargeChars = gTrue;
insertBOM = gFalse;
marginLeft = 0;
marginRight = 0;
marginTop = 0;
marginBottom = 0;
}
//------------------------------------------------------------------------
// TextFontInfo
//------------------------------------------------------------------------
TextFontInfo::TextFontInfo(GfxState *state) {
GfxFont *gfxFont;
gfxFont = state->getFont();
if (gfxFont) {
fontID = *gfxFont->getID();
ascent = gfxFont->getAscent();
descent = gfxFont->getDescent();
// "odd" ascent/descent values cause trouble more often than not
// (in theory these could be legitimate values for oddly designed
// fonts -- but they are more often due to buggy PDF generators)
// (values that are too small are a different issue -- those seem
// to be more commonly legitimate)
if (ascent > 1) {
ascent = 0.75;
}
if (descent < -0.5) {
descent = -0.25;
}
} else {
fontID.num = -1;
fontID.gen = -1;
ascent = 0.75;
descent = -0.25;
}
fontName = (gfxFont && gfxFont->getName()) ? gfxFont->getName()->copy()
: (GString *)NULL;
flags = gfxFont ? gfxFont->getFlags() : 0;
mWidth = 0;
if (gfxFont && !gfxFont->isCIDFont()) {
char *name;
int code;
for (code = 0; code < 256; ++code) {
if ((name = ((Gfx8BitFont *)gfxFont)->getCharName(code)) &&
name[0] == 'm' && name[1] == '\0') {
mWidth = ((Gfx8BitFont *)gfxFont)->getWidth((Guchar)code);
break;
}
}
}
}
TextFontInfo::~TextFontInfo() {
if (fontName) {
delete fontName;
}
}
GBool TextFontInfo::matches(GfxState *state) {
Ref id;
if (state->getFont()) {
id = *state->getFont()->getID();
} else {
id.num = -1;
id.gen = -1;
}
return id.num == fontID.num && id.gen == fontID.gen;
}
//------------------------------------------------------------------------
// TextWord
//------------------------------------------------------------------------
// Build a TextWord object, using chars[start .. start+len-1].
// (If rot >= 2, the chars list is in reverse order.)
TextWord::TextWord(GList *chars, int start, int lenA,
int rotA, GBool rotatedA, int dirA, GBool spaceAfterA) {
TextChar *ch;
int i;
rot = (char)rotA;
rotated = (char)rotatedA;
len = lenA;
text = (Unicode *)gmallocn(len, sizeof(Unicode));
edge = (double *)gmallocn(len + 1, sizeof(double));
charPos = (int *)gmallocn(len + 1, sizeof(int));
if (rot & 1) {
ch = (TextChar *)chars->get(start);
xMin = ch->xMin;
xMax = ch->xMax;
yMin = ch->yMin;
ch = (TextChar *)chars->get(start + len - 1);
yMax = ch->yMax;
} else {
ch = (TextChar *)chars->get(start);
xMin = ch->xMin;
yMin = ch->yMin;
yMax = ch->yMax;
ch = (TextChar *)chars->get(start + len - 1);
xMax = ch->xMax;
}
for (i = 0; i < len; ++i) {
ch = (TextChar *)chars->get(rot >= 2 ? start + len - 1 - i : start + i);
text[i] = ch->c;
charPos[i] = ch->charPos;
if (i == len - 1) {
charPos[len] = ch->charPos + ch->charLen;
}
switch (rot) {
case 0:
default:
edge[i] = ch->xMin;
if (i == len - 1) {
edge[len] = ch->xMax;
}
break;
case 1:
edge[i] = ch->yMin;
if (i == len - 1) {
edge[len] = ch->yMax;
}
break;
case 2:
edge[i] = ch->xMax;
if (i == len - 1) {
edge[len] = ch->xMin;
}
break;
case 3:
edge[i] = ch->yMax;
if (i == len - 1) {
edge[len] = ch->yMin;
}
break;
}
}
ch = (TextChar *)chars->get(start);
font = ch->font;
fontSize = ch->fontSize;
dir = (char)dirA;
spaceAfter = (char)spaceAfterA;
underlined = gFalse;
link = NULL;
colorR = ch->colorR;
colorG = ch->colorG;
colorB = ch->colorB;
invisible = ch->invisible;
}
TextWord::TextWord(TextWord *word) {
*this = *word;
text = (Unicode *)gmallocn(len, sizeof(Unicode));
memcpy(text, word->text, len * sizeof(Unicode));
edge = (double *)gmallocn(len + 1, sizeof(double));
memcpy(edge, word->edge, (len + 1) * sizeof(double));
charPos = (int *)gmallocn(len + 1, sizeof(int));
memcpy(charPos, word->charPos, (len + 1) * sizeof(int));
}
TextWord::~TextWord() {
gfree(text);
gfree(edge);
gfree(charPos);
}
int TextWord::cmpYX(const void *p1, const void *p2) {
const TextWord *word1 = *(const TextWord **)p1;
const TextWord *word2 = *(const TextWord **)p2;
double cmp;
if ((cmp = word1->yMin - word2->yMin) == 0) {
cmp = word1->xMin - word2->xMin;
}
return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
int TextWord::cmpCharPos(const void *p1, const void *p2) {
const TextWord *word1 = *(const TextWord **)p1;
const TextWord *word2 = *(const TextWord **)p2;
return word1->charPos[0] - word2->charPos[0];
}
GString *TextWord::getText() {
GString *s;
UnicodeMap *uMap;
char buf[8];
int n, i;
s = new GString();
if (!(uMap = globalParams->getTextEncoding())) {
return s;
}
for (i = 0; i < len; ++i) {
n = uMap->mapUnicode(text[i], buf, sizeof(buf));
s->append(buf, n);
}
uMap->decRefCnt();
return s;
}
void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA,
double *xMaxA, double *yMaxA) {
if (charIdx < 0 || charIdx >= len) {
return;
}
switch (rot) {
case 0:
*xMinA = edge[charIdx];
*xMaxA = edge[charIdx + 1];
*yMinA = yMin;
*yMaxA = yMax;
break;
case 1:
*xMinA = xMin;
*xMaxA = xMax;
*yMinA = edge[charIdx];
*yMaxA = edge[charIdx + 1];
break;
case 2:
*xMinA = edge[charIdx + 1];
*xMaxA = edge[charIdx];
*yMinA = yMin;
*yMaxA = yMax;
break;
case 3:
*xMinA = xMin;
*xMaxA = xMax;
*yMinA = edge[charIdx + 1];
*yMaxA = edge[charIdx];
break;
}
}
double TextWord::getBaseline() {
switch (rot) {
case 0:
default:
return yMax + fontSize * font->descent;
case 1:
return xMin - fontSize * font->descent;
case 2:
return yMin - fontSize * font->descent;
case 3:
return xMax + fontSize * font->descent;
}
}
GString *TextWord::getLinkURI() {
return link ? link->uri : (GString *)NULL;
}
//------------------------------------------------------------------------
// TextLine
//------------------------------------------------------------------------
TextLine::TextLine(GList *wordsA, double xMinA, double yMinA,
double xMaxA, double yMaxA, double fontSizeA) {
TextWord *word;
int i, j, k;
words = wordsA;
rot = 0;
xMin = xMinA;
yMin = yMinA;
xMax = xMaxA;
yMax = yMaxA;
fontSize = fontSizeA;
px = 0;
pw = 0;
// build the text
len = 0;
for (i = 0; i < words->getLength(); ++i) {
word = (TextWord *)words->get(i);
len += word->len;
if (word->spaceAfter) {
++len;
}
}
text = (Unicode *)gmallocn(len, sizeof(Unicode));
edge = (double *)gmallocn(len + 1, sizeof(double));
j = 0;
for (i = 0; i < words->getLength(); ++i) {
word = (TextWord *)words->get(i);
if (i == 0) {
rot = word->rot;
}
for (k = 0; k < word->len; ++k) {
text[j] = word->text[k];
edge[j] = word->edge[k];
++j;
}
edge[j] = word->edge[word->len];
if (word->spaceAfter) {
text[j] = (Unicode)0x0020;
++j;
edge[j] = edge[j - 1];
}
}
//~ need to check for other Unicode chars used as hyphens
hyphenated = text[len - 1] == (Unicode)'-';
}
TextLine::~TextLine() {
deleteGList(words, TextWord);
gfree(text);
gfree(edge);
}
double TextLine::getBaseline() {
TextWord *word0;
word0 = (TextWord *)words->get(0);
switch (rot) {
case 0:
default:
return yMax + fontSize * word0->font->descent;
case 1:
return xMin - fontSize * word0->font->descent;
case 2:
return yMin - fontSize * word0->font->descent;
case 3:
return xMax + fontSize * word0->font->descent;
}
}
int TextLine::cmpX(const void *p1, const void *p2) {
const TextLine *line1 = *(const TextLine **)p1;
const TextLine *line2 = *(const TextLine **)p2;
if (line1->xMin < line2->xMin) {
return -1;
} else if (line1->xMin > line2->xMin) {
return 1;
} else {
return 0;
}
}
//------------------------------------------------------------------------
// TextParagraph
//------------------------------------------------------------------------
TextParagraph::TextParagraph(GList *linesA, GBool dropCapA) {
TextLine *line;
int i;
lines = linesA;
dropCap = dropCapA;
xMin = yMin = xMax = yMax = 0;
for (i = 0; i < lines->getLength(); ++i) {
line = (TextLine *)lines->get(i);
if (i == 0 || line->xMin < xMin) {
xMin = line->xMin;
}
if (i == 0 || line->yMin < yMin) {
yMin = line->yMin;
}
if (i == 0 || line->xMax > xMax) {
xMax = line->xMax;
}
if (i == 0 || line->yMax > yMax) {
yMax = line->yMax;
}
}
}
TextParagraph::~TextParagraph() {
deleteGList(lines, TextLine);
}
//------------------------------------------------------------------------
// TextColumn
//------------------------------------------------------------------------
TextColumn::TextColumn(GList *paragraphsA, double xMinA, double yMinA,
double xMaxA, double yMaxA) {
paragraphs = paragraphsA;
xMin = xMinA;
yMin = yMinA;
xMax = xMaxA;
yMax = yMaxA;
px = py = 0;
pw = ph = 0;
}
TextColumn::~TextColumn() {
deleteGList(paragraphs, TextParagraph);
}
int TextColumn::getRotation() {
TextParagraph *par;
TextLine *line;
par = (TextParagraph *)paragraphs->get(0);
line = (TextLine *)par->getLines()->get(0);
return line->getRotation();
}
int TextColumn::cmpX(const void *p1, const void *p2) {
const TextColumn *col1 = *(const TextColumn **)p1;
const TextColumn *col2 = *(const TextColumn **)p2;
if (col1->xMin < col2->xMin) {
return -1;
} else if (col1->xMin > col2->xMin) {
return 1;
} else {
return 0;
}
}
int TextColumn::cmpY(const void *p1, const void *p2) {
const TextColumn *col1 = *(const TextColumn **)p1;
const TextColumn *col2 = *(const TextColumn **)p2;
if (col1->yMin < col2->yMin) {
return -1;
} else if (col1->yMin > col2->yMin) {
return 1;
} else {
return 0;
}
}
int TextColumn::cmpPX(const void *p1, const void *p2) {
const TextColumn *col1 = *(const TextColumn **)p1;
const TextColumn *col2 = *(const TextColumn **)p2;
if (col1->px < col2->px) {
return -1;
} else if (col1->px > col2->px) {
return 1;
} else {
return 0;
}
}
//------------------------------------------------------------------------
// TextWordList
//------------------------------------------------------------------------
TextWordList::TextWordList(GList *wordsA, GBool primaryLRA) {
words = wordsA;
primaryLR = primaryLRA;
}
TextWordList::~TextWordList() {
deleteGList(words, TextWord);
}
int TextWordList::getLength() {
return words->getLength();
}
TextWord *TextWordList::get(int idx) {
if (idx < 0 || idx >= words->getLength()) {
return NULL;
}
return (TextWord *)words->get(idx);
}
//------------------------------------------------------------------------
// TextPosition
//------------------------------------------------------------------------
int TextPosition::operator==(TextPosition pos) {
return colIdx == pos.colIdx &&
parIdx == pos.parIdx &&
lineIdx == pos.lineIdx &&
charIdx == pos.charIdx;
}
int TextPosition::operator!=(TextPosition pos) {
return colIdx != pos.colIdx ||
parIdx != pos.parIdx ||
lineIdx != pos.lineIdx ||
charIdx != pos.charIdx;
}
int TextPosition::operator<(TextPosition pos) {
return colIdx < pos.colIdx ||
(colIdx == pos.colIdx &&
(parIdx < pos.parIdx ||
(parIdx == pos.parIdx &&
(lineIdx < pos.lineIdx ||
(lineIdx == pos.lineIdx &&
charIdx < pos.charIdx)))));
}
int TextPosition::operator>(TextPosition pos) {
return colIdx > pos.colIdx ||
(colIdx == pos.colIdx &&
(parIdx > pos.parIdx ||
(parIdx == pos.parIdx &&
(lineIdx > pos.lineIdx ||
(lineIdx == pos.lineIdx &&
charIdx > pos.charIdx)))));
}
//------------------------------------------------------------------------
// TextPage
//------------------------------------------------------------------------
TextPage::TextPage(TextOutputControl *controlA) {
control = *controlA;
remapping = globalParams->getUnicodeRemapping();
uBufSize = 16;
uBuf = (Unicode *)gmallocn(uBufSize, sizeof(Unicode));
pageWidth = pageHeight = 0;
charPos = 0;
curFont = NULL;
curFontSize = 0;
curRot = 0;
diagonal = gFalse;
rotated = gFalse;
nTinyChars = 0;
actualText = NULL;
actualTextLen = 0;
actualTextX0 = 0;
actualTextY0 = 0;
actualTextX1 = 0;
actualTextY1 = 0;
actualTextNBytes = 0;
chars = new GList();
fonts = new GList();
underlines = new GList();
links = new GList();
findCols = NULL;
lastFindXMin = lastFindYMin = 0;
haveLastFind = gFalse;
problematic = gFalse;
}
TextPage::~TextPage() {
clear();
deleteGList(chars, TextChar);
deleteGList(fonts, TextFontInfo);
deleteGList(underlines, TextUnderline);
deleteGList(links, TextLink);
if (findCols) {
deleteGList(findCols, TextColumn);
}
gfree(uBuf);
}
void TextPage::startPage(GfxState *state) {
clear();
if (state) {
pageWidth = state->getPageWidth();
pageHeight = state->getPageHeight();
} else {
pageWidth = pageHeight = 0;
}
}
void TextPage::clear() {
pageWidth = pageHeight = 0;
charPos = 0;
curFont = NULL;
curFontSize = 0;
curRot = 0;
diagonal = gFalse;
rotated = gFalse;
nTinyChars = 0;
gfree(actualText);
actualText = NULL;
actualTextLen = 0;
actualTextNBytes = 0;
deleteGList(chars, TextChar);
chars = new GList();
deleteGList(fonts, TextFontInfo);
fonts = new GList();
deleteGList(underlines, TextUnderline);
underlines = new GList();
deleteGList(links, TextLink);
links = new GList();
if (findCols) {
deleteGList(findCols, TextColumn);
findCols = NULL;
}
lastFindXMin = lastFindYMin = 0;
haveLastFind = gFalse;
problematic = gFalse;
}
void TextPage::updateFont(GfxState *state) {
GfxFont *gfxFont;
double *fm;
char *name;
int code, mCode, letterCode, anyCode;
double w;
double m[4], m2[4];
int i;
// get the font info object
curFont = NULL;
for (i = 0; i < fonts->getLength(); ++i) {
curFont = (TextFontInfo *)fonts->get(i);
if (curFont->matches(state)) {
break;
}
curFont = NULL;
}
if (!curFont) {
curFont = new TextFontInfo(state);
fonts->append(curFont);
if (state->getFont() && state->getFont()->problematicForUnicode()) {
problematic = gTrue;
}
}
// adjust the font size
gfxFont = state->getFont();
curFontSize = state->getTransformedFontSize();
if (gfxFont && gfxFont->getType() == fontType3) {
// This is a hack which makes it possible to deal with some Type 3
// fonts. The problem is that it's impossible to know what the
// base coordinate system used in the font is without actually
// rendering the font. This code tries to guess by looking at the
// width of the character 'm' (which breaks if the font is a
// subset that doesn't contain 'm').
mCode = letterCode = anyCode = -1;
for (code = 0; code < 256; ++code) {
name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
if (name && name[0] == 'm' && name[1] == '\0') {
mCode = code;
}
if (letterCode < 0 &&
name &&
((name[0] >= 'A' && name[0] <= 'Z') ||
(name[0] >= 'a' && name[0] <= 'z')) &&
name[1] == '\0') {
letterCode = code;
}
if (anyCode < 0 && name &&
((Gfx8BitFont *)gfxFont)->getWidth((Guchar)code) > 0) {
anyCode = code;
}
}
if (mCode >= 0 &&
(w = ((Gfx8BitFont *)gfxFont)->getWidth((Guchar)mCode)) > 0) {
// 0.6 is a generic average 'm' width -- yes, this is a hack
curFontSize *= w / 0.6;
} else if (letterCode >= 0 &&
(w = ((Gfx8BitFont *)gfxFont)->getWidth((Guchar)letterCode))
> 0) {
// even more of a hack: 0.5 is a generic letter width
curFontSize *= w / 0.5;
} else if (anyCode >= 0 &&
(w = ((Gfx8BitFont *)gfxFont)->getWidth((Guchar)anyCode)) > 0) {
// better than nothing: 0.5 is a generic character width
curFontSize *= w / 0.5;
}
fm = gfxFont->getFontMatrix();
if (fm[0] != 0) {
curFontSize *= fabs(fm[3] / fm[0]);
}
}
// compute the rotation
state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
if (gfxFont && gfxFont->getType() == fontType3) {
fm = gfxFont->getFontMatrix();
m2[0] = fm[0] * m[0] + fm[1] * m[2];
m2[1] = fm[0] * m[1] + fm[1] * m[3];
m2[2] = fm[2] * m[0] + fm[3] * m[2];
m2[3] = fm[2] * m[1] + fm[3] * m[3];
m[0] = m2[0];
m[1] = m2[1];
m[2] = m2[2];
m[3] = m2[3];
}
if (curFontSize == 0) {
// special case - if the font size is zero, just assume plain
// horizontal text
curRot = 0;
diagonal = gFalse;
} else if (fabs(m[0]) >= fabs(m[1])) {
if (m[0] > 0) {
curRot = 0;
} else {
curRot = 2;
}
diagonal = fabs(m[1]) > diagonalThreshold * fabs(m[0]);
} else {
if (m[1] > 0) {
curRot = 1;
} else {
curRot = 3;
}
diagonal = fabs(m[0]) > diagonalThreshold * fabs(m[1]);
}
// this matches the 'horiz' test in SplashOutputDev::drawChar()
rotated = !(m[0] > 0 && fabs(m[1]) < 0.001 &&
fabs(m[2]) < 0.001 && m[3] < 0);
}
void TextPage::addChar(GfxState *state, double x, double y,
double dx, double dy,
CharCode c, int nBytes, Unicode *u, int uLen) {
double x1, y1, x2, y2, w1, h1, dx2, dy2, ascent, descent, sp;
double xMin, yMin, xMax, yMax, xMid, yMid;
double clipXMin, clipYMin, clipXMax, clipYMax;
GfxRGB rgb;
double alpha;
GBool clipped, rtl;
int uBufLen, i, j;
// if we're in an ActualText span, save the position info (the
// ActualText chars will be added by TextPage::endActualText()).
if (actualText) {
if (!actualTextNBytes) {
actualTextX0 = x;
actualTextY0 = y;
}
actualTextX1 = x + dx;
actualTextY1 = y + dy;
actualTextNBytes += nBytes;
return;
}
// throw away diagonal/rotated chars
if ((control.discardDiagonalText && diagonal) ||
(control.discardRotatedText && rotated)) {
charPos += nBytes;
return;
}
// subtract char and word spacing from the dx,dy values
sp = state->getCharSpace();
if (c == (CharCode)0x20) {
sp += state->getWordSpace();
}
state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
dx -= dx2;
dy -= dy2;
state->transformDelta(dx, dy, &w1, &h1);
// throw away chars that aren't inside the page bounds
// (and also do a sanity check on the character size)
state->transform(x, y, &x1, &y1);
if (x1 + w1 < control.marginLeft ||
x1 > pageWidth - control.marginRight ||
y1 + h1 < control.marginTop ||
y1 > pageHeight - control.marginBottom ||
w1 > pageWidth ||
h1 > pageHeight) {
charPos += nBytes;
return;
}
// check the tiny chars limit
if (!globalParams->getTextKeepTinyChars() &&
fabs(w1) < 3 && fabs(h1) < 3) {
if (++nTinyChars > 50000) {
charPos += nBytes;
return;
}
}
// skip space, tab, and non-breaking space characters
// (ActualText spans can result in multiple space chars)
for (i = 0; i < uLen; ++i) {
if (u[i] != (Unicode)0x20 &&
u[i] != (Unicode)0x09 &&
u[i] != (Unicode)0xa0) {
break;
}
}
if (i == uLen && uLen >= 1) {
charPos += nBytes;
if (chars->getLength() > 0) {
((TextChar *)chars->get(chars->getLength() - 1))->spaceAfter =
(char)gTrue;
}
return;
}
// remap Unicode
uBufLen = 0;
for (i = 0; i < uLen; ++i) {
if (uBufSize - uBufLen < 8 && uBufSize < 20000) {
uBufSize *= 2;
uBuf = (Unicode *)greallocn(uBuf, uBufSize, sizeof(Unicode));
}
uBufLen += remapping->map(u[i], uBuf + uBufLen, uBufSize - uBufLen);
}
// add the characters
if (uBufLen > 0) {
// handle right-to-left ligatures: if there are multiple Unicode
// characters, and they're all right-to-left, insert them in
// right-to-left order
if (uBufLen > 1) {
rtl = gTrue;
for (i = 0; i < uBufLen; ++i) {
if (!unicodeTypeR(uBuf[i])) {
rtl = gFalse;
break;
}
}
} else {
rtl = gFalse;
}
// compute the bounding box
w1 /= uBufLen;
h1 /= uBufLen;
ascent = curFont->ascent * curFontSize;
descent = curFont->descent * curFontSize;
for (i = 0; i < uBufLen; ++i) {
x2 = x1 + i * w1;
y2 = y1 + i * h1;
switch (curRot) {
case 0:
default:
xMin = x2;
xMax = x2 + w1;
yMin = y2 - ascent;
yMax = y2 - descent;
break;
case 1:
xMin = x2 + descent;
xMax = x2 + ascent;
yMin = y2;
yMax = y2 + h1;
break;
case 2:
xMin = x2 + w1;
xMax = x2;
yMin = y2 + descent;
yMax = y2 + ascent;
break;
case 3:
xMin = x2 - ascent;
xMax = x2 - descent;
yMin = y2 + h1;
yMax = y2;
break;
}
// check for clipping
clipped = gFalse;
if (control.clipText || control.discardClippedText) {
state->getClipBBox(&clipXMin, &clipYMin, &clipXMax, &clipYMax);
xMid = 0.5 * (xMin + xMax);
yMid = 0.5 * (yMin + yMax);
if (xMid < clipXMin || xMid > clipXMax ||
yMid < clipYMin || yMid > clipYMax) {
clipped = gTrue;
}
}
if ((state->getRender() & 3) == 1) {
state->getStrokeRGB(&rgb);
alpha = state->getStrokeOpacity();
} else {
state->getFillRGB(&rgb);
alpha = state->getFillOpacity();
}
if (rtl) {
j = uBufLen - 1 - i;
} else {
j = i;
}
chars->append(new TextChar(uBuf[j], charPos, nBytes,
xMin, yMin, xMax, yMax,
curRot, rotated, clipped,
state->getRender() == 3 || alpha < 0.001,
curFont, curFontSize,
colToDbl(rgb.r), colToDbl(rgb.g),
colToDbl(rgb.b)));
}
}
charPos += nBytes;
}
void TextPage::incCharCount(int nChars) {
charPos += nChars;
}
void TextPage::beginActualText(GfxState *state, Unicode *u, int uLen) {
if (actualText) {
gfree(actualText);
}
actualText = (Unicode *)gmallocn(uLen, sizeof(Unicode));
memcpy(actualText, u, uLen * sizeof(Unicode));
actualTextLen = uLen;
actualTextNBytes = 0;
}
void TextPage::endActualText(GfxState *state) {
Unicode *u;
u = actualText;
actualText = NULL; // so we can call TextPage::addChar()
if (actualTextNBytes) {
// now that we have the position info for all of the text inside
// the marked content span, we feed the "ActualText" back through
// addChar()
addChar(state, actualTextX0, actualTextY0,
actualTextX1 - actualTextX0, actualTextY1 - actualTextY0,
0, actualTextNBytes, u, actualTextLen);
}
gfree(u);
actualText = NULL;
actualTextLen = 0;
actualTextNBytes = gFalse;
}
void TextPage::addUnderline(double x0, double y0, double x1, double y1) {
underlines->append(new TextUnderline(x0, y0, x1, y1));
}
void TextPage::addLink(double xMin, double yMin, double xMax, double yMax,
Link *link) {
GString *uri;
if (link && link->getAction() && link->getAction()->getKind() == actionURI) {
uri = ((LinkURI *)link->getAction())->getURI()->copy();
links->append(new TextLink(xMin, yMin, xMax, yMax, uri));
}
}
//------------------------------------------------------------------------
// TextPage: output
//------------------------------------------------------------------------
void TextPage::write(void *outputStream, TextOutputFunc outputFunc) {
UnicodeMap *uMap;
char space[8], eol[16], eop[8];
int spaceLen, eolLen, eopLen;
GBool pageBreaks;
// get the output encoding
if (!(uMap = globalParams->getTextEncoding())) {
return;
}
spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
eolLen = 0; // make gcc happy
switch (globalParams->getTextEOL()) {
case eolUnix:
eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
break;
case eolDOS:
eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
eolLen += uMap->mapUnicode(0x0a, eol + eolLen, (int)sizeof(eol) - eolLen);
break;
case eolMac:
eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
break;
}
eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
pageBreaks = globalParams->getTextPageBreaks();
switch (control.mode) {
case textOutReadingOrder:
writeReadingOrder(outputStream, outputFunc, uMap, space, spaceLen,
eol, eolLen);
break;
case textOutPhysLayout:
case textOutTableLayout:
writePhysLayout(outputStream, outputFunc, uMap, space, spaceLen,
eol, eolLen);
break;
case textOutSimpleLayout:
writeSimpleLayout(outputStream, outputFunc, uMap, space, spaceLen,
eol, eolLen);
break;
case textOutSimple2Layout:
writeSimple2Layout(outputStream, outputFunc, uMap, space, spaceLen,
eol, eolLen);
break;
case textOutLinePrinter:
writeLinePrinter(outputStream, outputFunc, uMap, space, spaceLen,
eol, eolLen);
break;
case textOutRawOrder:
writeRaw(outputStream, outputFunc, uMap, space, spaceLen,
eol, eolLen);
break;
}
// end of page
if (pageBreaks) {
(*outputFunc)(outputStream, eop, eopLen);
}
uMap->decRefCnt();
}
void TextPage::writeReadingOrder(void *outputStream,
TextOutputFunc outputFunc,
UnicodeMap *uMap,
char *space, int spaceLen,
char *eol, int eolLen) {
TextBlock *tree;
TextColumn *col;
TextParagraph *par;
TextLine *line;
GList *overlappingChars;
GList *columns;
GBool primaryLR;
GString *s;
int colIdx, parIdx, lineIdx, rot, n;
#if 0 //~debug
dumpChars(chars);
#endif
if (control.overlapHandling != textOutIgnoreOverlaps) {
overlappingChars = separateOverlappingText(chars);
} else {
overlappingChars = NULL;
}
rot = rotateChars(chars);
primaryLR = checkPrimaryLR(chars);
tree = splitChars(chars);
#if 0 //~debug
dumpTree(tree);
#endif
if (!tree) {
// no text
unrotateChars(chars, rot);
return;
}
columns = buildColumns(tree, primaryLR);
delete tree;
unrotateChars(chars, rot);
if (control.html) {
rotateUnderlinesAndLinks(rot);
generateUnderlinesAndLinks(columns);
}
if (overlappingChars) {
if (overlappingChars->getLength() > 0) {
columns->append(buildOverlappingTextColumn(overlappingChars));
}
deleteGList(overlappingChars, TextChar);
}
#if 0 //~debug
dumpColumns(columns);
#endif
for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
col = (TextColumn *)columns->get(colIdx);
for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
n = line->len;
if (line->hyphenated && lineIdx + 1 < par->lines->getLength()) {
--n;
}
s = new GString();
encodeFragment(line->text, n, uMap, primaryLR, s);
if (lineIdx + 1 < par->lines->getLength() && !line->hyphenated) {
s->append(space, spaceLen);
}
(*outputFunc)(outputStream, s->getCString(), s->getLength());
delete s;
}
(*outputFunc)(outputStream, eol, eolLen);
}
(*outputFunc)(outputStream, eol, eolLen);
}
deleteGList(columns, TextColumn);
}
GList *TextPage::makeColumns() {
TextBlock *tree;
GList *overlappingChars;
GList *columns;
GBool primaryLR;
int rot;
if (control.mode == textOutSimple2Layout) {
primaryLR = checkPrimaryLR(chars);
rotateCharsToZero(chars);
columns = buildSimple2Columns(chars);
unrotateCharsFromZero(chars);
unrotateColumnsFromZero(columns);
} else {
if (control.overlapHandling != textOutIgnoreOverlaps) {
overlappingChars = separateOverlappingText(chars);
} else {
overlappingChars = NULL;
}
rot = rotateChars(chars);
primaryLR = checkPrimaryLR(chars);
if ((tree = splitChars(chars))) {
columns = buildColumns(tree, primaryLR);
delete tree;
} else {
// no text
columns = new GList();
}
unrotateChars(chars, rot);
unrotateColumns(columns, rot);
if (control.html) {
generateUnderlinesAndLinks(columns);
}
if (overlappingChars) {
if (overlappingChars->getLength() > 0) {
columns->append(buildOverlappingTextColumn(overlappingChars));
}
deleteGList(overlappingChars, TextChar);
}
}
return columns;
}
// This handles both physical layout and table layout modes.
void TextPage::writePhysLayout(void *outputStream,
TextOutputFunc outputFunc,
UnicodeMap *uMap,
char *space, int spaceLen,
char *eol, int eolLen) {
TextBlock *tree;
GString **out;
int *outLen;
TextColumn *col;
TextParagraph *par;
TextLine *line;
GList *overlappingChars;
GList *columns;
GBool primaryLR;
int ph, colIdx, parIdx, lineIdx, rot, y, i;
#if 0 //~debug
dumpChars(chars);
#endif
#if 0 //~debug
dumpUnderlines();
#endif
if (control.overlapHandling != textOutIgnoreOverlaps) {
overlappingChars = separateOverlappingText(chars);
} else {
overlappingChars = NULL;
}
rot = rotateChars(chars);
primaryLR = checkPrimaryLR(chars);
tree = splitChars(chars);
#if 0 //~debug
dumpTree(tree);
#endif
if (!tree) {
// no text
unrotateChars(chars, rot);
return;
}
//~ this doesn't correctly handle the right-to-left case
columns = buildColumns(tree, gTrue);
delete tree;
unrotateChars(chars, rot);
if (control.html) {
rotateUnderlinesAndLinks(rot);
generateUnderlinesAndLinks(columns);
}
ph = assignPhysLayoutPositions(columns);
#if 0 //~debug
dumpColumns(columns);
#endif
out = (GString **)gmallocn(ph, sizeof(GString *));
outLen = (int *)gmallocn(ph, sizeof(int));
for (i = 0; i < ph; ++i) {
out[i] = NULL;
outLen[i] = 0;
}
columns->sort(&TextColumn::cmpPX);
for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
col = (TextColumn *)columns->get(colIdx);
y = col->py;
for (parIdx = 0;
parIdx < col->paragraphs->getLength() && y < ph;
++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
for (lineIdx = 0;
lineIdx < par->lines->getLength() && y < ph;
++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
if (!out[y]) {
out[y] = new GString();
}
while (outLen[y] < col->px + line->px) {
out[y]->append(space, spaceLen);
++outLen[y];
}
encodeFragment(line->text, line->len, uMap, primaryLR, out[y]);
outLen[y] += line->pw;
++y;
}
if (parIdx + 1 < col->paragraphs->getLength()) {
++y;
}
}
}
for (i = 0; i < ph; ++i) {
if (out[i]) {
(*outputFunc)(outputStream, out[i]->getCString(), out[i]->getLength());
delete out[i];
}
(*outputFunc)(outputStream, eol, eolLen);
}
gfree(out);
gfree(outLen);
deleteGList(columns, TextColumn);
if (overlappingChars) {
if (overlappingChars->getLength() > 0) {
TextColumn *col = buildOverlappingTextColumn(overlappingChars);
(*outputFunc)(outputStream, eol, eolLen);
for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
GString *s = new GString();
encodeFragment(line->text, line->len, uMap, primaryLR, s);
s->append(eol, eolLen);
(*outputFunc)(outputStream, s->getCString(), s->getLength());
delete s;
}
if (parIdx < col->paragraphs->getLength() - 1) {
(*outputFunc)(outputStream, eol, eolLen);
}
}
delete col;
}
deleteGList(overlappingChars, TextChar);
}
}
void TextPage::writeSimpleLayout(void *outputStream,
TextOutputFunc outputFunc,
UnicodeMap *uMap,
char *space, int spaceLen,
char *eol, int eolLen) {
TextBlock *tree;
TextSuperLine *superLine0, *superLine1;
TextLine *line;
GList *superLines;
GString *out;
GBool primaryLR;
int rot, x, i, j;
#if 0 //~debug
dumpChars(chars);
#endif
rot = rotateChars(chars);
primaryLR = checkPrimaryLR(chars);
tree = splitChars(chars);
#if 0 //~debug
dumpTree(tree);
#endif
if (!tree) {
// no text
unrotateChars(chars, rot);
return;
}
superLines = new GList();
buildSuperLines(tree, superLines);
delete tree;
unrotateChars(chars, rot);
assignSimpleLayoutPositions(superLines, uMap);
for (i = 0; i < superLines->getLength(); ++i) {
superLine0 = (TextSuperLine *)superLines->get(i);
out = new GString();
x = 0;
for (j = 0; j < superLine0->lines->getLength(); ++j) {
line = (TextLine *)superLine0->lines->get(j);
while (x < line->px) {
out->append(space, spaceLen);
++x;
}
encodeFragment(line->text, line->len, uMap, primaryLR, out);
x += line->pw;
}
(*outputFunc)(outputStream, out->getCString(), out->getLength());
delete out;
(*outputFunc)(outputStream, eol, eolLen);
if (i + 1 < superLines->getLength()) {
superLine1 = (TextSuperLine *)superLines->get(i + 1);
if (superLine1->yMin - superLine0->yMax > 1.0 * superLine0->fontSize) {
(*outputFunc)(outputStream, eol, eolLen);
}
}
}
deleteGList(superLines, TextSuperLine);
}
void TextPage::writeSimple2Layout(void *outputStream,
TextOutputFunc outputFunc,
UnicodeMap *uMap,
char *space, int spaceLen,
char *eol, int eolLen) {
GList *columns;
TextColumn *col;
TextParagraph *par;
TextLine *line;
GString *out;
GBool primaryLR;
int colIdx, parIdx, lineIdx;
primaryLR = checkPrimaryLR(chars);
rotateCharsToZero(chars);
#if 0 //~debug
dumpChars(chars);
#endif
columns = buildSimple2Columns(chars);
unrotateCharsFromZero(chars);
unrotateColumnsFromZero(columns);
for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
col = (TextColumn *)columns->get(colIdx);
for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
out = new GString();
encodeFragment(line->text, line->len, uMap, primaryLR, out);
(*outputFunc)(outputStream, out->getCString(), out->getLength());
delete out;
(*outputFunc)(outputStream, eol, eolLen);
}
}
}
deleteGList(columns, TextColumn);
}
void TextPage::writeLinePrinter(void *outputStream,
TextOutputFunc outputFunc,
UnicodeMap *uMap,
char *space, int spaceLen,
char *eol, int eolLen) {
TextChar *ch, *ch2;
GList *line;
GString *s;
char buf[8];
double pitch, lineSpacing, delta;
double yMin0, yShift, xMin0, xShift;
double y, x;
int rot, n, i, j, k;
rot = rotateChars(chars);
chars->sort(&TextChar::cmpX);
// don't call removeDuplicates here, because it expects to be
// working on a secondary list that doesn't own the TextChar objects
chars->sort(&TextChar::cmpY);
// get character pitch
if (control.fixedPitch > 0) {
pitch = control.fixedPitch;
} else {
// compute (approximate) character pitch
pitch = pageWidth;
for (i = 0; i < chars->getLength(); ++i) {
ch = (TextChar *)chars->get(i);
for (j = i + 1; j < chars->getLength(); ++j) {
ch2 = (TextChar *)chars->get(j);
if (ch2->yMin + ascentAdjustFactor * (ch2->yMax - ch2->yMin) <
ch->yMax - descentAdjustFactor * (ch->yMax - ch->yMin) &&
ch->yMin + ascentAdjustFactor * (ch->yMax - ch->yMin) <
ch2->yMax - descentAdjustFactor * (ch2->yMax - ch2->yMin)) {
delta = fabs(ch2->xMin - ch->xMin);
if (delta > 0.01 && delta < pitch) {
pitch = delta;
}
}
}
}
}
// get line spacing
if (control.fixedLineSpacing > 0) {
lineSpacing = control.fixedLineSpacing;
} else {
// compute (approximate) line spacing
lineSpacing = pageHeight;
i = 0;
while (i < chars->getLength()) {
ch = (TextChar *)chars->get(i);
// look for the first char that does not (substantially)
// vertically overlap this one
delta = 0;
for (++i; delta == 0 && i < chars->getLength(); ++i) {
ch2 = (TextChar *)chars->get(i);
if (ch2->yMin + ascentAdjustFactor * (ch2->yMax - ch2->yMin) >
ch->yMax - descentAdjustFactor * (ch->yMax - ch->yMin)) {
delta = ch2->yMin - ch->yMin;
}
}
if (delta > 0 && delta < lineSpacing) {
lineSpacing = delta;
}
}
}
// shift the grid to avoid problems with floating point accuracy --
// for fixed line spacing, this avoids problems with
// dropping/inserting blank lines
if (chars->getLength()) {
yMin0 = ((TextChar *)chars->get(0))->yMin;
yShift = yMin0 - (int)(yMin0 / lineSpacing + 0.5) * lineSpacing
- 0.5 * lineSpacing;
} else {
yShift = 0;
}
// for each line...
i = 0;
j = chars->getLength() - 1;
for (y = yShift; y < pageHeight; y += lineSpacing) {
// get the characters in this line
line = new GList;
while (i < chars->getLength() &&
((TextChar *)chars->get(i))->yMin < y + lineSpacing) {
line->append(chars->get(i++));
}
line->sort(&TextChar::cmpX);
// shift the grid to avoid problems with floating point accuracy
// -- for fixed char spacing, this avoids problems with
// dropping/inserting spaces
if (line->getLength()) {
xMin0 = ((TextChar *)line->get(0))->xMin;
xShift = xMin0 - (int)(xMin0 / pitch + 0.5) * pitch - 0.5 * pitch;
} else {
xShift = 0;
}
// write the line
s = new GString();
x = xShift;
k = 0;
while (k < line->getLength()) {
ch = (TextChar *)line->get(k);
if (ch->xMin < x + pitch) {
n = uMap->mapUnicode(ch->c, buf, sizeof(buf));
s->append(buf, n);
++k;
} else {
s->append(space, spaceLen);
n = spaceLen;
}
x += (uMap->isUnicode() ? 1 : n) * pitch;
}
s->append(eol, eolLen);
(*outputFunc)(outputStream, s->getCString(), s->getLength());
delete s;
delete line;
}
unrotateChars(chars, rot);
}
void TextPage::writeRaw(void *outputStream,
TextOutputFunc outputFunc,
UnicodeMap *uMap,
char *space, int spaceLen,
char *eol, int eolLen) {
TextChar *ch, *ch2;
GString *s;
char buf[8];
int n, i;
s = new GString();
for (i = 0; i < chars->getLength(); ++i) {
// process one char
ch = (TextChar *)chars->get(i);
n = uMap->mapUnicode(ch->c, buf, sizeof(buf));
s->append(buf, n);
// check for space or eol
if (i+1 < chars->getLength()) {
ch2 = (TextChar *)chars->get(i+1);
if (ch2->rot != ch->rot) {
s->append(eol, eolLen);
} else {
switch (ch->rot) {
case 0:
default:
if (fabs(ch2->yMin - ch->yMin) > rawModeLineDelta * ch->fontSize ||
ch2->xMin - ch->xMax < -rawModeCharOverlap * ch->fontSize) {
s->append(eol, eolLen);
} else if (ch->spaceAfter ||
ch2->xMin - ch->xMax >
rawModeWordSpacing * ch->fontSize) {
s->append(space, spaceLen);
}
break;
case 1:
if (fabs(ch->xMax - ch2->xMax) > rawModeLineDelta * ch->fontSize ||
ch2->yMin - ch->yMax < -rawModeCharOverlap * ch->fontSize) {
s->append(eol, eolLen);
} else if (ch->spaceAfter ||
ch2->yMin - ch->yMax >
rawModeWordSpacing * ch->fontSize) {
s->append(space, spaceLen);
}
break;
case 2:
if (fabs(ch->yMax - ch2->yMax) > rawModeLineDelta * ch->fontSize ||
ch->xMin - ch2->xMax < -rawModeCharOverlap * ch->fontSize) {
s->append(eol, eolLen);
} else if (ch->spaceAfter ||
ch->xMin - ch2->xMax >
rawModeWordSpacing * ch->fontSize) {
s->append(space, spaceLen);
}
break;
case 3:
if (fabs(ch2->xMin - ch->xMin) > rawModeLineDelta * ch->fontSize ||
ch->yMin - ch2->yMax < -rawModeCharOverlap * ch->fontSize) {
s->append(eol, eolLen);
} else if (ch->spaceAfter ||
ch->yMin - ch2->yMax >
rawModeWordSpacing * ch->fontSize) {
s->append(space, spaceLen);
}
break;
}
}
} else {
s->append(eol, eolLen);
}
if (s->getLength() > 1000) {
(*outputFunc)(outputStream, s->getCString(), s->getLength());
s->clear();
}
}
if (s->getLength() > 0) {
(*outputFunc)(outputStream, s->getCString(), s->getLength());
}
delete s;
}
void TextPage::encodeFragment(Unicode *text, int len, UnicodeMap *uMap,
GBool primaryLR, GString *s) {
char lre[8], rle[8], popdf[8], buf[8];
int lreLen, rleLen, popdfLen, n;
int i, j, k;
if (uMap->isUnicode()) {
lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
if (primaryLR) {
i = 0;
while (i < len) {
// output a left-to-right section
for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
for (k = i; k < j; ++k) {
n = uMap->mapUnicode(text[k], buf, sizeof(buf));
s->append(buf, n);
}
i = j;
// output a right-to-left section
for (j = i;
j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
++j) ;
if (j > i) {
s->append(rle, rleLen);
for (k = j - 1; k >= i; --k) {
n = uMap->mapUnicode(text[k], buf, sizeof(buf));
s->append(buf, n);
}
s->append(popdf, popdfLen);
i = j;
}
}
} else {
// Note: This code treats numeric characters (European and
// Arabic/Indic) as left-to-right, which isn't strictly correct
// (incurs extra LRE/POPDF pairs), but does produce correct
// visual formatting.
s->append(rle, rleLen);
i = len - 1;
while (i >= 0) {
// output a right-to-left section
for (j = i;
j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
--j) ;
for (k = i; k > j; --k) {
n = uMap->mapUnicode(text[k], buf, sizeof(buf));
s->append(buf, n);
}
i = j;
// output a left-to-right section
for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
if (j < i) {
s->append(lre, lreLen);
for (k = j + 1; k <= i; ++k) {
n = uMap->mapUnicode(text[k], buf, sizeof(buf));
s->append(buf, n);
}
s->append(popdf, popdfLen);
i = j;
}
}
s->append(popdf, popdfLen);
}
} else {
for (i = 0; i < len; ++i) {
n = uMap->mapUnicode(text[i], buf, sizeof(buf));
s->append(buf, n);
}
}
}
//------------------------------------------------------------------------
// TextPage: layout analysis
//------------------------------------------------------------------------
// Determine primary (most common) rotation value. Rotate all chars
// to that primary rotation.
int TextPage::rotateChars(GList *charsA) {
TextChar *ch;
int nChars[4];
double xMin, yMin, xMax, yMax, t;
int rot, i;
// determine primary rotation
nChars[0] = nChars[1] = nChars[2] = nChars[3] = 0;
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
++nChars[ch->rot];
}
rot = 0;
for (i = 1; i < 4; ++i) {
if (nChars[i] > nChars[rot]) {
rot = i;
}
}
// rotate
switch (rot) {
case 0:
default:
break;
case 1:
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
xMin = ch->yMin;
xMax = ch->yMax;
yMin = pageWidth - ch->xMax;
yMax = pageWidth - ch->xMin;
ch->xMin = xMin;
ch->xMax = xMax;
ch->yMin = yMin;
ch->yMax = yMax;
ch->rot = (ch->rot + 3) & 3;
}
t = pageWidth;
pageWidth = pageHeight;
pageHeight = t;
break;
case 2:
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
xMin = pageWidth - ch->xMax;
xMax = pageWidth - ch->xMin;
yMin = pageHeight - ch->yMax;
yMax = pageHeight - ch->yMin;
ch->xMin = xMin;
ch->xMax = xMax;
ch->yMin = yMin;
ch->yMax = yMax;
ch->rot = (ch->rot + 2) & 3;
}
break;
case 3:
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
xMin = pageHeight - ch->yMax;
xMax = pageHeight - ch->yMin;
yMin = ch->xMin;
yMax = ch->xMax;
ch->xMin = xMin;
ch->xMax = xMax;
ch->yMin = yMin;
ch->yMax = yMax;
ch->rot = (ch->rot + 1) & 3;
}
t = pageWidth;
pageWidth = pageHeight;
pageHeight = t;
break;
}
return rot;
}
// Rotate all chars to zero rotation. This leaves the TextChar.rot
// fields unchanged.
void TextPage::rotateCharsToZero(GList *charsA) {
TextChar *ch;
double xMin, yMin, xMax, yMax;
int i;
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
switch (ch->rot) {
case 0:
default:
break;
case 1:
xMin = ch->yMin;
xMax = ch->yMax;
yMin = pageWidth - ch->xMax;
yMax = pageWidth - ch->xMin;
ch->xMin = xMin;
ch->xMax = xMax;
ch->yMin = yMin;
ch->yMax = yMax;
break;
case 2:
xMin = pageWidth - ch->xMax;
xMax = pageWidth - ch->xMin;
yMin = pageHeight - ch->yMax;
yMax = pageHeight - ch->yMin;
ch->xMin = xMin;
ch->xMax = xMax;
ch->yMin = yMin;
ch->yMax = yMax;
break;
case 3:
xMin = pageHeight - ch->yMax;
xMax = pageHeight - ch->yMin;
yMin = ch->xMin;
yMax = ch->xMax;
ch->xMin = xMin;
ch->xMax = xMax;
ch->yMin = yMin;
ch->yMax = yMax;
break;
}
}
}
// Rotate the TextUnderlines and TextLinks to match the transform
// performed by rotateChars().
void TextPage::rotateUnderlinesAndLinks(int rot) {
TextUnderline *underline;
TextLink *link;
double xMin, yMin, xMax, yMax;
int i;
switch (rot) {
case 0:
default:
break;
case 1:
for (i = 0; i < underlines->getLength(); ++i) {
underline = (TextUnderline *)underlines->get(i);
xMin = underline->y0;
xMax = underline->y1;
yMin = pageWidth - underline->x1;
yMax = pageWidth - underline->x0;
underline->x0 = xMin;
underline->x1 = xMax;
underline->y0 = yMin;
underline->y1 = yMax;
underline->horiz = !underline->horiz;
}
for (i = 0; i < links->getLength(); ++i) {
link = (TextLink *)links->get(i);
xMin = link->yMin;
xMax = link->yMax;
yMin = pageWidth - link->xMax;
yMax = pageWidth - link->xMin;
link->xMin = xMin;
link->xMax = xMax;
link->yMin = yMin;
link->yMax = yMax;
}
break;
case 2:
for (i = 0; i < underlines->getLength(); ++i) {
underline = (TextUnderline *)underlines->get(i);
xMin = pageWidth - underline->x1;
xMax = pageWidth - underline->x0;
yMin = pageHeight - underline->y1;
yMax = pageHeight - underline->y0;
underline->x0 = xMin;
underline->x1 = xMax;
underline->y0 = yMin;
underline->y1 = yMax;
}
for (i = 0; i < links->getLength(); ++i) {
link = (TextLink *)links->get(i);
xMin = pageWidth - link->xMax;
xMax = pageWidth - link->xMin;
yMin = pageHeight - link->yMax;
yMax = pageHeight - link->yMin;
link->xMin = xMin;
link->xMax = xMax;
link->yMin = yMin;
link->yMax = yMax;
}
break;
case 3:
for (i = 0; i < underlines->getLength(); ++i) {
underline = (TextUnderline *)underlines->get(i);
xMin = pageHeight - underline->y1;
xMax = pageHeight - underline->y0;
yMin = underline->x0;
yMax = underline->x1;
underline->x0 = xMin;
underline->x1 = xMax;
underline->y0 = yMin;
underline->y1 = yMax;
underline->horiz = !underline->horiz;
}
for (i = 0; i < links->getLength(); ++i) {
link = (TextLink *)links->get(i);
xMin = pageHeight - link->yMax;
xMax = pageHeight - link->yMin;
yMin = link->xMin;
yMax = link->xMax;
link->xMin = xMin;
link->xMax = xMax;
link->yMin = yMin;
link->yMax = yMax;
}
break;
}
}
// Undo the coordinate transform performed by rotateChars().
void TextPage::unrotateChars(GList *charsA, int rot) {
TextChar *ch;
double xMin, yMin, xMax, yMax, t;
int i;
switch (rot) {
case 0:
default:
// no transform
break;
case 1:
t = pageWidth;
pageWidth = pageHeight;
pageHeight = t;
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
xMin = pageWidth - ch->yMax;
xMax = pageWidth - ch->yMin;
yMin = ch->xMin;
yMax = ch->xMax;
ch->xMin = xMin;
ch->xMax = xMax;
ch->yMin = yMin;
ch->yMax = yMax;
ch->rot = (ch->rot + 1) & 3;
}
break;
case 2:
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
xMin = pageWidth - ch->xMax;
xMax = pageWidth - ch->xMin;
yMin = pageHeight - ch->yMax;
yMax = pageHeight - ch->yMin;
ch->xMin = xMin;
ch->xMax = xMax;
ch->yMin = yMin;
ch->yMax = yMax;
ch->rot = (ch->rot + 2) & 3;
}
break;
case 3:
t = pageWidth;
pageWidth = pageHeight;
pageHeight = t;
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
xMin = ch->yMin;
xMax = ch->yMax;
yMin = pageHeight - ch->xMax;
yMax = pageHeight - ch->xMin;
ch->xMin = xMin;
ch->xMax = xMax;
ch->yMin = yMin;
ch->yMax = yMax;
ch->rot = (ch->rot + 3) & 3;
}
break;
}
}
// Undo the coordinate transform performed by rotateCharsToZero().
void TextPage::unrotateCharsFromZero(GList *charsA) {
TextChar *ch;
double xMin, yMin, xMax, yMax;
int i;
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
switch (ch->rot) {
case 0:
default:
break;
case 1:
xMin = pageWidth - ch->yMax;
xMax = pageWidth - ch->yMin;
yMin = ch->xMin;
yMax = ch->xMax;
ch->xMin = xMin;
ch->xMax = xMax;
ch->yMin = yMin;
ch->yMax = yMax;
break;
case 2:
xMin = pageWidth - ch->xMax;
xMax = pageWidth - ch->xMin;
yMin = pageHeight - ch->yMax;
yMax = pageHeight - ch->yMin;
ch->xMin = xMin;
ch->xMax = xMax;
ch->yMin = yMin;
ch->yMax = yMax;
break;
case 3:
xMin = ch->yMin;
xMax = ch->yMax;
yMin = pageHeight - ch->xMax;
yMax = pageHeight - ch->xMin;
ch->xMin = xMin;
ch->xMax = xMax;
ch->yMin = yMin;
ch->yMax = yMax;
break;
}
}
}
// Undo the coordinate transform performed by rotateCharsToZero().
void TextPage::unrotateColumnsFromZero(GList *columns) {
TextColumn *col;
TextParagraph *par;
TextLine *line;
TextWord *word;
double xMin, yMin, xMax, yMax;
int colIdx, parIdx, lineIdx, wordIdx, i;
for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
col = (TextColumn *)columns->get(colIdx);
switch (col->getRotation()) {
case 0:
default:
break;
case 1:
xMin = pageWidth - col->yMax;
xMax = pageWidth - col->yMin;
yMin = col->xMin;
yMax = col->xMax;
col->xMin = xMin;
col->xMax = xMax;
col->yMin = yMin;
col->yMax = yMax;
for (parIdx = 0;
parIdx < col->paragraphs->getLength();
++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
xMin = pageWidth - par->yMax;
xMax = pageWidth - par->yMin;
yMin = par->xMin;
yMax = par->xMax;
par->xMin = xMin;
par->xMax = xMax;
par->yMin = yMin;
par->yMax = yMax;
for (lineIdx = 0;
lineIdx < par->lines->getLength();
++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
xMin = pageWidth - line->yMax;
xMax = pageWidth - line->yMin;
yMin = line->xMin;
yMax = line->xMax;
line->xMin = xMin;
line->xMax = xMax;
line->yMin = yMin;
line->yMax = yMax;
for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
word = (TextWord *)line->words->get(wordIdx);
xMin = pageWidth - word->yMax;
xMax = pageWidth - word->yMin;
yMin = word->xMin;
yMax = word->xMax;
word->xMin = xMin;
word->xMax = xMax;
word->yMin = yMin;
word->yMax = yMax;
}
}
}
break;
case 2:
xMin = pageWidth - col->xMax;
xMax = pageWidth - col->xMin;
yMin = pageHeight - col->yMax;
yMax = pageHeight - col->yMin;
col->xMin = xMin;
col->xMax = xMax;
col->yMin = yMin;
col->yMax = yMax;
for (parIdx = 0;
parIdx < col->paragraphs->getLength();
++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
xMin = pageWidth - par->xMax;
xMax = pageWidth - par->xMin;
yMin = pageHeight - par->yMax;
yMax = pageHeight - par->yMin;
par->xMin = xMin;
par->xMax = xMax;
par->yMin = yMin;
par->yMax = yMax;
for (lineIdx = 0;
lineIdx < par->lines->getLength();
++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
xMin = pageWidth - line->xMax;
xMax = pageWidth - line->xMin;
yMin = pageHeight - line->yMax;
yMax = pageHeight - line->yMin;
line->xMin = xMin;
line->xMax = xMax;
line->yMin = yMin;
line->yMax = yMax;
for (i = 0; i <= line->len; ++i) {
line->edge[i] = pageWidth - line->edge[i];
}
for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
word = (TextWord *)line->words->get(wordIdx);
xMin = pageWidth - word->xMax;
xMax = pageWidth - word->xMin;
yMin = pageHeight - word->yMax;
yMax = pageHeight - word->yMin;
word->xMin = xMin;
word->xMax = xMax;
word->yMin = yMin;
word->yMax = yMax;
for (i = 0; i <= word->len; ++i) {
word->edge[i] = pageWidth - word->edge[i];
}
}
}
}
break;
case 3:
xMin = col->yMin;
xMax = col->yMax;
yMin = pageHeight - col->xMax;
yMax = pageHeight - col->xMin;
col->xMin = xMin;
col->xMax = xMax;
col->yMin = yMin;
col->yMax = yMax;
for (parIdx = 0;
parIdx < col->paragraphs->getLength();
++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
xMin = par->yMin;
xMax = par->yMax;
yMin = pageHeight - par->xMax;
yMax = pageHeight - par->xMin;
par->xMin = xMin;
par->xMax = xMax;
par->yMin = yMin;
par->yMax = yMax;
for (lineIdx = 0;
lineIdx < par->lines->getLength();
++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
xMin = line->yMin;
xMax = line->yMax;
yMin = pageHeight - line->xMax;
yMax = pageHeight - line->xMin;
line->xMin = xMin;
line->xMax = xMax;
line->yMin = yMin;
line->yMax = yMax;
for (i = 0; i <= line->len; ++i) {
line->edge[i] = pageHeight - line->edge[i];
}
for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
word = (TextWord *)line->words->get(wordIdx);
xMin = word->yMin;
xMax = word->yMax;
yMin = pageHeight - word->xMax;
yMax = pageHeight - word->xMin;
word->xMin = xMin;
word->xMax = xMax;
word->yMin = yMin;
word->yMax = yMax;
for (i = 0; i <= word->len; ++i) {
word->edge[i] = pageHeight - word->edge[i];
}
}
}
}
break;
}
}
}
// Undo the coordinate transform performed by rotateChars().
void TextPage::unrotateColumns(GList *columns, int rot) {
TextColumn *col;
TextParagraph *par;
TextLine *line;
TextWord *word;
double xMin, yMin, xMax, yMax;
int colIdx, parIdx, lineIdx, wordIdx, i;
switch (rot) {
case 0:
default:
// no transform
break;
case 1:
// NB: this is called after unrotateChars(), which will have
// swapped pageWidth and pageHeight already.
for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
col = (TextColumn *)columns->get(colIdx);
xMin = pageWidth - col->yMax;
xMax = pageWidth - col->yMin;
yMin = col->xMin;
yMax = col->xMax;
col->xMin = xMin;
col->xMax = xMax;
col->yMin = yMin;
col->yMax = yMax;
for (parIdx = 0;
parIdx < col->paragraphs->getLength();
++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
xMin = pageWidth - par->yMax;
xMax = pageWidth - par->yMin;
yMin = par->xMin;
yMax = par->xMax;
par->xMin = xMin;
par->xMax = xMax;
par->yMin = yMin;
par->yMax = yMax;
for (lineIdx = 0;
lineIdx < par->lines->getLength();
++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
xMin = pageWidth - line->yMax;
xMax = pageWidth - line->yMin;
yMin = line->xMin;
yMax = line->xMax;
line->xMin = xMin;
line->xMax = xMax;
line->yMin = yMin;
line->yMax = yMax;
line->rot = (line->rot + 1) & 3;
if (!(line->rot & 1)) {
for (i = 0; i <= line->len; ++i) {
line->edge[i] = pageWidth - line->edge[i];
}
}
for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
word = (TextWord *)line->words->get(wordIdx);
xMin = pageWidth - word->yMax;
xMax = pageWidth - word->yMin;
yMin = word->xMin;
yMax = word->xMax;
word->xMin = xMin;
word->xMax = xMax;
word->yMin = yMin;
word->yMax = yMax;
word->rot = (word->rot + 1) & 3;
if (!(word->rot & 1)) {
for (i = 0; i <= word->len; ++i) {
word->edge[i] = pageWidth - word->edge[i];
}
}
}
}
}
}
break;
case 2:
for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
col = (TextColumn *)columns->get(colIdx);
xMin = pageWidth - col->xMax;
xMax = pageWidth - col->xMin;
yMin = pageHeight - col->yMax;
yMax = pageHeight - col->yMin;
col->xMin = xMin;
col->xMax = xMax;
col->yMin = yMin;
col->yMax = yMax;
for (parIdx = 0;
parIdx < col->paragraphs->getLength();
++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
xMin = pageWidth - par->xMax;
xMax = pageWidth - par->xMin;
yMin = pageHeight - par->yMax;
yMax = pageHeight - par->yMin;
par->xMin = xMin;
par->xMax = xMax;
par->yMin = yMin;
par->yMax = yMax;
for (lineIdx = 0;
lineIdx < par->lines->getLength();
++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
xMin = pageWidth - line->xMax;
xMax = pageWidth - line->xMin;
yMin = pageHeight - line->yMax;
yMax = pageHeight - line->yMin;
line->xMin = xMin;
line->xMax = xMax;
line->yMin = yMin;
line->yMax = yMax;
line->rot = (line->rot + 2) & 3;
if (line->rot & 1) {
for (i = 0; i <= line->len; ++i) {
line->edge[i] = pageHeight - line->edge[i];
}
} else {
for (i = 0; i <= line->len; ++i) {
line->edge[i] = pageWidth - line->edge[i];
}
}
for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
word = (TextWord *)line->words->get(wordIdx);
xMin = pageWidth - word->xMax;
xMax = pageWidth - word->xMin;
yMin = pageHeight - word->yMax;
yMax = pageHeight - word->yMin;
word->xMin = xMin;
word->xMax = xMax;
word->yMin = yMin;
word->yMax = yMax;
word->rot = (word->rot + 2) & 3;
if (word->rot & 1) {
for (i = 0; i <= word->len; ++i) {
word->edge[i] = pageHeight - word->edge[i];
}
} else {
for (i = 0; i <= word->len; ++i) {
word->edge[i] = pageWidth - word->edge[i];
}
}
}
}
}
}
break;
case 3:
// NB: this is called after unrotateChars(), which will have
// swapped pageWidth and pageHeight already.
for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
col = (TextColumn *)columns->get(colIdx);
xMin = col->yMin;
xMax = col->yMax;
yMin = pageHeight - col->xMax;
yMax = pageHeight - col->xMin;
col->xMin = xMin;
col->xMax = xMax;
col->yMin = yMin;
col->yMax = yMax;
for (parIdx = 0;
parIdx < col->paragraphs->getLength();
++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
xMin = par->yMin;
xMax = par->yMax;
yMin = pageHeight - par->xMax;
yMax = pageHeight - par->xMin;
par->xMin = xMin;
par->xMax = xMax;
par->yMin = yMin;
par->yMax = yMax;
for (lineIdx = 0;
lineIdx < par->lines->getLength();
++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
xMin = line->yMin;
xMax = line->yMax;
yMin = pageHeight - line->xMax;
yMax = pageHeight - line->xMin;
line->xMin = xMin;
line->xMax = xMax;
line->yMin = yMin;
line->yMax = yMax;
line->rot = (line->rot + 3) & 3;
if (line->rot & 1) {
for (i = 0; i <= line->len; ++i) {
line->edge[i] = pageHeight - line->edge[i];
}
}
for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
word = (TextWord *)line->words->get(wordIdx);
xMin = word->yMin;
xMax = word->yMax;
yMin = pageHeight - word->xMax;
yMax = pageHeight - word->xMin;
word->xMin = xMin;
word->xMax = xMax;
word->yMin = yMin;
word->yMax = yMax;
word->rot = (word->rot + 3) & 3;
if (word->rot & 1) {
for (i = 0; i <= word->len; ++i) {
word->edge[i] = pageHeight - word->edge[i];
}
}
}
}
}
}
break;
}
}
void TextPage::unrotateWords(GList *words, int rot) {
TextWord *word;
double xMin, yMin, xMax, yMax;
int i, j;
switch (rot) {
case 0:
default:
// no transform
break;
case 1:
for (i = 0; i < words->getLength(); ++i) {
word = (TextWord *)words->get(i);
xMin = pageWidth - word->yMax;
xMax = pageWidth - word->yMin;
yMin = word->xMin;
yMax = word->xMax;
word->xMin = xMin;
word->xMax = xMax;
word->yMin = yMin;
word->yMax = yMax;
word->rot = (word->rot + 1) & 3;
if (!(word->rot & 1)) {
for (j = 0; j <= word->len; ++j) {
word->edge[j] = pageWidth - word->edge[j];
}
}
}
break;
case 2:
for (i = 0; i < words->getLength(); ++i) {
word = (TextWord *)words->get(i);
xMin = pageWidth - word->xMax;
xMax = pageWidth - word->xMin;
yMin = pageHeight - word->yMax;
yMax = pageHeight - word->yMin;
word->xMin = xMin;
word->xMax = xMax;
word->yMin = yMin;
word->yMax = yMax;
word->rot = (word->rot + 2) & 3;
if (word->rot & 1) {
for (j = 0; j <= word->len; ++j) {
word->edge[j] = pageHeight - word->edge[j];
}
} else {
for (j = 0; j <= word->len; ++j) {
word->edge[j] = pageWidth - word->edge[j];
}
}
}
break;
case 3:
for (i = 0; i < words->getLength(); ++i) {
word = (TextWord *)words->get(i);
xMin = word->yMin;
xMax = word->yMax;
yMin = pageHeight - word->xMax;
yMax = pageHeight - word->xMin;
word->xMin = xMin;
word->xMax = xMax;
word->yMin = yMin;
word->yMax = yMax;
word->rot = (word->rot + 3) & 3;
if (word->rot & 1) {
for (j = 0; j <= word->len; ++j) {
word->edge[j] = pageHeight - word->edge[j];
}
}
}
break;
}
}
// Determine the primary text direction (LR vs RL). Returns true for
// LR, false for RL.
GBool TextPage::checkPrimaryLR(GList *charsA) {
TextChar *ch;
int i, lrCount;
lrCount = 0;
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
if (unicodeTypeL(ch->c)) {
++lrCount;
} else if (unicodeTypeR(ch->c)) {
--lrCount;
}
}
return lrCount >= 0;
}
// Remove duplicate characters. The list of chars has been sorted --
// by x for rot=0,2; by y for rot=1,3.
void TextPage::removeDuplicates(GList *charsA, int rot) {
TextChar *ch, *ch2;
double xDelta, yDelta;
int i, j;
if (rot & 1) {
i = 0;
while (i < charsA->getLength()) {
ch = (TextChar *)charsA->get(i);
xDelta = dupMaxSecDelta * ch->fontSize;
yDelta = dupMaxPriDelta * ch->fontSize;
j = i + 1;
while (j < charsA->getLength()) {
ch2 = (TextChar *)charsA->get(j);
if (ch2->yMin - ch->yMin >= yDelta) {
break;
}
if (ch2->c == ch->c &&
fabs(ch2->xMin - ch->xMin) < xDelta &&
fabs(ch2->xMax - ch->xMax) < xDelta &&
fabs(ch2->yMax - ch->yMax) < yDelta) {
if (ch->invisible && !ch2->invisible) {
charsA->del(i);
--i;
break;
}
if (ch2->spaceAfter) {
ch->spaceAfter = (char)gTrue;
}
charsA->del(j);
} else {
++j;
}
}
++i;
}
} else {
i = 0;
while (i < charsA->getLength()) {
ch = (TextChar *)charsA->get(i);
xDelta = dupMaxPriDelta * ch->fontSize;
yDelta = dupMaxSecDelta * ch->fontSize;
j = i + 1;
while (j < charsA->getLength()) {
ch2 = (TextChar *)charsA->get(j);
if (ch2->xMin - ch->xMin >= xDelta) {
break;
}
if (ch2->c == ch->c &&
fabs(ch2->xMax - ch->xMax) < xDelta &&
fabs(ch2->yMin - ch->yMin) < yDelta &&
fabs(ch2->yMax - ch->yMax) < yDelta) {
if (ch->invisible && !ch2->invisible) {
charsA->del(i);
--i;
break;
}
if (ch2->spaceAfter) {
ch->spaceAfter = (char)gTrue;
}
charsA->del(j);
} else {
++j;
}
}
++i;
}
}
}
struct TextCharNode {
TextCharNode(TextChar *chA, TextCharNode *nextA): ch(chA), next(nextA) {}
TextChar *ch;
TextCharNode *next;
};
// Separate out any overlapping text. If handling is
// textOutAppendOverlaps, return a list of the overlapping chars; else
// delete them and return NULL.
GList *TextPage::separateOverlappingText(GList *charsA) {
// bin-sort the TextChars
TextCharNode *grid[overlapGridHeight][overlapGridWidth];
for (int y = 0; y < overlapGridHeight; ++y) {
for (int x = 0; x < overlapGridWidth; ++x) {
grid[y][x] = NULL;
}
}
for (int i = 0; i < charsA->getLength(); ++i) {
TextChar *ch = (TextChar *)charsA->get(i);
int y0 = (int)floor(overlapGridHeight * ch->yMin / pageHeight);
int y1 = (int)ceil(overlapGridHeight * ch->yMax / pageHeight);
int x0 = (int)floor(overlapGridWidth * ch->xMin / pageWidth);
int x1 = (int)ceil(overlapGridWidth * ch->yMin / pageWidth);
if (y0 < 0) {
y0 = 0;
}
if (y1 >= overlapGridHeight) {
y1 = overlapGridHeight - 1;
}
if (x0 < 0) {
x0 = 0;
}
if (x1 >= overlapGridWidth) {
x1 = overlapGridWidth - 1;
}
for (int y = y0; y <= y1; ++y) {
for (int x = x0; x <= x1; ++x) {
grid[y][x] = new TextCharNode(ch, grid[y][x]);
}
}
}
// look for overlaps in each cell
GBool foundOverlaps = gFalse;
for (int y = 0; y < overlapGridHeight; ++y) {
for (int x = 0; x < overlapGridWidth; ++x) {
for (TextCharNode *p0 = grid[y][x]; p0; p0 = p0->next) {
for (TextCharNode *p1 = p0->next; p1; p1 = p1->next) {
if (p0->ch->colorR != p1->ch->colorR ||
p0->ch->colorG != p1->ch->colorG ||
p0->ch->colorB != p1->ch->colorB) {
double ovx = (dmin(p0->ch->xMax, p1->ch->xMax)
- dmax(p0->ch->xMin, p1->ch->xMin))
/ dmin(p0->ch->xMax - p0->ch->xMin,
p1->ch->xMax - p1->ch->xMin);
double ovy = (dmin(p0->ch->yMax, p1->ch->yMax)
- dmax(p0->ch->yMin, p1->ch->yMin))
/ dmin(p0->ch->yMax - p0->ch->yMin,
p1->ch->yMax - p1->ch->yMin);
if (ovx > minCharOverlap && ovy > minCharOverlap) {
// assume the lighter colored text is extraneous
if (p0->ch->colorR + p0->ch->colorG + p0->ch->colorB
< p1->ch->colorR + p1->ch->colorG + p1->ch->colorB) {
p1->ch->overlap = gTrue;
} else {
p0->ch->overlap = gTrue;
}
foundOverlaps = gTrue;
}
}
}
}
}
}
// find overlapped strings
GList *overlapChars = NULL;
if (control.overlapHandling == textOutAppendOverlaps) {
overlapChars = new GList();
}
if (foundOverlaps) {
charsA->sort(&TextChar::cmpCharPos);
int i = 0;
while (i < charsA->getLength()) {
TextChar *ch0 = (TextChar *)charsA->get(i);
if (ch0->overlap) {
int j0, j1;
for (j0 = i - 1; j0 >= 0; --j0) {
TextChar *ch1 = (TextChar *)charsA->get(j0);
if (ch1->colorR != ch0->colorR ||
ch1->colorG != ch0->colorG ||
ch1->colorB != ch0->colorB ||
ch1->rot != ch0->rot) {
break;
}
}
++j0;
for (j1 = i + 1; j1 < charsA->getLength(); ++j1) {
TextChar *ch1 = (TextChar *)charsA->get(j1);
if (ch1->colorR != ch0->colorR ||
ch1->colorG != ch0->colorG ||
ch1->colorB != ch0->colorB ||
ch1->rot != ch0->rot) {
break;
}
}
--j1;
for (int j = j0; j <= j1; ++j) {
if (overlapChars) {
overlapChars->append(charsA->get(j0));
} else {
delete (TextChar *)charsA->get(j0);
}
charsA->del(j0);
}
i = j0;
} else {
++i;
}
}
}
// free memory
for (int y = 0; y < overlapGridHeight; ++y) {
for (int x = 0; x < overlapGridWidth; ++x) {
TextCharNode *p0 = grid[y][x];
while (p0) {
TextCharNode *p1 = p0->next;
delete p0;
p0 = p1;
}
}
}
return overlapChars;
}
// Construct a TextColumn from the list of separated overlapping
// chars.
TextColumn *TextPage::buildOverlappingTextColumn(GList *overlappingChars) {
GList *pars = new GList();
GList *lines = new GList();
GList *words = new GList();
int wordStart = 0;
double lineXMin = 0, lineYMin = 0, lineXMax = 0, lineYMax = 0;
double colXMin = 0, colYMin = 0, colXMax = 0, colYMax = 0;
for (int i = 0; i < overlappingChars->getLength(); ++i) {
TextChar *ch = (TextChar *)overlappingChars->get(i);
TextChar *chNext = NULL;
if (i + 1 < overlappingChars->getLength()) {
chNext = (TextChar *)overlappingChars->get(i + 1);
}
double sp = 0;
double dy = 0;
if (chNext) {
switch (ch->rot) {
case 0:
default:
sp = chNext->xMin - ch->xMax;
dy = chNext->yMin - ch->yMin;
break;
case 1:
sp = chNext->yMin - ch->yMax;
dy = chNext->xMax - ch->xMax;
break;
case 2:
sp = ch->xMin - chNext->xMax;
dy = ch->yMax - chNext->yMax;
break;
case 3:
sp = ch->yMin - chNext->yMax;
dy = ch->xMin - chNext->xMin;
break;
}
}
// the +1 here allows for a space character after ch
GBool parBreak = !chNext ||
chNext->rot != ch->rot ||
chNext->charPos > ch->charPos + ch->charLen + 1;
GBool lineBreak = parBreak ||
sp < -rawModeCharOverlap * ch->fontSize ||
fabs(dy) > rawModeLineDelta * ch->fontSize;
GBool wordBreak = lineBreak ||
ch->spaceAfter ||
sp > rawModeWordSpacing * ch->fontSize;
if (!wordBreak) {
continue;
}
TextWord *word = new TextWord(overlappingChars, wordStart,
i - wordStart + 1, ch->rot, ch->rotated,
getCharDirection(ch), !lineBreak);
words->append(word);
if (words->getLength() == 0) {
lineXMin = word->xMin;
lineYMin = word->yMin;
lineXMax = word->xMax;
lineYMax = word->yMax;
} else {
lineXMin = dmin(lineXMin, word->xMin);
lineYMin = dmin(lineYMin, word->yMin);
lineXMax = dmax(lineXMax, word->xMax);
lineYMax = dmax(lineYMax, word->yMax);
}
wordStart = i + 1;
if (!lineBreak) {
continue;
}
lines->append(new TextLine(words, lineXMin, lineYMin, lineXMax, lineYMax,
((TextWord *)words->get(0))->fontSize));
words = new GList();
if (!parBreak) {
continue;
}
TextParagraph *par = new TextParagraph(lines, gFalse);
pars->append(par);
if (pars->getLength() == 0) {
colXMin = par->xMin;
colYMin = par->yMin;
colXMax = par->xMax;
colYMax = par->yMax;
} else {
colXMin = dmin(colXMin, par->xMin);
colYMin = dmin(colYMin, par->yMin);
colXMax = dmax(colXMax, par->xMax);
colYMax = dmax(colYMax, par->yMax);
}
lines = new GList();
}
delete words;
delete lines;
return new TextColumn(pars, colXMin, colYMin, colXMax, colYMax);
}
// Split the characters into trees of TextBlocks, one tree for each
// rotation. Merge into a single tree (with the primary rotation).
TextBlock *TextPage::splitChars(GList *charsA) {
TextBlock *tree[4];
TextBlock *blk;
GList *chars2, *clippedChars;
TextChar *ch;
int rot, i;
// split: build a tree of TextBlocks for each rotation
clippedChars = new GList();
for (rot = 0; rot < 4; ++rot) {
chars2 = new GList();
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
if (ch->rot == rot &&
!(control.discardInvisibleText && ch->invisible) &&
!(control.discardClippedText && ch->clipped)) {
chars2->append(ch);
}
}
tree[rot] = NULL;
if (chars2->getLength() > 0) {
chars2->sort((rot & 1) ? &TextChar::cmpY : &TextChar::cmpX);
removeDuplicates(chars2, rot);
if (control.clipText) {
i = 0;
while (i < chars2->getLength()) {
ch = (TextChar *)chars2->get(i);
if (ch->clipped) {
ch = (TextChar *)chars2->del(i);
clippedChars->append(ch);
} else {
++i;
}
}
}
if (chars2->getLength() > 0) {
tree[rot] = split(chars2, rot);
}
}
delete chars2;
}
// if the page contains no (unclipped) text, just leave an empty
// column list
if (!tree[0]) {
// normally tree[0] is empty only if there is no text at all, but
// if the caller didn't do rotation, the rotated trees may be
// non-empty, so we need to free them
for (rot = 1; rot < 4; ++rot) {
if (tree[rot]) {
delete tree[rot];
}
}
delete clippedChars;
return NULL;
}
// if the main tree is not a multicolumn node, insert one so that
// rotated text has somewhere to go
if (tree[0]->tag != blkTagMulticolumn) {
blk = new TextBlock(blkHorizSplit, 0);
blk->addChild(tree[0]);
blk->tag = blkTagMulticolumn;
tree[0] = blk;
}
// merge non-primary-rotation text into the primary-rotation tree
for (rot = 1; rot < 4; ++rot) {
if (tree[rot]) {
insertIntoTree(tree[rot], tree[0]);
tree[rot] = NULL;
}
}
if (clippedChars->getLength()) {
insertClippedChars(clippedChars, tree[0]);
}
delete clippedChars;
#if 0 //~debug
dumpTree(tree[0]);
#endif
return tree[0];
}
// Generate a tree of TextBlocks, marked as columns, lines, and words.
TextBlock *TextPage::split(GList *charsA, int rot) {
TextBlock *blk;
GList *chars2, *chars3;
TextGaps *horizGaps, *vertGaps;
TextChar *ch;
double xMin, yMin, xMax, yMax, avgFontSize;
double horizGapSize, vertGapSize, minHorizChunkWidth, minVertChunkWidth;
double gap, nLines, vertGapThreshold, minChunk;
double largeCharSize;
double x0, x1, y0, y1;
int nHorizGaps, nVertGaps, nLargeChars;
int i;
GBool doHorizSplit, doVertSplit, smallSplit;
//----- find all horizontal and vertical gaps
horizGaps = new TextGaps();
vertGaps = new TextGaps();
findGaps(charsA, rot, &xMin, &yMin, &xMax, &yMax, &avgFontSize,
horizGaps, vertGaps);
//----- find the largest horizontal and vertical gaps
horizGapSize = 0;
for (i = 0; i < horizGaps->getLength(); ++i) {
gap = horizGaps->getW(i);
if (gap > horizGapSize) {
horizGapSize = gap;
}
}
vertGapSize = 0;
for (i = 0; i < vertGaps->getLength(); ++i) {
gap = vertGaps->getW(i);
if (gap > vertGapSize) {
vertGapSize = gap;
}
}
//----- count horiz/vert gaps equivalent to largest gaps
minHorizChunkWidth = yMax - yMin;
nHorizGaps = 0;
if (horizGaps->getLength() > 0) {
y0 = yMin;
for (i = 0; i < horizGaps->getLength(); ++i) {
gap = horizGaps->getW(i);
if (gap > horizGapSize - splitGapSlack * avgFontSize) {
++nHorizGaps;
y1 = horizGaps->getX(i) - 0.5 * gap;
if (y1 - y0 < minHorizChunkWidth) {
minHorizChunkWidth = y1 - y0;
}
y0 = y1 + gap;
}
}
y1 = yMax;
if (y1 - y0 < minHorizChunkWidth) {
minHorizChunkWidth = y1 - y0;
}
}
minVertChunkWidth = xMax - xMin;
nVertGaps = 0;
if (vertGaps->getLength() > 0) {
x0 = xMin;
for (i = 0; i < vertGaps->getLength(); ++i) {
gap = vertGaps->getW(i);
if (gap > vertGapSize - splitGapSlack * avgFontSize) {
++nVertGaps;
x1 = vertGaps->getX(i) - 0.5 * gap;
if (x1 - x0 < minVertChunkWidth) {
minVertChunkWidth = x1 - x0;
}
x0 = x1 + gap;
}
}
x1 = xMax;
if (x1 - x0 < minVertChunkWidth) {
minVertChunkWidth = x1 - x0;
}
}
//----- compute splitting parameters
// approximation of number of lines in block
if (fabs(avgFontSize) < 0.001) {
nLines = 1;
} else if (rot & 1) {
nLines = (xMax - xMin) / avgFontSize;
} else {
nLines = (yMax - yMin) / avgFontSize;
}
// compute the minimum allowed vertical gap size
// (this is a horizontal gap threshold for rot=1,3
if (control.mode == textOutTableLayout) {
vertGapThreshold = vertGapThresholdTableMax
+ vertGapThresholdTableSlope * nLines;
if (vertGapThreshold < vertGapThresholdTableMin) {
vertGapThreshold = vertGapThresholdTableMin;
}
} else if (control.mode == textOutSimpleLayout) {
vertGapThreshold = simpleLayoutGapThreshold;
} else {
vertGapThreshold = vertGapThresholdMax + vertGapThresholdSlope * nLines;
if (vertGapThreshold < vertGapThresholdMin) {
vertGapThreshold = vertGapThresholdMin;
}
}
vertGapThreshold = vertGapThreshold * avgFontSize;
// compute the minimum allowed chunk width
if (control.mode == textOutTableLayout) {
minChunk = 0;
} else {
minChunk = vertSplitChunkThreshold * avgFontSize;
}
// look for large chars
// -- this kludge (multiply by 256, convert to int, divide by 256.0)
// prevents floating point stability issues on x86 with gcc, where
// largeCharSize could otherwise have slightly different values
// here and where it's used below to do the large char partition
// (because it gets truncated from 80 to 64 bits when spilled)
nLargeChars = 0;
largeCharSize = 0;
if (control.separateLargeChars) {
largeCharSize = (int)(largeCharThreshold * avgFontSize * 256) / 256.0;
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
if (ch->fontSize > largeCharSize) {
++nLargeChars;
}
}
}
// figure out which type of split to do
doHorizSplit = doVertSplit = gFalse;
smallSplit = gFalse;
if (rot & 1) {
if (control.mode == textOutSimpleLayout) {
if (nVertGaps > 0) {
doVertSplit = gTrue;
} else if (nHorizGaps > 0) {
doHorizSplit = gTrue;
smallSplit = horizGapSize <= vertGapThreshold;
}
} else if (nHorizGaps > 0 &&
(horizGapSize > vertGapSize ||
control.mode == textOutTableLayout) &&
horizGapSize > vertGapThreshold &&
(minHorizChunkWidth > minChunk ||
nVertGaps == 0)) {
doHorizSplit = gTrue;
} else if (nVertGaps > 0) {
doVertSplit = gTrue;
} else if (nLargeChars == 0 && nHorizGaps > 0) {
doHorizSplit = gTrue;
smallSplit = gTrue;
}
} else {
if (control.mode == textOutSimpleLayout) {
if (nHorizGaps > 0) {
doHorizSplit = gTrue;
} else if (nVertGaps > 0) {
doVertSplit = gTrue;
smallSplit = vertGapSize <= vertGapThreshold;
}
} else if (nVertGaps > 0 &&
(vertGapSize > horizGapSize ||
control.mode == textOutTableLayout) &&
vertGapSize > vertGapThreshold &&
(minVertChunkWidth > minChunk ||
nHorizGaps == 0)) {
doVertSplit = gTrue;
} else if (nHorizGaps > 0) {
doHorizSplit = gTrue;
} else if (nLargeChars == 0 && nVertGaps > 0) {
doVertSplit = gTrue;
smallSplit = gTrue;
}
}
//----- split the block
//~ this could use "other content" (vector graphics, rotated text) --
//~ presence of other content in a gap means we should definitely split
// split vertically
if (doVertSplit) {
#if 0 //~debug
printf("vert split xMin=%g yMin=%g xMax=%g yMax=%g small=%d\n",
xMin, pageHeight - yMax, xMax, pageHeight - yMin, smallSplit);
for (i = 0; i < vertGaps->getLength(); ++i) {
if (vertGaps->getW(i) > vertGapSize - splitGapSlack * avgFontSize) {
printf(" x=%g\n", vertGaps->getX(i));
}
}
#endif
blk = new TextBlock(blkVertSplit, rot);
blk->smallSplit = smallSplit;
x0 = xMin - 1;
for (i = 0; i < vertGaps->getLength(); ++i) {
if (vertGaps->getW(i) > vertGapSize - splitGapSlack * avgFontSize) {
x1 = vertGaps->getX(i);
chars2 = getChars(charsA, x0, yMin - 1, x1, yMax + 1);
blk->addChild(split(chars2, rot));
delete chars2;
x0 = x1;
}
}
chars2 = getChars(charsA, x0, yMin - 1, xMax + 1, yMax + 1);
blk->addChild(split(chars2, rot));
delete chars2;
// split horizontally
} else if (doHorizSplit) {
#if 0 //~debug
printf("horiz split xMin=%g yMin=%g xMax=%g yMax=%g small=%d\n",
xMin, pageHeight - yMax, xMax, pageHeight - yMin, smallSplit);
for (i = 0; i < horizGaps->getLength(); ++i) {
if (horizGaps->getW(i) > horizGapSize - splitGapSlack * avgFontSize) {
printf(" y=%g\n", pageHeight - horizGaps->getX(i));
}
}
#endif
blk = new TextBlock(blkHorizSplit, rot);
blk->smallSplit = smallSplit;
y0 = yMin - 1;
for (i = 0; i < horizGaps->getLength(); ++i) {
if (horizGaps->getW(i) > horizGapSize - splitGapSlack * avgFontSize) {
y1 = horizGaps->getX(i);
chars2 = getChars(charsA, xMin - 1, y0, xMax + 1, y1);
blk->addChild(split(chars2, rot));
delete chars2;
y0 = y1;
}
}
chars2 = getChars(charsA, xMin - 1, y0, xMax + 1, yMax + 1);
blk->addChild(split(chars2, rot));
delete chars2;
// split into larger and smaller chars
} else if (nLargeChars > 0) {
#if 0 //~debug
printf("large char split xMin=%g yMin=%g xMax=%g yMax=%g\n",
xMin, pageHeight - yMax, xMax, pageHeight - yMin);
#endif
chars2 = new GList();
chars3 = new GList();
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
if (ch->fontSize > largeCharSize) {
chars2->append(ch);
} else {
chars3->append(ch);
}
}
blk = split(chars3, rot);
insertLargeChars(chars2, blk);
delete chars2;
delete chars3;
// create a leaf node
} else {
#if 0 //~debug
printf("leaf xMin=%g yMin=%g xMax=%g yMax=%g\n",
xMin, pageHeight - yMax, xMax, pageHeight - yMin);
#endif
blk = new TextBlock(blkLeaf, rot);
for (i = 0; i < charsA->getLength(); ++i) {
blk->addChild((TextChar *)charsA->get(i), gTrue);
}
}
delete horizGaps;
delete vertGaps;
tagBlock(blk);
return blk;
}
// Return the subset of chars inside a rectangle.
GList *TextPage::getChars(GList *charsA, double xMin, double yMin,
double xMax, double yMax) {
GList *ret;
TextChar *ch;
double x, y;
int i;
ret = new GList();
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
// because of {ascent,descent}AdjustFactor, the y coords (or x
// coords for rot 1,3) for the gaps will be a little bit tight --
// so we use the center of the character here
x = 0.5 * (ch->xMin + ch->xMax);
y = 0.5 * (ch->yMin + ch->yMax);
if (x > xMin && x < xMax && y > yMin && y < yMax) {
ret->append(ch);
}
}
return ret;
}
void TextPage::findGaps(GList *charsA, int rot,
double *xMinOut, double *yMinOut,
double *xMaxOut, double *yMaxOut,
double *avgFontSizeOut,
TextGaps *horizGaps, TextGaps *vertGaps) {
TextChar *ch;
char *horizProfile, *vertProfile;
double xMin, yMin, xMax, yMax, w;
double minFontSize, avgFontSize, splitPrecision, invSplitPrecision;
double ascentAdjust, descentAdjust;
int xMinI, yMinI, xMaxI, yMaxI, xMinI2, yMinI2, xMaxI2, yMaxI2;
int start, x, y, i;
//----- compute bbox, min font size, average font size, and split precision
xMin = yMin = xMax = yMax = 0; // make gcc happy
minFontSize = avgFontSize = 0;
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
if (i == 0 || ch->xMin < xMin) {
xMin = ch->xMin;
}
if (i == 0 || ch->yMin < yMin) {
yMin = ch->yMin;
}
if (i == 0 || ch->xMax > xMax) {
xMax = ch->xMax;
}
if (i == 0 || ch->yMax > yMax) {
yMax = ch->yMax;
}
avgFontSize += ch->fontSize;
if (i == 0 || ch->fontSize < minFontSize) {
minFontSize = ch->fontSize;
}
}
avgFontSize /= charsA->getLength();
splitPrecision = splitPrecisionMul * minFontSize;
if (splitPrecision < minSplitPrecision) {
splitPrecision = minSplitPrecision;
}
invSplitPrecision = 1 / splitPrecision;
*xMinOut = xMin;
*yMinOut = yMin;
*xMaxOut = xMax;
*yMaxOut = yMax;
*avgFontSizeOut = avgFontSize;
//----- compute the horizontal and vertical profiles
if (xMin * invSplitPrecision < 0.5 * INT_MIN ||
xMax * invSplitPrecision > 0.5 * INT_MAX ||
yMin * invSplitPrecision < 0.5 * INT_MIN ||
yMax * invSplitPrecision > 0.5 * INT_MAX) {
return;
}
// add some slack to the array bounds to avoid floating point
// precision problems
xMinI = (int)floor(xMin * invSplitPrecision) - 1;
yMinI = (int)floor(yMin * invSplitPrecision) - 1;
xMaxI = (int)floor(xMax * invSplitPrecision) + 1;
yMaxI = (int)floor(yMax * invSplitPrecision) + 1;
horizProfile = (char *)gmalloc(yMaxI - yMinI + 1);
vertProfile = (char *)gmalloc(xMaxI - xMinI + 1);
memset(horizProfile, 0, yMaxI - yMinI + 1);
memset(vertProfile, 0, xMaxI - xMinI + 1);
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
// yMinI2 and yMaxI2 are adjusted to allow for slightly overlapping lines
switch (rot) {
case 0:
default:
xMinI2 = (int)floor(ch->xMin * invSplitPrecision);
xMaxI2 = (int)floor(ch->xMax * invSplitPrecision);
ascentAdjust = ascentAdjustFactor * (ch->yMax - ch->yMin);
yMinI2 = (int)floor((ch->yMin + ascentAdjust) * invSplitPrecision);
descentAdjust = descentAdjustFactor * (ch->yMax - ch->yMin);
yMaxI2 = (int)floor((ch->yMax - descentAdjust) * invSplitPrecision);
break;
case 1:
descentAdjust = descentAdjustFactor * (ch->xMax - ch->xMin);
xMinI2 = (int)floor((ch->xMin + descentAdjust) * invSplitPrecision);
ascentAdjust = ascentAdjustFactor * (ch->xMax - ch->xMin);
xMaxI2 = (int)floor((ch->xMax - ascentAdjust) * invSplitPrecision);
yMinI2 = (int)floor(ch->yMin * invSplitPrecision);
yMaxI2 = (int)floor(ch->yMax * invSplitPrecision);
break;
case 2:
xMinI2 = (int)floor(ch->xMin * invSplitPrecision);
xMaxI2 = (int)floor(ch->xMax * invSplitPrecision);
descentAdjust = descentAdjustFactor * (ch->yMax - ch->yMin);
yMinI2 = (int)floor((ch->yMin + descentAdjust) * invSplitPrecision);
ascentAdjust = ascentAdjustFactor * (ch->yMax - ch->yMin);
yMaxI2 = (int)floor((ch->yMax - ascentAdjust) * invSplitPrecision);
break;
case 3:
ascentAdjust = ascentAdjustFactor * (ch->xMax - ch->xMin);
xMinI2 = (int)floor((ch->xMin + ascentAdjust) * invSplitPrecision);
descentAdjust = descentAdjustFactor * (ch->xMax - ch->xMin);
xMaxI2 = (int)floor((ch->xMax - descentAdjust) * invSplitPrecision);
yMinI2 = (int)floor(ch->yMin * invSplitPrecision);
yMaxI2 = (int)floor(ch->yMax * invSplitPrecision);
break;
}
for (y = yMinI2; y <= yMaxI2; ++y) {
horizProfile[y - yMinI] = 1;
}
for (x = xMinI2; x <= xMaxI2; ++x) {
vertProfile[x - xMinI] = 1;
}
}
//----- build the list of horizontal gaps
for (start = yMinI; start < yMaxI && !horizProfile[start - yMinI]; ++start) ;
for (y = start; y < yMaxI; ++y) {
if (horizProfile[y - yMinI]) {
if (!horizProfile[y + 1 - yMinI]) {
start = y;
}
} else {
if (horizProfile[y + 1 - yMinI]) {
w = (y - start) * splitPrecision;
horizGaps->addGap((start + 1) * splitPrecision + 0.5 * w, w);
}
}
}
//----- build the list of vertical gaps
for (start = xMinI; start < xMaxI && !vertProfile[start - xMinI]; ++start) ;
for (x = start; x < xMaxI; ++x) {
if (vertProfile[x - xMinI]) {
if (!vertProfile[x + 1 - xMinI]) {
start = x;
}
} else {
if (vertProfile[x + 1 - xMinI]) {
w = (x - start) * splitPrecision;
vertGaps->addGap((start + 1) * splitPrecision + 0.5 * w, w);
}
}
}
gfree(horizProfile);
gfree(vertProfile);
}
// Decide whether this block is a line, column, or multiple columns:
// - all leaf nodes are lines
// - horiz split nodes whose children are lines or columns are columns
// - other horiz split nodes are multiple columns
// - vert split nodes, with small gaps, whose children are lines are lines
// - other vert split nodes are multiple columns
// (for rot=1,3: the horiz and vert splits are swapped)
// In table layout mode:
// - all leaf nodes are lines
// - vert split nodes, with small gaps, whose children are lines are lines
// - everything else is multiple columns
// In simple layout mode:
// - all leaf nodes are lines
// - vert split nodes with small gaps are lines
// - vert split nodes with large gaps are super-lines
// - horiz split nodes are columns
void TextPage::tagBlock(TextBlock *blk) {
TextBlock *child;
int i;
if (control.mode == textOutTableLayout) {
if (blk->type == blkLeaf) {
blk->tag = blkTagLine;
} else if (blk->type == ((blk->rot & 1) ? blkHorizSplit : blkVertSplit) &&
blk->smallSplit) {
blk->tag = blkTagLine;
for (i = 0; i < blk->children->getLength(); ++i) {
child = (TextBlock *)blk->children->get(i);
if (child->tag != blkTagLine) {
blk->tag = blkTagMulticolumn;
break;
}
}
} else {
blk->tag = blkTagMulticolumn;
}
return;
}
if (control.mode == textOutSimpleLayout) {
if (blk->type == blkLeaf) {
blk->tag = blkTagLine;
} else if (blk->type == ((blk->rot & 1) ? blkHorizSplit : blkVertSplit)) {
blk->tag = blk->smallSplit ? blkTagLine : blkTagSuperLine;
} else {
blk->tag = blkTagColumn;
}
return;
}
if (blk->type == blkLeaf) {
blk->tag = blkTagLine;
} else {
if (blk->type == ((blk->rot & 1) ? blkVertSplit : blkHorizSplit)) {
blk->tag = blkTagColumn;
for (i = 0; i < blk->children->getLength(); ++i) {
child = (TextBlock *)blk->children->get(i);
if (child->tag != blkTagColumn && child->tag != blkTagLine) {
blk->tag = blkTagMulticolumn;
break;
}
}
} else {
if (blk->smallSplit) {
blk->tag = blkTagLine;
for (i = 0; i < blk->children->getLength(); ++i) {
child = (TextBlock *)blk->children->get(i);
if (child->tag != blkTagLine) {
blk->tag = blkTagMulticolumn;
break;
}
}
} else {
blk->tag = blkTagMulticolumn;
}
}
}
}
// Insert a list of large characters into a tree.
void TextPage::insertLargeChars(GList *largeChars, TextBlock *blk) {
TextChar *ch, *ch2;
GBool singleLine;
double minOverlap;
int i;
//~ this currently works only for characters in the primary rotation
// check to see if the large chars are a single line
singleLine = gTrue;
for (i = 1; i < largeChars->getLength(); ++i) {
ch = (TextChar *)largeChars->get(i-1);
ch2 = (TextChar *)largeChars->get(i);
minOverlap = 0.5 * (ch->fontSize < ch2->fontSize ? ch->fontSize
: ch2->fontSize);
if (ch->yMax - ch2->yMin < minOverlap ||
ch2->yMax - ch->yMin < minOverlap) {
singleLine = gFalse;
break;
}
}
if (singleLine) {
// if the large chars are a single line, prepend them to the first
// leaf node in blk
insertLargeCharsInFirstLeaf(largeChars, blk);
} else {
// if the large chars are not a single line, prepend each one to
// the appropriate leaf node -- this handles cases like bullets
// drawn in a large font, on the left edge of a column
for (i = largeChars->getLength() - 1; i >= 0; --i) {
ch = (TextChar *)largeChars->get(i);
insertLargeCharInLeaf(ch, blk);
}
}
}
// Find the first leaf (in depth-first order) in blk, and prepend a
// list of large chars.
void TextPage::insertLargeCharsInFirstLeaf(GList *largeChars, TextBlock *blk) {
TextChar *ch;
int i;
if (blk->type == blkLeaf) {
for (i = largeChars->getLength() - 1; i >= 0; --i) {
ch = (TextChar *)largeChars->get(i);
blk->prependChild(ch);
}
} else {
insertLargeCharsInFirstLeaf(largeChars, (TextBlock *)blk->children->get(0));
blk->updateBounds(0);
}
}
// Find the leaf in <blk> where large char <ch> belongs, and prepend
// it.
void TextPage::insertLargeCharInLeaf(TextChar *ch, TextBlock *blk) {
TextBlock *child;
double y;
int i;
//~ this currently works only for characters in the primary rotation
//~ this currently just looks down the left edge of blk
//~ -- it could be extended to do more
// estimate the baseline of ch
y = ch->yMin + 0.75 * (ch->yMax - ch->yMin);
if (blk->type == blkLeaf) {
blk->prependChild(ch);
} else if (blk->type == blkHorizSplit) {
for (i = 0; i < blk->children->getLength(); ++i) {
child = (TextBlock *)blk->children->get(i);
if (y < child->yMax || i == blk->children->getLength() - 1) {
insertLargeCharInLeaf(ch, child);
blk->updateBounds(i);
break;
}
}
} else {
insertLargeCharInLeaf(ch, (TextBlock *)blk->children->get(0));
blk->updateBounds(0);
}
}
// Merge blk (rot != 0) into primaryTree (rot == 0).
void TextPage::insertIntoTree(TextBlock *blk, TextBlock *primaryTree) {
TextBlock *child;
// we insert a whole column at a time - so call insertIntoTree
// recursively until we get to a column (or line)
if (blk->tag == blkTagMulticolumn) {
while (blk->children->getLength()) {
child = (TextBlock *)blk->children->del(0);
insertIntoTree(child, primaryTree);
}
delete blk;
} else {
insertColumnIntoTree(blk, primaryTree);
}
}
// Insert a column (as an atomic subtree) into tree.
// Requirement: tree is not a leaf node.
void TextPage::insertColumnIntoTree(TextBlock *column, TextBlock *tree) {
TextBlock *child;
int i;
for (i = 0; i < tree->children->getLength(); ++i) {
child = (TextBlock *)tree->children->get(i);
if (child->tag == blkTagMulticolumn &&
column->xMin >= child->xMin &&
column->yMin >= child->yMin &&
column->xMax <= child->xMax &&
column->yMax <= child->yMax) {
insertColumnIntoTree(column, child);
tree->tag = blkTagMulticolumn;
return;
}
}
if (tree->type == blkVertSplit) {
if (tree->rot == 1 || tree->rot == 2) {
for (i = 0; i < tree->children->getLength(); ++i) {
child = (TextBlock *)tree->children->get(i);
if (column->xMax > 0.5 * (child->xMin + child->xMax)) {
break;
}
}
} else {
for (i = 0; i < tree->children->getLength(); ++i) {
child = (TextBlock *)tree->children->get(i);
if (column->xMin < 0.5 * (child->xMin + child->xMax)) {
break;
}
}
}
} else if (tree->type == blkHorizSplit) {
if (tree->rot >= 2) {
for (i = 0; i < tree->children->getLength(); ++i) {
child = (TextBlock *)tree->children->get(i);
if (column->yMax > 0.5 * (child->yMin + child->yMax)) {
break;
}
}
} else {
for (i = 0; i < tree->children->getLength(); ++i) {
child = (TextBlock *)tree->children->get(i);
if (column->yMin < 0.5 * (child->yMin + child->yMax)) {
break;
}
}
}
} else {
// this should never happen
return;
}
tree->children->insert(i, column);
tree->tag = blkTagMulticolumn;
}
// Insert clipped characters back into the TextBlock tree.
void TextPage::insertClippedChars(GList *clippedChars, TextBlock *tree) {
TextChar *ch, *ch2;
TextBlock *leaf;
double y;
int i;
//~ this currently works only for characters in the primary rotation
clippedChars->sort(TextChar::cmpX);
while (clippedChars->getLength()) {
ch = (TextChar *)clippedChars->del(0);
if (ch->rot != 0) {
continue;
}
if (!(leaf = findClippedCharLeaf(ch, tree))) {
continue;
}
leaf->addChild(ch, gFalse);
i = 0;
while (i < clippedChars->getLength()) {
ch2 = (TextChar *)clippedChars->get(i);
if (ch2->xMin > ch->xMax + clippedTextMaxWordSpace * ch->fontSize) {
break;
}
y = 0.5 * (ch2->yMin + ch2->yMax);
if (y > leaf->yMin && y < leaf->yMax) {
ch2 = (TextChar *)clippedChars->del(i);
leaf->addChild(ch2, gFalse);
ch = ch2;
} else {
++i;
}
}
}
}
// Find the leaf in <tree> to which clipped char <ch> can be appended.
// Returns NULL if there is no appropriate append point.
TextBlock *TextPage::findClippedCharLeaf(TextChar *ch, TextBlock *tree) {
TextBlock *ret, *child;
double y;
int i;
//~ this currently works only for characters in the primary rotation
y = 0.5 * (ch->yMin + ch->yMax);
if (tree->type == blkLeaf) {
if (tree->rot == 0) {
if (y > tree->yMin && y < tree->yMax &&
ch->xMin <= tree->xMax + clippedTextMaxWordSpace * ch->fontSize) {
return tree;
}
}
} else {
for (i = 0; i < tree->children->getLength(); ++i) {
child = (TextBlock *)tree->children->get(i);
if ((ret = findClippedCharLeaf(ch, child))) {
return ret;
}
}
}
return NULL;
}
// Convert the tree of TextBlocks into a list of TextColumns.
GList *TextPage::buildColumns(TextBlock *tree, GBool primaryLR) {
GList *columns;
columns = new GList();
buildColumns2(tree, columns, primaryLR);
return columns;
}
void TextPage::buildColumns2(TextBlock *blk, GList *columns, GBool primaryLR) {
TextColumn *col;
int i;
switch (blk->tag) {
case blkTagSuperLine: // should never happen
case blkTagLine:
case blkTagColumn:
col = buildColumn(blk);
columns->append(col);
break;
case blkTagMulticolumn:
#if 0 //~tmp
if (!primaryLR && blk->type == blkVertSplit) {
for (i = blk->children->getLength() - 1; i >= 0; --i) {
buildColumns2((TextBlock *)blk->children->get(i), columns, primaryLR);
}
} else {
#endif
for (i = 0; i < blk->children->getLength(); ++i) {
buildColumns2((TextBlock *)blk->children->get(i), columns, primaryLR);
}
#if 0 //~tmp
}
#endif
break;
}
}
TextColumn *TextPage::buildColumn(TextBlock *blk) {
GList *lines, *parLines;
GList *paragraphs;
TextLine *line0, *line1;
GBool dropCap;
double spaceThresh, indent0, indent1, fontSize0, fontSize1;
int i;
lines = new GList();
buildLines(blk, lines, gFalse);
spaceThresh = paragraphSpacingThreshold * getAverageLineSpacing(lines);
//~ could look for bulleted lists here: look for the case where
//~ all out-dented lines start with the same char
//~ this doesn't handle right-to-left scripts (need to look for indents
//~ on the right instead of left, etc.)
// build the paragraphs
paragraphs = new GList();
i = 0;
while (i < lines->getLength()) {
// get the first line of the paragraph
parLines = new GList();
dropCap = gFalse;
line0 = (TextLine *)lines->get(i);
parLines->append(line0);
++i;
if (i < lines->getLength()) {
line1 = (TextLine *)lines->get(i);
indent0 = getLineIndent(line0, blk);
indent1 = getLineIndent(line1, blk);
fontSize0 = line0->fontSize;
fontSize1 = line1->fontSize;
// inverted indent
if (indent1 - indent0 > minParagraphIndent * fontSize0 &&
fabs(fontSize0 - fontSize1) <= paragraphFontSizeDelta &&
getLineSpacing(line0, line1) <= spaceThresh) {
parLines->append(line1);
indent0 = indent1;
for (++i; i < lines->getLength(); ++i) {
line1 = (TextLine *)lines->get(i);
indent1 = getLineIndent(line1, blk);
fontSize1 = line1->fontSize;
if (indent0 - indent1 > minParagraphIndent * fontSize0) {
break;
}
if (fabs(fontSize0 - fontSize1) > paragraphFontSizeDelta) {
break;
}
if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
> spaceThresh) {
break;
}
parLines->append(line1);
}
// drop cap
} else if (fontSize0 > largeCharThreshold * fontSize1 &&
indent1 - indent0 > minParagraphIndent * fontSize1 &&
getLineSpacing(line0, line1) < 0) {
dropCap = gTrue;
parLines->append(line1);
fontSize0 = fontSize1;
for (++i; i < lines->getLength(); ++i) {
line1 = (TextLine *)lines->get(i);
indent1 = getLineIndent(line1, blk);
if (indent1 - indent0 <= minParagraphIndent * fontSize0) {
break;
}
if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
> spaceThresh) {
break;
}
parLines->append(line1);
}
for (; i < lines->getLength(); ++i) {
line1 = (TextLine *)lines->get(i);
indent1 = getLineIndent(line1, blk);
fontSize1 = line1->fontSize;
if (indent1 - indent0 > minParagraphIndent * fontSize0) {
break;
}
if (fabs(fontSize0 - fontSize1) > paragraphFontSizeDelta) {
break;
}
if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
> spaceThresh) {
break;
}
parLines->append(line1);
}
// regular indent or no indent
} else if (fabs(fontSize0 - fontSize1) <= paragraphFontSizeDelta &&
getLineSpacing(line0, line1) <= spaceThresh) {
parLines->append(line1);
indent0 = indent1;
for (++i; i < lines->getLength(); ++i) {
line1 = (TextLine *)lines->get(i);
indent1 = getLineIndent(line1, blk);
fontSize1 = line1->fontSize;
if (indent1 - indent0 > minParagraphIndent * fontSize0) {
break;
}
if (fabs(fontSize0 - fontSize1) > paragraphFontSizeDelta) {
break;
}
if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
> spaceThresh) {
break;
}
parLines->append(line1);
}
}
}
paragraphs->append(new TextParagraph(parLines, dropCap));
}
delete lines;
return new TextColumn(paragraphs, blk->xMin, blk->yMin,
blk->xMax, blk->yMax);
}
double TextPage::getLineIndent(TextLine *line, TextBlock *blk) {
double indent;
switch (line->rot) {
case 0:
default: indent = line->xMin - blk->xMin; break;
case 1: indent = line->yMin - blk->yMin; break;
case 2: indent = blk->xMax - line->xMax; break;
case 3: indent = blk->yMax - line->yMax; break;
}
return indent;
}
// Compute average line spacing in column.
double TextPage::getAverageLineSpacing(GList *lines) {
double avg, sp;
int n, i;
avg = 0;
n = 0;
for (i = 1; i < lines->getLength(); ++i) {
sp = getLineSpacing((TextLine *)lines->get(i - 1),
(TextLine *)lines->get(i));
if (sp > 0) {
avg += sp;
++n;
}
}
if (n > 0) {
avg /= n;
}
return avg;
}
// Compute the space between two lines.
double TextPage::getLineSpacing(TextLine *line0, TextLine *line1) {
double sp;
switch (line0->rot) {
case 0:
default: sp = line1->yMin - line0->yMax; break;
case 1: sp = line0->xMin - line1->xMax; break;
case 2: sp = line0->yMin - line1->yMin; break;
case 3: sp = line1->xMin - line1->xMax; break;
}
return sp;
}
void TextPage::buildLines(TextBlock *blk, GList *lines,
GBool splitSuperLines) {
TextLine *line;
int i;
if (blk->tag == blkTagLine ||
(blk->tag == blkTagSuperLine && !splitSuperLines)) {
line = buildLine(blk);
if (blk->rot == 1 || blk->rot == 2) {
lines->insert(0, line);
} else {
lines->append(line);
}
} else {
for (i = 0; i < blk->children->getLength(); ++i) {
buildLines((TextBlock *)blk->children->get(i), lines, splitSuperLines);
}
}
}
GList *TextPage::buildSimple2Columns(GList *charsA) {
GList *columns, *paragraphs, *lines;
TextParagraph *paragraph;
int rot;
charsA->sort(&TextChar::cmpX);
columns = new GList();
for (rot = 0; rot < 4; ++rot) {
lines = buildSimple2Lines(charsA, rot);
if (lines->getLength() == 0) {
delete lines;
continue;
}
paragraph = new TextParagraph(lines, gFalse);
paragraphs = new GList();
paragraphs->append(paragraph);
columns->append(new TextColumn(paragraphs,
paragraph->xMin, paragraph->yMin,
paragraph->xMax, paragraph->yMax));
}
return columns;
}
GList *TextPage::buildSimple2Lines(GList *charsA, int rot) {
GList *openCharLines, *lines;
TextCharLine *firstCharLine, *lastCharLine, *charLine, *p;
TextChar *ch;
TextLine *line;
double bestOverlap, overlap, xMin, yMin, xMax, yMax;
int bestLine, i, j, k, m;
firstCharLine = lastCharLine = NULL;
openCharLines = new GList();
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
if (ch->rot != rot) {
continue;
}
// find the first open line with line.yMax > ch.yMin
j = -1;
k = openCharLines->getLength();
while (j < k - 1) {
// invariants: openLines[j].yMax <= ch.yMin (or j = -1)
// openLines[k].yMax > ch.yMin (or k = nOpenLines)
// j < k - 1
m = j + (k - j) / 2;
charLine = (TextCharLine *)openCharLines->get(m);
if (charLine->yMax <= ch->yMin) {
j = m;
} else {
k = m;
}
}
// check overlap for all overlapping lines
// i.e., all lines with line.yMin < ch.yMax and line.yMax > ch.yMin
bestLine = -1;
bestOverlap = 0;
for (; k < openCharLines->getLength(); ++k) {
charLine = (TextCharLine *)openCharLines->get(k);
if (charLine->yMin >= ch->yMax) {
break;
}
overlap = ((ch->yMax < charLine->yMax ? ch->yMax : charLine->yMax)
- (ch->yMin > charLine->yMin ? ch->yMin : charLine->yMin))
/ (ch->yMax - ch->yMin);
if (overlap > bestOverlap) {
bestLine = k;
bestOverlap = overlap;
}
}
// found an overlapping line
if (bestLine >= 0 && bestOverlap > simple2MinOverlap) {
k = bestLine;
charLine = (TextCharLine *)openCharLines->get(k);
// else insert a new line immediately before line k
} else {
charLine = new TextCharLine(ch->rot);
if (k < openCharLines->getLength()) {
p = (TextCharLine *)openCharLines->get(k);
if (p->prev) {
p->prev->next = charLine;
charLine->prev = p->prev;
} else {
firstCharLine = charLine;
}
p->prev = charLine;
charLine->next = p;
} else {
if (lastCharLine) {
lastCharLine->next = charLine;
charLine->prev = lastCharLine;
} else {
firstCharLine = charLine;
}
lastCharLine = charLine;
}
openCharLines->insert(k, charLine);
}
// add the char to the line
charLine->add(ch);
charLine->yMin = ch->yMin;
charLine->yMax = ch->yMax;
// update open lines before k
j = k - 1;
while (j >= 0) {
charLine = (TextCharLine *)openCharLines->get(j);
if (charLine->yMax <= ch->yMin) {
break;
}
charLine->yMax = ch->yMin;
if (charLine->yMin < charLine->yMax) {
break;
}
openCharLines->del(j);
--j;
}
// update open lines after k
j = k + 1;
while (j < openCharLines->getLength()) {
charLine = (TextCharLine *)openCharLines->get(j);
if (charLine->yMin >= ch->yMax) {
break;
}
charLine->yMin = ch->yMax;
if (charLine->yMin < charLine->yMax) {
break;
}
openCharLines->del(j);
}
}
// build TextLine objects
lines = new GList();
for (charLine = firstCharLine; charLine; charLine = p) {
xMin = yMin = xMax = yMax = 0;
for (j = 0; j < charLine->chars->getLength(); ++j) {
ch = (TextChar *)charLine->chars->get(j);
if (j == 0) {
xMin = ch->xMin;
yMin = ch->yMin;
xMax = ch->xMax;
yMax = ch->yMax;
} else {
if (ch->xMin < xMin) {
xMin = ch->xMin;
}
if (ch->yMin < yMin) {
yMin = ch->yMin;
}
if (ch->xMax < xMax) {
xMax = ch->xMax;
}
if (ch->yMax < yMax) {
yMax = ch->yMax;
}
}
}
// the chars have been rotated to 0, without changing the
// TextChar.rot values, so we need to tell buildLine to use rot=0,
// and then set the word and line rotation correctly afterward
line = buildLine(charLine->chars, 0, xMin, yMin, xMax, yMax);
line->rot = charLine->rot;
for (i = 0; i < line->words->getLength(); ++i) {
((TextWord *)line->words->get(i))->rot = (char)charLine->rot;
}
lines->append(line);
p = charLine->next;
delete charLine;
}
delete openCharLines;
return lines;
}
TextLine *TextPage::buildLine(TextBlock *blk) {
GList *charsA;
charsA = new GList();
getLineChars(blk, charsA);
TextLine *line = buildLine(charsA, blk->rot,
blk->xMin, blk->yMin, blk->xMax, blk->yMax);
delete charsA;
return line;
}
TextLine *TextPage::buildLine(GList *charsA, int rot,
double xMin, double yMin,
double xMax, double yMax) {
GList *words;
TextChar *ch, *ch2;
TextWord *word;
double wordSp, lineFontSize, sp;
int dir, dir2;
GBool rotated, spaceAfter, spaceBefore;
int i, j;
wordSp = computeWordSpacingThreshold(charsA, rot);
words = new GList();
lineFontSize = 0;
spaceBefore = gFalse;
i = 0;
while (i < charsA->getLength()) {
sp = wordSp - 1;
spaceAfter = gFalse;
dir = getCharDirection((TextChar *)charsA->get(i));
rotated = ((TextChar *)charsA->get(i))->rotated;
for (j = i+1; j < charsA->getLength(); ++j) {
ch = (TextChar *)charsA->get(j-1);
ch2 = (TextChar *)charsA->get(j);
sp = (rot & 1) ? (ch2->yMin - ch->yMax) : (ch2->xMin - ch->xMax);
if (sp > wordSp) {
spaceAfter = gTrue;
break;
}
// look for significant overlaps, which can happen with clipped
// characters (among other things)
if (sp < -ch->fontSize) {
spaceAfter = gTrue;
break;
}
dir2 = getCharDirection(ch2);
if (ch->font != ch2->font ||
fabs(ch->fontSize - ch2->fontSize) > 0.01 ||
(control.splitRotatedWords && ch2->rotated != rotated) ||
(dir && dir2 && dir2 != dir) ||
(control.mode == textOutRawOrder &&
ch2->charPos != ch->charPos + ch->charLen)) {
break;
}
if (!dir && dir2) {
dir = dir2;
}
sp = wordSp - 1;
}
word = new TextWord(charsA, i, j - i, rot, rotated, dir,
(rot >= 2) ? spaceBefore : spaceAfter);
spaceBefore = spaceAfter;
i = j;
if (rot >= 2) {
words->insert(0, word);
} else {
words->append(word);
}
if (i == 0 || word->fontSize > lineFontSize) {
lineFontSize = word->fontSize;
}
}
return new TextLine(words, xMin, yMin, xMax, yMax, lineFontSize);
}
void TextPage::getLineChars(TextBlock *blk, GList *charsA) {
int i;
if (blk->type == blkLeaf) {
charsA->append(blk->children);
} else {
for (i = 0; i < blk->children->getLength(); ++i) {
getLineChars((TextBlock *)blk->children->get(i), charsA);
}
}
}
// Compute the inter-word spacing threshold for a line of chars.
// Spaces greater than this threshold will be considered inter-word
// spaces.
double TextPage::computeWordSpacingThreshold(GList *charsA, int rot) {
TextChar *ch, *ch2;
double uniformSp, wordSp;
double avgFontSize;
double minAdjGap, maxAdjGap, minSpGap, maxSpGap, minGap, maxGap, gap, gap2;
int i;
if (control.mode == textOutTableLayout) {
uniformSp = tableModeUniformSpacing;
wordSp = tableModeWordSpacing;
} else {
uniformSp = uniformSpacing;
wordSp = wordSpacing;
}
avgFontSize = 0;
minGap = maxGap = 0;
minAdjGap = minSpGap = 1;
maxAdjGap = maxSpGap = 0;
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
avgFontSize += ch->fontSize;
if (i < charsA->getLength() - 1) {
ch2 = (TextChar *)charsA->get(i+1);
gap = (rot & 1) ? (ch2->yMin - ch->yMax) : (ch2->xMin - ch->xMax);
if (ch->spaceAfter) {
if (minSpGap > maxSpGap) {
minSpGap = maxSpGap = gap;
} else if (gap < minSpGap) {
minSpGap = gap;
} else if (gap > maxSpGap) {
maxSpGap = gap;
}
} else {
if (minAdjGap > maxAdjGap) {
minAdjGap = maxAdjGap = gap;
} else if (gap < minAdjGap) {
minAdjGap = gap;
} else if (gap > maxAdjGap) {
maxAdjGap = gap;
}
}
if (i == 0 || gap < minGap) {
minGap = gap;
}
if (gap > maxGap) {
maxGap = gap;
}
}
}
avgFontSize /= charsA->getLength();
if (minGap < 0) {
minGap = 0;
}
// if spacing is nearly uniform (minGap is close to maxGap), there
// are three cases:
// (1) if the SpGap and AdjGap values are both available and
// sensible, use them
// (2) if only the SpGap values are available, meaning that every
// character in the line had a space after it, split after every
// character
// (3) otherwise assume it's a single word (technically it could be
// either "ABC" or "A B C", but it's essentially impossible to
// tell)
if (maxGap - minGap < uniformSp * avgFontSize) {
if (minSpGap <= maxSpGap) {
if (minAdjGap <= maxAdjGap &&
minSpGap - maxAdjGap > 0.01) {
return 0.5 * (maxAdjGap + minSpGap);
} else if (minAdjGap > maxAdjGap &&
maxSpGap - minSpGap < uniformSp * avgFontSize) {
return minSpGap - 1;
}
}
return maxGap + 1;
// if there is some variation in spacing, but it's small, assume
// there are some inter-word spaces
} else if (maxGap - minGap < wordSp * avgFontSize) {
return 0.5 * (minGap + maxGap);
// if there is a large variation in spacing, use the SpGap/AdjGap
// values if they look reasonable, otherwise, assume a reasonable
// threshold for inter-word spacing (we can't use something like
// 0.5*(minGap+maxGap) here because there can be outliers at the
// high end)
} else {
if (minAdjGap <= maxAdjGap &&
minSpGap <= maxSpGap &&
minSpGap - maxAdjGap > uniformSp * avgFontSize) {
gap = wordSp * avgFontSize;
gap2 = 0.5 * (minSpGap - minGap);
return minGap + (gap < gap2 ? gap : gap2);
} else {
return minGap + wordSp * avgFontSize;
}
}
}
// Check the characters direction: returns 1 for L or Num; -1 for R; 0
// for others.
int TextPage::getCharDirection(TextChar *ch) {
if (unicodeTypeL(ch->c) || unicodeTypeNum(ch->c)) {
return 1;
}
if (unicodeTypeR(ch->c)) {
return -1;
}
return 0;
}
int TextPage::assignPhysLayoutPositions(GList *columns) {
assignLinePhysPositions(columns);
return assignColumnPhysPositions(columns);
}
// Assign a physical x coordinate for each TextLine (relative to the
// containing TextColumn). This also computes TextColumn width and
// height.
void TextPage::assignLinePhysPositions(GList *columns) {
TextColumn *col;
TextParagraph *par;
TextLine *line;
UnicodeMap *uMap;
int colIdx, parIdx, lineIdx;
if (!(uMap = globalParams->getTextEncoding())) {
return;
}
for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
col = (TextColumn *)columns->get(colIdx);
col->pw = col->ph = 0;
for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
computeLinePhysWidth(line, uMap);
if (control.fixedPitch > 0) {
line->px = (int)((line->xMin - col->xMin) / control.fixedPitch);
} else if (fabs(line->fontSize) < 0.001) {
line->px = 0;
} else {
line->px = (int)((line->xMin - col->xMin) /
(physLayoutSpaceWidth * line->fontSize));
}
if (line->px + line->pw > col->pw) {
col->pw = line->px + line->pw;
}
}
col->ph += par->lines->getLength();
}
col->ph += col->paragraphs->getLength() - 1;
}
uMap->decRefCnt();
}
void TextPage::computeLinePhysWidth(TextLine *line, UnicodeMap *uMap) {
char buf[8];
int n, i;
if (uMap->isUnicode()) {
line->pw = line->len;
} else {
line->pw = 0;
for (i = 0; i < line->len; ++i) {
n = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
line->pw += n;
}
}
}
// Assign physical x and y coordinates for each TextColumn. Returns
// the text height (max physical y + 1).
int TextPage::assignColumnPhysPositions(GList *columns) {
TextColumn *col, *col2;
double slack, xOverlap, yOverlap;
int ph, i, j;
if (control.mode == textOutTableLayout) {
slack = tableCellOverlapSlack;
} else {
slack = 0;
}
// assign x positions
columns->sort(&TextColumn::cmpX);
for (i = 0; i < columns->getLength(); ++i) {
col = (TextColumn *)columns->get(i);
if (control.fixedPitch) {
col->px = (int)(col->xMin / control.fixedPitch);
} else {
col->px = 0;
for (j = 0; j < i; ++j) {
col2 = (TextColumn *)columns->get(j);
xOverlap = col2->xMax - col->xMin;
if (xOverlap < slack * (col2->xMax - col2->xMin)) {
if (col2->px + col2->pw + 2 > col->px) {
col->px = col2->px + col2->pw + 2;
}
} else {
yOverlap = (col->yMax < col2->yMax ? col->yMax : col2->yMax) -
(col->yMin > col2->yMin ? col->yMin : col2->yMin);
if (yOverlap > 0 && xOverlap < yOverlap) {
if (col2->px + col2->pw > col->px) {
col->px = col2->px + col2->pw;
}
} else {
if (col2->px > col->px) {
col->px = col2->px;
}
}
}
}
}
}
// assign y positions
ph = 0;
columns->sort(&TextColumn::cmpY);
for (i = 0; i < columns->getLength(); ++i) {
col = (TextColumn *)columns->get(i);
col->py = 0;
for (j = 0; j < i; ++j) {
col2 = (TextColumn *)columns->get(j);
yOverlap = col2->yMax - col->yMin;
if (yOverlap < slack * (col2->yMax - col2->yMin)) {
if (col2->py + col2->ph + 1 > col->py) {
col->py = col2->py + col2->ph + 1;
}
} else {
xOverlap = (col->xMax < col2->xMax ? col->xMax : col2->xMax) -
(col->xMin > col2->xMin ? col->xMin : col2->xMin);
if (xOverlap > 0 && yOverlap < xOverlap) {
if (col2->py + col2->ph > col->py) {
col->py = col2->py + col2->ph;
}
} else {
if (col2->py > col->py) {
col->py = col2->py;
}
}
}
}
if (col->py + col->ph > ph) {
ph = col->py + col->ph;
}
}
return ph;
}
void TextPage::buildSuperLines(TextBlock *blk, GList *superLines) {
GList *lines;
int i;
if (blk->tag == blkTagLine || blk->tag == blkTagSuperLine) {
lines = new GList();
buildLines(blk, lines, gTrue);
superLines->append(new TextSuperLine(lines));
} else {
for (i = 0; i < blk->children->getLength(); ++i) {
buildSuperLines((TextBlock *)blk->children->get(i), superLines);
}
}
}
void TextPage::assignSimpleLayoutPositions(GList *superLines,
UnicodeMap *uMap) {
GList *lines;
TextLine *line0, *line1;
double xMin, xMax;
int px, px2, sp, i, j;
// build a list of lines and sort by x
lines = new GList();
for (i = 0; i < superLines->getLength(); ++i) {
lines->append(((TextSuperLine *)superLines->get(i))->lines);
}
lines->sort(&TextLine::cmpX);
// assign positions
xMin = ((TextLine *)lines->get(0))->xMin;
for (i = 0; i < lines->getLength(); ++i) {
line0 = (TextLine *)lines->get(i);
computeLinePhysWidth(line0, uMap);
px = 0;
xMax = xMin;
for (j = 0; j < i; ++j) {
line1 = (TextLine *)lines->get(j);
if (line0->xMin > line1->xMax) {
if (line1->xMax > xMax) {
xMax = line1->xMax;
}
px2 = line1->px + line1->pw;
if (px2 > px) {
px = px2;
}
}
}
sp = (int)((line0->xMin - xMax) / (0.5 * line0->fontSize) + 0.5);
if (sp < 1 && xMax > xMin) {
sp = 1;
}
line0->px = px + sp;
}
delete lines;
}
void TextPage::generateUnderlinesAndLinks(GList *columns) {
TextColumn *col;
TextParagraph *par;
TextLine *line;
TextWord *word;
TextUnderline *underline;
TextLink *link;
double base, uSlack, ubSlack, hSlack;
int colIdx, parIdx, lineIdx, wordIdx, i;
for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
col = (TextColumn *)columns->get(colIdx);
for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
word = (TextWord *)line->words->get(wordIdx);
base = word->getBaseline();
uSlack = underlineSlack * word->fontSize;
ubSlack = underlineBaselineSlack * word->fontSize;
hSlack = hyperlinkSlack * word->fontSize;
//----- handle underlining
for (i = 0; i < underlines->getLength(); ++i) {
underline = (TextUnderline *)underlines->get(i);
if (underline->horiz) {
if (word->rot == 0 || word->rot == 2) {
if (fabs(underline->y0 - base) < ubSlack &&
underline->x0 < word->xMin + uSlack &&
word->xMax - uSlack < underline->x1) {
word->underlined = gTrue;
}
}
} else {
if (word->rot == 1 || word->rot == 3) {
if (fabs(underline->x0 - base) < ubSlack &&
underline->y0 < word->yMin + uSlack &&
word->yMax - uSlack < underline->y1) {
word->underlined = gTrue;
}
}
}
}
//----- handle links
for (i = 0; i < links->getLength(); ++i) {
link = (TextLink *)links->get(i);
if (link->xMin < word->xMin + hSlack &&
word->xMax - hSlack < link->xMax &&
link->yMin < word->yMin + hSlack &&
word->yMax - hSlack < link->yMax) {
word->link = link;
}
}
}
}
}
}
}
//------------------------------------------------------------------------
// TextPage: access
//------------------------------------------------------------------------
GBool TextPage::findText(Unicode *s, int len,
GBool startAtTop, GBool stopAtBottom,
GBool startAtLast, GBool stopAtLast,
GBool caseSensitive, GBool backward,
GBool wholeWord,
double *xMin, double *yMin,
double *xMax, double *yMax) {
TextColumn *column;
TextParagraph *par;
TextLine *line;
Unicode *s2, *txt;
Unicode *p;
double xStart, yStart, xStop, yStop;
double xMin0, yMin0, xMax0, yMax0;
double xMin1, yMin1, xMax1, yMax1;
GBool found;
int txtSize, m, colIdx, parIdx, lineIdx, i, j, k;
//~ need to handle right-to-left text
//~ - pass primaryLR to buildColumns
buildFindCols();
// convert the search string to uppercase
if (!caseSensitive) {
s2 = (Unicode *)gmallocn(len, sizeof(Unicode));
for (i = 0; i < len; ++i) {
s2[i] = unicodeToUpper(s[i]);
}
} else {
s2 = s;
}
txt = NULL;
txtSize = 0;
xStart = yStart = xStop = yStop = 0;
if (startAtLast && haveLastFind) {
xStart = lastFindXMin;
yStart = lastFindYMin;
} else if (!startAtTop) {
xStart = *xMin;
yStart = *yMin;
}
if (stopAtLast && haveLastFind) {
xStop = lastFindXMin;
yStop = lastFindYMin;
} else if (!stopAtBottom) {
xStop = *xMax;
yStop = *yMax;
}
found = gFalse;
xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
for (colIdx = backward ? findCols->getLength() - 1 : 0;
backward ? colIdx >= 0 : colIdx < findCols->getLength();
colIdx += backward ? -1 : 1) {
column = (TextColumn *)findCols->get(colIdx);
// check: is the column above the top limit?
if (!startAtTop && (backward ? column->yMin > yStart
: column->yMax < yStart)) {
continue;
}
// check: is the column below the bottom limit?
if (!stopAtBottom && (backward ? column->yMax < yStop
: column->yMin > yStop)) {
continue;
}
for (parIdx = backward ? column->paragraphs->getLength() - 1 : 0;
backward ? parIdx >= 0 : parIdx < column->paragraphs->getLength();
parIdx += backward ? -1 : 1) {
par = (TextParagraph *)column->paragraphs->get(parIdx);
// check: is the paragraph above the top limit?
if (!startAtTop && (backward ? par->yMin > yStart
: par->yMax < yStart)) {
continue;
}
// check: is the paragraph below the bottom limit?
if (!stopAtBottom && (backward ? par->yMax < yStop
: par->yMin > yStop)) {
continue;
}
for (lineIdx = backward ? par->lines->getLength() - 1 : 0;
backward ? lineIdx >= 0 : lineIdx < par->lines->getLength();
lineIdx += backward ? -1 : 1) {
line = (TextLine *)par->lines->get(lineIdx);
// check: is the line above the top limit?
if (!startAtTop && (backward ? line->yMin > yStart
: line->yMax < yStart)) {
continue;
}
// check: is the line below the bottom limit?
if (!stopAtBottom && (backward ? line->yMax < yStop
: line->yMin > yStop)) {
continue;
}
// convert the line to uppercase
m = line->len;
if (!caseSensitive) {
if (m > txtSize) {
txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
txtSize = m;
}
for (k = 0; k < m; ++k) {
txt[k] = unicodeToUpper(line->text[k]);
}
} else {
txt = line->text;
}
// search each position in this line
j = backward ? m - len : 0;
p = txt + j;
while (backward ? j >= 0 : j <= m - len) {
if (!wholeWord ||
((j == 0 || !unicodeTypeWord(txt[j - 1])) &&
(j + len == m || !unicodeTypeWord(txt[j + len])))) {
// compare the strings
for (k = 0; k < len; ++k) {
if (p[k] != s2[k]) {
break;
}
}
// found it
if (k == len) {
switch (line->rot) {
case 0:
xMin1 = line->edge[j];
xMax1 = line->edge[j + len];
yMin1 = line->yMin;
yMax1 = line->yMax;
break;
case 1:
xMin1 = line->xMin;
xMax1 = line->xMax;
yMin1 = line->edge[j];
yMax1 = line->edge[j + len];
break;
case 2:
xMin1 = line->edge[j + len];
xMax1 = line->edge[j];
yMin1 = line->yMin;
yMax1 = line->yMax;
break;
case 3:
xMin1 = line->xMin;
xMax1 = line->xMax;
yMin1 = line->edge[j + len];
yMax1 = line->edge[j];
break;
}
if (backward) {
if ((startAtTop ||
yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) &&
(stopAtBottom ||
yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
if (!found ||
yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
xMin0 = xMin1;
xMax0 = xMax1;
yMin0 = yMin1;
yMax0 = yMax1;
found = gTrue;
}
}
} else {
if ((startAtTop ||
yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) &&
(stopAtBottom ||
yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
if (!found ||
yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
xMin0 = xMin1;
xMax0 = xMax1;
yMin0 = yMin1;
yMax0 = yMax1;
found = gTrue;
}
}
}
}
}
if (backward) {
--j;
--p;
} else {
++j;
++p;
}
}
}
}
}
if (!caseSensitive) {
gfree(s2);
gfree(txt);
}
if (found) {
*xMin = xMin0;
*xMax = xMax0;
*yMin = yMin0;
*yMax = yMax0;
lastFindXMin = xMin0;
lastFindYMin = yMin0;
haveLastFind = gTrue;
return gTrue;
}
return gFalse;
}
GString *TextPage::getText(double xMin, double yMin,
double xMax, double yMax, GBool forceEOL) {
UnicodeMap *uMap;
char space[8], eol[16];
int spaceLen, eolLen;
GList *chars2;
GString **out;
int *outLen;
TextColumn *col;
TextParagraph *par;
TextLine *line;
TextChar *ch;
GBool primaryLR;
TextBlock *tree;
GList *columns;
GString *ret;
double xx, yy;
int rot, colIdx, parIdx, lineIdx, ph, y, i;
// get the output encoding
if (!(uMap = globalParams->getTextEncoding())) {
return NULL;
}
spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
eolLen = 0; // make gcc happy
switch (globalParams->getTextEOL()) {
case eolUnix:
eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
break;
case eolDOS:
eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
eolLen += uMap->mapUnicode(0x0a, eol + eolLen, (int)sizeof(eol) - eolLen);
break;
case eolMac:
eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
break;
}
// get all chars in the rectangle
// (i.e., all chars whose center lies inside the rectangle)
chars2 = new GList();
for (i = 0; i < chars->getLength(); ++i) {
ch = (TextChar *)chars->get(i);
xx = 0.5 * (ch->xMin + ch->xMax);
yy = 0.5 * (ch->yMin + ch->yMax);
if (xx > xMin && xx < xMax && yy > yMin && yy < yMax) {
chars2->append(ch);
}
}
#if 0 //~debug
dumpChars(chars2);
#endif
rot = rotateChars(chars2);
primaryLR = checkPrimaryLR(chars2);
tree = splitChars(chars2);
if (!tree) {
unrotateChars(chars2, rot);
delete chars2;
return new GString();
}
#if 0 //~debug
dumpTree(tree);
#endif
columns = buildColumns(tree, primaryLR);
delete tree;
ph = assignPhysLayoutPositions(columns);
#if 0 //~debug
dumpColumns(columns);
#endif
unrotateChars(chars2, rot);
delete chars2;
out = (GString **)gmallocn(ph, sizeof(GString *));
outLen = (int *)gmallocn(ph, sizeof(int));
for (i = 0; i < ph; ++i) {
out[i] = NULL;
outLen[i] = 0;
}
columns->sort(&TextColumn::cmpPX);
for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
col = (TextColumn *)columns->get(colIdx);
y = col->py;
for (parIdx = 0;
parIdx < col->paragraphs->getLength() && y < ph;
++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
for (lineIdx = 0;
lineIdx < par->lines->getLength() && y < ph;
++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
if (!out[y]) {
out[y] = new GString();
}
while (outLen[y] < col->px + line->px) {
out[y]->append(space, spaceLen);
++outLen[y];
}
encodeFragment(line->text, line->len, uMap, primaryLR, out[y]);
outLen[y] += line->pw;
++y;
}
if (parIdx + 1 < col->paragraphs->getLength()) {
++y;
}
}
}
ret = new GString();
for (i = 0; i < ph; ++i) {
if (out[i]) {
ret->append(out[i]);
delete out[i];
}
if (ph > 1 || forceEOL) {
ret->append(eol, eolLen);
}
}
gfree(out);
gfree(outLen);
deleteGList(columns, TextColumn);
uMap->decRefCnt();
return ret;
}
GBool TextPage::findCharRange(int pos, int length,
double *xMin, double *yMin,
double *xMax, double *yMax) {
TextChar *ch;
double xMin2, yMin2, xMax2, yMax2;
GBool first;
int i;
//~ this doesn't correctly handle ranges split across multiple lines
//~ (the highlighted region is the bounding box of all the parts of
//~ the range)
xMin2 = yMin2 = xMax2 = yMax2 = 0;
first = gTrue;
for (i = 0; i < chars->getLength(); ++i) {
ch = (TextChar *)chars->get(i);
if (ch->charPos >= pos && ch->charPos < pos + length) {
if (first || ch->xMin < xMin2) {
xMin2 = ch->xMin;
}
if (first || ch->yMin < yMin2) {
yMin2 = ch->yMin;
}
if (first || ch->xMax > xMax2) {
xMax2 = ch->xMax;
}
if (first || ch->yMax > yMax2) {
yMax2 = ch->yMax;
}
first = gFalse;
}
}
if (first) {
return gFalse;
}
*xMin = xMin2;
*yMin = yMin2;
*xMax = xMax2;
*yMax = yMax2;
return gTrue;
}
GBool TextPage::checkPointInside(double x, double y) {
TextColumn *col;
int colIdx;
buildFindCols();
//~ this doesn't handle RtL, vertical, or rotated text
//~ this doesn't handle drop caps
for (colIdx = 0; colIdx < findCols->getLength(); ++colIdx) {
col = (TextColumn *)findCols->get(colIdx);
if (col->getRotation() != 0) {
continue;
}
if (x >= col->getXMin() && x <= col->getXMax() &&
y >= col->getYMin() && y <= col->getYMax()) {
return gTrue;
}
}
return gFalse;
}
GBool TextPage::findPointInside(double x, double y, TextPosition *pos) {
TextColumn *col;
int colIdx;
buildFindCols();
//~ this doesn't handle RtL, vertical, or rotated text
//~ this doesn't handle drop caps
for (colIdx = 0; colIdx < findCols->getLength(); ++colIdx) {
col = (TextColumn *)findCols->get(colIdx);
if (col->getRotation() != 0) {
continue;
}
if (x >= col->getXMin() && x <= col->getXMax() &&
y >= col->getYMin() && y <= col->getYMax()) {
pos->colIdx = colIdx;
findPointInColumn(col, x, y, pos);
return gTrue;
}
}
return gFalse;
}
GBool TextPage::findPointNear(double x, double y, TextPosition *pos) {
TextColumn *col;
double nearestDist, dx, dy;
int nearestColIdx, colIdx;
buildFindCols();
//~ this doesn't handle RtL, vertical, or rotated text
//~ this doesn't handle drop caps
nearestColIdx = -1;
nearestDist = 0;
for (colIdx = 0; colIdx < findCols->getLength(); ++colIdx) {
col = (TextColumn *)findCols->get(colIdx);
if (col->getRotation() != 0) {
continue;
}
if (x < col->getXMin()) {
dx = col->getXMin() - x;
} else if (x > col->getXMax()) {
dx = x - col->getXMax();
} else {
dx = 0;
}
if (y < col->getYMin()) {
dy = col->getYMin() - y;
} else if (y > col->getYMax()) {
dy = y - col->getYMax();
} else {
dy = 0;
}
if (nearestColIdx < 0 || dx + dy < nearestDist) {
nearestColIdx = colIdx;
nearestDist = dx + dy;
}
}
if (nearestColIdx < 0) {
return gFalse;
}
pos->colIdx = nearestColIdx;
col = (TextColumn *)findCols->get(nearestColIdx);
findPointInColumn(col, x, y, pos);
return gTrue;
}
GBool TextPage::findWordPoints(double x, double y,
TextPosition *startPos, TextPosition *endPos) {
TextPosition pos;
TextColumn *col;
TextParagraph *par;
TextLine *line;
int startCharIdx, endCharIdx;
if (!findPointInside(x, y, &pos)) {
return gFalse;
}
col = (TextColumn *)findCols->get(pos.colIdx);
par = (TextParagraph *)col->getParagraphs()->get(pos.parIdx);
line = (TextLine *)par->getLines()->get(pos.lineIdx);
for (startCharIdx = pos.charIdx;
startCharIdx > 0 && line->text[startCharIdx - 1] != 0x20;
--startCharIdx) ;
*startPos = pos;
startPos->charIdx = startCharIdx;
for (endCharIdx = pos.charIdx;
endCharIdx < line->len && line->text[endCharIdx] != 0x20;
++endCharIdx) ;
*endPos = pos;
endPos->charIdx = endCharIdx;
return gTrue;
}
GBool TextPage::findLinePoints(double x, double y,
TextPosition *startPos, TextPosition *endPos) {
TextPosition pos;
TextColumn *col;
TextParagraph *par;
TextLine *line;
if (!findPointInside(x, y, &pos)) {
return gFalse;
}
col = (TextColumn *)findCols->get(pos.colIdx);
par = (TextParagraph *)col->getParagraphs()->get(pos.parIdx);
line = (TextLine *)par->getLines()->get(pos.lineIdx);
*startPos = pos;
startPos->charIdx = 0;
*endPos = pos;
endPos->charIdx = line->len;
return gTrue;
}
// Find the position in [col] corresponding to [x],[y]. The column,
// [col], was found by findPointInside() or findPointNear().
void TextPage::findPointInColumn(TextColumn *col, double x, double y,
TextPosition *pos) {
TextParagraph *par;
TextLine *line;
GList *pars, *lines;
int parIdx, lineIdx, charIdx;
//~ this doesn't handle RtL, vertical, or rotated text
//~ this doesn't handle drop caps
pars = col->getParagraphs();
//~ could use a binary search here
for (parIdx = 0; parIdx < pars->getLength() - 1; ++parIdx) {
par = (TextParagraph *)pars->get(parIdx);
if (y <= par->getYMax()) {
break;
}
}
par = (TextParagraph *)pars->get(parIdx);
lines = par->getLines();
//~ could use a binary search here
for (lineIdx = 0; lineIdx < lines->getLength() - 1; ++lineIdx) {
line = (TextLine *)lines->get(lineIdx);
if (y <= line->getYMax()) {
break;
}
}
line = (TextLine *)lines->get(lineIdx);
//~ could use a binary search here
for (charIdx = 0; charIdx < line->getLength(); ++charIdx) {
if (x <= 0.5 * (line->getEdge(charIdx) + line->getEdge(charIdx + 1))) {
break;
}
}
pos->parIdx = parIdx;
pos->lineIdx = lineIdx;
pos->charIdx = charIdx;
}
void TextPage::convertPosToPointUpper(TextPosition *pos,
double *x, double *y) {
TextColumn *col;
TextParagraph *par;
TextLine *line;
buildFindCols();
col = (TextColumn *)findCols->get(pos->colIdx);
par = (TextParagraph *)col->getParagraphs()->get(pos->parIdx);
line = (TextLine *)par->getLines()->get(pos->lineIdx);
*x = line->getEdge(pos->charIdx);
*y = line->getBaseline() - selectionAscent * line->fontSize;
}
void TextPage::convertPosToPointLower(TextPosition *pos,
double *x, double *y) {
TextColumn *col;
TextParagraph *par;
TextLine *line;
buildFindCols();
col = (TextColumn *)findCols->get(pos->colIdx);
par = (TextParagraph *)col->getParagraphs()->get(pos->parIdx);
line = (TextLine *)par->getLines()->get(pos->lineIdx);
*x = line->getEdge(pos->charIdx);
*y = line->getYMax();
}
void TextPage::convertPosToPointLeftEdge(TextPosition *pos,
double *x, double *y) {
TextColumn *col;
TextParagraph *par;
TextLine *line;
buildFindCols();
col = (TextColumn *)findCols->get(pos->colIdx);
par = (TextParagraph *)col->getParagraphs()->get(pos->parIdx);
line = (TextLine *)par->getLines()->get(pos->lineIdx);
*x = col->getXMin();
*y = line->getBaseline() - selectionAscent * line->fontSize;
}
void TextPage::convertPosToPointRightEdge(TextPosition *pos,
double *x, double *y) {
TextColumn *col;
TextParagraph *par;
TextLine *line;
buildFindCols();
col = (TextColumn *)findCols->get(pos->colIdx);
par = (TextParagraph *)col->getParagraphs()->get(pos->parIdx);
line = (TextLine *)par->getLines()->get(pos->lineIdx);
*x = col->getXMax();
*y = line->getYMax();
}
void TextPage::getColumnUpperRight(int colIdx, double *x, double *y) {
TextColumn *col;
TextParagraph *par0;
TextLine *line0;
buildFindCols();
col = (TextColumn *)findCols->get(colIdx);
*x = col->getXMax();
par0 = (TextParagraph *)col->paragraphs->get(0);
line0 = (TextLine *)par0->lines->get(0);
*y = line0->getBaseline() - selectionAscent * line0->fontSize;
}
void TextPage::getColumnLowerLeft(int colIdx, double *x, double *y) {
TextColumn *col;
buildFindCols();
col = (TextColumn *)findCols->get(colIdx);
*x = col->getXMin();
*y = col->getYMax();
}
void TextPage::buildFindCols() {
TextBlock *tree;
int rot;
if (findCols) {
return;
}
rot = rotateChars(chars);
if ((tree = splitChars(chars))) {
findCols = buildColumns(tree, gFalse);
delete tree;
} else {
// no text
findCols = new GList();
}
unrotateChars(chars, rot);
unrotateColumns(findCols, rot);
}
TextWordList *TextPage::makeWordList() {
return makeWordListForChars(chars);
}
TextWordList *TextPage::makeWordListForRect(double xMin, double yMin,
double xMax, double yMax) {
TextWordList *words;
GList *chars2;
TextChar *ch;
double xx, yy;
int i;
// get all chars in the rectangle
// (i.e., all chars whose center lies inside the rectangle)
chars2 = new GList();
for (i = 0; i < chars->getLength(); ++i) {
ch = (TextChar *)chars->get(i);
xx = 0.5 * (ch->xMin + ch->xMax);
yy = 0.5 * (ch->yMin + ch->yMax);
if (xx > xMin && xx < xMax && yy > yMin && yy < yMax) {
chars2->append(ch);
}
}
words = makeWordListForChars(chars2);
delete chars2;
return words;
}
TextWordList *TextPage::makeWordListForChars(GList *charList) {
TextBlock *tree;
GList *columns;
TextColumn *col;
TextParagraph *par;
TextLine *line;
TextWord *word;
GList *overlappingChars;
GList *words;
GBool primaryLR;
int rot, colIdx, parIdx, lineIdx, wordIdx;
#if 0 //~debug
dumpChars(charList);
#endif
if (control.mode == textOutSimple2Layout) {
rot = 0;
primaryLR = checkPrimaryLR(chars);
rotateCharsToZero(chars);
columns = buildSimple2Columns(chars);
unrotateCharsFromZero(chars);
unrotateColumnsFromZero(columns);
} else {
if (control.overlapHandling != textOutIgnoreOverlaps) {
overlappingChars = separateOverlappingText(chars);
} else {
overlappingChars = NULL;
}
rot = rotateChars(charList);
primaryLR = checkPrimaryLR(charList);
tree = splitChars(charList);
#if 0 //~debug
dumpTree(tree);
#endif
if (!tree) {
// no text
unrotateChars(charList, rot);
return new TextWordList(new GList(), gTrue);
}
columns = buildColumns(tree, primaryLR);
#if 0 //~debug
dumpColumns(columns, gTrue);
#endif
delete tree;
unrotateChars(charList, rot);
if (control.html) {
rotateUnderlinesAndLinks(rot);
generateUnderlinesAndLinks(columns);
}
if (overlappingChars) {
if (overlappingChars->getLength() > 0) {
columns->append(buildOverlappingTextColumn(overlappingChars));
}
deleteGList(overlappingChars, TextChar);
}
}
words = new GList();
for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
col = (TextColumn *)columns->get(colIdx);
for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
word = ((TextWord *)line->words->get(wordIdx))->copy();
if (wordIdx == line->words->getLength() - 1 &&
!line->getHyphenated()) {
word->spaceAfter = gTrue;
}
words->append(word);
}
}
}
}
switch (control.mode) {
case textOutReadingOrder:
case textOutSimple2Layout:
// already in reading order
break;
case textOutPhysLayout:
case textOutSimpleLayout:
case textOutTableLayout:
case textOutLinePrinter:
words->sort(&TextWord::cmpYX);
break;
case textOutRawOrder:
words->sort(&TextWord::cmpCharPos);
break;
}
// this has to be done after sorting with cmpYX
unrotateWords(words, rot);
deleteGList(columns, TextColumn);
return new TextWordList(words, primaryLR);
}
GBool TextPage::primaryDirectionIsLR() {
return checkPrimaryLR(chars);
}
//------------------------------------------------------------------------
// TextPage: debug
//------------------------------------------------------------------------
#if 0 //~debug
void TextPage::dumpChars(GList *charsA) {
TextChar *ch;
int i;
for (i = 0; i < charsA->getLength(); ++i) {
ch = (TextChar *)charsA->get(i);
printf("char: U+%04x '%c' xMin=%g yMin=%g xMax=%g yMax=%g fontSize=%g rot=%d charPos=%d charLen=%d spaceAfter=%d\n",
ch->c, ch->c & 0xff, ch->xMin, ch->yMin, ch->xMax, ch->yMax,
ch->fontSize, ch->rot, ch->charPos, ch->charLen, ch->spaceAfter);
}
}
void TextPage::dumpTree(TextBlock *tree, int indent) {
TextChar *ch;
int i;
printf("%*sblock: type=%s tag=%s small=%d rot=%d xMin=%g yMin=%g xMax=%g yMax=%g\n",
indent, "",
tree->type == blkLeaf ? "leaf" :
tree->type == blkHorizSplit ? "horiz" : "vert",
tree->tag == blkTagMulticolumn ? "multicolumn" :
tree->tag == blkTagColumn ? "column" :
tree->tag == blkTagSuperLine ? "superline" : "line",
tree->smallSplit,
tree->rot, tree->xMin, tree->yMin, tree->xMax, tree->yMax);
if (tree->type == blkLeaf) {
for (i = 0; i < tree->children->getLength(); ++i) {
ch = (TextChar *)tree->children->get(i);
printf("%*schar: '%c' xMin=%g yMin=%g xMax=%g yMax=%g font=%d.%d\n",
indent + 2, "", ch->c & 0xff,
ch->xMin, ch->yMin, ch->xMax, ch->yMax,
ch->font->fontID.num, ch->font->fontID.gen);
}
} else {
for (i = 0; i < tree->children->getLength(); ++i) {
dumpTree((TextBlock *)tree->children->get(i), indent + 2);
}
}
}
void TextPage::dumpColumns(GList *columns, GBool dumpWords) {
TextColumn *col;
TextParagraph *par;
TextLine *line;
TextWord *word;
int colIdx, parIdx, lineIdx, wordIdx, i;
for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
col = (TextColumn *)columns->get(colIdx);
printf("column: xMin=%g yMin=%g xMax=%g yMax=%g px=%d py=%d pw=%d ph=%d\n",
col->xMin, col->yMin, col->xMax, col->yMax,
col->px, col->py, col->pw, col->ph);
for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
par = (TextParagraph *)col->paragraphs->get(parIdx);
printf(" paragraph:\n");
for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
line = (TextLine *)par->lines->get(lineIdx);
printf(" line: xMin=%g yMin=%g xMax=%g yMax=%g px=%d pw=%d rot=%d\n",
line->xMin, line->yMin, line->xMax, line->yMax,
line->px, line->pw, line->rot);
if (dumpWords) {
for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
word = (TextWord *)line->words->get(wordIdx);
printf(" word: xMin=%g yMin=%g xMax=%g yMax=%g\n",
word->xMin, word->yMin, word->xMax, word->yMax);
printf(" '");
for (i = 0; i < word->len; ++i) {
printf("%c", word->text[i] & 0xff);
}
printf("'\n");
}
} else {
printf(" '");
for (i = 0; i < line->len; ++i) {
printf("%c", line->text[i] & 0xff);
}
printf("'\n");
}
}
}
}
}
void TextPage::dumpUnderlines() {
TextUnderline *u;
int i;
printf("underlines:\n");
for (i = 0; i < underlines->getLength(); ++i) {
u = (TextUnderline *)underlines->get(i);
printf(" horiz=%d x0=%g y0=%g x1=%g y1=%g\n",
u->horiz, u->x0, u->y0, u->x1, u->y1);
}
}
#endif //~debug
//------------------------------------------------------------------------
// TextOutputDev
//------------------------------------------------------------------------
static void outputToFile(void *stream, const char *text, int len) {
fwrite(text, 1, len, (FILE *)stream);
}
TextOutputDev::TextOutputDev(char *fileName, TextOutputControl *controlA,
GBool append, GBool fileNameIsUTF8) {
text = NULL;
control = *controlA;
ok = gTrue;
// open file
needClose = gFalse;
if (fileName) {
if (!strcmp(fileName, "-")) {
outputStream = stdout;
#ifdef WIN32
// keep DOS from munging the end-of-line characters
setmode(fileno(stdout), O_BINARY);
#endif
} else {
if (fileNameIsUTF8) {
outputStream = openFile(fileName, append ? "ab" : "wb");
} else {
outputStream = fopen(fileName, append ? "ab" : "wb");
}
if (!outputStream) {
error(errIO, -1, "Couldn't open text file '{0:s}'", fileName);
ok = gFalse;
return;
}
needClose = gTrue;
}
outputFunc = &outputToFile;
} else {
outputFunc = NULL;
outputStream = NULL;
}
// set up text object
text = new TextPage(&control);
generateBOM();
}
TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
TextOutputControl *controlA) {
outputFunc = func;
outputStream = stream;
needClose = gFalse;
control = *controlA;
text = new TextPage(&control);
generateBOM();
ok = gTrue;
}
TextOutputDev::~TextOutputDev() {
if (needClose) {
fclose((FILE *)outputStream);
}
if (text) {
delete text;
}
}
void TextOutputDev::generateBOM() {
UnicodeMap *uMap;
char bom[8];
int bomLen;
// insert Unicode BOM
if (control.insertBOM && outputStream) {
if (!(uMap = globalParams->getTextEncoding())) {
return;
}
bomLen = uMap->mapUnicode(0xfeff, bom, sizeof(bom));
uMap->decRefCnt();
(*outputFunc)(outputStream, bom, bomLen);
}
}
void TextOutputDev::startPage(int pageNum, GfxState *state) {
text->startPage(state);
}
void TextOutputDev::endPage() {
if (outputStream) {
text->write(outputStream, outputFunc);
}
}
void TextOutputDev::restoreState(GfxState *state) {
text->updateFont(state);
}
void TextOutputDev::updateFont(GfxState *state) {
text->updateFont(state);
}
void TextOutputDev::beginString(GfxState *state, GString *s) {
}
void TextOutputDev::endString(GfxState *state) {
}
void TextOutputDev::drawChar(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode c, int nBytes, Unicode *u, int uLen) {
text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
}
void TextOutputDev::incCharCount(int nChars) {
text->incCharCount(nChars);
}
void TextOutputDev::beginActualText(GfxState *state, Unicode *u, int uLen) {
text->beginActualText(state, u, uLen);
}
void TextOutputDev::endActualText(GfxState *state) {
text->endActualText(state);
}
void TextOutputDev::stroke(GfxState *state) {
GfxPath *path;
GfxSubpath *subpath;
double x[2], y[2], t;
if (!control.html) {
return;
}
path = state->getPath();
if (path->getNumSubpaths() != 1) {
return;
}
subpath = path->getSubpath(0);
if (subpath->getNumPoints() != 2) {
return;
}
state->transform(subpath->getX(0), subpath->getY(0), &x[0], &y[0]);
state->transform(subpath->getX(1), subpath->getY(1), &x[1], &y[1]);
// look for a vertical or horizontal line
if (x[0] == x[1] || y[0] == y[1]) {
if (x[0] > x[1]) {
t = x[0]; x[0] = x[1]; x[1] = t;
}
if (y[0] > y[1]) {
t = y[0]; y[0] = y[1]; y[1] = t;
}
text->addUnderline(x[0], y[0], x[1], y[1]);
}
}
void TextOutputDev::fill(GfxState *state) {
GfxPath *path;
GfxSubpath *subpath;
double x[5], y[5];
double rx0, ry0, rx1, ry1, t;
int i;
if (!control.html) {
return;
}
path = state->getPath();
if (path->getNumSubpaths() != 1) {
return;
}
subpath = path->getSubpath(0);
if (subpath->getNumPoints() != 5) {
return;
}
for (i = 0; i < 5; ++i) {
if (subpath->getCurve(i)) {
return;
}
state->transform(subpath->getX(i), subpath->getY(i), &x[i], &y[i]);
}
// look for a rectangle
if (x[0] == x[1] && y[1] == y[2] && x[2] == x[3] && y[3] == y[4] &&
x[0] == x[4] && y[0] == y[4]) {
rx0 = x[0];
ry0 = y[0];
rx1 = x[2];
ry1 = y[1];
} else if (y[0] == y[1] && x[1] == x[2] && y[2] == y[3] && x[3] == x[4] &&
x[0] == x[4] && y[0] == y[4]) {
rx0 = x[0];
ry0 = y[0];
rx1 = x[1];
ry1 = y[2];
} else {
return;
}
if (rx1 < rx0) {
t = rx0;
rx0 = rx1;
rx1 = t;
}
if (ry1 < ry0) {
t = ry0;
ry0 = ry1;
ry1 = t;
}
// skinny horizontal rectangle
if (ry1 - ry0 < rx1 - rx0) {
if (ry1 - ry0 < maxUnderlineWidth) {
ry0 = 0.5 * (ry0 + ry1);
text->addUnderline(rx0, ry0, rx1, ry0);
}
// skinny vertical rectangle
} else {
if (rx1 - rx0 < maxUnderlineWidth) {
rx0 = 0.5 * (rx0 + rx1);
text->addUnderline(rx0, ry0, rx0, ry1);
}
}
}
void TextOutputDev::eoFill(GfxState *state) {
if (!control.html) {
return;
}
fill(state);
}
void TextOutputDev::processLink(Link *link) {
double x1, y1, x2, y2;
int xMin, yMin, xMax, yMax, x, y;
if (!control.html) {
return;
}
link->getRect(&x1, &y1, &x2, &y2);
cvtUserToDev(x1, y1, &x, &y);
xMin = xMax = x;
yMin = yMax = y;
cvtUserToDev(x1, y2, &x, &y);
if (x < xMin) {
xMin = x;
} else if (x > xMax) {
xMax = x;
}
if (y < yMin) {
yMin = y;
} else if (y > yMax) {
yMax = y;
}
cvtUserToDev(x2, y1, &x, &y);
if (x < xMin) {
xMin = x;
} else if (x > xMax) {
xMax = x;
}
if (y < yMin) {
yMin = y;
} else if (y > yMax) {
yMax = y;
}
cvtUserToDev(x2, y2, &x, &y);
if (x < xMin) {
xMin = x;
} else if (x > xMax) {
xMax = x;
}
if (y < yMin) {
yMin = y;
} else if (y > yMax) {
yMax = y;
}
text->addLink(xMin, yMin, xMax, yMax, link);
}
GBool TextOutputDev::findText(Unicode *s, int len,
GBool startAtTop, GBool stopAtBottom,
GBool startAtLast, GBool stopAtLast,
GBool caseSensitive, GBool backward,
GBool wholeWord,
double *xMin, double *yMin,
double *xMax, double *yMax) {
return text->findText(s, len, startAtTop, stopAtBottom,
startAtLast, stopAtLast,
caseSensitive, backward, wholeWord,
xMin, yMin, xMax, yMax);
}
GString *TextOutputDev::getText(double xMin, double yMin,
double xMax, double yMax) {
return text->getText(xMin, yMin, xMax, yMax);
}
GBool TextOutputDev::findCharRange(int pos, int length,
double *xMin, double *yMin,
double *xMax, double *yMax) {
return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
}
TextWordList *TextOutputDev::makeWordList() {
return text->makeWordList();
}
TextWordList *TextOutputDev::makeWordListForRect(double xMin, double yMin,
double xMax, double yMax) {
return text->makeWordListForRect(xMin, yMin, xMax, yMax);
}
TextPage *TextOutputDev::takeText() {
TextPage *ret;
ret = text;
text = new TextPage(&control);
return ret;
}