316 lines
9.1 KiB
C++
316 lines
9.1 KiB
C++
//========================================================================
|
|
//
|
|
// pdftotext.cc
|
|
//
|
|
// Copyright 1997-2013 Glyph & Cog, LLC
|
|
//
|
|
//========================================================================
|
|
|
|
#include <aconf.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <stddef.h>
|
|
#include <string.h>
|
|
#ifdef DEBUG_FP_LINUX
|
|
# include <fenv.h>
|
|
# include <fpu_control.h>
|
|
#endif
|
|
#include "gmem.h"
|
|
#include "gmempp.h"
|
|
#include "parseargs.h"
|
|
#include "GString.h"
|
|
#include "GList.h"
|
|
#include "GlobalParams.h"
|
|
#include "Object.h"
|
|
#include "Stream.h"
|
|
#include "Array.h"
|
|
#include "Dict.h"
|
|
#include "XRef.h"
|
|
#include "Catalog.h"
|
|
#include "Page.h"
|
|
#include "PDFDoc.h"
|
|
#include "TextOutputDev.h"
|
|
#include "CharTypes.h"
|
|
#include "UnicodeMap.h"
|
|
#include "TextString.h"
|
|
#include "Error.h"
|
|
#include "config.h"
|
|
|
|
static int firstPage = 1;
|
|
static int lastPage = 0;
|
|
static GBool physLayout = gFalse;
|
|
static GBool simpleLayout = gFalse;
|
|
static GBool simple2Layout = gFalse;
|
|
static GBool tableLayout = gFalse;
|
|
static GBool linePrinter = gFalse;
|
|
static GBool rawOrder = gFalse;
|
|
static double fixedPitch = 0;
|
|
static double fixedLineSpacing = 0;
|
|
static GBool clipText = gFalse;
|
|
static GBool discardDiag = gFalse;
|
|
static char textEncName[128] = "";
|
|
static char textEOL[16] = "";
|
|
static GBool noPageBreaks = gFalse;
|
|
static GBool insertBOM = gFalse;
|
|
static double marginLeft = 0;
|
|
static double marginRight = 0;
|
|
static double marginTop = 0;
|
|
static double marginBottom = 0;
|
|
static char ownerPassword[33] = "\001";
|
|
static char userPassword[33] = "\001";
|
|
static GBool quiet = gFalse;
|
|
static char cfgFileName[256] = "";
|
|
static GBool listEncodings = gFalse;
|
|
static GBool printVersion = gFalse;
|
|
static GBool printHelp = gFalse;
|
|
|
|
static ArgDesc argDesc[] = {
|
|
{"-f", argInt, &firstPage, 0,
|
|
"first page to convert"},
|
|
{"-l", argInt, &lastPage, 0,
|
|
"last page to convert"},
|
|
{"-layout", argFlag, &physLayout, 0,
|
|
"maintain original physical layout"},
|
|
{"-simple", argFlag, &simpleLayout, 0,
|
|
"simple one-column page layout"},
|
|
{"-simple2", argFlag, &simple2Layout, 0,
|
|
"simple one-column page layout, version 2"},
|
|
{"-table", argFlag, &tableLayout, 0,
|
|
"similar to -layout, but optimized for tables"},
|
|
{"-lineprinter", argFlag, &linePrinter, 0,
|
|
"use strict fixed-pitch/height layout"},
|
|
{"-raw", argFlag, &rawOrder, 0,
|
|
"keep strings in content stream order"},
|
|
{"-fixed", argFP, &fixedPitch, 0,
|
|
"assume fixed-pitch (or tabular) text"},
|
|
{"-linespacing", argFP, &fixedLineSpacing, 0,
|
|
"fixed line spacing for LinePrinter mode"},
|
|
{"-clip", argFlag, &clipText, 0,
|
|
"separate clipped text"},
|
|
{"-nodiag", argFlag, &discardDiag, 0,
|
|
"discard diagonal text"},
|
|
{"-enc", argString, textEncName, sizeof(textEncName),
|
|
"output text encoding name"},
|
|
{"-eol", argString, textEOL, sizeof(textEOL),
|
|
"output end-of-line convention (unix, dos, or mac)"},
|
|
{"-nopgbrk", argFlag, &noPageBreaks, 0,
|
|
"don't insert page breaks between pages"},
|
|
{"-bom", argFlag, &insertBOM, 0,
|
|
"insert a Unicode BOM at the start of the text file"},
|
|
{"-marginl", argFP, &marginLeft, 0,
|
|
"left page margin"},
|
|
{"-marginr", argFP, &marginRight, 0,
|
|
"right page margin"},
|
|
{"-margint", argFP, &marginTop, 0,
|
|
"top page margin"},
|
|
{"-marginb", argFP, &marginBottom, 0,
|
|
"bottom page margin"},
|
|
{"-opw", argString, ownerPassword, sizeof(ownerPassword),
|
|
"owner password (for encrypted files)"},
|
|
{"-upw", argString, userPassword, sizeof(userPassword),
|
|
"user password (for encrypted files)"},
|
|
{"-q", argFlag, &quiet, 0,
|
|
"don't print any messages or errors"},
|
|
{"-cfg", argString, cfgFileName, sizeof(cfgFileName),
|
|
"configuration file to use in place of .xpdfrc"},
|
|
{"-listencodings", argFlag, &listEncodings, 0,
|
|
"list all available output text encodings"},
|
|
{"-v", argFlag, &printVersion, 0,
|
|
"print copyright and version info"},
|
|
{"-h", argFlag, &printHelp, 0,
|
|
"print usage information"},
|
|
{"-help", argFlag, &printHelp, 0,
|
|
"print usage information"},
|
|
{"--help", argFlag, &printHelp, 0,
|
|
"print usage information"},
|
|
{"-?", argFlag, &printHelp, 0,
|
|
"print usage information"},
|
|
{NULL}
|
|
};
|
|
|
|
int main(int argc, char *argv[]) {
|
|
PDFDoc *doc;
|
|
char *fileName;
|
|
GString *textFileName;
|
|
GString *ownerPW, *userPW;
|
|
TextOutputControl textOutControl;
|
|
TextOutputDev *textOut;
|
|
UnicodeMap *uMap;
|
|
GBool ok;
|
|
char *p;
|
|
int exitCode;
|
|
|
|
#ifdef DEBUG_FP_LINUX
|
|
// enable exceptions on floating point div-by-zero
|
|
feenableexcept(FE_DIVBYZERO);
|
|
// force 64-bit rounding: this avoids changes in output when minor
|
|
// code changes result in spills of x87 registers; it also avoids
|
|
// differences in output with valgrind's 64-bit floating point
|
|
// emulation (yes, this is a kludge; but it's pretty much
|
|
// unavoidable given the x87 instruction set; see gcc bug 323 for
|
|
// more info)
|
|
fpu_control_t cw;
|
|
_FPU_GETCW(cw);
|
|
cw = (fpu_control_t)((cw & ~_FPU_EXTENDED) | _FPU_DOUBLE);
|
|
_FPU_SETCW(cw);
|
|
#endif
|
|
|
|
exitCode = 99;
|
|
|
|
// parse args
|
|
fixCommandLine(&argc, &argv);
|
|
ok = parseArgs(argDesc, &argc, argv);
|
|
if (ok && listEncodings) {
|
|
// list available encodings
|
|
globalParams = new GlobalParams(cfgFileName);
|
|
GList *encs = globalParams->getAvailableTextEncodings();
|
|
for (int i = 0; i < encs->getLength(); ++i) {
|
|
printf("%s\n", ((GString *)encs->get(i))->getCString());
|
|
}
|
|
deleteGList(encs, GString);
|
|
delete globalParams;
|
|
goto err0;
|
|
}
|
|
if (!ok || argc < 2 || argc > 3 || printVersion || printHelp) {
|
|
fprintf(stderr, "pdftotext version %s [www.xpdfreader.com]\n", xpdfVersion);
|
|
fprintf(stderr, "%s\n", xpdfCopyright);
|
|
if (!printVersion) {
|
|
printUsage("pdftotext", "<PDF-file> [<text-file>]", argDesc);
|
|
}
|
|
goto err0;
|
|
}
|
|
fileName = argv[1];
|
|
|
|
// read config file
|
|
globalParams = new GlobalParams(cfgFileName);
|
|
if (textEncName[0]) {
|
|
globalParams->setTextEncoding(textEncName);
|
|
}
|
|
if (textEOL[0]) {
|
|
if (!globalParams->setTextEOL(textEOL)) {
|
|
fprintf(stderr, "Bad '-eol' value on command line\n");
|
|
}
|
|
}
|
|
if (noPageBreaks) {
|
|
globalParams->setTextPageBreaks(gFalse);
|
|
}
|
|
if (quiet) {
|
|
globalParams->setErrQuiet(quiet);
|
|
}
|
|
|
|
// get mapping to output encoding
|
|
if (!(uMap = globalParams->getTextEncoding())) {
|
|
error(errConfig, -1, "Couldn't get text encoding");
|
|
goto err1;
|
|
}
|
|
|
|
// open PDF file
|
|
if (ownerPassword[0] != '\001') {
|
|
ownerPW = new GString(ownerPassword);
|
|
} else {
|
|
ownerPW = NULL;
|
|
}
|
|
if (userPassword[0] != '\001') {
|
|
userPW = new GString(userPassword);
|
|
} else {
|
|
userPW = NULL;
|
|
}
|
|
doc = new PDFDoc(fileName, ownerPW, userPW);
|
|
if (userPW) {
|
|
delete userPW;
|
|
}
|
|
if (ownerPW) {
|
|
delete ownerPW;
|
|
}
|
|
if (!doc->isOk()) {
|
|
exitCode = 1;
|
|
goto err2;
|
|
}
|
|
|
|
// check for copy permission
|
|
if (!doc->okToCopy()) {
|
|
error(errNotAllowed, -1,
|
|
"Copying of text from this document is not allowed.");
|
|
exitCode = 3;
|
|
goto err2;
|
|
}
|
|
|
|
// construct text file name
|
|
if (argc == 3) {
|
|
textFileName = new GString(argv[2]);
|
|
} else {
|
|
p = fileName + strlen(fileName) - 4;
|
|
if (strlen(fileName) > 4 && (!strcmp(p, ".pdf") || !strcmp(p, ".PDF"))) {
|
|
textFileName = new GString(fileName, (int)strlen(fileName) - 4);
|
|
} else {
|
|
textFileName = new GString(fileName);
|
|
}
|
|
textFileName->append(".txt");
|
|
}
|
|
|
|
// get page range
|
|
if (firstPage < 1) {
|
|
firstPage = 1;
|
|
}
|
|
if (lastPage < 1 || lastPage > doc->getNumPages()) {
|
|
lastPage = doc->getNumPages();
|
|
}
|
|
|
|
// write text file
|
|
if (tableLayout) {
|
|
textOutControl.mode = textOutTableLayout;
|
|
textOutControl.fixedPitch = fixedPitch;
|
|
} else if (physLayout) {
|
|
textOutControl.mode = textOutPhysLayout;
|
|
textOutControl.fixedPitch = fixedPitch;
|
|
} else if (simpleLayout) {
|
|
textOutControl.mode = textOutSimpleLayout;
|
|
} else if (simple2Layout) {
|
|
textOutControl.mode = textOutSimple2Layout;
|
|
} else if (linePrinter) {
|
|
textOutControl.mode = textOutLinePrinter;
|
|
textOutControl.fixedPitch = fixedPitch;
|
|
textOutControl.fixedLineSpacing = fixedLineSpacing;
|
|
} else if (rawOrder) {
|
|
textOutControl.mode = textOutRawOrder;
|
|
} else {
|
|
textOutControl.mode = textOutReadingOrder;
|
|
}
|
|
textOutControl.clipText = clipText;
|
|
textOutControl.discardDiagonalText = discardDiag;
|
|
textOutControl.insertBOM = insertBOM;
|
|
textOutControl.marginLeft = marginLeft;
|
|
textOutControl.marginRight = marginRight;
|
|
textOutControl.marginTop = marginTop;
|
|
textOutControl.marginBottom = marginBottom;
|
|
textOut = new TextOutputDev(textFileName->getCString(), &textOutControl,
|
|
gFalse, gTrue);
|
|
if (textOut->isOk()) {
|
|
doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0,
|
|
gFalse, gTrue, gFalse);
|
|
} else {
|
|
delete textOut;
|
|
exitCode = 2;
|
|
goto err3;
|
|
}
|
|
delete textOut;
|
|
|
|
exitCode = 0;
|
|
|
|
// clean up
|
|
err3:
|
|
delete textFileName;
|
|
err2:
|
|
delete doc;
|
|
uMap->decRefCnt();
|
|
err1:
|
|
delete globalParams;
|
|
err0:
|
|
|
|
// check for memory leaks
|
|
Object::memCheck(stderr);
|
|
gMemReport(stderr);
|
|
|
|
return exitCode;
|
|
}
|