#! /usr/bin/env python # # A dubious script that extracts all the visible text portion from a # PDF file and spits it onto stdout # from re import findall, sub from sys import stdout, argv from pdffile import PDFFile for filename in argv[1:]: f = PDFFile() f.load_file(filename) print f.Security count = f.Root['Pages']['Count'].value() for page in range(0, count): kid = f.Root['Pages']['Kids'][page] data = kid['Contents'].value()._stream.read(65536) for g in findall(r'\(((\\.|[^)])*)\)', data): stdout.write(sub(r'\\(.)', r'\1', g[0])) stdout.write('\n')