public void ParseTag(String tag, PdfObject obj, PdfDictionary page)
{
// if the identifier is a number, we can extract the content right away
if (obj is PdfNumber) {
PdfNumber mcid = (PdfNumber) obj;
RenderFilter filter = new MarkedContentRenderFilter(mcid.IntValue);
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
FilteredTextRenderListener listener = new FilteredTextRenderListener(strategy, new RenderFilter[]{filter});
PdfContentStreamProcessor processor = new PdfContentStreamProcessor(
listener);
processor.ProcessContent(PdfReader.GetPageContent(page), page
.GetAsDict(PdfName.RESOURCES));
outp.Write(XMLUtil.EscapeXML(listener.GetResultantText(), true));
}
// if the identifier is an array, we call the parseTag method
// recursively
else if (obj is PdfArray) {
PdfArray arr = (PdfArray) obj;
int n = arr.Size;
for (int i = 0; i < n; i++) {
ParseTag(tag, arr[i], page);
if (i < n - 1)
outp.WriteLine();
}
}
// if the identifier is a dictionary, we get the resources from the
// dictionary
else if (obj is PdfDictionary) {
PdfDictionary mcr = (PdfDictionary) obj;
ParseTag(tag, mcr.GetDirectObject(PdfName.MCID), mcr
.GetAsDict(PdfName.PG));
}
}