/**
* Searches for a tag in a page.
*
* @param tag
* the name of the tag
* @param obj
* an identifier to find the marked content
* @param page
* a page dictionary
* @throws IOException
*/
public void ParseTag(String tag, PdfObject obj, PdfDictionary page)
{
PRStream stream = (PRStream)page.GetAsStream(PdfName.CONTENTS);
// if the identifier is a number, we can extract the content right away
if (obj is PdfNumber)
{
PdfNumber mcid = (PdfNumber)obj;
RenderFilter filter = new MarkedContentRenderFilter(mcid.IntValue);
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
FilteredTextRenderListener listener = new FilteredTextRenderListener(strategy, new RenderFilter[] { filter });
PdfContentStreamProcessor processor = new PdfContentStreamProcessor(
listener);
processor.ProcessContent(PdfReader.GetStreamBytes(stream), page
.GetAsDict(PdfName.RESOURCES));
outp.Write(SimpleXMLParser.EscapeXML(listener.GetResultantText(), true));
}
// if the identifier is an array, we call the parseTag method
// recursively
else if (obj is PdfArray)
{
PdfArray arr = (PdfArray)obj;
int n = arr.Size;
for (int i = 0; i < n; i++)
{
ParseTag(tag, arr[i], page);
if (i < n - 1)
{
outp.WriteLine();
}
}
}
// if the identifier is a dictionary, we get the resources from the
// dictionary
else if (obj is PdfDictionary)
{
PdfDictionary mcr = (PdfDictionary)obj;
ParseTag(tag, mcr.GetDirectObject(PdfName.MCID), mcr
.GetAsDict(PdfName.PG));
}
}