/// <summary>
/// Internal routine that actually does the parsing. The caller
/// can pass either an InputStream or file name. If both are passed,
/// the file name is preferred.
/// </summary>
internal Node ParseInternal(Stream input, Stream output, TidyMessageCollection messages)
{
Node document = null;
Out o = new OutImpl(); /* normal output stream */
/* ensure config is self-consistent */
_options.Adjust();
if (input != null)
{
var lexer = new Lexer(new ClsStreamInImpl(input, _options.CharEncoding, _options.TabSize), _options)
{
Messages = messages
};
/*
store pointer to lexer in input stream
to allow character encoding errors to be
reported
*/
lexer.Input.Lexer = lexer;
/* Tidy doesn't alter the doctype for generic XML docs */
Node doctype;
if (_options.XmlTags)
{
document = ParserImpl.ParseXmlDocument(lexer);
}
else
{
document = ParserImpl.ParseDocument(lexer);
if (!document.CheckNodeIntegrity())
{
Report.BadTree(lexer);
return null;
}
var cleaner = new Clean(_options.TagTable);
/* simplifies <b><b> ... </b> ...</b> etc. */
cleaner.NestedEmphasis(document);
/* cleans up <dir>indented text</dir> etc. */
cleaner.List2Bq(document);
cleaner.Bq2Div(document);
/* replaces i by em and b by strong */
if (_options.LogicalEmphasis)
{
cleaner.EmFromI(document);
}
if (_options.Word2000 && cleaner.IsWord2000(document, _options.TagTable))
{
/* prune Word2000's <![if ...]> ... <![endif]> */
cleaner.DropSections(lexer, document);
/* drop style & class attributes and empty p, span elements */
cleaner.CleanWord2000(lexer, document);
}
/* replaces presentational markup by style rules */
if (_options.MakeClean || _options.DropFontTags)
{
cleaner.CleanTree(lexer, document);
}
if (!document.CheckNodeIntegrity())
{
Report.BadTree(lexer);
return null;
}
doctype = document.FindDocType();
if (document.Content != null)
{
if (_options.Xhtml)
{
lexer.SetXhtmlDocType(document);
}
else
{
lexer.FixDocType(document);
}
if (_options.TidyMark)
{
lexer.AddGenerator(document);
}
}
/* ensure presence of initial <?XML version="1.0"?> */
if (_options.XmlOut && _options.XmlPi)
{
lexer.FixXmlPi(document);
}
if (document.Content != null)
{
Report.ReportVersion(lexer, doctype);
Report.ReportNumWarnings(lexer);
}
}
if (lexer.Messages.Errors > 0)
{
Report.NeedsAuthorIntervention(lexer);
}
o.State = StreamIn.FSM_ASCII;
o.Encoding = _options.CharEncoding;
if (lexer.Messages.Errors == 0)
{
PPrint pprint;
if (_options.BurstSlides)
{
/*
remove doctype to avoid potential clash with
markup introduced when bursting into slides
*/
/* discard the document type */
doctype = document.FindDocType();
if (doctype != null)
{
Node.DiscardElement(doctype);
}
/* slides use transitional features */
lexer.Versions |= HtmlVersion.Html40Loose;
/* and patch up doctype to match */
if (_options.Xhtml)
{
lexer.SetXhtmlDocType(document);
}
else
{
lexer.FixDocType(document);
}
/* find the body element which may be implicit */
Node body = document.FindBody(_options.TagTable);
if (body != null)
{
pprint = new PPrint(_options);
Report.ReportNumberOfSlides(lexer, pprint.CountSlides(body));
pprint.CreateSlides(lexer, document);
}
else
{
Report.MissingBody(lexer);
}
}
else if (output != null)
{
pprint = new PPrint(_options);
o.Output = output;
if (_options.XmlTags)
{
pprint.PrintXmlTree(o, 0, 0, lexer, document);
}
else
{
pprint.PrintTree(o, 0, 0, lexer, document);
}
pprint.FlushLine(o, 0);
}
}
Report.ErrorSummary(lexer);
}
return document;
}