private XmlElement ParseHtmlContent()
{
// Create artificial root elelemt to be able to group multiple top-level elements
// We create "html" element which may be a duplicate of real HTML element, which is ok, as HtmlConverter will swallow it painlessly..
XmlElement htmlRootElement = _document.CreateElement("html", XhtmlNamespace);
OpenStructuringElement(htmlRootElement);
while (_htmlLexicalAnalyzer.NextTokenType != HtmlTokenType.EOF)
{
if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.OpeningTagStart)
{
_htmlLexicalAnalyzer.GetNextTagToken();
if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name)
{
string htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower();
_htmlLexicalAnalyzer.GetNextTagToken();
// Create an element
XmlElement htmlElement = _document.CreateElement(htmlElementName, XhtmlNamespace);
// Parse element attributes
ParseAttributes(htmlElement);
if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.EmptyTagEnd || HtmlSchema.IsEmptyElement(htmlElementName))
{
// It is an element without content (because of explicit slash or based on implicit knowledge aboout html)
AddEmptyElement(htmlElement);
}
else if (HtmlSchema.IsInlineElement(htmlElementName))
{
// Elements known as formatting are pushed to some special
// pending stack, which allows them to be transferred
// over block tags - by doing this we convert
// overlapping tags into normal heirarchical element structure.
OpenInlineElement(htmlElement);
}
else if (HtmlSchema.IsBlockElement(htmlElementName) || HtmlSchema.IsKnownOpenableElement(htmlElementName))
{
// This includes no-scope elements
OpenStructuringElement(htmlElement);
}
else
{
// Do nothing. Skip the whole opening tag.
// Ignoring all unknown elements on their start tags.
// Thus we will ignore them on closinng tag as well.
// Anyway we don't know what to do withthem on conversion to Xaml.
}
}
else
{
// Note that the token following opening angle bracket must be a name - lexical analyzer must guarantee that.
// Otherwise - we skip the angle bracket and continue parsing the content as if it is just text.
// Add the following asserion here, right? or output "<" as a text run instead?:
// InvariantAssert(false, "Angle bracket without a following name is not expected");
}
}
else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.ClosingTagStart)
{
_htmlLexicalAnalyzer.GetNextTagToken();
if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Name)
{
string htmlElementName = _htmlLexicalAnalyzer.NextToken.ToLower();
// Skip the name token. Assume that the following token is end of tag,
// but do not check this. If it is not true, we simply ignore one token
// - this is our recovery from bad xml in this case.
_htmlLexicalAnalyzer.GetNextTagToken();
CloseElement(htmlElementName);
}
}
else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Text)
{
AddTextContent(_htmlLexicalAnalyzer.NextToken);
}
else if (_htmlLexicalAnalyzer.NextTokenType == HtmlTokenType.Comment)
{
AddComment(_htmlLexicalAnalyzer.NextToken);
}
_htmlLexicalAnalyzer.GetNextContentToken();
}
// Get rid of the artificial root element
if (htmlRootElement.FirstChild is XmlElement &&
htmlRootElement.FirstChild == htmlRootElement.LastChild &&
htmlRootElement.FirstChild.LocalName.ToLower() == "html")
{
htmlRootElement = (XmlElement)htmlRootElement.FirstChild;
}
return htmlRootElement;
}