public virtual Node GetToken(short mode)
{
int c;
int badcomment = 0;
var isempty = new MutableBoolean();
if (Pushed)
{
/* duplicate inlines in preference to pushed text nodes when appropriate */
if (Token.Type != Node.TEXT_NODE || (Insert == - 1 && Inode == null))
{
Pushed = false;
return Token;
}
}
/* at start of block elements, unclosed inline
elements are inserted into the token stream */
if (Insert != - 1 || Inode != null)
{
return InsertedToken();
}
Lines = Input.CursorLine;
Columns = Input.CursorColumn;
Waswhite = false;
Txtstart = Lexsize;
Txtend = Lexsize;
while (true)
{
c = Input.ReadChar();
if (c == StreamIn.END_OF_STREAM)
{
break;
}
if (Insertspace && mode != IGNORE_WHITESPACE)
{
AddCharToLexer(' ');
Waswhite = true;
Insertspace = false;
}
/* treat \r\n as \n and \r as \n */
if (c == '\r')
{
c = Input.ReadChar();
if (c != '\n')
{
Input.UngetChar(c);
}
c = '\n';
}
AddCharToLexer(c);
short map;
switch (State)
{
case LEX_CONTENT:
map = Map((char) c);
/*
Discard white space if appropriate. Its cheaper
to do this here rather than in parser methods
for elements that don't have mixed content.
*/
if (((map & WHITE) != 0) && (mode == IGNORE_WHITESPACE) && Lexsize == Txtstart + 1)
{
--Lexsize;
Waswhite = false;
Lines = Input.CursorLine;
Columns = Input.CursorColumn;
continue;
}
if (c == '<')
{
State = LEX_GT;
continue;
}
if ((map & WHITE) != 0)
{
/* was previous char white? */
if (Waswhite)
{
if (mode != PREFORMATTED && mode != IGNORE_MARKUP)
{
--Lexsize;
Lines = Input.CursorLine;
Columns = Input.CursorColumn;
}
}
/* prev char wasn't white */
else
{
Waswhite = true;
if (mode != PREFORMATTED && mode != IGNORE_MARKUP && c != ' ')
{
ChangeChar((byte) ' ');
}
}
continue;
}
if (c == '&' && mode != IGNORE_MARKUP)
{
ParseEntity(mode);
}
/* this is needed to avoid trimming trailing whitespace */
if (mode == IGNORE_WHITESPACE)
mode = MIXED_CONTENT;
Waswhite = false;
continue;
case LEX_GT:
if (c == '/')
{
c = Input.ReadChar();
if (c == StreamIn.END_OF_STREAM)
{
Input.UngetChar(c);
continue;
}
AddCharToLexer(c);
map = Map((char) c);
if ((map & LETTER) != 0)
{
Lexsize -= 3;
Txtend = Lexsize;
Input.UngetChar(c);
State = LEX_ENDTAG;
Lexbuf[Lexsize] = (byte) '\x0000'; /* debug */
Input.CursorColumn -= 2;
/* if some text before the </ return it now */
if (Txtend > Txtstart)
{
/* trim space char before end tag */
if (mode == IGNORE_WHITESPACE && Lexbuf[Lexsize - 1] == (byte) ' ')
{
Lexsize -= 1;
Txtend = Lexsize;
}
Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend);
return Token;
}
continue; /* no text so keep going */
}
/* otherwise treat as CDATA */
Waswhite = false;
State = LEX_CONTENT;
continue;
}
if (mode == IGNORE_MARKUP)
{
/* otherwise treat as CDATA */
Waswhite = false;
State = LEX_CONTENT;
continue;
}
/*
look out for comments, doctype or marked sections
this isn't quite right, but its getting there ...
*/
if (c == '!')
{
c = Input.ReadChar();
if (c == '-')
{
c = Input.ReadChar();
if (c == '-')
{
State = LEX_COMMENT; /* comment */
Lexsize -= 2;
Txtend = Lexsize;
/* if some text before < return it now */
if (Txtend > Txtstart)
{
Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend);
return Token;
}
Txtstart = Lexsize;
continue;
}
Report.Warning(this, null, null, Report.MALFORMED_COMMENT);
}
else if (c == 'd' || c == 'D')
{
State = LEX_DOCTYPE; /* doctype */
Lexsize -= 2;
Txtend = Lexsize;
mode = IGNORE_WHITESPACE;
/* skip until white space or '>' */
for (;;)
{
c = Input.ReadChar();
if (c == StreamIn.END_OF_STREAM || c == '>')
{
Input.UngetChar(c);
break;
}
map = Map((char) c);
if ((map & WHITE) == 0)
{
continue;
}
/* and skip to end of whitespace */
for (;;)
{
c = Input.ReadChar();
if (c == StreamIn.END_OF_STREAM || c == '>')
{
Input.UngetChar(c);
break;
}
map = Map((char) c);
if ((map & WHITE) != 0)
{
continue;
}
Input.UngetChar(c);
break;
}
break;
}
/* if some text before < return it now */
if (Txtend > Txtstart)
{
Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend);
return Token;
}
Txtstart = Lexsize;
continue;
}
else if (c == '[')
{
/* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
Lexsize -= 2;
State = LEX_SECTION;
Txtend = Lexsize;
/* if some text before < return it now */
if (Txtend > Txtstart)
{
Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend);
return Token;
}
Txtstart = Lexsize;
continue;
}
/* otherwise swallow chars up to and including next '>' */
while (true)
{
c = Input.ReadChar();
if (c == '>')
{
break;
}
if (c == - 1)
{
Input.UngetChar(c);
break;
}
}
Lexsize -= 2;
Lexbuf[Lexsize] = (byte) '\x0000';
State = LEX_CONTENT;
continue;
}
/*
processing instructions
*/
if (c == '?')
{
Lexsize -= 2;
State = LEX_PROCINSTR;
Txtend = Lexsize;
/* if some text before < return it now */
if (Txtend > Txtstart)
{
Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend);
return Token;
}
Txtstart = Lexsize;
continue;
}
/* Microsoft ASP's e.g. <% ... server-code ... %> */
if (c == '%')
{
Lexsize -= 2;
State = LEX_ASP;
Txtend = Lexsize;
/* if some text before < return it now */
if (Txtend > Txtstart)
{
Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend);
return Token;
}
Txtstart = Lexsize;
continue;
}
/* Netscapes JSTE e.g. <# ... server-code ... #> */
if (c == '#')
{
Lexsize -= 2;
State = LEX_JSTE;
Txtend = Lexsize;
/* if some text before < return it now */
if (Txtend > Txtstart)
{
Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend);
return Token;
}
Txtstart = Lexsize;
continue;
}
map = Map((char) c);
/* check for start tag */
if ((map & LETTER) != 0)
{
Input.UngetChar(c); /* push back letter */
Lexsize -= 2; /* discard "<" + letter */
Txtend = Lexsize;
State = LEX_STARTTAG; /* ready to read tag name */
/* if some text before < return it now */
if (Txtend > Txtstart)
{
Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend);
return Token;
}
continue; /* no text so keep going */
}
/* otherwise treat as CDATA */
State = LEX_CONTENT;
Waswhite = false;
continue;
case LEX_ENDTAG:
Txtstart = Lexsize - 1;
Input.CursorColumn += 2;
c = ParseTagName();
Token = NewNode(Node.END_TAG, Lexbuf, Txtstart, Txtend,
GetString(Lexbuf, Txtstart, Txtend - Txtstart));
Lexsize = Txtstart;
Txtend = Txtstart;
/* skip to '>' */
while (c != '>')
{
c = Input.ReadChar();
if (c == StreamIn.END_OF_STREAM)
{
break;
}
}
if (c == StreamIn.END_OF_STREAM)
{
Input.UngetChar(c);
continue;
}
State = LEX_CONTENT;
Waswhite = false;
return Token; /* the endtag token */
case LEX_STARTTAG:
Txtstart = Lexsize - 1; /* set txtstart to first letter */
c = ParseTagName();
isempty.Val = false;
AttVal attributes = null;
Token = NewNode((isempty.Val ? Node.START_END_TAG : Node.START_TAG), Lexbuf, Txtstart, Txtend,
GetString(Lexbuf, Txtstart, Txtend - Txtstart));
/* parse attributes, consuming closing ">" */
if (c != '>')
{
if (c == '/')
{
Input.UngetChar(c);
}
attributes = ParseAttrs(isempty);
}
if (isempty.Val)
{
Token.Type = Node.START_END_TAG;
}
Token.Attributes = attributes;
Lexsize = Txtstart;
Txtend = Txtstart;
/* swallow newline following start tag */
/* special check needed for CRLF sequence */
/* this doesn't apply to empty elements */
if (ExpectsContent(Token) || Token.Tag == Options.TagTable.TagBr)
{
c = Input.ReadChar();
if (c == '\r')
{
c = Input.ReadChar();
if (c != '\n')
{
Input.UngetChar(c);
}
}
else if (c != '\n' && c != '\f')
{
Input.UngetChar(c);
}
Waswhite = true; /* to swallow leading whitespace */
}
else
{
Waswhite = false;
}
State = LEX_CONTENT;
if (Token.Tag == null)
{
Report.Error(this, null, Token, Report.UNKNOWN_ELEMENT);
}
else if (!Options.XmlTags)
{
Versions &= Token.Tag.Versions;
if ((Token.Tag.Versions & HtmlVersion.Proprietary) != 0)
{
if (!Options.MakeClean &&
(Token.Tag == Options.TagTable.TagNobr || Token.Tag == Options.TagTable.TagWbr))
{
Report.Warning(this, null, Token, Report.PROPRIETARY_ELEMENT);
}
}
if (Token.Tag.CheckAttribs != null)
{
Token.CheckUniqueAttributes(this);
Token.Tag.CheckAttribs.Check(this, Token);
}
else
{
Token.CheckAttributes(this);
}
}
return Token; /* return start tag */
case LEX_COMMENT:
if (c != '-')
{
continue;
}
c = Input.ReadChar();
AddCharToLexer(c);
if (c != '-')
{
continue;
}
while (true)
{
c = Input.ReadChar();
if (c == '>')
{
if (badcomment != 0)
{
Report.Warning(this, null, null, Report.MALFORMED_COMMENT);
}
Txtend = Lexsize - 2; // AQ 8Jul2000
Lexbuf[Lexsize] = (byte) '\x0000';
State = LEX_CONTENT;
Waswhite = false;
Token = NewNode(Node.COMMENT_TAG, Lexbuf, Txtstart, Txtend);
/* now look for a line break */
c = Input.ReadChar();
if (c == '\r')
{
c = Input.ReadChar();
if (c != '\n')
{
Token.Linebreak = true;
}
}
if (c == '\n')
{
Token.Linebreak = true;
}
else
{
Input.UngetChar(c);
}
return Token;
}
/* note position of first such error in the comment */
if (badcomment == 0)
{
Lines = Input.CursorLine;
Columns = Input.CursorColumn - 3;
}
badcomment++;
if (Options.FixComments)
{
Lexbuf[Lexsize - 2] = (byte) '=';
}
AddCharToLexer(c);
/* if '-' then look for '>' to end the comment */
if (c != '-')
{
break;
}
}
/* otherwise continue to look for --> */
Lexbuf[Lexsize - 2] = (byte) '=';
continue;
case LEX_DOCTYPE:
map = Map((char) c);
if ((map & WHITE) != 0)
{
if (Waswhite)
{
Lexsize -= 1;
}
Waswhite = true;
}
else
{
Waswhite = false;
}
if (c != '>')
{
continue;
}
Lexsize -= 1;
Txtend = Lexsize;
Lexbuf[Lexsize] = (byte) '\x0000';
State = LEX_CONTENT;
Waswhite = false;
Token = NewNode(Node.DOC_TYPE_TAG, Lexbuf, Txtstart, Txtend);
/* make a note of the version named by the doctype */
Doctype = FindGivenVersion(Token);
return Token;
case LEX_PROCINSTR:
if (Lexsize - Txtstart == 3)
{
if ((GetString(Lexbuf, Txtstart, 3)).Equals("php"))
{
State = LEX_PHP;
continue;
}
}
if (Options.XmlPIs)
{
/* insist on ?> as terminator */
if (c != '?')
{
continue;
}
/* now look for '>' */
c = Input.ReadChar();
if (c == StreamIn.END_OF_STREAM)
{
Report.Warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
Input.UngetChar(c);
continue;
}
AddCharToLexer(c);
}
if (c != '>')
{
continue;
}
Lexsize -= 1;
Txtend = Lexsize;
Lexbuf[Lexsize] = (byte) '\x0000';
State = LEX_CONTENT;
Waswhite = false;
Token = NewNode(Node.PROC_INS_TAG, Lexbuf, Txtstart, Txtend);
return Token;
case LEX_ASP:
if (c != '%')
{
continue;
}
/* now look for '>' */
c = Input.ReadChar();
if (c != '>')
{
Input.UngetChar(c);
continue;
}
Lexsize -= 1;
Txtend = Lexsize;
Lexbuf[Lexsize] = (byte) '\x0000';
State = LEX_CONTENT;
Waswhite = false;
Token = NewNode(Node.ASP_TAG, Lexbuf, Txtstart, Txtend);
return Token;
case LEX_JSTE:
if (c != '#')
{
continue;
}
/* now look for '>' */
c = Input.ReadChar();
if (c != '>')
{
Input.UngetChar(c);
continue;
}
Lexsize -= 1;
Txtend = Lexsize;
Lexbuf[Lexsize] = (byte) '\x0000';
State = LEX_CONTENT;
Waswhite = false;
Token = NewNode(Node.JSTE_TAG, Lexbuf, Txtstart, Txtend);
return Token;
case LEX_PHP:
if (c != '?')
{
continue;
}
/* now look for '>' */
c = Input.ReadChar();
if (c != '>')
{
Input.UngetChar(c);
continue;
}
Lexsize -= 1;
Txtend = Lexsize;
Lexbuf[Lexsize] = (byte) '\x0000';
State = LEX_CONTENT;
Waswhite = false;
Token = NewNode(Node.PHP_TAG, Lexbuf, Txtstart, Txtend);
return Token;
case LEX_SECTION:
if (c == '[')
{
if (Lexsize == (Txtstart + 6) && (GetString(Lexbuf, Txtstart, 6)).Equals("CDATA["))
{
State = LEX_CDATA;
Lexsize -= 6;
continue;
}
}
if (c != ']')
{
continue;
}
/* now look for '>' */
c = Input.ReadChar();
if (c != '>')
{
Input.UngetChar(c);
continue;
}
Lexsize -= 1;
Txtend = Lexsize;
Lexbuf[Lexsize] = (byte) '\x0000';
State = LEX_CONTENT;
Waswhite = false;
Token = NewNode(Node.SECTION_TAG, Lexbuf, Txtstart, Txtend);
return Token;
case LEX_CDATA:
if (c != ']')
{
continue;
}
/* now look for ']' */
c = Input.ReadChar();
if (c != ']')
{
Input.UngetChar(c);
continue;
}
/* now look for '>' */
c = Input.ReadChar();
if (c != '>')
{
Input.UngetChar(c);
continue;
}
Lexsize -= 1;
Txtend = Lexsize;
Lexbuf[Lexsize] = (byte) '\x0000';
State = LEX_CONTENT;
Waswhite = false;
Token = NewNode(Node.CDATA_TAG, Lexbuf, Txtstart, Txtend);
return Token;
}
}
if (State == LEX_CONTENT)
{
/* text string */
Txtend = Lexsize;
if (Txtend > Txtstart)
{
Input.UngetChar(c);
if (Lexbuf[Lexsize - 1] == (byte) ' ')
{
Lexsize -= 1;
Txtend = Lexsize;
}
Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend);
return Token;
}
}
else if (State == LEX_COMMENT)
{
/* comment */
if (c == StreamIn.END_OF_STREAM)
{
Report.Warning(this, null, null, Report.MALFORMED_COMMENT);
}
Txtend = Lexsize;
Lexbuf[Lexsize] = (byte) '\x0000';
State = LEX_CONTENT;
Waswhite = false;
Token = NewNode(Node.COMMENT_TAG, Lexbuf, Txtstart, Txtend);
return Token;
}
return null;
}