internal override unsafe int GetChars(byte* bytes, int byteCount,
char* chars, int charCount, DecoderNLS baseDecoder)
{
// Just need to ASSERT, this is called by something else internal that checked parameters already
// We'll allow null chars as a count
BCLDebug.Assert(bytes != null, "[GB18030Encoding.GetChars]bytes is null");
BCLDebug.Assert(byteCount >= 0, "[GB18030Encoding.GetChars]byteCount is negative");
// BCLDebug.Assert(chars != null, "[GB18030Encoding.GetChars]chars is null");
BCLDebug.Assert(charCount >= 0, "[GB18030Encoding.GetChars]charCount is negative");
// Fix our decoder
GB18030Decoder decoder = (GB18030Decoder)baseDecoder;
// Get our info.
Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer(
this, decoder, chars, charCount, bytes, byteCount);
// Need temp bytes because we can't muss up decoder
short byte1 = -1;
short byte2 = -1;
short byte3 = -1;
short byte4 = -1;
// See if there was anything to get out of the decoder
if (decoder != null && decoder.bLeftOver1 != -1)
{
// Need temp bytes because we can't muss up decoder
byte1 = decoder.bLeftOver1;
byte2 = decoder.bLeftOver2;
byte3 = decoder.bLeftOver3;
byte4 = decoder.bLeftOver4;
// Loop because we might have too many in buffer
// This could happen if we are working on a 4 byte sequence, but it isn't valid.
while (byte1 != -1)
{
// If its not a lead byte, use ? or its value, then scoot them down & try again
// This could happen if we previously had a bad 4 byte sequence and this is a trail byte
if (!IsGBLeadByte(byte1))
{
// This is either a ? or ASCII, need 1 char output
if (byte1 <= 0x7f)
{
if (!buffer.AddChar((char)byte1)) // Its ASCII
break;
}
else
{
if (!buffer.Fallback((byte)byte1)) // Not a valid byte
break;
}
byte1 = byte2;
byte2 = byte3;
byte3 = byte4;
byte4 = -1;
continue;
}
// Read in more bytes as needed
while (byte2 == -1 ||
(IsGBFourByteTrailing(byte2) && byte4 == -1))
{
// Do we have room?
if (!buffer.MoreData)
{
// No input left to read, do we have to flush?
if (!decoder.MustFlush)
{
// Don't stick stuff in decoder when counting
if (chars != null)
{
// Don't have to flush, won't have any chars
// Decoder is correct, just return
decoder.bLeftOver1 = byte1;
decoder.bLeftOver2 = byte2;
decoder.bLeftOver3 = byte3;
decoder.bLeftOver4 = byte4;
}
decoder.m_bytesUsed = buffer.BytesUsed;
return buffer.Count;
}
// We'll have to flush, add a ? and scoot them down to try again
// We could be trying for a 4 byte sequence but byte 3 could be ascii and should be spit out
// Breaking will do this because we have zeros
break;
}
// Read them in
if (byte2 == -1) byte2 = buffer.GetNextByte();
else if (byte3 == -1) byte3 = buffer.GetNextByte();
else byte4 = buffer.GetNextByte();
}
// Now we have our 2 or 4 bytes
if (IsGBTwoByteTrailing(byte2))
{
//
// The trailing byte is a GB18030 two-byte sequence trailing byte.
//
int iTwoBytes = byte1 << 8;
iTwoBytes |= unchecked((byte)byte2);
if (!buffer.AddChar(this.mapBytesToUnicode[iTwoBytes], 2))
break;
// We're done with it
byte1 = -1;
byte2 = -1;
}
else if (IsGBFourByteTrailing(byte2) &&
IsGBLeadByte(byte3) &&
IsGBFourByteTrailing(byte4))
{
//
// Four-byte GB18030
//
int sFourBytesOffset = GetFourBytesOffset(
byte1, byte2, byte3, byte4);
// What kind is it?
if (sFourBytesOffset <= GBLast4ByteCode)
{
//
// The Unicode will be in the BMP range.
//
if (!buffer.AddChar(map4BytesToUnicode[sFourBytesOffset], 4))
break;
}
else if (sFourBytesOffset >= GBSurrogateOffset &&
sFourBytesOffset <= GBLastSurrogateOffset)
{
//
// This will be converted to a surrogate pair, need another char
//
// Use our surrogate
sFourBytesOffset -= GBSurrogateOffset;
if (!buffer.AddChar(unchecked((char)(0xd800 + (sFourBytesOffset / 0x400))),
unchecked((char)(0xdc00 + (sFourBytesOffset % 0x400))), 4))
break;
}
else
{
// Real GB18030 codepoint, but can't be mapped to unicode
// We already checked our buffer space.
// Do fallback here if we impliment decoderfallbacks.
if (!buffer.Fallback((byte)byte1, (byte)byte2, (byte)byte3, (byte)byte4))
break;
}
// We're done with this one
byte1 = -1;
byte2 = -1;
byte3 = -1;
byte4 = -1;
}
else
{
// Not a valid sequence, use '?' for 1st byte & scoot them all down 1
if (!buffer.Fallback((byte)byte1))
break;
// Move all bytes down 1
byte1 = byte2;
byte2 = byte3;
byte3 = byte4;
byte4 = -1;
}
}
}
// Loop, just do '?' replacement because we don't have fallbacks for decodings.
while (buffer.MoreData)
{
byte ch = buffer.GetNextByte();
// ASCII case is easy
if (ch <= 0x7f)
{
// ASCII, have room?
if (!buffer.AddChar((char)ch))
break; // No room in convert buffer, so stop
}
// See if its a lead byte
else if (IsGBLeadByte(ch))
{
// ch is a lead byte, have room for more?
if (buffer.MoreData)
{
byte ch2 = buffer.GetNextByte();
if (IsGBTwoByteTrailing(ch2))
{
//
// The trailing byte is a GB18030 two-byte sequence trailing byte.
//
//
// Two-byte GB18030
//
int iTwoBytes = ch << 8;
iTwoBytes |= ch2;
if (!buffer.AddChar(this.mapBytesToUnicode[iTwoBytes], 2))
break;
}
else if (IsGBFourByteTrailing(ch2))
{
// Do we have room for Four Byte Sequence? (already have 1 byte)
if (buffer.EvenMoreData(2))
{
// Is it a valid 4 byte sequence?
byte ch3 = buffer.GetNextByte();
byte ch4 = buffer.GetNextByte();
if (IsGBLeadByte(ch3) &&
IsGBFourByteTrailing(ch4))
{
//
// Four-byte GB18030
//
int sFourBytesOffset = GetFourBytesOffset(ch, ch2, ch3, ch4);
// What kind is it?
// We'll be at least 1 BMP char or a '?' char.
if (sFourBytesOffset <= GBLast4ByteCode)
{
//
// The Unicode will be in the BMP range.
//
if (!buffer.AddChar(map4BytesToUnicode[sFourBytesOffset],4))
break;
}
else if (sFourBytesOffset >= GBSurrogateOffset &&
sFourBytesOffset <= GBLastSurrogateOffset)
{
//
// This will be converted to a surrogate pair, need another char
//
// Use our surrogate
sFourBytesOffset -= GBSurrogateOffset;
if (!buffer.AddChar(unchecked((char)(0xd800 + (sFourBytesOffset / 0x400))),
unchecked((char)(0xdc00 + (sFourBytesOffset % 0x400))),4))
break;
}
else
{
// Real GB18030 codepoint, but can't be mapped to unicode
if (!buffer.Fallback(ch, ch2, ch3, ch4))
break;
}
}
else
{
// Not a valid 2 or 4 byte sequence, use '?' for ch and try other 3 again
buffer.AdjustBytes(-3);
if (!buffer.Fallback(ch))
break;
}
}
else
{
// No room for 4 bytes, have 2 already, may be one more
// Lead byte but no place to stick it
if (decoder != null && !decoder.MustFlush)
{
// (make sure not to set decoder if counting, so check chars)
if (chars != null)
{
// We'll be able to stick the remainder in the decoder
byte1 = ch;
byte2 = ch2;
if (buffer.MoreData)
byte3 = buffer.GetNextByte();
else
byte3 = -1;
byte4=-1;
}
break;
}
// Won't go in decoder, we'll use '?' for it.
if (!buffer.Fallback(ch, ch2))
break;
}
}
else
{
// Unknown byte sequence, fall back lead byte and try 2nd one again
buffer.AdjustBytes(-1);
if (!buffer.Fallback(ch))
break;
}
}
else
{
// Lead byte but don't know about trail byte
// (make sure not to set decoder if counting, so check bytes)
if (decoder != null && !decoder.MustFlush)
{
// We'll be able to stick it in the decoder
// (don't actually do it when counting though)
if (chars != null)
{
byte1 = ch;
byte2 = -1;
byte3 = -1;
byte4 = -1;
}
break;
}
if (!buffer.Fallback(ch))
break;
}
}
else
{
// Not ASCII and not a lead byte, we'll use '?' for it if we have room
if (!buffer.Fallback(ch))
break;
}
}
// Need to flush the decoder if necessary
// (make sure not to set decoder if counting, so check bytes)
if (decoder != null)
{
if (chars != null)
{
decoder.bLeftOver1 = byte1;
decoder.bLeftOver2 = byte2;
decoder.bLeftOver3 = byte3;
decoder.bLeftOver4 = byte4;
}
decoder.m_bytesUsed = buffer.BytesUsed;
}
// Return the # of characters we found
return buffer.Count;
}