public static Encoding DetectUnicodeInByteSampleByHeuristics(byte[] SampleBytes)
{
long oddBinaryNullsInSample = 0;
long evenBinaryNullsInSample = 0;
long suspiciousUTF8SequenceCount = 0;
long suspiciousUTF8BytesTotal = 0;
long likelyUSASCIIBytesInSample = 0;
//Cycle through, keeping count of binary null positions, possible UTF-8
// sequences from upper ranges of Windows-1252, and probable US-ASCII
// character counts.
long currentPos = 0;
int skipUTF8Bytes = 0;
while (currentPos < SampleBytes.Length)
{
//binary null distribution
if (SampleBytes[currentPos] == 0)
{
if (currentPos%2 == 0)
evenBinaryNullsInSample++;
else
oddBinaryNullsInSample++;
}
//likely US-ASCII characters
if (IsCommonUSASCIIByte(SampleBytes[currentPos]))
likelyUSASCIIBytesInSample++;
//suspicious sequences (look like UTF-8)
if (skipUTF8Bytes == 0)
{
int lengthFound = DetectSuspiciousUTF8SequenceLength(SampleBytes, currentPos);
if (lengthFound > 0)
{
suspiciousUTF8SequenceCount++;
suspiciousUTF8BytesTotal += lengthFound;
skipUTF8Bytes = lengthFound - 1;
}
}
else
{
skipUTF8Bytes--;
}
currentPos++;
}
//1: UTF-16 LE - in english / european environments, this is usually characterized by a
// high proportion of odd binary nulls (starting at 0), with (as this is text) a low
// proportion of even binary nulls.
// The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
// 60% nulls where you do expect nulls) are completely arbitrary.
if (((evenBinaryNullsInSample*2.0)/SampleBytes.Length) < 0.2
&& ((oddBinaryNullsInSample*2.0)/SampleBytes.Length) > 0.6
)
return Encoding.Unicode;
//2: UTF-16 BE - in english / european environments, this is usually characterized by a
// high proportion of even binary nulls (starting at 0), with (as this is text) a low
// proportion of odd binary nulls.
// The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
// 60% nulls where you do expect nulls) are completely arbitrary.
if (((oddBinaryNullsInSample*2.0)/SampleBytes.Length) < 0.2
&& ((evenBinaryNullsInSample*2.0)/SampleBytes.Length) > 0.6
)
return Encoding.BigEndianUnicode;
//3: UTF-8 - Martin Dürst outlines a method for detecting whether something CAN be UTF-8 content
// using regexp, in his w3c.org unicode FAQ entry:
// http://www.w3.org/International/questions/qa-forms-utf-8
// adapted here for C#.
string potentiallyMangledString = Encoding.ASCII.GetString(SampleBytes);
Regex UTF8Validator = new Regex(@"\A("
+ @"[\x09\x0A\x0D\x20-\x7E]"
+ @"|[\xC2-\xDF][\x80-\xBF]"
+ @"|\xE0[\xA0-\xBF][\x80-\xBF]"
+ @"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}"
+ @"|\xED[\x80-\x9F][\x80-\xBF]"
+ @"|\xF0[\x90-\xBF][\x80-\xBF]{2}"
+ @"|[\xF1-\xF3][\x80-\xBF]{3}"
+ @"|\xF4[\x80-\x8F][\x80-\xBF]{2}"
+ @")*\z");
if (UTF8Validator.IsMatch(potentiallyMangledString))
{
//Unfortunately, just the fact that it CAN be UTF-8 doesn't tell you much about probabilities.
//If all the characters are in the 0-127 range, no harm done, most western charsets are same as UTF-8 in these ranges.
//If some of the characters were in the upper range (western accented characters), however, they would likely be mangled to 2-byte by the UTF-8 encoding process.
// So, we need to play stats.
// The "Random" likelihood of any pair of randomly generated characters being one
// of these "suspicious" character sequences is:
// 128 / (256 * 256) = 0.2%.
//
// In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127
// character range, so we assume that more than 1 in 500,000 of these character
// sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me.
//
// We can only assume these character sequences will be rare if we ALSO assume that this
// IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is
// not already suspicious sequences) should be plain US-ASCII bytes. This, I
// arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield
// approx 40%, so the chances of hitting this threshold by accident in random data are
// VERY low).
if ((suspiciousUTF8SequenceCount*500000.0/SampleBytes.Length >= 1) //suspicious sequences
&& (
//all suspicious, so cannot evaluate proportion of US-Ascii
SampleBytes.Length - suspiciousUTF8BytesTotal == 0
||
likelyUSASCIIBytesInSample*1.0/(SampleBytes.Length - suspiciousUTF8BytesTotal) >= 0.8
)
)
return Encoding.UTF8;
}
return null;
}