FastColoredTextBoxNS.EncodingDetector.DetectUnicodeInByteSampleByHeuristics C# (CSharp) Method

DetectUnicodeInByteSampleByHeuristics() public static method

public static DetectUnicodeInByteSampleByHeuristics ( byte SampleBytes ) : Encoding
SampleBytes byte
return System.Text.Encoding
        public static Encoding DetectUnicodeInByteSampleByHeuristics(byte[] SampleBytes)
        {
            long oddBinaryNullsInSample = 0;
            long evenBinaryNullsInSample = 0;
            long suspiciousUTF8SequenceCount = 0;
            long suspiciousUTF8BytesTotal = 0;
            long likelyUSASCIIBytesInSample = 0;

            //Cycle through, keeping count of binary null positions, possible UTF-8
            //  sequences from upper ranges of Windows-1252, and probable US-ASCII
            //  character counts.

            long currentPos = 0;
            int skipUTF8Bytes = 0;

            while (currentPos < SampleBytes.Length)
            {
                //binary null distribution
                if (SampleBytes[currentPos] == 0)
                {
                    if (currentPos%2 == 0)
                        evenBinaryNullsInSample++;
                    else
                        oddBinaryNullsInSample++;
                }

                //likely US-ASCII characters
                if (IsCommonUSASCIIByte(SampleBytes[currentPos]))
                    likelyUSASCIIBytesInSample++;

                //suspicious sequences (look like UTF-8)
                if (skipUTF8Bytes == 0)
                {
                    int lengthFound = DetectSuspiciousUTF8SequenceLength(SampleBytes, currentPos);

                    if (lengthFound > 0)
                    {
                        suspiciousUTF8SequenceCount++;
                        suspiciousUTF8BytesTotal += lengthFound;
                        skipUTF8Bytes = lengthFound - 1;
                    }
                }
                else
                {
                    skipUTF8Bytes--;
                }

                currentPos++;
            }

            //1: UTF-16 LE - in english / european environments, this is usually characterized by a
            //  high proportion of odd binary nulls (starting at 0), with (as this is text) a low
            //  proportion of even binary nulls.
            //  The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
            //  60% nulls where you do expect nulls) are completely arbitrary.

            if (((evenBinaryNullsInSample*2.0)/SampleBytes.Length) < 0.2
                && ((oddBinaryNullsInSample*2.0)/SampleBytes.Length) > 0.6
                )
                return Encoding.Unicode;

            //2: UTF-16 BE - in english / european environments, this is usually characterized by a
            //  high proportion of even binary nulls (starting at 0), with (as this is text) a low
            //  proportion of odd binary nulls.
            //  The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
            //  60% nulls where you do expect nulls) are completely arbitrary.

            if (((oddBinaryNullsInSample*2.0)/SampleBytes.Length) < 0.2
                && ((evenBinaryNullsInSample*2.0)/SampleBytes.Length) > 0.6
                )
                return Encoding.BigEndianUnicode;

            //3: UTF-8 - Martin Dürst outlines a method for detecting whether something CAN be UTF-8 content
            //  using regexp, in his w3c.org unicode FAQ entry:
            //  http://www.w3.org/International/questions/qa-forms-utf-8
            //  adapted here for C#.
            string potentiallyMangledString = Encoding.ASCII.GetString(SampleBytes);
            Regex UTF8Validator = new Regex(@"\A("
                                            + @"[\x09\x0A\x0D\x20-\x7E]"
                                            + @"|[\xC2-\xDF][\x80-\xBF]"
                                            + @"|\xE0[\xA0-\xBF][\x80-\xBF]"
                                            + @"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}"
                                            + @"|\xED[\x80-\x9F][\x80-\xBF]"
                                            + @"|\xF0[\x90-\xBF][\x80-\xBF]{2}"
                                            + @"|[\xF1-\xF3][\x80-\xBF]{3}"
                                            + @"|\xF4[\x80-\x8F][\x80-\xBF]{2}"
                                            + @")*\z");
            if (UTF8Validator.IsMatch(potentiallyMangledString))
            {
                //Unfortunately, just the fact that it CAN be UTF-8 doesn't tell you much about probabilities.
                //If all the characters are in the 0-127 range, no harm done, most western charsets are same as UTF-8 in these ranges.
                //If some of the characters were in the upper range (western accented characters), however, they would likely be mangled to 2-byte by the UTF-8 encoding process.
                // So, we need to play stats.

                // The "Random" likelihood of any pair of randomly generated characters being one
                //   of these "suspicious" character sequences is:
                //     128 / (256 * 256) = 0.2%.
                //
                // In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127
                //   character range, so we assume that more than 1 in 500,000 of these character
                //   sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me.
                //
                // We can only assume these character sequences will be rare if we ALSO assume that this
                //   IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is
                //   not already suspicious sequences) should be plain US-ASCII bytes. This, I
                //   arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield
                //   approx 40%, so the chances of hitting this threshold by accident in random data are
                //   VERY low).

                if ((suspiciousUTF8SequenceCount*500000.0/SampleBytes.Length >= 1) //suspicious sequences
                    && (
                        //all suspicious, so cannot evaluate proportion of US-Ascii
                        SampleBytes.Length - suspiciousUTF8BytesTotal == 0
                        ||
                        likelyUSASCIIBytesInSample*1.0/(SampleBytes.Length - suspiciousUTF8BytesTotal) >= 0.8
                        )
                    )
                    return Encoding.UTF8;
            }

            return null;
        }