Optimizing utf8 conversion

Posted on: 2016-11-24

I ran across this article, and it talks about this open source utf8 conversion code. The 'magic subtraction' trick was just too cool. So I thought I'd see if I could improve on it.

Previously the utf8 to unicode conversion I was using was based on how they did it in the poco C++ libraries. This library is pretty well put together in many ways. The test as I have it is probably not entirely fair, as there is some extra verification that I don't bother with in my 'magic subtract' implementation to get some extra speed. It's fair enough in my use cases to have a fast 'assuming valid utf8' implementation as well as a slower checking one.

Doing some profiling on my i7 4770 I find...

Percent Wide	Poco	Magic (No Dep)	Magic	Magic speedup
0	3524	1980	1934	182.21%
10	9568	5353	4963	192.79%
20	14135	7634	7335	192.71%
30	19484	10865	10804	180.34%
40	25142	14413	13521	185.95%
50	28986	16151	15202	190.67%
60	30645	16689	13842	221.39%
70	30878	15304	12409	248.84%
80	30696	14742	11648	263.53%
90	31546	13639	10433	302.37%
100	30893	12485	8913	346.61%

The columns are Poco, Magic style - with no dependencies in unrolled loop and finally Magic style with unrolled loop. Percent Wide is the percentage of wide chars in a text sequence of 1000 characters.

The 'magic' style I ended up with looks something like this.

/* static */Int Utf8Util::Utf8ToUnicode(const UInt8* utf8, Int len, WideChar* dstIn)
{
    // Based on
    // http://www.opensource.apple.com/source/lldb/lldb-69/llvm/tools/clang/lib/Basic/ConvertUTF.c
    const UInt8* end = utf8 + len;
    const UInt8* cur = utf8;
    WideChar* dst = dstIn;
    while (cur < end)
    {
        Int c = *cur++;                        // Get the first byte
        if (c <= 0x7f)
        {
            // It's encoded in a byte, so write out
            *dst++ = WideChar(c);
        }
        else
        {
            const Int tail = TailLengths[c];
#if 1
            switch (tail)
            {
                case 5: c = (c << 6) + *cur++;
                case 4: c = (c << 6) + *cur++;
                case 3: c = (c << 6) + *cur++;
                case 2: c = (c << 6) + *cur++;
                case 1: c = (c << 6) + *cur++;
            }
#else
            // Loop unrolled with no dependencies
            switch (tail)
            {
                case 1: c = (c <<  6) + cur[0]; break;
                case 2: c = (c << 12) + (WideChar(cur[0]) << 6) + cur[1]; break;
                case 3: c = (c << 18) + (WideChar(cur[0]) << 12) + (WideChar(cur[1]) << 6) + cur[2]; break;
                case 4: c = (c << 24) + (WideChar(cur[0]) << 18) + (WideChar(cur[1]) << 12) + (WideChar(cur[2]) << 6) + cur[3]; break;
                case 5: c = (c << 30) + (WideChar(cur[0]) << 24) + (WideChar(cur[1]) << 18) + (WideChar(cur[2]) << 12) + (WideChar(cur[3]) << 6) + cur[4]; break;
            }
            cur += tail;
#endif

            // Fix
            c -= MagicSub[tail];
            *dst++ = WideChar(c);
        }
    }
    // Return the length
    return Int(dst - dstIn);
}

The Poco style like this

static const Int8 _CharTable[] =
{
    /* 00 */    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    /* 10 */    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
    /* 20 */    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
    /* 30 */    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
    /* 40 */    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
    /* 50 */    0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
    /* 60 */    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
    /* 70 */    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
    /* 80 */      -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    /* 90 */      -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    /* a0 */      -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    /* b0 */      -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    /* c0 */      -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,
    /* d0 */      -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,
    /* e0 */      -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,
    /* f0 */      -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -5,   -5,   -5,   -5,   -6,   -6,   -1,   -1,
};

/* static */Int Utf8Util::Utf8ToUnicode(const UInt8* utf8, Int len, WideChar* dstIn)
{
    const UInt8* end = utf8 + len;
    const UInt8* cur = utf8;
    WideChar* dst = dstIn;

    Int size = 0;

    while (cur < end)
    {
        Int c = *cur++;                        // Get the first byte
        if (c <= 0x7f)
        {
            // It's encoded in a byte, so write out
            *dst++ = WideChar(c);
        }
        else
        {
            Int n = _CharTable[c];            // Get the size (it's negative)
            Int uc;                            // Stores character as its built up
            switch (n)
            {
                case -6:
                case -5:
                case -1:
                {
                    return -1;
                }
                case -4:
                case -3:
                case -2:
                {
                    if (!IsLegal(cur - 1, -n))
                    {
                        return -1;
                    }
                    uc = c & ((0x07 << (n + 4)) | 0x03);
                    break;
                 }
                 default:
                 {
                     RE4_ASSERT_FAIL("Not possible");
                     return -1;
                 }
             }

            while (n++ < -1)
            {    
                uc <<= 6;
                uc |= (*cur & 0x3F);
                cur++;
            }

            // Save char
            *dst++ = WideChar(uc);
        }
    }

    // Return the length
    return Int(dst - dstIn);
}

A mystery here is why when there are no wide characters (ie it's all ascii) is the poco style so much slower? Here's the poco assembler...

00BDCFB0  movzx       ebx,byte ptr [esi]
00BDCFB3  mov         dword ptr [ebp-4],esi
00BDCFB6  inc         esi
00BDCFB7  cmp         ebx,7Fh
 if (c <= 0x7f)
00BDCFBA  ja          Re4::Utf8Util::Utf8ToUnicode+30h (0BDCFC0h)
 {
      // It's encoded in a byte, so write out
      *dst++ = WideChar(c);
00BDCFBC  mov         dword ptr [edi],ebx
 }
 else
00BDCFBE  jmp         $LN11+42h (0BDD01Bh)
00BDCFC0  // !!!!! Wide expansion !!!!!!
00BDD01B  add         edi,4
00BDD01E  cmp         esi,eax
00BDD020  jb          Re4::Utf8Util::Utf8ToUnicode+20h (0BDCFB0h)

The magic style looks like this...

0016CFA4  movzx       eax,byte ptr [esi]
0016CFA7  inc         esi
0016CFA8  cmp         eax,7Fh
 if (c <= 0x7f)
0016CFAB  jbe         $LN12+10h (16CFF7h)
0016CFAD  // !!!!!  Wide expansion !!!!
 *dst++ = WideChar(c);
0016CFF7  mov         dword ptr [edx],eax
0016CFF9  add         edx,4
0016CFFC  cmp         esi,ebx
0016CFFE  jb          Re4::Utf8Util::Utf8ToUnicode+14h (16CFA4h)

The Poco loop is a little more complex - it looks like register pressure could be part of it, as it writes the read pointer to [ebp-4]. It also has an extra non conditional branch. It's a little surprising such a different performing loop is produced, as the non 'wide' case code is very similar.

You could probably do better than this - especially if you know that most of your text is ascii characters. For example you could align the read pointer onto words, read it and AND with 0x80808080 to test for any wide chars in across the word. Still it'd have to be a significant bottleneck for the extra complexity which it isn't for me right now.