logo

Optimizing utf8 conversion

Posted on: 2016-11-24

I ran across this article, and it talks about this open source utf8 conversion code. The 'magic subtraction' trick was just too cool. So I thought I'd see if I could improve on it.

Previously the utf8 to unicode conversion I was using was based on how they did it in the poco C++ libraries. This library is pretty well put together in many ways. The test as I have it is probably not entirely fair, as there is some extra verification that I don't bother with in my 'magic subtract' implementation to get some extra speed. It's fair enough in my use cases to have a fast 'assuming valid utf8' implementation as well as a slower checking one.

Doing some profiling on my i7 4770 I find...

Percent Wide Poco Magic (No Dep) Magic Magic speedup
0 3524 1980 1934 182.21%
10 9568 5353 4963 192.79%
20 14135 7634 7335 192.71%
30 19484 10865 10804 180.34%
40 25142 14413 13521 185.95%
50 28986 16151 15202 190.67%
60 30645 16689 13842 221.39%
70 30878 15304 12409 248.84%
80 30696 14742 11648 263.53%
90 31546 13639 10433 302.37%
100 30893 12485 8913 346.61%

The columns are Poco, Magic style - with no dependencies in unrolled loop and finally Magic style with unrolled loop. Percent Wide is the percentage of wide chars in a text sequence of 1000 characters.

The 'magic' style I ended up with looks something like this.

/* static */Int Utf8Util::Utf8ToUnicode(const UInt8* utf8, Int len, WideChar* dstIn)
{
    // Based on
    // http://www.opensource.apple.com/source/lldb/lldb-69/llvm/tools/clang/lib/Basic/ConvertUTF.c
    const UInt8* end = utf8 + len;
    const UInt8* cur = utf8;
    WideChar* dst = dstIn;
    while (cur < end)
    {
        Int c = *cur++;                        // Get the first byte
        if (c <= 0x7f)
        {
            // It's encoded in a byte, so write out
            *dst++ = WideChar(c);
        }
        else
        {
            const Int tail = TailLengths[c];
#if 1
            switch (tail)
            {
                case 5: c = (c << 6) + *cur++;
                case 4: c = (c << 6) + *cur++;
                case 3: c = (c << 6) + *cur++;
                case 2: c = (c << 6) + *cur++;
                case 1: c = (c << 6) + *cur++;
            }
#else
            // Loop unrolled with no dependencies
            switch (tail)
            {
                case 1: c = (c <<  6) + cur[0]; break;
                case 2: c = (c << 12) + (WideChar(cur[0]) << 6) + cur[1]; break;
                case 3: c = (c << 18) + (WideChar(cur[0]) << 12) + (WideChar(cur[1]) << 6) + cur[2]; break;
                case 4: c = (c << 24) + (WideChar(cur[0]) << 18) + (WideChar(cur[1]) << 12) + (WideChar(cur[2]) << 6) + cur[3]; break;
                case 5: c = (c << 30) + (WideChar(cur[0]) << 24) + (WideChar(cur[1]) << 18) + (WideChar(cur[2]) << 12) + (WideChar(cur[3]) << 6) + cur[4]; break;
            }
            cur += tail;
#endif

            // Fix
            c -= MagicSub[tail];
            *dst++ = WideChar(c);
        }
    }
    // Return the length
    return Int(dst - dstIn);
}

The Poco style like this

static const Int8 _CharTable[] =
{
    /* 00 */    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    /* 10 */    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
    /* 20 */    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
    /* 30 */    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
    /* 40 */    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
    /* 50 */    0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
    /* 60 */    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
    /* 70 */    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
    /* 80 */      -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    /* 90 */      -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    /* a0 */      -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    /* b0 */      -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    /* c0 */      -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,
    /* d0 */      -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,   -2,
    /* e0 */      -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,   -3,
    /* f0 */      -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -5,   -5,   -5,   -5,   -6,   -6,   -1,   -1,
};

/* static */Int Utf8Util::Utf8ToUnicode(const UInt8* utf8, Int len, WideChar* dstIn)
{
    const UInt8* end = utf8 + len;
    const UInt8* cur = utf8;
    WideChar* dst = dstIn;

    Int size = 0;

    while (cur < end)
    {
        Int c = *cur++;                        // Get the first byte
        if (c <= 0x7f)
        {
            // It's encoded in a byte, so write out
            *dst++ = WideChar(c);
        }
        else
        {
            Int n = _CharTable[c];            // Get the size (it's negative)
            Int uc;                            // Stores character as its built up
            switch (n)
            {
                case -6:
                case -5:
                case -1:
                {
                    return -1;
                }
                case -4:
                case -3:
                case -2:
                {
                    if (!IsLegal(cur - 1, -n))
                    {
                        return -1;
                    }
                    uc = c & ((0x07 << (n + 4)) | 0x03);
                    break;
                 }
                 default:
                 {
                     RE4_ASSERT_FAIL("Not possible");
                     return -1;
                 }
             }

            while (n++ < -1)
            {    
                uc <<= 6;
                uc |= (*cur & 0x3F);
                cur++;
            }

            // Save char
            *dst++ = WideChar(uc);
        }
    }

    // Return the length
    return Int(dst - dstIn);
}

A mystery here is why when there are no wide characters (ie it's all ascii) is the poco style so much slower? Here's the poco assembler...

00BDCFB0  movzx       ebx,byte ptr [esi]
00BDCFB3  mov         dword ptr [ebp-4],esi
00BDCFB6  inc         esi
00BDCFB7  cmp         ebx,7Fh
 if (c <= 0x7f)
00BDCFBA  ja          Re4::Utf8Util::Utf8ToUnicode+30h (0BDCFC0h)
 {
      // It's encoded in a byte, so write out
      *dst++ = WideChar(c);
00BDCFBC  mov         dword ptr [edi],ebx
 }
 else
00BDCFBE  jmp         $LN11+42h (0BDD01Bh)
00BDCFC0  // !!!!! Wide expansion !!!!!!
00BDD01B  add         edi,4
00BDD01E  cmp         esi,eax
00BDD020  jb          Re4::Utf8Util::Utf8ToUnicode+20h (0BDCFB0h)  

The magic style looks like this...

0016CFA4  movzx       eax,byte ptr [esi]
0016CFA7  inc         esi
0016CFA8  cmp         eax,7Fh
 if (c <= 0x7f)
0016CFAB  jbe         $LN12+10h (16CFF7h)
0016CFAD  // !!!!!  Wide expansion !!!!
 *dst++ = WideChar(c);
0016CFF7  mov         dword ptr [edx],eax
0016CFF9  add         edx,4
0016CFFC  cmp         esi,ebx
0016CFFE  jb          Re4::Utf8Util::Utf8ToUnicode+14h (16CFA4h)

The Poco loop is a little more complex - it looks like register pressure could be part of it, as it writes the read pointer to [ebp-4]. It also has an extra non conditional branch. It's a little surprising such a different performing loop is produced, as the non 'wide' case code is very similar.

You could probably do better than this - especially if you know that most of your text is ascii characters. For example you could align the read pointer onto words, read it and AND with 0x80808080 to test for any wide chars in across the word. Still it'd have to be a significant bottleneck for the extra complexity which it isn't for me right now.