I ran across this article, and it talks about this open source utf8 conversion code. The 'magic subtraction' trick was just too cool. So I thought I'd see if I could improve on it.
Previously the utf8 to unicode conversion I was using was based on how they did it in the poco C++ libraries. This library is pretty well put together in many ways. The test as I have it is probably not entirely fair, as there is some extra verification that I don't bother with in my 'magic subtract' implementation to get some extra speed. It's fair enough in my use cases to have a fast 'assuming valid utf8' implementation as well as a slower checking one.
Doing some profiling on my i7 4770 I find...
Percent Wide | Poco | Magic (No Dep) | Magic | Magic speedup |
---|---|---|---|---|
0 | 3524 | 1980 | 1934 | 182.21% |
10 | 9568 | 5353 | 4963 | 192.79% |
20 | 14135 | 7634 | 7335 | 192.71% |
30 | 19484 | 10865 | 10804 | 180.34% |
40 | 25142 | 14413 | 13521 | 185.95% |
50 | 28986 | 16151 | 15202 | 190.67% |
60 | 30645 | 16689 | 13842 | 221.39% |
70 | 30878 | 15304 | 12409 | 248.84% |
80 | 30696 | 14742 | 11648 | 263.53% |
90 | 31546 | 13639 | 10433 | 302.37% |
100 | 30893 | 12485 | 8913 | 346.61% |
The columns are Poco, Magic style - with no dependencies in unrolled loop and finally Magic style with unrolled loop. Percent Wide is the percentage of wide chars in a text sequence of 1000 characters.
The 'magic' style I ended up with looks something like this.
/* static */Int Utf8Util::Utf8ToUnicode(const UInt8* utf8, Int len, WideChar* dstIn) { // Based on // http://www.opensource.apple.com/source/lldb/lldb-69/llvm/tools/clang/lib/Basic/ConvertUTF.c const UInt8* end = utf8 + len; const UInt8* cur = utf8; WideChar* dst = dstIn; while (cur < end) { Int c = *cur++; // Get the first byte if (c <= 0x7f) { // It's encoded in a byte, so write out *dst++ = WideChar(c); } else { const Int tail = TailLengths[c]; #if 1 switch (tail) { case 5: c = (c << 6) + *cur++; case 4: c = (c << 6) + *cur++; case 3: c = (c << 6) + *cur++; case 2: c = (c << 6) + *cur++; case 1: c = (c << 6) + *cur++; } #else // Loop unrolled with no dependencies switch (tail) { case 1: c = (c << 6) + cur[0]; break; case 2: c = (c << 12) + (WideChar(cur[0]) << 6) + cur[1]; break; case 3: c = (c << 18) + (WideChar(cur[0]) << 12) + (WideChar(cur[1]) << 6) + cur[2]; break; case 4: c = (c << 24) + (WideChar(cur[0]) << 18) + (WideChar(cur[1]) << 12) + (WideChar(cur[2]) << 6) + cur[3]; break; case 5: c = (c << 30) + (WideChar(cur[0]) << 24) + (WideChar(cur[1]) << 18) + (WideChar(cur[2]) << 12) + (WideChar(cur[3]) << 6) + cur[4]; break; } cur += tail; #endif // Fix c -= MagicSub[tail]; *dst++ = WideChar(c); } } // Return the length return Int(dst - dstIn); }
The Poco style like this
static const Int8 _CharTable[] = { /* 00 */ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 10 */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 20 */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 30 */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 40 */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 50 */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 60 */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 70 */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 80 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 90 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* a0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* b0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* c0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, /* d0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, /* e0 */ -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, /* f0 */ -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -1, -1, }; /* static */Int Utf8Util::Utf8ToUnicode(const UInt8* utf8, Int len, WideChar* dstIn) { const UInt8* end = utf8 + len; const UInt8* cur = utf8; WideChar* dst = dstIn; Int size = 0; while (cur < end) { Int c = *cur++; // Get the first byte if (c <= 0x7f) { // It's encoded in a byte, so write out *dst++ = WideChar(c); } else { Int n = _CharTable[c]; // Get the size (it's negative) Int uc; // Stores character as its built up switch (n) { case -6: case -5: case -1: { return -1; } case -4: case -3: case -2: { if (!IsLegal(cur - 1, -n)) { return -1; } uc = c & ((0x07 << (n + 4)) | 0x03); break; } default: { RE4_ASSERT_FAIL("Not possible"); return -1; } } while (n++ < -1) { uc <<= 6; uc |= (*cur & 0x3F); cur++; } // Save char *dst++ = WideChar(uc); } } // Return the length return Int(dst - dstIn); }
A mystery here is why when there are no wide characters (ie it's all ascii) is the poco style so much slower? Here's the poco assembler...
00BDCFB0 movzx ebx,byte ptr [esi] 00BDCFB3 mov dword ptr [ebp-4],esi 00BDCFB6 inc esi 00BDCFB7 cmp ebx,7Fh if (c <= 0x7f) 00BDCFBA ja Re4::Utf8Util::Utf8ToUnicode+30h (0BDCFC0h) { // It's encoded in a byte, so write out *dst++ = WideChar(c); 00BDCFBC mov dword ptr [edi],ebx } else 00BDCFBE jmp $LN11+42h (0BDD01Bh) 00BDCFC0 // !!!!! Wide expansion !!!!!! 00BDD01B add edi,4 00BDD01E cmp esi,eax 00BDD020 jb Re4::Utf8Util::Utf8ToUnicode+20h (0BDCFB0h)
The magic style looks like this...
0016CFA4 movzx eax,byte ptr [esi] 0016CFA7 inc esi 0016CFA8 cmp eax,7Fh if (c <= 0x7f) 0016CFAB jbe $LN12+10h (16CFF7h) 0016CFAD // !!!!! Wide expansion !!!! *dst++ = WideChar(c); 0016CFF7 mov dword ptr [edx],eax 0016CFF9 add edx,4 0016CFFC cmp esi,ebx 0016CFFE jb Re4::Utf8Util::Utf8ToUnicode+14h (16CFA4h)
The Poco loop is a little more complex - it looks like register pressure could be part of it, as it writes the read pointer to [ebp-4]. It also has an extra non conditional branch. It's a little surprising such a different performing loop is produced, as the non 'wide' case code is very similar.
You could probably do better than this - especially if you know that most of your text is ascii characters. For example you could align the read pointer onto words, read it and AND with 0x80808080 to test for any wide chars in across the word. Still it'd have to be a significant bottleneck for the extra complexity which it isn't for me right now.