1 module more.utf8; 2 3 4 version(unittest_utf8) { 5 import std.stdio; 6 import more.common; 7 } 8 9 version(unittest_utf8) 10 { 11 import std.string; 12 } 13 14 // 15 // Utf8 16 // 17 private enum genericMessage = "invalid utf8"; 18 private enum startedInsideCodePointMessage = "utf8 string started inside a utf8 code point"; 19 private enum missingBytesMessage = "utf8 encoding is missing some bytes"; 20 private enum outOfRangeMessage = "the utf8 code point is out of range"; 21 class Utf8Exception : Exception { 22 enum Type { 23 generic, 24 startedInsideCodePoint, 25 missingBytes, 26 outOfRange, 27 } 28 static string getMessage(Type type) { 29 final switch(type) { 30 case Type.generic: return genericMessage; 31 case Type.startedInsideCodePoint: return startedInsideCodePointMessage; 32 case Type.missingBytes: return missingBytesMessage; 33 case Type.outOfRange: return outOfRangeMessage; 34 } 35 } 36 const Type type; 37 this(Type type) { 38 super(getMessage(type)); 39 this.type = type; 40 } 41 } 42 43 44 // This method assumes that utf8 points to at least one character 45 // and that the first non-valid pointer is at the limit pointer 46 // (this means that utf8 < limit) 47 dchar decodeUtf8(ref inout(char)* utf8, const char* limit) { 48 dchar c = *utf8; 49 utf8++; 50 if(c <= 0x7F) { 51 return c; 52 } 53 if((c & 0x40) == 0) { 54 throw new Utf8Exception(Utf8Exception.Type.startedInsideCodePoint); 55 } 56 57 if((c & 0x20) == 0) { 58 if(utf8 >= limit) throw new Utf8Exception(Utf8Exception.Type.missingBytes); 59 return ((c << 6) & 0x7C0) | (*(utf8++) & 0x3F); 60 } 61 62 if((c & 0x10) == 0) { 63 utf8++; 64 if(utf8 >= limit) throw new Utf8Exception(Utf8Exception.Type.missingBytes); 65 return ((c << 12) & 0xF000) | ((*(utf8 - 1) << 6) & 0xFC0) | (*(utf8++) & 0x3F); 66 } 67 68 if((c & 0x08) == 0) { 69 utf8 += 2; 70 if(utf8 >= limit) throw new Utf8Exception(Utf8Exception.Type.missingBytes); 71 return ((c << 18) & 0x1C0000) | ((*(utf8 - 2) << 12) & 0x3F000) | 72 ((*(utf8 - 1) << 6) & 0xFC0) | (*(utf8++) & 0x3F); 73 } 74 75 throw new Utf8Exception(Utf8Exception.Type.outOfRange); 76 } 77 78 // 79 // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> 80 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. 81 // 82 dchar bjoernDecodeUtf8(ref inout(char)* utf8, const char* limit) { 83 static __gshared immutable ubyte[] utf8lookup = [ 84 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f 85 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f 86 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f 87 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f 88 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f 89 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf 90 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df 91 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef 92 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff 93 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 94 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 95 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 96 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 97 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 98 ]; 99 enum utf8Accept = 0; 100 enum utf8Reject = 1; 101 102 uint state = utf8Accept; 103 dchar codep; 104 105 while(true) { 106 ubyte b = *utf8; 107 uint type = utf8lookup[b]; 108 109 codep = (state != utf8Accept) ? 110 (b & 0x3fu) | (codep << 6) : (0xff >> type) & b; 111 112 state = utf8lookup[256 + state*16 + type]; 113 114 if(state == utf8Accept) return codep; 115 if(state == utf8Reject) throw new Utf8Exception(Utf8Exception.Type.generic); 116 utf8++; 117 if(utf8 >= limit) throw new Utf8Exception(Utf8Exception.Type.missingBytes); 118 } 119 } 120 121 122 version(unittest_utf8) unittest 123 { 124 mixin(scopedTest!("utf8")); 125 126 void testDecodeUtf8(inout(char)[] s, dchar[] expectedChars, size_t line = __LINE__) { 127 auto start = s.ptr; 128 auto limit = s.ptr + s.length; 129 130 foreach(expected; expectedChars) { 131 if(start >= limit) { 132 writefln("Expected more decoded utf8 chars but input ended"); 133 writefln("test on line %s", line); 134 assert(0); 135 } 136 auto saveStart = start; 137 dchar decoded = decodeUtf8(start, limit); 138 if(decoded != expected) { 139 writefln("decodeUtf8: Expected '%s' 0x%x but decoded '%s' 0x%x", 140 expected, expected, decoded, decoded); 141 writefln("test on line %s", line); 142 assert(0); 143 } 144 start = saveStart; 145 decoded = bjoernDecodeUtf8(start, limit); 146 if(decoded != expected) { 147 writefln("bjoernDecodeUtf8: Expected '%s' 0x%x but decoded '%s' 0x%x", 148 expected, expected, decoded, decoded); 149 writefln("test on line %s", line); 150 assert(0); 151 } 152 debug writefln("decodeUtf8('%s')", decoded); 153 } 154 } 155 void testInvalidUtf8(Utf8Exception.Type expectedError, inout(char)[] s, size_t line = __LINE__) { 156 auto start = s.ptr; 157 auto limit = s.ptr + s.length; 158 159 auto saveStart = start; 160 try { 161 dchar decoded = decodeUtf8(start, limit); 162 assert(0, format("expected error '%s' but no error was thrown", expectedError)); 163 } catch(Utf8Exception e) { 164 assert(e.type == expectedError, format("expected error '%s' but got '%s'", expectedError, e.type)); 165 } 166 167 start = saveStart; 168 try { 169 dchar decoded = bjoernDecodeUtf8(start, limit); 170 assert(0, format("expected error '%s' but no error was thrown", expectedError)); 171 } catch(Utf8Exception e) { 172 assert(e.type == Utf8Exception.Type.generic || e.type == expectedError, format 173 ("expected error '%s' but got '%s'", expectedError, e.type)); 174 } 175 debug writefln("got expected error '%s'", expectedError); 176 } 177 178 char[] testString = new char[256]; 179 dchar[] expectedCharsBuffer = new dchar[256]; 180 181 182 testInvalidUtf8(Utf8Exception.Type.startedInsideCodePoint, [0x80]); 183 testInvalidUtf8(Utf8Exception.Type.missingBytes, [0xC0]); 184 testInvalidUtf8(Utf8Exception.Type.missingBytes, [0xE0, 0x80]); 185 186 187 // dchar[] ranges = 188 // [0, 0x7F] 189 for(char c = 0; c <= 0x7F; c++) { 190 testString[0] = c; 191 expectedCharsBuffer[0] = c; 192 testDecodeUtf8(testString[0..1], expectedCharsBuffer[0..1]); 193 } 194 195 196 testDecodeUtf8("\u0000", [0x0000]); 197 testDecodeUtf8("\u0001", [0x0001]); 198 199 testDecodeUtf8("\u00a9", [0xa9]); 200 testDecodeUtf8("\u00b1", [0xb1]); 201 testDecodeUtf8("\u02c2", [0x02c2]); 202 203 204 testDecodeUtf8("\u0080", [0x80]); 205 testDecodeUtf8("\u07FF", [0x7FF]); 206 207 testDecodeUtf8("\u0800", [0x800]); 208 testDecodeUtf8("\u7fff", [0x7FFF]); 209 testDecodeUtf8("\u8000", [0x8000]); 210 testDecodeUtf8("\uFFFD", [0xFFFD]); 211 //testDecodeUtf8("\uFFFE", [0xFFFE]); // DMD doesn't like this code point 212 //testDecodeUtf8("\uFFFF", [0xFFFF]); // DMD doesn't like this code point 213 214 215 testDecodeUtf8("\U00010000", [0x10000]); 216 testDecodeUtf8("\U00100000", [0x00100000]); 217 testDecodeUtf8("\U0010FFFF", [0x0010FFFF]); 218 //testDecodeUtf8("\U00110000", [0x00110000]); // DMD doesn't like this code point 219 } 220