1 module more.utf8; 2 3 import std.typecons : Flag, Yes, No; 4 5 version(unittest) 6 { 7 import more.test; 8 import std.stdio; 9 import std..string; 10 } 11 12 class Utf8Exception : Exception 13 { 14 this(string msg, string file = __FILE__, size_t line = __LINE__) pure 15 { 16 super(msg, file, line); 17 } 18 } 19 class Utf8DecodeException : Utf8Exception 20 { 21 enum Type : ubyte 22 { 23 invalidFirstByte, 24 missingBytes, 25 outOfRange, 26 } 27 @property static auto invalidFirstByte(string file = __FILE__, size_t line = __LINE__) 28 { 29 return new Utf8DecodeException(Type.invalidFirstByte, 30 "a utf8 code point starts with an invalid byte", file, line); 31 } 32 @property static auto missingBytes(string file = __FILE__, size_t line = __LINE__) 33 { 34 return new Utf8DecodeException(Type.missingBytes, 35 "a utf8 code point is missing one or more bytes", file, line); 36 } 37 @property static auto outOfRange(string file = __FILE__, size_t line = __LINE__) 38 { 39 return new Utf8DecodeException(Type.outOfRange, 40 "a utf8 code point is out of range", file, line); 41 } 42 const Type type; 43 private this(Type type, string msg, string file = __FILE__, size_t line = __LINE__) pure 44 { 45 super(msg, file, line); 46 this.type = type; 47 } 48 } 49 50 /** 51 Decodes a single UTF8 character from the given buffer. $(D utf8InOut) is 52 used to pass in the start of the utf8 encoded character, and also used to 53 return the end of it once it has been decoded. 54 55 Returns the decoded character as a 32-bit dchar. 56 57 Throw: $(D Utf8DecodeException) if the utf8 encoding is invalid. 58 */ 59 pragma(inline) 60 dchar decodeUtf8(T)(T** utf8InOut) pure if(T.sizeof == 1) 61 { 62 return decodeUtf8Impl!(No.useLimit)(cast(const(char)**)utf8InOut); 63 } 64 /// ditto 65 pragma(inline) 66 dchar decodeUtf8(T)(T** utf8InOut, T* limit) if(T.sizeof == 1) 67 { 68 return decodeUtf8Impl!(Yes.useLimit)(cast(const(char)**)utf8InOut, cast(const(char)*) limit); 69 } 70 71 template decodeUtf8Impl(Flag!"useLimit" useLimit) 72 { 73 private enum MixinCode = q{ 74 auto utf8 = *utf8InOut; 75 scope(exit) 76 { 77 *utf8InOut = utf8; 78 } 79 dchar first = *utf8; 80 utf8++; 81 if(first <= 0x7F) 82 { 83 return first; 84 } 85 static if(useLimit) 86 { 87 if((first & 0x40) == 0) 88 { 89 throw Utf8DecodeException.invalidFirstByte(); 90 } 91 } 92 if((first & 0x20) == 0) 93 { 94 static if(useLimit) 95 { 96 if(utf8 >= limit) throw Utf8DecodeException.missingBytes; 97 } 98 return ((first << 6) & 0x7C0) | (*(utf8++) & 0x3F); 99 } 100 if((first & 0x10) == 0) 101 { 102 utf8++; 103 static if(useLimit) 104 { 105 if(utf8 >= limit) throw Utf8DecodeException.missingBytes; 106 } 107 return ((first << 12) & 0xF000) | ((*(utf8 - 1) << 6) & 0xFC0) | (*(utf8++) & 0x3F); 108 } 109 110 if((first & 0x08) == 0) 111 { 112 utf8 += 2; 113 static if(useLimit) 114 { 115 if(utf8 >= limit) throw Utf8DecodeException.missingBytes; 116 } 117 return ((first << 18) & 0x1C0000) | ((*(utf8 - 2) << 12) & 0x3F000) | 118 ((*(utf8 - 1) << 6) & 0xFC0) | (*(utf8++) & 0x3F); 119 } 120 121 throw Utf8DecodeException.outOfRange; 122 }; 123 static if(useLimit) 124 { 125 dchar decodeUtf8Impl(const(char)** utf8InOut, const(char)* limit) pure 126 { 127 mixin(MixinCode); 128 } 129 } 130 else 131 { 132 dchar decodeUtf8Impl(const(char)** utf8InOut) pure 133 { 134 mixin(MixinCode); 135 } 136 } 137 } 138 139 unittest 140 { 141 mixin(scopedTest!("decodeUtf8")); 142 143 void testDecodeUtf8(inout(char)[] s, dchar[] expectedChars, size_t line = __LINE__) 144 { 145 auto start = s.ptr; 146 auto limit = s.ptr + s.length; 147 148 foreach(expected; expectedChars) 149 { 150 if(start >= limit) 151 { 152 writefln("Expected more decoded utf8 chars but input ended"); 153 writefln("test on line %s", line); 154 assert(0); 155 } 156 auto saveStart = start; 157 dchar decoded = decodeUtf8(&start, limit); 158 if(decoded != expected) 159 { 160 writefln("decodeUtf8: Expected '%s' 0x%x but decoded '%s' 0x%x", 161 expected, expected, decoded, decoded); 162 writefln("test on line %s", line); 163 assert(0); 164 } 165 //debug writefln("decodeUtf8('%s')", decoded); 166 } 167 } 168 void testInvalidUtf8(Utf8DecodeException.Type expectedError, inout(char)[] s, size_t line = __LINE__) 169 { 170 auto start = s.ptr; 171 auto limit = s.ptr + s.length; 172 173 auto saveStart = start; 174 try 175 { 176 dchar decoded = decodeUtf8(&start, limit); 177 assert(0, format("expected error '%s' but no error was thrown", expectedError)); 178 } 179 catch(Utf8DecodeException e) 180 { 181 assert(e.type == expectedError, format("expected error '%s' but got '%s'", expectedError, e.type)); 182 } 183 } 184 185 char[] testString = new char[256]; 186 dchar[] expectedCharsBuffer = new dchar[256]; 187 188 testInvalidUtf8(Utf8DecodeException.Type.invalidFirstByte, [0x80]); 189 testInvalidUtf8(Utf8DecodeException.Type.missingBytes, [0xC0]); 190 testInvalidUtf8(Utf8DecodeException.Type.missingBytes, [0xE0, 0x80]); 191 192 // dchar[] ranges = 193 // [0, 0x7F] 194 for(char c = 0; c <= 0x7F; c++) 195 { 196 testString[0] = c; 197 expectedCharsBuffer[0] = c; 198 testDecodeUtf8(testString[0..1], expectedCharsBuffer[0..1]); 199 } 200 201 testDecodeUtf8("\u0000", [0x0000]); 202 testDecodeUtf8("\u0001", [0x0001]); 203 204 testDecodeUtf8("\u00a9", [0xa9]); 205 testDecodeUtf8("\u00b1", [0xb1]); 206 testDecodeUtf8("\u02c2", [0x02c2]); 207 208 209 testDecodeUtf8("\u0080", [0x80]); 210 testDecodeUtf8("\u07FF", [0x7FF]); 211 212 testDecodeUtf8("\u0800", [0x800]); 213 testDecodeUtf8("\u7fff", [0x7FFF]); 214 testDecodeUtf8("\u8000", [0x8000]); 215 testDecodeUtf8("\uFFFD", [0xFFFD]); 216 //testDecodeUtf8("\uFFFE", [0xFFFE]); // DMD doesn't like this code point 217 //testDecodeUtf8("\uFFFF", [0xFFFF]); // DMD doesn't like this code point 218 219 testDecodeUtf8("\U00010000", [0x10000]); 220 testDecodeUtf8("\U00100000", [0x00100000]); 221 testDecodeUtf8("\U0010FFFF", [0x0010FFFF]); 222 //testDecodeUtf8("\U00110000", [0x00110000]); // DMD doesn't like this code point 223 } 224 225 class Utf8EncodeException : Utf8Exception 226 { 227 this(string msg, string file = __FILE__, size_t line = __LINE__) pure 228 { 229 super(msg, file, line); 230 } 231 } 232 233 /** 234 Encodes a single character into the given buffer 235 Returns the number of bytes used to encode the given character. 236 */ 237 ubyte encodeUtf8(char* dst, dchar c) 238 { 239 if(c <= 0x7F) 240 { 241 *dst++ = cast(char)c; 242 return 1; 243 } 244 if(c <= 0x7FF) 245 { 246 *dst++ = cast(char)(192+c/64); 247 *dst++ = cast(char)(128+c%64); 248 return 2; 249 } 250 if(c <= 0xFFFF) 251 { 252 *dst++ = cast(char)(224+c/4096); 253 *dst++ = cast(char)(128+c/64%64); 254 *dst++ = cast(char)(128+c%64); 255 return 3; 256 } 257 if(c <= 0x1FFFFF) 258 { 259 *dst++ = cast(char)(240+c/262144); 260 *dst++ = cast(char)(128+c/4096%64); 261 *dst++ = cast(char)(128+c/64%64); 262 *dst++ = cast(char)(128+c%64); 263 return 4; 264 } 265 import std.format; 266 throw new Utf8EncodeException(format("encodeUtf8 got a value that was too large (0x%x)", c)); 267 } 268 269 unittest 270 { 271 mixin(scopedTest!("full utf8 encode/decode")); 272 for(dchar c = 0; ;c++) 273 { 274 char[4] buffer; 275 auto encodeLength = encodeUtf8(buffer.ptr, c); 276 { 277 auto utf8 = buffer.ptr; 278 auto decoded = decodeUtf8(&utf8); 279 assert(utf8 - buffer.ptr == encodeLength); 280 assert(decoded == c); 281 } 282 { 283 auto utf8 = buffer.ptr; 284 auto decoded = decodeUtf8(&utf8, utf8 + encodeLength); 285 assert(utf8 - buffer.ptr == encodeLength); 286 assert(decoded == c); 287 } 288 if(c == dchar.max) 289 { 290 break; 291 } 292 } 293 }