1 module more.utf8;
2 
3 import std.typecons : Flag, Yes, No;
4 
5 version(unittest)
6 {
7     import more.test;
8     import std.stdio;
9     import std..string;
10 }
11 
12 class Utf8Exception : Exception
13 {
14     this(string msg, string file = __FILE__, size_t line = __LINE__) pure
15     {
16         super(msg, file, line);
17     }
18 }
19 class Utf8DecodeException : Utf8Exception
20 {
21     enum Type : ubyte
22     {
23         invalidFirstByte,
24         missingBytes,
25         outOfRange,
26     }
27     @property static auto invalidFirstByte(string file = __FILE__, size_t line = __LINE__)
28     {
29         return new Utf8DecodeException(Type.invalidFirstByte,
30             "a utf8 code point starts with an invalid byte", file, line);
31     }
32     @property static auto missingBytes(string file = __FILE__, size_t line = __LINE__)
33     {
34         return new Utf8DecodeException(Type.missingBytes,
35             "a utf8 code point is missing one or more bytes", file, line);
36     }
37     @property static auto outOfRange(string file = __FILE__, size_t line = __LINE__)
38     {
39         return new Utf8DecodeException(Type.outOfRange,
40             "a utf8 code point is out of range", file, line);
41     }
42     const Type type;
43     private this(Type type, string msg, string file = __FILE__, size_t line = __LINE__) pure
44     {
45         super(msg, file, line);
46         this.type = type;
47     }
48 }
49 
50 /**
51 Decodes a single UTF8 character from the given buffer. $(D utf8InOut) is
52 used to pass in the start of the utf8 encoded character, and also used to
53 return the end of it once it has been decoded.
54 
55 Returns the decoded character as a 32-bit dchar.
56 
57 Throw: $(D Utf8DecodeException) if the utf8 encoding is invalid.
58 */
59 pragma(inline)
60 dchar decodeUtf8(T)(T** utf8InOut) pure if(T.sizeof == 1)
61 {
62     return decodeUtf8Impl!(No.useLimit)(cast(const(char)**)utf8InOut);
63 }
64 /// ditto
65 pragma(inline)
66 dchar decodeUtf8(T)(T** utf8InOut, T* limit) if(T.sizeof == 1)
67 {
68     return decodeUtf8Impl!(Yes.useLimit)(cast(const(char)**)utf8InOut, cast(const(char)*) limit);
69 }
70 
71 template decodeUtf8Impl(Flag!"useLimit" useLimit)
72 {
73     private enum MixinCode = q{
74         auto utf8 = *utf8InOut;
75         scope(exit)
76         {
77             *utf8InOut = utf8;
78         }
79         dchar first = *utf8;
80         utf8++;
81         if(first <= 0x7F)
82         {
83             return first;
84         }
85         static if(useLimit)
86         {
87             if((first & 0x40) == 0)
88             {
89                 throw Utf8DecodeException.invalidFirstByte();
90             }
91         }
92         if((first & 0x20) == 0)
93         {
94             static if(useLimit)
95             {
96                 if(utf8 >= limit) throw Utf8DecodeException.missingBytes;
97             }
98             return ((first << 6) & 0x7C0) | (*(utf8++) & 0x3F);
99         }
100         if((first & 0x10) == 0)
101         {
102             utf8++;
103             static if(useLimit)
104             {
105                 if(utf8 >= limit) throw Utf8DecodeException.missingBytes;
106             }
107             return ((first << 12) & 0xF000) | ((*(utf8 - 1) << 6) & 0xFC0) | (*(utf8++) & 0x3F);
108         }
109 
110         if((first & 0x08) == 0)
111         {
112             utf8 += 2;
113             static if(useLimit)
114             {
115                 if(utf8 >= limit) throw Utf8DecodeException.missingBytes;
116             }
117             return ((first << 18) & 0x1C0000) | ((*(utf8 - 2) << 12) & 0x3F000) |
118                 ((*(utf8 - 1) << 6) & 0xFC0) | (*(utf8++) & 0x3F);
119         }
120 
121         throw Utf8DecodeException.outOfRange;
122   };
123   static if(useLimit)
124   {
125     dchar decodeUtf8Impl(const(char)** utf8InOut, const(char)* limit) pure
126     {
127       mixin(MixinCode);
128     }
129   }
130   else
131   {
132     dchar decodeUtf8Impl(const(char)** utf8InOut) pure
133     {
134       mixin(MixinCode);
135     }
136   }
137 }
138 
139 unittest
140 {
141     mixin(scopedTest!("decodeUtf8"));
142 
143     void testDecodeUtf8(inout(char)[] s, dchar[] expectedChars, size_t line = __LINE__)
144     {
145         auto start = s.ptr;
146         auto limit = s.ptr + s.length;
147 
148         foreach(expected; expectedChars)
149         {
150             if(start >= limit)
151             {
152                 writefln("Expected more decoded utf8 chars but input ended");
153                 writefln("test on line %s", line);
154                 assert(0);
155             }
156             auto saveStart = start;
157             dchar decoded = decodeUtf8(&start, limit);
158             if(decoded != expected)
159             {
160                 writefln("decodeUtf8: Expected '%s' 0x%x but decoded '%s' 0x%x",
161                 expected, expected, decoded, decoded);
162                 writefln("test on line %s", line);
163                 assert(0);
164             }
165             //debug writefln("decodeUtf8('%s')", decoded);
166         }
167     }
168     void testInvalidUtf8(Utf8DecodeException.Type expectedError, inout(char)[] s, size_t line = __LINE__)
169     {
170         auto start = s.ptr;
171         auto limit = s.ptr + s.length;
172 
173         auto saveStart = start;
174         try
175         {
176             dchar decoded = decodeUtf8(&start, limit);
177             assert(0, format("expected error '%s' but no error was thrown", expectedError));
178         }
179         catch(Utf8DecodeException e)
180         {
181             assert(e.type == expectedError, format("expected error '%s' but got '%s'", expectedError, e.type));
182         }
183     }
184 
185     char[] testString = new char[256];
186     dchar[] expectedCharsBuffer = new dchar[256];
187 
188     testInvalidUtf8(Utf8DecodeException.Type.invalidFirstByte, [0x80]);
189     testInvalidUtf8(Utf8DecodeException.Type.missingBytes, [0xC0]);
190     testInvalidUtf8(Utf8DecodeException.Type.missingBytes, [0xE0, 0x80]);
191 
192     //  dchar[] ranges =
193     //    [0, 0x7F]
194     for(char c = 0; c <= 0x7F; c++)
195     {
196         testString[0] = c;
197         expectedCharsBuffer[0] = c;
198         testDecodeUtf8(testString[0..1], expectedCharsBuffer[0..1]);
199     }
200 
201     testDecodeUtf8("\u0000", [0x0000]);
202     testDecodeUtf8("\u0001", [0x0001]);
203 
204     testDecodeUtf8("\u00a9", [0xa9]);
205     testDecodeUtf8("\u00b1", [0xb1]);
206     testDecodeUtf8("\u02c2", [0x02c2]);
207 
208 
209     testDecodeUtf8("\u0080", [0x80]);
210     testDecodeUtf8("\u07FF", [0x7FF]);
211 
212     testDecodeUtf8("\u0800", [0x800]);
213     testDecodeUtf8("\u7fff", [0x7FFF]);
214     testDecodeUtf8("\u8000", [0x8000]);
215     testDecodeUtf8("\uFFFD", [0xFFFD]);
216     //testDecodeUtf8("\uFFFE", [0xFFFE]); // DMD doesn't like this code point
217     //testDecodeUtf8("\uFFFF", [0xFFFF]); // DMD doesn't like this code point
218 
219     testDecodeUtf8("\U00010000", [0x10000]);
220     testDecodeUtf8("\U00100000", [0x00100000]);
221     testDecodeUtf8("\U0010FFFF", [0x0010FFFF]);
222     //testDecodeUtf8("\U00110000", [0x00110000]); // DMD doesn't like this code point
223 }
224 
225 class Utf8EncodeException : Utf8Exception
226 {
227     this(string msg, string file = __FILE__, size_t line = __LINE__) pure
228     {
229         super(msg, file, line);
230     }
231 }
232 
233 /**
234 Encodes a single character into the given buffer
235 Returns the number of bytes used to encode the given character.
236 */
237 ubyte encodeUtf8(char* dst, dchar c)
238 {
239     if(c <= 0x7F)
240     {
241         *dst++ = cast(char)c;
242         return 1;
243     }
244     if(c <= 0x7FF)
245     {
246         *dst++ = cast(char)(192+c/64);
247         *dst++ = cast(char)(128+c%64);
248         return 2;
249     }
250     if(c <= 0xFFFF)
251     {
252         *dst++ = cast(char)(224+c/4096);
253         *dst++ = cast(char)(128+c/64%64);
254         *dst++ = cast(char)(128+c%64);
255         return 3;
256     }
257     if(c <= 0x1FFFFF)
258     {
259         *dst++ = cast(char)(240+c/262144);
260         *dst++ = cast(char)(128+c/4096%64);
261         *dst++ = cast(char)(128+c/64%64);
262         *dst++ = cast(char)(128+c%64);
263         return 4;
264     }
265     import std.format;
266     throw new Utf8EncodeException(format("encodeUtf8 got a value that was too large (0x%x)", c));
267 }
268 
269 unittest
270 {
271     mixin(scopedTest!("full utf8 encode/decode"));
272     for(dchar c = 0; ;c++)
273     {
274         char[4] buffer;
275         auto encodeLength = encodeUtf8(buffer.ptr, c);
276         {
277             auto utf8 = buffer.ptr;
278             auto decoded = decodeUtf8(&utf8);
279             assert(utf8 - buffer.ptr == encodeLength);
280             assert(decoded == c);
281         }
282         {
283             auto utf8 = buffer.ptr;
284             auto decoded = decodeUtf8(&utf8, utf8 + encodeLength);
285             assert(utf8 - buffer.ptr == encodeLength);
286             assert(decoded == c);
287         }
288         if(c == dchar.max)
289         {
290             break;
291         }
292     }
293 }