more.utf8 source code

1 module more.utf8;
2 
3 
4 version(unittest_utf8) {
5   import std.stdio;
6   import more.common;
7 }
8 
9 version(unittest_utf8)
10 {
11   import std.string;
12 }
13 
14 //
15 // Utf8
16 //
17 private enum genericMessage = "invalid utf8";
18 private enum startedInsideCodePointMessage = "utf8 string started inside a utf8 code point";
19 private enum missingBytesMessage = "utf8 encoding is missing some bytes";
20 private enum outOfRangeMessage = "the utf8 code point is out of range";
21 class Utf8Exception : Exception {
22   enum Type {
23     generic,
24     startedInsideCodePoint,
25     missingBytes,
26     outOfRange,
27   }
28   static string getMessage(Type type) {
29     final switch(type) {
30     case Type.generic: return genericMessage;
31     case Type.startedInsideCodePoint: return startedInsideCodePointMessage;
32     case Type.missingBytes: return missingBytesMessage;
33     case Type.outOfRange: return outOfRangeMessage;
34     }
35   }
36   const Type type;
37   this(Type type) {
38     super(getMessage(type));
39     this.type = type;
40   }
41 }
42 
43 
44 // This method assumes that utf8 points to at least one character
45 // and that the first non-valid pointer is at the limit pointer
46 // (this means that utf8 < limit)
47 dchar decodeUtf8(ref inout(char)* utf8, const char* limit) {
48   dchar c = *utf8;
49   utf8++;
50   if(c <= 0x7F) {
51     return c;
52   }
53   if((c & 0x40) == 0) {
54     throw new Utf8Exception(Utf8Exception.Type.startedInsideCodePoint);
55   }
56 
57   if((c & 0x20) == 0) {
58     if(utf8 >= limit) throw new Utf8Exception(Utf8Exception.Type.missingBytes);
59     return ((c << 6) & 0x7C0) | (*(utf8++) & 0x3F);
60   }
61 
62   if((c & 0x10) == 0) {
63     utf8++;
64     if(utf8 >= limit) throw new Utf8Exception(Utf8Exception.Type.missingBytes);
65     return ((c << 12) & 0xF000) | ((*(utf8 - 1) << 6) & 0xFC0) | (*(utf8++) & 0x3F);
66   }
67   
68   if((c & 0x08) == 0) {
69     utf8 += 2;
70     if(utf8 >= limit) throw new Utf8Exception(Utf8Exception.Type.missingBytes);
71     return ((c << 18) & 0x1C0000) | ((*(utf8 - 2) << 12) & 0x3F000) |
72       ((*(utf8 - 1) << 6) & 0xFC0) | (*(utf8++) & 0x3F);
73   }
74 
75   throw new Utf8Exception(Utf8Exception.Type.outOfRange);
76 }
77 
78 //
79 // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
80 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
81 //
82 dchar bjoernDecodeUtf8(ref inout(char)* utf8, const char* limit) {
83   static __gshared immutable ubyte[] utf8lookup = [
84     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
85     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
86     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
87     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
88     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
89     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
90     8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
91     0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
92     0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
93     0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
94     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
95     1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
96     1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
97     1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
98   ];
99   enum utf8Accept = 0;
100   enum utf8Reject = 1;
101 
102   uint state = utf8Accept;
103   dchar codep;
104 
105   while(true) {
106     ubyte b = *utf8;
107     uint type = utf8lookup[b];
108 
109     codep = (state != utf8Accept) ?
110       (b & 0x3fu) | (codep << 6) : (0xff >> type) & b;
111 
112     state = utf8lookup[256 + state*16 + type];
113 
114     if(state == utf8Accept) return codep;
115     if(state == utf8Reject) throw new Utf8Exception(Utf8Exception.Type.generic);
116     utf8++;
117     if(utf8 >= limit) throw new Utf8Exception(Utf8Exception.Type.missingBytes);
118   }
119 }
120 
121 
122 version(unittest_utf8) unittest
123 {
124   mixin(scopedTest!("utf8"));
125 
126   void testDecodeUtf8(inout(char)[] s, dchar[] expectedChars, size_t line = __LINE__) {
127     auto start = s.ptr;
128     auto limit = s.ptr + s.length;
129 
130     foreach(expected; expectedChars) {
131       if(start >= limit) {
132 	writefln("Expected more decoded utf8 chars but input ended");
133 	writefln("test on line %s", line);
134 	assert(0);
135       }
136       auto saveStart = start;
137       dchar decoded = decodeUtf8(start, limit);
138       if(decoded != expected) {
139 	writefln("decodeUtf8: Expected '%s' 0x%x but decoded '%s' 0x%x",
140 		 expected, expected, decoded, decoded);
141 	writefln("test on line %s", line);
142 	assert(0);
143       }
144       start = saveStart;
145       decoded = bjoernDecodeUtf8(start, limit);
146       if(decoded != expected) {
147 	writefln("bjoernDecodeUtf8: Expected '%s' 0x%x but decoded '%s' 0x%x",
148 		 expected, expected, decoded, decoded);
149 	writefln("test on line %s", line);
150 	assert(0);
151       }
152       debug writefln("decodeUtf8('%s')", decoded);
153     }
154   }
155   void testInvalidUtf8(Utf8Exception.Type expectedError, inout(char)[] s, size_t line = __LINE__) {
156     auto start = s.ptr;
157     auto limit = s.ptr + s.length;
158     
159     auto saveStart = start;
160     try {
161       dchar decoded = decodeUtf8(start, limit);
162       assert(0, format("expected error '%s' but no error was thrown", expectedError));
163     } catch(Utf8Exception e) {
164       assert(e.type == expectedError, format("expected error '%s' but got '%s'", expectedError, e.type));
165     }
166 
167     start = saveStart;
168     try {
169       dchar decoded = bjoernDecodeUtf8(start, limit);
170       assert(0, format("expected error '%s' but no error was thrown", expectedError));
171     } catch(Utf8Exception e) {
172       assert(e.type == Utf8Exception.Type.generic || e.type == expectedError, format
173 	     ("expected error '%s' but got '%s'", expectedError, e.type));
174     }
175     debug writefln("got expected error '%s'", expectedError);
176   }
177 
178   char[] testString = new char[256];
179   dchar[] expectedCharsBuffer = new dchar[256];
180 
181 
182   testInvalidUtf8(Utf8Exception.Type.startedInsideCodePoint, [0x80]);
183   testInvalidUtf8(Utf8Exception.Type.missingBytes, [0xC0]);
184   testInvalidUtf8(Utf8Exception.Type.missingBytes, [0xE0, 0x80]);
185 
186 
187   //  dchar[] ranges =
188   //    [0, 0x7F]
189   for(char c = 0; c <= 0x7F; c++) {
190     testString[0] = c;
191     expectedCharsBuffer[0] = c;
192     testDecodeUtf8(testString[0..1], expectedCharsBuffer[0..1]);
193   }
194 		    
195 
196   testDecodeUtf8("\u0000", [0x0000]);
197   testDecodeUtf8("\u0001", [0x0001]);
198 
199   testDecodeUtf8("\u00a9", [0xa9]);
200   testDecodeUtf8("\u00b1", [0xb1]);
201   testDecodeUtf8("\u02c2", [0x02c2]);
202 
203 
204   testDecodeUtf8("\u0080", [0x80]);
205   testDecodeUtf8("\u07FF", [0x7FF]);
206 
207   testDecodeUtf8("\u0800", [0x800]);
208   testDecodeUtf8("\u7fff", [0x7FFF]);
209   testDecodeUtf8("\u8000", [0x8000]);
210   testDecodeUtf8("\uFFFD", [0xFFFD]);
211   //testDecodeUtf8("\uFFFE", [0xFFFE]); // DMD doesn't like this code point
212   //testDecodeUtf8("\uFFFF", [0xFFFF]); // DMD doesn't like this code point
213 
214 
215   testDecodeUtf8("\U00010000", [0x10000]);
216   testDecodeUtf8("\U00100000", [0x00100000]);
217   testDecodeUtf8("\U0010FFFF", [0x0010FFFF]);
218   //testDecodeUtf8("\U00110000", [0x00110000]); // DMD doesn't like this code point
219 }
220