more.fields source code

1 module more.fields;
2 
3 import std..string;
4 
5 import more.utf8;
6 import more.common;
7 
8 version(unittest)
9 {
10     import std.stdio;
11 }
12 
13 class TextParseException : Exception
14 {
15     this(string msg)
16     {
17         super(msg);
18     }
19 }
20 
21 struct Text
22 {
23   uint lineNumber;
24   uint column;
25 
26   const(char)[] chars;
27   const(char)* limit;
28 
29   const(char)* cpos;
30   const(char)* next;
31 
32   dchar c;
33 
34   this(const(char)[] chars) {
35     setup(chars);
36   }
37   void setup(const(char)[] chars) {
38     this.lineNumber = 1;
39     this.column = 1;
40 
41     this.chars = chars;
42     this.limit = chars.ptr + chars.length;
43     this.cpos = chars.ptr;
44     this.next = chars.ptr;
45 
46     if(chars.length > 0) {
47       c = decodeUtf8(&this.next, this.limit);
48     }
49   }
50   @property bool empty() {
51     return cpos >= limit;
52   }
53 
54   void skipChar() {
55     cpos = next;
56     if(next < limit) {
57       c = decodeUtf8(&next, limit);
58     }
59   }
60 
61   void toNextLine()
62   {
63     while(true) {
64       if(next >= limit) break;
65       c = decodeUtf8(&next, limit);
66       if(c == '\n') {
67         lineNumber++;
68         column = 1;
69         break;
70       }
71     }
72     cpos = next;
73     if(next < limit) {
74       c = decodeUtf8(&next, limit);
75     }
76   }
77   void toNewline()
78   {
79     while(true) {
80       cpos = next;
81       if(next >= limit) break;
82       c = decodeUtf8(&next, limit);
83       column++;
84       if(c == '\n') {
85         break;
86       }
87     }
88   }
89   void toEndOfToken()
90   {
91     if(c == '"') {
92       implement("quoted tokens");
93     } else {
94 
95       while(true) {
96         cpos = next;
97         if(next >= limit) break;
98         c = decodeUtf8(&next, limit);
99         column++;
100         if(isControlChar(c)) {
101 
102           // Handle slashes that aren't comments
103           if(c != '/') break;
104           if(next >= limit) {
105             cpos = next;
106             break;
107           }
108           auto saveNext = next;
109           c = decodeUtf8(&next, limit);
110           next = saveNext;
111 
112           if(c == '*' || c == '/') {
113             break;
114           }
115 
116         }
117 
118       }
119     }
120   }
121 
122   // If skipNewlines is true, c/cpos will be pointing at the newline if no field was found
123   void skipWhitespaceAndComments(bool skipNewlines)
124   {
125     while(true) {
126 
127       // TODO: maybe use a lookup table here
128       if(c == ' ' || c == '\t' || c =='\v' || c == '\f' || c == '\r') {
129 
130         // do nothing (check first as this is the most likely case)
131 
132       } else if(c == '\n') {
133 
134         if(!skipNewlines) return;
135 
136         lineNumber++;
137         column = 1;
138 
139       } else if(c == '#') {
140 
141         if(!skipNewlines) {
142           toNewline();
143           return;
144         }
145 
146         toNextLine();
147 
148       } else if(c == '/') {
149 
150         if(next >= limit) return;
151 
152         c = decodeUtf8(&next, limit);
153 
154         if(c == '/') {
155 
156           if(!skipNewlines) {
157             toNewline();
158             return;
159           }
160 
161           toNextLine();
162 
163         } else if(c == '*') {
164 
165           if(!skipNewlines) {
166             implement("multiline comments when not skipping newlines");
167           }
168 
169 
170           column++;
171 
172         MULTILINE_COMMENT_LOOP:
173           while(next < limit) {
174 
175             c = decodeUtf8(&next, limit); // no need to save cpos since c will be thrown away
176             column++;
177 
178             if(c == '\n') {
179               lineNumber++;
180               column = 0;
181               lineNumber++;
182             } else if(c == '*') {
183               // loop assume c is pointing to a '*' and next is pointing to the next characer
184               while(next < limit) {
185 
186                 c = decodeUtf8(&next, limit);
187                 column++;
188                 if(c == '/') break MULTILINE_COMMENT_LOOP;
189                 if(c == '\n') {
190                   lineNumber++;
191                   column = 0;
192                 } else if(c != '*') {
193                   break;
194                 }
195               }
196             }
197           }
198 
199         } else {
200           return;
201         }
202 
203       } else {
204 
205         return; // Found non-whitespace and non-comment
206 
207       }
208 
209       //
210       // Goto next character
211       //
212       cpos = next;
213       if(next >= limit) return;
214       c = decodeUtf8(&next, limit);
215       column++;
216     }
217 
218   }
219   void parseField(ref FieldToken token, bool sameLine = false)
220   {
221     skipWhitespaceAndComments(!sameLine);
222     if(cpos >= limit || (sameLine && c == '\n')) {
223       token.text = null;
224       token.lineNumber = lineNumber;
225       token.column = column;
226     } else {
227 
228       if(isControlChar(c)) {
229         throw new TextParseException(format("Expected non-control character but got '%s' (charcode=%s)",
230                                             c, cast(uint)c));
231       }
232 
233       const(char)* startOfToken = cpos;
234       token.lineNumber = lineNumber;
235       token.column = column;
236       toEndOfToken();
237       token.text = startOfToken[0..cpos-startOfToken];
238     }
239   }
240   bool noMoreFieldsOnThisLine()
241   {
242     skipWhitespaceAndComments(false);
243     return cpos >= limit || c == '\n';
244   }
245 
246 
247   void parseString(ref FieldToken token)
248   {
249     skipWhitespaceAndComments(true);
250     token.lineNumber = lineNumber;
251     token.column = column;
252     if(cpos >= limit || isControlChar(c)) {
253       token.text = null;
254     } else {
255       const(char)* startOfToken = cpos;
256       toEndOfToken();
257       token.text = startOfToken[0..cpos-startOfToken];
258     }
259   }
260 
261 
262   alias parseString parseObjectFieldName;
263 
264   // An object starts with an open curly brace '{' or omits its curly
265   // brace section with a semi-colon ';'
266   // A 'NamelessObjectField' is a field before the curly-brace section
267   void parseNamelessObjectField(ref FieldToken token)
268   {
269     skipWhitespaceAndComments(true);
270     token.lineNumber = lineNumber;
271     token.column = column;
272     if(cpos >= limit || isControlChar(c)) {
273       token.text = null;
274     } else {
275       const(char)* startOfToken = cpos;
276       toEndOfToken();
277       token.text = startOfToken[0..cpos-startOfToken];
278     }
279   }
280   bool atObjectStart()
281   {
282     skipWhitespaceAndComments(true);
283     if(cpos >= limit || c != '{') return false;
284 
285     cpos = next;
286     if(next < limit) {
287       c = decodeUtf8(&next, limit);
288     }
289     return true;
290   }
291 }
292 
293 
294 struct FieldToken
295 {
296   const(char)[] text;
297   uint lineNumber;
298   uint column;
299 
300   bool eof()
301   {
302     return text is null;
303   }
304 }
305 
306 /+
307 /**
308  * Used to parse the fields in <i>line</i> to the <i>fields</i> sink.
309  * line is a single line without the line ending character.
310  * returns error message on error
311  */
312 void parseField(ref FieldToken token, ref Text text)
313 {
314   //writefln("[DEBUG] parseField(..., '%s')", escape(text.chars));
315 
316   const(char)* next = text.chars.ptr;
317   const char* limit = next + text.chars.length;
318   const(char)* cpos;
319   dchar c;
320 
321   // ExpectedState:
322   //   c/cpos: points to a character before the newline character
323   // ReturnState:
324   //   c/cpos: points to the character after the newline character or at limit if at EOF
325   void toNextLine()
326   {
327     // no need to save cpos since c will be thrown away
328     while(true) {
329       if(next >= limit) break;
330       c = decodeUtf8(&next, limit);
331       if(c == '\n') {
332         text.lineNumber++;
333         text.column = 1;
334         break;
335       }
336     }
337     cpos = next;
338     if(next < limit) {
339       c = decodeUtf8(&next, limit);
340     }
341   }
342   // ExpectedState:
343   //   c/cpos: points to the first character of the token
344   // ReturnState:
345   //   c/cpos: points to the character after the token
346   void toEndOfToken()
347   {
348     if(c == '"') {
349       implement("quoted tokens");
350     } else {
351 
352       while(true) {
353         cpos = next;
354         if(next >= limit) break;
355         c = decodeUtf8(&next, limit);
356         text.column++;
357         if(isControlChar(c)) {
358           break;
359         }
360       }
361     }
362   }
363   // ExpectedState:
364   //   c/cpos: points to the first character of the potential whitespace/comment
365   // ReturnState:
366   //   c/cpos: points to the first character after all the whitespace/comments
367   void skipWhitespaceAndComments()
368   {
369     while(true) {
370 
371       // TODO: maybe use a lookup table here
372       if(c == ' ' || c == '\t' || c =='\v' || c == '\f' || c == '\r') {
373 
374         // do nothing (check first as this is the most likely case)
375 
376       } else if(c == '\n') {
377 
378         text.lineNumber++;
379         text.column = 1;
380 
381       } else if(c == '#') {
382 
383         toNextLine();
384 
385       } else if(c == '/') {
386 
387         if(next >= limit) return;
388 
389         c = decodeUtf8(&next, limit);
390 
391         if(c == '/') {
392 
393           toNextLine();
394 
395         } else if(c == '*') {
396 
397           text.column++;
398 
399         MULTILINE_COMMENT_LOOP:
400           while(next < limit) {
401 
402             c = decodeUtf8(&next, limit); // no need to save cpos since c will be thrown away
403             text.column++;
404 
405             if(c == '\n') {
406               text.lineNumber++;
407               text.column = 0;
408               text.lineNumber++;
409             } else if(c == '*') {
410               // loop assume c is pointing to a '*' and next is pointing to the next characer
411               while(next < limit) {
412 
413                 c = decodeUtf8(&next, limit);
414                 text.column++;
415                 if(c == '/') break MULTILINE_COMMENT_LOOP;
416                 if(c == '\n') {
417                   text.lineNumber++;
418                   text.column = 0;
419                 } else if(c != '*') {
420                   break;
421                 }
422               }
423             }
424           }
425 
426         } else {
427           return;
428         }
429 
430       } else {
431 
432         return; // Found non-whitespace and non-comment
433 
434       }
435 
436       //
437       // Goto next character
438       //
439       cpos = next;
440       if(next >= limit) return;
441       c = decodeUtf8(&next, limit);
442       text.column++;
443     }
444 
445   }
446 
447   //
448   // Read the first character
449   //
450   cpos = next;
451   c = decodeUtf8(&next, limit);
452 
453   skipWhitespaceAndComments();
454   if(cpos >= limit) {
455     token.text = null;
456     text.chars = null;
457     return;
458   }
459 
460   const(char)* startOfToken = cpos;
461   token.lineNumber = text.lineNumber;
462   token.column = text.column;
463   toEndOfToken();
464   token.text = startOfToken[0..cpos-startOfToken];
465 
466   text.chars = cpos[0..limit-cpos];
467 
468   return;
469 }
470 +/
471 
472 
473 enum ubyte controlCharFlag                  = 0x01;
474 enum ubyte whitespaceFlag                   = 0x02;
475 enum ubyte tokenStartFlag                   = 0x04;
476 
477 bool isControlChar(dchar c) {
478   return (c < charLookup.length) && ( (charLookup[c] & controlCharFlag) != 0);
479 }
480 bool isWhitespace(dchar c) {
481   return (c < charLookup.length) && ( (charLookup[c] & whitespaceFlag) != 0);
482 }
483 mixin("private __gshared immutable ubyte[256] charLookup = "~rangeInitializers
484       (
485        /*
486          "'_'"    , "sdlIDFlag",
487 
488          `'a'`    , "sdlIDFlag",
489          `'b'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
490          `'c'`    , "sdlIDFlag",
491          `'d'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
492          `'e'`    , "sdlIDFlag",
493          `'f'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
494          `'g'-'k'`, "sdlIDFlag",
495          `'l'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
496          `'m'-'z'`, "sdlIDFlag",
497 
498          `'A'`    , "sdlIDFlag",
499          `'B'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
500          `'C'`    , "sdlIDFlag",
501          `'D'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
502          `'E'`    , "sdlIDFlag",
503          `'F'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
504          `'G'-'K'`, "sdlIDFlag",
505          `'L'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
506          `'M'-'Z'`, "sdlIDFlag",
507 
508          `'0'-'9'`, "sdlIDFlag | sdlNumberFlag",
509          `'-'`    , "sdlIDFlag",
510          `'.'`    , "sdlIDFlag | sdlNumberFlag",
511          `'$'`    , "sdlIDFlag",
512        */
513        `' '`    , "controlCharFlag | whitespaceFlag",
514        `'\t'`   , "controlCharFlag | whitespaceFlag",
515        `'\n'`   , "controlCharFlag | whitespaceFlag",
516        `'\v'`   , "controlCharFlag | whitespaceFlag",
517        `'\f'`   , "controlCharFlag | whitespaceFlag",
518        `'\r'`   , "controlCharFlag | whitespaceFlag",
519        `'{'`    , "controlCharFlag",
520        `'}'`    , "controlCharFlag",
521 
522        `'['`    , "controlCharFlag",
523        `']'`    , "controlCharFlag",
524        //`';'`    , "controlCharFlag",
525        //`'\\'`    , "controlCharFlag",
526        `'/'`    , "controlCharFlag",
527        `'#'`    , "controlCharFlag",
528 
529 
530        )~";");
531 
532 unittest
533 {
534   import more.test;
535   mixin(scopedTest!"fields");
536 
537   writefln("Running Unit Tests...");
538 
539   void testParseFields(const(char)[] textString, FieldToken[] expectedTokens = [], size_t testLine = __LINE__)
540   {
541     auto escapedText = escape(textString);
542 
543     debug {
544       writefln("[TEST] testing '%s'", escapedText);
545     }
546 
547     FieldToken token;
548     Text text = Text(textString);
549     //text.setup(textString);
550 
551     try {
552 
553       for(auto i = 0; i < expectedTokens.length; i++) {
554 
555         //parseField(token, text);
556         text.parseField(token);
557         if(token.eof) {
558           writefln("Expected %s token(s) but only got %s", expectedTokens.length, i);
559           writefln("Error: test on line %s", testLine);
560         }
561 
562         auto expectedToken = expectedTokens[i];
563         if(token.text != expectedToken.text) {
564           writefln("Error: expected token '%s' but got '%s'", expectedToken.text, token.text);
565           writefln("Error: test on line %s", testLine);
566           assert(0);
567         }
568       }
569 
570       //parseField(token, text);
571       text.parseField(token);
572       if(!token.eof) {
573         writefln("Expected %s token(s) but got at least one more (text='%s')",
574                  expectedTokens.length, token.text);
575         writefln("Error: test on line %s", testLine);
576         assert(0);
577       }
578 
579     } catch(Exception e) {
580       writefln("[TEST] this sdl threw an unexpected Exception: '%s'", escape(text.chars));
581       writeln(e);
582       writefln("Error: test on line %s", testLine);
583       assert(0);
584     }
585   }
586 
587   testParseFields("");
588   testParseFields(" ");
589   testParseFields("\n");
590 
591   testParseFields("// comment");
592   testParseFields("# comment");
593   testParseFields("/* comment */");
594   testParseFields("/* comment\n next-line \n hey */");
595   testParseFields("/* comment\n next-line *\n * ** *** \n hey **/");
596 
597   testParseFields("first", [FieldToken("first")]);
598 
599   //testParseFields("[", [FieldToken("first")]);
600 
601 }