more.fields source code

1 module more.fields;
2 
3 import std.string;
4 
5 import more.utf8;
6 import more.common;
7 
8 version(unittest_fields)
9 {
10   import std.stdio;
11 }
12 
13 class TextParseException : Exception
14 {
15   this(string msg)
16   {
17     super(msg);
18   }
19 }
20 
21 struct Text
22 {
23   uint lineNumber;
24   uint column;
25 
26   const(char)[] chars;
27   const(char)* limit;
28 
29   const(char)* cpos;
30   const(char)* next;
31 
32   dchar c;
33 
34   this(const(char)[] chars) {
35     setup(chars);
36   }
37   void setup(const(char)[] chars) {
38     this.lineNumber = 1;
39     this.column = 1;
40     
41     this.chars = chars;
42     this.limit = chars.ptr + chars.length;
43     this.cpos = chars.ptr;
44     this.next = chars.ptr;
45     
46     if(chars.length > 0) {
47       c = decodeUtf8(this.next, this.limit);
48     }
49   }
50   @property bool empty() {
51     return cpos >= limit;
52   }
53 
54   void skipChar() {
55     cpos = next;
56     if(next < limit) {
57       c = decodeUtf8(next, limit);
58     }
59   }
60 
61   void toNextLine()
62   {
63     while(true) {
64       if(next >= limit) break;
65       c = decodeUtf8(next, limit); 
66       if(c == '\n') {
67 	lineNumber++;
68 	column = 1;
69 	break;
70       }
71     }
72     cpos = next;
73     if(next < limit) {
74       c = decodeUtf8(next, limit);
75     }
76   }
77   void toNewline()
78   {
79     while(true) {
80       cpos = next;
81       if(next >= limit) break;
82       c = decodeUtf8(next, limit); 
83       column++;
84       if(c == '\n') {
85 	break;
86       }
87     }
88   }
89   void toEndOfToken()
90   {
91     if(c == '"') {
92       implement("quoted tokens");
93     } else {
94 
95       while(true) {
96 	cpos = next;
97 	if(next >= limit) break;
98 	c = decodeUtf8(next, limit); 
99 	column++;
100 	if(isControlChar(c)) {
101 
102 	  // Handle slashes that aren't comments
103 	  if(c != '/') break;
104 	  if(next >= limit) {
105 	    cpos = next;
106 	    break;
107 	  }
108 	  auto saveNext = next;
109 	  c = decodeUtf8(next, limit);
110 	  next = saveNext;
111 	  
112 	  if(c == '*' || c == '/') {
113 	    break;
114 	  }
115 
116 	}
117 
118       }
119     }
120   }
121 
122   // If skipNewlines is true, c/cpos will be pointing at the newline if no field was found
123   void skipWhitespaceAndComments(bool skipNewlines)
124   {
125     while(true) {
126 
127       // TODO: maybe use a lookup table here
128       if(c == ' ' || c == '\t' || c =='\v' || c == '\f' || c == '\r') {
129 
130 	// do nothing (check first as this is the most likely case)
131 
132       } else if(c == '\n') {
133 
134 	if(!skipNewlines) return;
135 
136 	lineNumber++;
137 	column = 1;
138 
139       } else if(c == '#') {
140 
141 	if(!skipNewlines) {
142 	  toNewline();
143 	  return;
144 	}
145 
146 	toNextLine();
147 
148       } else if(c == '/') {
149 
150 	if(next >= limit) return;
151 
152 	c = decodeUtf8(next, limit);
153 	
154 	if(c == '/') {
155 
156 	  if(!skipNewlines) {
157 	    toNewline();
158 	    return;
159 	  }
160 
161 	  toNextLine();
162 
163 	} else if(c == '*') {
164 
165 	  if(!skipNewlines) {
166 	    implement("multiline comments when not skipping newlines");
167 	  }
168 
169 	  
170 	  column++;
171 
172 	MULTILINE_COMMENT_LOOP:
173 	  while(next < limit) {
174 
175 	    c = decodeUtf8(next, limit); // no need to save cpos since c will be thrown away
176 	    column++;
177 
178 	    if(c == '\n') {
179 	      lineNumber++;
180 	      column = 0;
181 	      lineNumber++;
182 	    } else if(c == '*') {
183 	      // loop assume c is pointing to a '*' and next is pointing to the next characer
184 	      while(next < limit) {
185 
186 		c = decodeUtf8(next, limit);
187 		column++;
188 		if(c == '/') break MULTILINE_COMMENT_LOOP;
189 		if(c == '\n') {
190 		  lineNumber++;
191 		  column = 0;
192 		} else if(c != '*') {
193 		  break;
194 		}
195 	      }
196 	    }
197 	  }
198 
199 	} else {
200 	  return;
201 	}
202 
203       } else {
204 
205 	return; // Found non-whitespace and non-comment
206 
207       }
208 
209       //
210       // Goto next character
211       //
212       cpos = next;
213       if(next >= limit) return;
214       c = decodeUtf8(next, limit);
215       column++;
216     }
217 
218   }
219   void parseField(ref FieldToken token, bool sameLine = false)
220   {
221     skipWhitespaceAndComments(!sameLine);
222     if(cpos >= limit || (sameLine && c == '\n')) {
223       token.text = null;
224       token.lineNumber = lineNumber;
225       token.column = column;
226     } else {
227       
228       if(isControlChar(c)) {
229 	throw new TextParseException(format("Expected non-control character but got '%s' (charcode=%s)",
230 					    c, cast(uint)c));
231       }
232 
233       const(char)* startOfToken = cpos;
234       token.lineNumber = lineNumber;
235       token.column = column;
236       toEndOfToken();
237       token.text = startOfToken[0..cpos-startOfToken];
238     }
239   }
240   bool noMoreFieldsOnThisLine()
241   {
242     skipWhitespaceAndComments(false);
243     return cpos >= limit || c == '\n';
244   }
245 
246 
247   void parseString(ref FieldToken token)
248   {
249     skipWhitespaceAndComments(true);
250     token.lineNumber = lineNumber;
251     token.column = column;
252     if(cpos >= limit || isControlChar(c)) {
253       token.text = null;
254     } else {
255       const(char)* startOfToken = cpos;
256       toEndOfToken();
257       token.text = startOfToken[0..cpos-startOfToken];
258     }
259   }
260 
261 
262   alias parseString parseObjectFieldName;
263 
264   // An object starts with an open curly brace '{' or omits its curly
265   // brace section with a semi-colon ';'
266   // A 'NamelessObjectField' is a field before the curly-brace section
267   void parseNamelessObjectField(ref FieldToken token)
268   {
269     skipWhitespaceAndComments(true);
270     token.lineNumber = lineNumber;
271     token.column = column;
272     if(cpos >= limit || isControlChar(c)) {
273       token.text = null;
274     } else {
275       const(char)* startOfToken = cpos;
276       toEndOfToken();
277       token.text = startOfToken[0..cpos-startOfToken];
278     }
279   }
280   bool atObjectStart()
281   {
282     skipWhitespaceAndComments(true);
283     if(cpos >= limit || c != '{') return false;
284 
285     cpos = next;
286     if(next < limit) {
287       c = decodeUtf8(next, limit);
288     }
289     return true;
290   }
291 }
292 
293 
294 struct FieldToken
295 {
296   const(char)[] text;
297   uint lineNumber;
298   uint column;
299 
300   bool eof()
301   {
302     return text is null;
303   }
304 }
305 
306 /+
307 /**
308  * Used to parse the fields in <i>line</i> to the <i>fields</i> sink.
309  * line is a single line without the line ending character.
310  * returns error message on error
311  */
312 void parseField(ref FieldToken token, ref Text text)
313 {
314   //writefln("[DEBUG] parseField(..., '%s')", escape(text.chars));
315 
316   const(char)* next = text.chars.ptr;
317   const char* limit = next + text.chars.length;
318   const(char)* cpos;
319   dchar c;
320 
321   // ExpectedState:
322   //   c/cpos: points to a character before the newline character
323   // ReturnState:
324   //   c/cpos: points to the character after the newline character or at limit if at EOF
325   void toNextLine()
326   {
327     // no need to save cpos since c will be thrown away
328     while(true) {
329       if(next >= limit) break;
330       c = decodeUtf8(next, limit); 
331       if(c == '\n') {
332 	text.lineNumber++;
333 	text.column = 1;
334 	break;
335       }
336     }
337     cpos = next;
338     if(next < limit) {
339       c = decodeUtf8(next, limit);
340     }
341   }
342   // ExpectedState:
343   //   c/cpos: points to the first character of the token
344   // ReturnState:
345   //   c/cpos: points to the character after the token
346   void toEndOfToken()
347   {
348     if(c == '"') {
349       implement("quoted tokens");
350     } else {
351 
352       while(true) {
353 	cpos = next;
354 	if(next >= limit) break;
355 	c = decodeUtf8(next, limit); 
356 	text.column++;
357 	if(isControlChar(c)) {
358 	  break;
359 	}
360       }
361     }
362   }
363   // ExpectedState:
364   //   c/cpos: points to the first character of the potential whitespace/comment
365   // ReturnState:
366   //   c/cpos: points to the first character after all the whitespace/comments
367   void skipWhitespaceAndComments()
368   {
369     while(true) {
370 
371       // TODO: maybe use a lookup table here
372       if(c == ' ' || c == '\t' || c =='\v' || c == '\f' || c == '\r') {
373 
374 	// do nothing (check first as this is the most likely case)
375 
376       } else if(c == '\n') {
377 
378 	text.lineNumber++;
379 	text.column = 1;
380 
381       } else if(c == '#') {
382 
383 	toNextLine();
384 
385       } else if(c == '/') {
386 
387 	if(next >= limit) return;
388 
389 	c = decodeUtf8(next, limit);
390 	
391 	if(c == '/') {
392 
393 	  toNextLine();
394 
395 	} else if(c == '*') {
396 	  
397 	  text.column++;
398 
399 	MULTILINE_COMMENT_LOOP:
400 	  while(next < limit) {
401 
402 	    c = decodeUtf8(next, limit); // no need to save cpos since c will be thrown away
403 	    text.column++;
404 
405 	    if(c == '\n') {
406 	      text.lineNumber++;
407 	      text.column = 0;
408 	      text.lineNumber++;
409 	    } else if(c == '*') {
410 	      // loop assume c is pointing to a '*' and next is pointing to the next characer
411 	      while(next < limit) {
412 
413 		c = decodeUtf8(next, limit);
414 		text.column++;
415 		if(c == '/') break MULTILINE_COMMENT_LOOP;
416 		if(c == '\n') {
417 		  text.lineNumber++;
418 		  text.column = 0;
419 		} else if(c != '*') {
420 		  break;
421 		}
422 	      }
423 	    }
424 	  }
425 
426 	} else {
427 	  return;
428 	}
429 
430       } else {
431 
432 	return; // Found non-whitespace and non-comment
433 
434       }
435 
436       //
437       // Goto next character
438       //
439       cpos = next;
440       if(next >= limit) return;
441       c = decodeUtf8(next, limit);
442       text.column++;
443     }
444 
445   }
446 
447   //
448   // Read the first character
449   //
450   cpos = next;
451   c = decodeUtf8(next, limit);
452 
453   skipWhitespaceAndComments();
454   if(cpos >= limit) {
455     token.text = null;
456     text.chars = null;
457     return;
458   }
459 
460   const(char)* startOfToken = cpos;
461   token.lineNumber = text.lineNumber;
462   token.column = text.column;
463   toEndOfToken();
464   token.text = startOfToken[0..cpos-startOfToken];
465 
466   text.chars = cpos[0..limit-cpos];
467 
468   return;
469 }
470 +/
471 
472 
473 enum ubyte controlCharFlag                  = 0x01;
474 enum ubyte whitespaceFlag                   = 0x02;
475 enum ubyte tokenStartFlag                   = 0x04;
476 
477 bool isControlChar(dchar c) {
478   return (c < charLookup.length) && ( (charLookup[c] & controlCharFlag) != 0);
479 }
480 bool isWhitespace(dchar c) {
481   return (c < charLookup.length) && ( (charLookup[c] & whitespaceFlag) != 0);
482 }
483 mixin("private __gshared immutable ubyte[256] charLookup = "~rangeInitializers
484       (
485        /*
486 	 "'_'"    , "sdlIDFlag",
487 
488 	 `'a'`    , "sdlIDFlag",
489 	 `'b'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
490 	 `'c'`    , "sdlIDFlag",
491 	 `'d'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
492 	 `'e'`    , "sdlIDFlag",
493 	 `'f'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
494 	 `'g'-'k'`, "sdlIDFlag",
495 	 `'l'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
496 	 `'m'-'z'`, "sdlIDFlag",
497 
498 	 `'A'`    , "sdlIDFlag",
499 	 `'B'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
500 	 `'C'`    , "sdlIDFlag",
501 	 `'D'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
502 	 `'E'`    , "sdlIDFlag",
503 	 `'F'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
504 	 `'G'-'K'`, "sdlIDFlag",
505 	 `'L'`    , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag",
506 	 `'M'-'Z'`, "sdlIDFlag",
507 
508 	 `'0'-'9'`, "sdlIDFlag | sdlNumberFlag",
509 	 `'-'`    , "sdlIDFlag",
510 	 `'.'`    , "sdlIDFlag | sdlNumberFlag",
511 	 `'$'`    , "sdlIDFlag",
512        */
513        `' '`    , "controlCharFlag | whitespaceFlag",
514        `'\t'`   , "controlCharFlag | whitespaceFlag",
515        `'\n'`   , "controlCharFlag | whitespaceFlag",
516        `'\v'`   , "controlCharFlag | whitespaceFlag",
517        `'\f'`   , "controlCharFlag | whitespaceFlag",
518        `'\r'`   , "controlCharFlag | whitespaceFlag",
519        `'{'`    , "controlCharFlag",
520        `'}'`    , "controlCharFlag",
521        
522        `'['`    , "controlCharFlag",
523        `']'`    , "controlCharFlag",
524        //`';'`    , "controlCharFlag",
525        //`'\\'`    , "controlCharFlag",
526        `'/'`    , "controlCharFlag",
527        `'#'`    , "controlCharFlag",
528 
529 
530        )~";");
531 
532 version(unittest_fields) unittest
533 {
534   writefln("Running Unit Tests...");
535 
536   void testParseFields(const(char)[] textString, FieldToken[] expectedTokens = [], size_t testLine = __LINE__)
537   {
538     auto escapedText = escape(textString);
539 
540     debug {
541       writefln("[TEST] testing '%s'", escapedText);
542     }
543 
544     FieldToken token;
545     Text text = Text(textString);
546     //text.setup(textString);
547 
548     try {
549 
550       for(auto i = 0; i < expectedTokens.length; i++) {
551 
552 	//parseField(token, text);
553 	text.parseField(token);
554 	if(token.eof) {
555 	  writefln("Expected %s token(s) but only got %s", expectedTokens.length, i);
556 	  writefln("Error: test on line %s", testLine);
557 	}
558 
559 	auto expectedToken = expectedTokens[i];
560 	if(token.text != expectedToken.text) {
561 	  writefln("Error: expected token '%s' but got '%s'", expectedToken.text, token.text);
562 	  writefln("Error: test on line %s", testLine);
563 	  assert(0);
564 	}
565       }
566 
567       //parseField(token, text);
568       text.parseField(token);
569       if(!token.eof) {
570 	writefln("Expected %s token(s) but got at least one more (text='%s')",
571 		 expectedTokens.length, token.text);
572 	writefln("Error: test on line %s", testLine);
573 	assert(0);
574       }
575       
576     } catch(Exception e) {
577       writefln("[TEST] this sdl threw an unexpected Exception: '%s'", escape(text.chars));
578       writeln(e);
579       writefln("Error: test on line %s", testLine);
580       assert(0);
581     }
582   }
583 
584   testParseFields("");
585   testParseFields(" ");
586   testParseFields("\n");
587 
588   testParseFields("// comment");
589   testParseFields("# comment");
590   testParseFields("/* comment */");
591   testParseFields("/* comment\n next-line \n hey */");
592   testParseFields("/* comment\n next-line *\n * ** *** \n hey **/");
593 
594   testParseFields("first", [FieldToken("first")]);
595 
596   //testParseFields("[", [FieldToken("first")]);
597 
598 }