1 module more.esb.parser;
2 
3 import std.array : Appender;
4 import std.format : format;
5 import std.conv : to;
6 import std.algorithm : startsWith;
7 
8 import more.format : utf8WriteEscaped;
9 import more.utf8 : decodeUtf8;
10 
11 /// Identifies the type of an $(D Expression).
12 enum ExpressionType : ubyte
13 {
14     /**
15     A "symbol" expression is a sequence of characters that match the following regex:
16         [a-zA-Z_/\.][a-zA-Z_/\.0-9]*
17     TODO: this regex should probably be configurable by the application.
18     */
19     symbol,
20     /**
21     A string is a sequence of character surrounded by "double-quotes".
22     Might add more types of strings later.  The types of strings could
23     also be configurable by the application.
24     */
25     string_,
26     functionCall,
27 }
28 
29 struct FunctionCall
30 {
31     size_t sourceNameLength;
32     Expression[] args;
33 }
34 
35 struct Expression
36 {
37     union
38     {
39         // Note: the "string_" field may or may not be a slice of "source"
40         // If it is a symbol, then it MUST be equal to source.
41         // TODO: I think I want to change this, DO NOT USE string_ for if it is a symbol.
42         // If it is a string_, then source will be the full quoted source string, and string_
43         // will be the source without the quotes if there are no escapes, otherwise, it will
44         // be a new string outside the source with the escapes handled.
45         string string_;
46         FunctionCall functionCall;
47     }
48     string source;
49     ExpressionType type;
50     private this(string source, ExpressionType type, string string_)
51     {
52         this.source = source;
53         this.type = type;
54         this.string_ = string_;
55     }
56     private this(string source, FunctionCall functionCall)
57     {
58         this.source = source;
59         this.type = ExpressionType.functionCall;
60         this.functionCall = functionCall;
61     }
62 
63     static Expression createSymbol(string source)
64     {
65         return Expression(source, ExpressionType.symbol, source);
66     }
67     static Expression createString(string source, string processedString)
68     {
69         return Expression(source, ExpressionType.string_, processedString);
70     }
71     static Expression createFunctionCall(string source, string name, Expression[] args)
72     {
73         assert(source.ptr == name.ptr);
74         return Expression(source, FunctionCall(name.length, args));
75     }
76 
77     @property string functionName() const
78         in { assert(type == ExpressionType.functionCall); } body
79     {
80         return source[0 .. functionCall.sourceNameLength];
81     }
82     void toString(scope void delegate(const(char)[]) sink) const
83     {
84         sink(source);
85     }
86 }
87 
88 /**
89 A Statement is one of the 3 building blocks for ESB. It represents
90 a list of expressions followed by an optional block of statements.
91 */
92 struct Statement
93 {
94     private Expression[] expressions;
95     Statement[] block;
96     @property auto expressionCount() { return expressions.length; }
97     @property auto expressionAt(size_t index) { return expressions[index];}
98     auto range(size_t expressionOffset)
99         in { assert(expressionOffset <= expressions.length); } body
100     {
101         return StatementRangeReference(&this, expressionOffset);
102     }
103     @property immutable(char)* sourceStart()
104     {
105         if(expressions.length > 0)
106         {
107             return expressions[0].source.ptr;
108         }
109         // TODO: implement this
110         assert(0, "not implemented");
111     }
112     void toString(scope void delegate(const(char)[]) sink) const
113     {
114         string prefix = "";
115         foreach(expression; expressions)
116         {
117             sink(prefix);
118             sink(expression.source);
119             prefix = " ";
120         }
121         if(block is null)
122         {
123             sink(";");
124         }
125         else
126         {
127             sink("{");
128             foreach(property; block)
129             {
130                 property.toString(sink);
131             }
132             sink("}");
133         }
134     }
135 }
136 
137 /**
138 This struct is used to represent a partial statement by skipping over
139 a number of expressions. It can be created from a $(D Statement) by calling
140 statement.range(expressionOffset).
141 */
142 struct StatementRangeReference
143 {
144     Statement* statement;
145     size_t next;
146     @property auto expressionsLeft() { return statement.expressions[next..$]; }
147     @property auto expressionAt(size_t index) { return statement.expressions[next + index];}
148     @property auto block() { return statement.block; }
149 
150     auto range(size_t expressionOffset)
151     {
152         return StatementRangeReference(statement, next + expressionOffset);
153     }
154     @property immutable(char)* sourceStart()
155     {
156         // TODO: maybe use next to offset the start?
157         return statement.sourceStart();
158     }
159 
160     @property bool empty() { return next >= statement.expressions.length; }
161     auto front()
162     {
163         return statement.expressions[next];
164     }
165     void popFront() { next++; }
166     @property final size_t remainingValues()
167     {
168         return statement.expressions.length - next;
169     }
170 }
171 
172 /**
173 The default NodeBuilder implementation.  A NodeBuilder is used to allocate
174 memory for lists of $(D Statement) and $(D Expression). The parser requires
175 a NodeBuilder to be accessible via it's template parameter as the "NodeBuilder" field.
176 */
177 struct DefaultNodeBuilder
178 {
179     static struct BlockBuilder
180     {
181         Appender!(Statement[]) appender;
182         this(size_t depth)
183         {
184         }
185         void newStatement(Expression[] expressions, Statement[] block)
186         {
187             appender.put(Statement(expressions, block));
188         }
189         auto finish()
190         {
191             return appender.data;
192         }
193     }
194     static struct ExpressionBuilder
195     {
196         Expression[10] firstValues;
197         Appender!(Expression[]) extraValues;
198         ubyte state;
199         void append(Expression expression)
200         {
201             if(state < firstValues.length)
202             {
203                 firstValues[state++] = expression;
204             }
205             else
206             {
207                 if(state == firstValues.length)
208                 {
209                     extraValues.put!(Expression[])(firstValues);
210                     state++;
211                 }
212                 extraValues.put(expression);
213             }
214         }
215         auto finish()
216         {
217             if(state == 0)
218             {
219                 return null;
220             }
221             else if(state <= firstValues.length)
222             {
223                 return firstValues[0..state].dup;
224             }
225             else
226             {
227                 return extraValues.data;
228             }
229         }
230     }
231 }
232 
233 /**
234 The default set of hooks for the ESB Parser
235 */
236 struct DefaultEsbParserHooks
237 {
238     alias NodeBuilder = DefaultNodeBuilder;
239     enum StatementDelimiter = ';';
240 }
241 
242 
243 class ParseException : Exception
244 {
245     this(string msg, string filename, uint lineNumber)
246     {
247         super(msg, filename, lineNumber);
248     }
249 }
250 
251 static struct PeekedChar
252 {
253     dchar nextChar;
254     const(char)* nextNextPtr;
255 }
256 private bool validNameFirstChar(dchar c)
257 {
258     return
259         (c >= 'a' && c <= 'z') ||
260         (c >= 'A' && c <= 'Z') ||
261         (c == '_') ||
262         (c == '.') ||
263         (c == '/');
264 }
265 private bool validNameChar(dchar c)
266 {
267     return
268         validNameFirstChar(c) ||
269         (c >= '0' && c <= '9');
270 }
271 
272 /**
273 NOTE: text must be null-terminated
274 */
275 auto parse(Hooks = DefaultEsbParserHooks)(string text, string filenameForErrors = null, uint lineNumber = 1)
276 {
277     auto parser = Parser!Hooks(text.ptr, filenameForErrors, lineNumber);
278     return parser.parse();
279 }
280 struct Parser(Hooks)
281 {
282     immutable(char)* nextPtr;
283     dchar current;
284     immutable(char)* currentPtr;
285     string filenameForErrors;
286     uint lineNumber;
287     
288     this(immutable(char)* nextPtr, string filenameForErrors = null, uint lineNumber = 1)
289     {
290         this.nextPtr = nextPtr;
291         this.filenameForErrors = filenameForErrors;
292         this.lineNumber = lineNumber;
293     }
294 
295     auto parseException(T...)(string fmt, T args)
296     {
297         return new ParseException(format(fmt, args), filenameForErrors, lineNumber);
298     }
299 
300     Statement[] parse()
301     {
302         // read the first character
303         consumeChar();
304         return parseBlock(0);
305     }
306     private Statement[] parseBlock(size_t depth)
307     {
308         auto blockBuilder = Hooks.NodeBuilder.BlockBuilder(depth);
309 
310         for(;;)
311         {
312             // parse optional expressions
313             Expression[] expressions;
314             {
315                 auto expressionBuilder = Hooks.NodeBuilder.ExpressionBuilder();
316                 for(;;)
317                 {
318                     auto expression = tryPeelExpression();
319                     if(expression.source is null)
320                     {
321                         break;
322                     }
323                     expressionBuilder.append(expression);
324                 }
325                 expressions = expressionBuilder.finish();
326             }
327 
328             if(expressions.length == 0)
329             {
330                 if(current == '\0')
331                 {
332                     if(depth == 0)
333                     {
334                         break;
335                     }
336                     throw parseException("not enough closing curly-braces");
337                 }
338                 if(current == '}')
339                 {
340                     if(depth == 0)
341                     {
342                         throw parseException("too many closing curly-braces");
343                     }
344                     consumeChar();
345                     break;
346                 }
347                 if(current == '{')
348                 {
349                     consumeChar();
350                     blockBuilder.newStatement(null, parseBlock(depth + 1));
351                 }
352                 else
353                 {
354                     throw parseException("expected an expression, or a '{ block }' but got %s", formatToken(currentPtr));
355                 }
356             }
357             else
358             {
359                 if(current == Hooks.StatementDelimiter)
360                 {
361                     consumeChar();
362                     blockBuilder.newStatement(expressions, null);
363                 }
364                 else if(current == '{')
365                 {
366                     consumeChar();
367                     blockBuilder.newStatement(expressions, parseBlock(depth + 1));
368                 }
369                 else
370                 {
371                     throw parseException("expected statement to end with '" ~
372                         Hooks.StatementDelimiter ~ "' or '{ block }' but got %s", formatToken(currentPtr));
373                 }
374             }
375         }
376         return blockBuilder.finish();
377     }
378     pragma(inline) void consumeChar()
379     {
380         currentPtr = nextPtr;
381         current = decodeUtf8(&nextPtr);
382     }
383     // NOTE: only call if you know you are not currently pointing to the
384     //       terminating NULL character
385     const PeekedChar peek() in { assert(current != '\0'); } body
386     {
387         PeekedChar peeked;
388         peeked.nextNextPtr = nextPtr;
389         peeked.nextChar = decodeUtf8(&peeked.nextNextPtr);
390         return peeked;
391     }
392 
393     void skipToNextLine()
394     {
395         for(;;)
396         {
397             auto c = decodeUtf8(&nextPtr);
398             if(c == '\n')
399             {
400                 lineNumber++;
401                 return;
402             }
403             if(c == '\0')
404             {
405                 currentPtr = nextPtr;
406                 current = '\0';
407                 return;
408             }
409         }
410     }
411     void skipWhitespaceAndComments()
412     {
413         for(;;)
414         {
415             if(current == ' ' || current == '\t' || current == '\r')
416             {
417                 //do nothing
418             }
419             else if(current == '\n')
420             {
421                 lineNumber++;
422             }
423             else if(current == '/')
424             {
425                 auto next = peek(); // Note: we know current != '\0'
426                 if(next.nextChar == '/')
427                 {
428                     skipToNextLine();
429                     if(current == '\0')
430                     {
431                         return;
432                     }
433                 }
434                 else if(next.nextChar == '*')
435                 {
436                     assert(0, "multiline comments not implemented");
437                 }
438                 else
439                 {
440                     return; // not a whitespace or comment
441                 }
442             }
443             else
444             {
445                 return; // not a whitespace or comment
446             }
447             consumeChar();
448         }
449     }
450     auto tryPeelName()
451     {
452         skipWhitespaceAndComments();
453         if(!validNameFirstChar(current))
454         {
455             return null;
456         }
457         auto nameStart = currentPtr;
458         for(;;)
459         {
460             consumeChar();
461             if(!validNameChar(current))
462             {
463                 return nameStart[0..currentPtr-nameStart];
464             }
465         }
466     }
467 
468 
469     Expression tryPeelExpression()
470     {
471         for(;;)
472         {
473             Expression part = tryPeelExpressionLevel0();
474             if(part.source is null)
475             {
476                 return part;
477             }
478             skipWhitespaceAndComments();
479             // TODO: check for operations such as '+' etc
480             return part;
481         }
482     }
483     // Level 0 expressions are the expressions with the highest operator precedence.
484     // 1) symbol
485     // 2) function call (symbol '(' args.. ')')
486     // 3) string
487     Expression tryPeelExpressionLevel0()
488     {
489         skipWhitespaceAndComments();
490         if(validNameFirstChar(current))
491         {
492             auto nameStart = currentPtr;
493             for(;;)
494             {
495                 consumeChar();
496                 if(!validNameChar(current))
497                 {
498                     auto name = nameStart[0..currentPtr-nameStart];
499                     skipWhitespaceAndComments();
500                     if(current == '(')
501                     {
502                         return peelFunctionCall(name);
503                     }
504                     return Expression.createSymbol(name);
505                 }
506             }
507         }
508         if(current == '"')
509         {
510             return peelString();
511         }
512         return Expression(); // no level0 expression was found
513     }
514     // Assumption: current is at the opening quote
515     Expression peelString()
516     {
517         auto start = currentPtr;
518         immutable(char)* firstEscape = null;
519         for(;;)
520         {
521             consumeChar();
522             if(current == '"')
523             {
524                 break;
525             }
526             if(current == '\\')
527             {
528                 if(!firstEscape)
529                 {
530                     firstEscape = currentPtr;
531                 }
532                 assert(0, "escapes not implemented");
533             }
534             else if(current == '\n')
535             {
536                 // TODO: maybe provide a way to allow this
537                 throw parseException("double-quoted strings cannot contain newlines");
538             }
539             else if(current == '\0')
540             {
541                 throw parseException("file ended inside double-quoted string");
542             }
543         }
544         if(!firstEscape)
545         {
546             consumeChar();
547             auto source = start[0 .. currentPtr - start];
548             auto str = source[1..$-1];
549             return Expression.createString(source, str);
550         }
551         assert(0, "escapes not implemented");
552     }
553     // Assumption: current points to opening paren
554     Expression peelFunctionCall(string name)
555     {
556         auto sourceStart = name.ptr;
557         auto expressionBuilder = Hooks.NodeBuilder.ExpressionBuilder();
558 
559         consumeChar();
560         skipWhitespaceAndComments();
561         if(current != ')')
562         {
563             for(;;)
564             {
565                 auto expression = tryPeelExpression();
566                 if(expression.source is null)
567                 {
568                     throw parseException("expected function call to end with ')' but got '%s'", formatToken(currentPtr));
569                 }
570                 expressionBuilder.append(expression);
571                 skipWhitespaceAndComments();
572                 if(current == ')')
573                 {
574                     break;
575                 }
576                 if(current != ',')
577                 {
578                     throw parseException("expected comma ',' after function argument but got '%s'", formatToken(currentPtr));
579                 }
580                 consumeChar();
581             }
582         }
583         consumeChar();
584         auto source = sourceStart[0 .. currentPtr - sourceStart];
585         return Expression.createFunctionCall(source, name, expressionBuilder.finish());
586     }
587 }
588 auto guessEndOfToken(const(char)* ptr)
589 {
590     // TODO: should use the first char to determine the kind of token and
591     //       then find the end using that information
592     for(;;)
593     {
594         auto c = *ptr;
595         if(c == '\0' || c == ' ' || c == '\t' || c == '\r' || c == '\n')
596         {
597             return ptr;
598         }
599         decodeUtf8(&ptr);
600     }
601 }
602 auto formatToken(const(char)* token)
603 {
604     static struct Formatter
605     {
606         const(char)* token;
607         void toString(scope void delegate(const(char)[]) sink) const
608         {
609             if(*token == '\0')
610             {
611                 sink("EOF");
612             }
613             else
614             {
615                 sink("\"");
616                 sink.utf8WriteEscaped(token, guessEndOfToken(token));
617                 sink("\"");
618             }
619         }
620     }
621     return Formatter(token);
622 }
623 
624 version(unittest)
625 {
626     static void assertEqual(Statement expected, Statement actual)
627     {
628         assert(expected.expressions.length == actual.expressions.length);
629     }
630     static void test(string text, Statement[] expected)
631     {
632         auto actual = parse(text);
633         assert(actual.length == expected.length);
634         foreach(statementIndex; 0..actual.length)
635         {
636             assertEqual(expected[statementIndex], actual[statementIndex]);
637         }
638     }
639     static auto block(Statement[] statements...)
640     {
641         return statements.dup;
642     }
643     static auto statement(Expression[] expressions, Statement[] block = null)
644     {
645         return Statement(expressions, block);
646     }
647     static auto symbol(string name)
648     {
649         return Expression.createSymbol(name);
650     }
651 }
652 
653 unittest
654 {
655     test("", null);
656     test("a;", block(statement([symbol("a")])));
657     test("a{}", block(statement([symbol("a")])));
658 
659     test(`abc
660 {
661     def;
662     hij;
663 }`, block(statement([symbol("abc")], block(
664     Statement([symbol("def")]),
665     Statement([symbol("hij")])
666 ))));
667 }