1 module more.esb.parser; 2 3 import std.array : Appender; 4 import std.format : format; 5 import std.conv : to; 6 import std.algorithm : startsWith; 7 8 import more.format : utf8WriteEscaped; 9 import more.utf8 : decodeUtf8; 10 11 /// Identifies the type of an $(D Expression). 12 enum ExpressionType : ubyte 13 { 14 /** 15 A "symbol" expression is a sequence of characters that match the following regex: 16 [a-zA-Z_/\.][a-zA-Z_/\.0-9]* 17 TODO: this regex should probably be configurable by the application. 18 */ 19 symbol, 20 /** 21 A string is a sequence of character surrounded by "double-quotes". 22 Might add more types of strings later. The types of strings could 23 also be configurable by the application. 24 */ 25 string_, 26 functionCall, 27 } 28 29 struct FunctionCall 30 { 31 size_t sourceNameLength; 32 Expression[] args; 33 } 34 35 struct Expression 36 { 37 union 38 { 39 // Note: the "string_" field may or may not be a slice of "source" 40 // If it is a symbol, then it MUST be equal to source. 41 // TODO: I think I want to change this, DO NOT USE string_ for if it is a symbol. 42 // If it is a string_, then source will be the full quoted source string, and string_ 43 // will be the source without the quotes if there are no escapes, otherwise, it will 44 // be a new string outside the source with the escapes handled. 45 string string_; 46 FunctionCall functionCall; 47 } 48 string source; 49 ExpressionType type; 50 private this(string source, ExpressionType type, string string_) 51 { 52 this.source = source; 53 this.type = type; 54 this.string_ = string_; 55 } 56 private this(string source, FunctionCall functionCall) 57 { 58 this.source = source; 59 this.type = ExpressionType.functionCall; 60 this.functionCall = functionCall; 61 } 62 63 static Expression createSymbol(string source) 64 { 65 return Expression(source, ExpressionType.symbol, source); 66 } 67 static Expression createString(string source, string processedString) 68 { 69 return Expression(source, ExpressionType.string_, processedString); 70 } 71 static Expression createFunctionCall(string source, string name, Expression[] args) 72 { 73 assert(source.ptr == name.ptr); 74 return Expression(source, FunctionCall(name.length, args)); 75 } 76 77 @property string functionName() const 78 in { assert(type == ExpressionType.functionCall); } body 79 { 80 return source[0 .. functionCall.sourceNameLength]; 81 } 82 void toString(scope void delegate(const(char)[]) sink) const 83 { 84 sink(source); 85 } 86 } 87 88 /** 89 A Statement is one of the 3 building blocks for ESB. It represents 90 a list of expressions followed by an optional block of statements. 91 */ 92 struct Statement 93 { 94 private Expression[] expressions; 95 Statement[] block; 96 @property auto expressionCount() { return expressions.length; } 97 @property auto expressionAt(size_t index) { return expressions[index];} 98 auto range(size_t expressionOffset) 99 in { assert(expressionOffset <= expressions.length); } body 100 { 101 return StatementRangeReference(&this, expressionOffset); 102 } 103 @property immutable(char)* sourceStart() 104 { 105 if(expressions.length > 0) 106 { 107 return expressions[0].source.ptr; 108 } 109 // TODO: implement this 110 assert(0, "not implemented"); 111 } 112 void toString(scope void delegate(const(char)[]) sink) const 113 { 114 string prefix = ""; 115 foreach(expression; expressions) 116 { 117 sink(prefix); 118 sink(expression.source); 119 prefix = " "; 120 } 121 if(block is null) 122 { 123 sink(";"); 124 } 125 else 126 { 127 sink("{"); 128 foreach(property; block) 129 { 130 property.toString(sink); 131 } 132 sink("}"); 133 } 134 } 135 } 136 137 /** 138 This struct is used to represent a partial statement by skipping over 139 a number of expressions. It can be created from a $(D Statement) by calling 140 statement.range(expressionOffset). 141 */ 142 struct StatementRangeReference 143 { 144 Statement* statement; 145 size_t next; 146 @property auto expressionsLeft() { return statement.expressions[next..$]; } 147 @property auto expressionAt(size_t index) { return statement.expressions[next + index];} 148 @property auto block() { return statement.block; } 149 150 auto range(size_t expressionOffset) 151 { 152 return StatementRangeReference(statement, next + expressionOffset); 153 } 154 @property immutable(char)* sourceStart() 155 { 156 // TODO: maybe use next to offset the start? 157 return statement.sourceStart(); 158 } 159 160 @property bool empty() { return next >= statement.expressions.length; } 161 auto front() 162 { 163 return statement.expressions[next]; 164 } 165 void popFront() { next++; } 166 @property final size_t remainingValues() 167 { 168 return statement.expressions.length - next; 169 } 170 } 171 172 /** 173 The default NodeBuilder implementation. A NodeBuilder is used to allocate 174 memory for lists of $(D Statement) and $(D Expression). The parser requires 175 a NodeBuilder to be accessible via it's template parameter as the "NodeBuilder" field. 176 */ 177 struct DefaultNodeBuilder 178 { 179 static struct BlockBuilder 180 { 181 Appender!(Statement[]) appender; 182 this(size_t depth) 183 { 184 } 185 void newStatement(Expression[] expressions, Statement[] block) 186 { 187 appender.put(Statement(expressions, block)); 188 } 189 auto finish() 190 { 191 return appender.data; 192 } 193 } 194 static struct ExpressionBuilder 195 { 196 Expression[10] firstValues; 197 Appender!(Expression[]) extraValues; 198 ubyte state; 199 void append(Expression expression) 200 { 201 if(state < firstValues.length) 202 { 203 firstValues[state++] = expression; 204 } 205 else 206 { 207 if(state == firstValues.length) 208 { 209 extraValues.put!(Expression[])(firstValues); 210 state++; 211 } 212 extraValues.put(expression); 213 } 214 } 215 auto finish() 216 { 217 if(state == 0) 218 { 219 return null; 220 } 221 else if(state <= firstValues.length) 222 { 223 return firstValues[0..state].dup; 224 } 225 else 226 { 227 return extraValues.data; 228 } 229 } 230 } 231 } 232 233 /** 234 The default set of hooks for the ESB Parser 235 */ 236 struct DefaultEsbParserHooks 237 { 238 alias NodeBuilder = DefaultNodeBuilder; 239 enum StatementDelimiter = ';'; 240 } 241 242 243 class ParseException : Exception 244 { 245 this(string msg, string filename, uint lineNumber) 246 { 247 super(msg, filename, lineNumber); 248 } 249 } 250 251 static struct PeekedChar 252 { 253 dchar nextChar; 254 const(char)* nextNextPtr; 255 } 256 private bool validNameFirstChar(dchar c) 257 { 258 return 259 (c >= 'a' && c <= 'z') || 260 (c >= 'A' && c <= 'Z') || 261 (c == '_') || 262 (c == '.') || 263 (c == '/'); 264 } 265 private bool validNameChar(dchar c) 266 { 267 return 268 validNameFirstChar(c) || 269 (c >= '0' && c <= '9'); 270 } 271 272 /** 273 NOTE: text must be null-terminated 274 */ 275 auto parse(Hooks = DefaultEsbParserHooks)(string text, string filenameForErrors = null, uint lineNumber = 1) 276 { 277 auto parser = Parser!Hooks(text.ptr, filenameForErrors, lineNumber); 278 return parser.parse(); 279 } 280 struct Parser(Hooks) 281 { 282 immutable(char)* nextPtr; 283 dchar current; 284 immutable(char)* currentPtr; 285 string filenameForErrors; 286 uint lineNumber; 287 288 this(immutable(char)* nextPtr, string filenameForErrors = null, uint lineNumber = 1) 289 { 290 this.nextPtr = nextPtr; 291 this.filenameForErrors = filenameForErrors; 292 this.lineNumber = lineNumber; 293 } 294 295 auto parseException(T...)(string fmt, T args) 296 { 297 return new ParseException(format(fmt, args), filenameForErrors, lineNumber); 298 } 299 300 Statement[] parse() 301 { 302 // read the first character 303 consumeChar(); 304 return parseBlock(0); 305 } 306 private Statement[] parseBlock(size_t depth) 307 { 308 auto blockBuilder = Hooks.NodeBuilder.BlockBuilder(depth); 309 310 for(;;) 311 { 312 // parse optional expressions 313 Expression[] expressions; 314 { 315 auto expressionBuilder = Hooks.NodeBuilder.ExpressionBuilder(); 316 for(;;) 317 { 318 auto expression = tryPeelExpression(); 319 if(expression.source is null) 320 { 321 break; 322 } 323 expressionBuilder.append(expression); 324 } 325 expressions = expressionBuilder.finish(); 326 } 327 328 if(expressions.length == 0) 329 { 330 if(current == '\0') 331 { 332 if(depth == 0) 333 { 334 break; 335 } 336 throw parseException("not enough closing curly-braces"); 337 } 338 if(current == '}') 339 { 340 if(depth == 0) 341 { 342 throw parseException("too many closing curly-braces"); 343 } 344 consumeChar(); 345 break; 346 } 347 if(current == '{') 348 { 349 consumeChar(); 350 blockBuilder.newStatement(null, parseBlock(depth + 1)); 351 } 352 else 353 { 354 throw parseException("expected an expression, or a '{ block }' but got %s", formatToken(currentPtr)); 355 } 356 } 357 else 358 { 359 if(current == Hooks.StatementDelimiter) 360 { 361 consumeChar(); 362 blockBuilder.newStatement(expressions, null); 363 } 364 else if(current == '{') 365 { 366 consumeChar(); 367 blockBuilder.newStatement(expressions, parseBlock(depth + 1)); 368 } 369 else 370 { 371 throw parseException("expected statement to end with '" ~ 372 Hooks.StatementDelimiter ~ "' or '{ block }' but got %s", formatToken(currentPtr)); 373 } 374 } 375 } 376 return blockBuilder.finish(); 377 } 378 pragma(inline) void consumeChar() 379 { 380 currentPtr = nextPtr; 381 current = decodeUtf8(&nextPtr); 382 } 383 // NOTE: only call if you know you are not currently pointing to the 384 // terminating NULL character 385 const PeekedChar peek() in { assert(current != '\0'); } body 386 { 387 PeekedChar peeked; 388 peeked.nextNextPtr = nextPtr; 389 peeked.nextChar = decodeUtf8(&peeked.nextNextPtr); 390 return peeked; 391 } 392 393 void skipToNextLine() 394 { 395 for(;;) 396 { 397 auto c = decodeUtf8(&nextPtr); 398 if(c == '\n') 399 { 400 lineNumber++; 401 return; 402 } 403 if(c == '\0') 404 { 405 currentPtr = nextPtr; 406 current = '\0'; 407 return; 408 } 409 } 410 } 411 void skipWhitespaceAndComments() 412 { 413 for(;;) 414 { 415 if(current == ' ' || current == '\t' || current == '\r') 416 { 417 //do nothing 418 } 419 else if(current == '\n') 420 { 421 lineNumber++; 422 } 423 else if(current == '/') 424 { 425 auto next = peek(); // Note: we know current != '\0' 426 if(next.nextChar == '/') 427 { 428 skipToNextLine(); 429 if(current == '\0') 430 { 431 return; 432 } 433 } 434 else if(next.nextChar == '*') 435 { 436 assert(0, "multiline comments not implemented"); 437 } 438 else 439 { 440 return; // not a whitespace or comment 441 } 442 } 443 else 444 { 445 return; // not a whitespace or comment 446 } 447 consumeChar(); 448 } 449 } 450 auto tryPeelName() 451 { 452 skipWhitespaceAndComments(); 453 if(!validNameFirstChar(current)) 454 { 455 return null; 456 } 457 auto nameStart = currentPtr; 458 for(;;) 459 { 460 consumeChar(); 461 if(!validNameChar(current)) 462 { 463 return nameStart[0..currentPtr-nameStart]; 464 } 465 } 466 } 467 468 469 Expression tryPeelExpression() 470 { 471 for(;;) 472 { 473 Expression part = tryPeelExpressionLevel0(); 474 if(part.source is null) 475 { 476 return part; 477 } 478 skipWhitespaceAndComments(); 479 // TODO: check for operations such as '+' etc 480 return part; 481 } 482 } 483 // Level 0 expressions are the expressions with the highest operator precedence. 484 // 1) symbol 485 // 2) function call (symbol '(' args.. ')') 486 // 3) string 487 Expression tryPeelExpressionLevel0() 488 { 489 skipWhitespaceAndComments(); 490 if(validNameFirstChar(current)) 491 { 492 auto nameStart = currentPtr; 493 for(;;) 494 { 495 consumeChar(); 496 if(!validNameChar(current)) 497 { 498 auto name = nameStart[0..currentPtr-nameStart]; 499 skipWhitespaceAndComments(); 500 if(current == '(') 501 { 502 return peelFunctionCall(name); 503 } 504 return Expression.createSymbol(name); 505 } 506 } 507 } 508 if(current == '"') 509 { 510 return peelString(); 511 } 512 return Expression(); // no level0 expression was found 513 } 514 // Assumption: current is at the opening quote 515 Expression peelString() 516 { 517 auto start = currentPtr; 518 immutable(char)* firstEscape = null; 519 for(;;) 520 { 521 consumeChar(); 522 if(current == '"') 523 { 524 break; 525 } 526 if(current == '\\') 527 { 528 if(!firstEscape) 529 { 530 firstEscape = currentPtr; 531 } 532 assert(0, "escapes not implemented"); 533 } 534 else if(current == '\n') 535 { 536 // TODO: maybe provide a way to allow this 537 throw parseException("double-quoted strings cannot contain newlines"); 538 } 539 else if(current == '\0') 540 { 541 throw parseException("file ended inside double-quoted string"); 542 } 543 } 544 if(!firstEscape) 545 { 546 consumeChar(); 547 auto source = start[0 .. currentPtr - start]; 548 auto str = source[1..$-1]; 549 return Expression.createString(source, str); 550 } 551 assert(0, "escapes not implemented"); 552 } 553 // Assumption: current points to opening paren 554 Expression peelFunctionCall(string name) 555 { 556 auto sourceStart = name.ptr; 557 auto expressionBuilder = Hooks.NodeBuilder.ExpressionBuilder(); 558 559 consumeChar(); 560 skipWhitespaceAndComments(); 561 if(current != ')') 562 { 563 for(;;) 564 { 565 auto expression = tryPeelExpression(); 566 if(expression.source is null) 567 { 568 throw parseException("expected function call to end with ')' but got '%s'", formatToken(currentPtr)); 569 } 570 expressionBuilder.append(expression); 571 skipWhitespaceAndComments(); 572 if(current == ')') 573 { 574 break; 575 } 576 if(current != ',') 577 { 578 throw parseException("expected comma ',' after function argument but got '%s'", formatToken(currentPtr)); 579 } 580 consumeChar(); 581 } 582 } 583 consumeChar(); 584 auto source = sourceStart[0 .. currentPtr - sourceStart]; 585 return Expression.createFunctionCall(source, name, expressionBuilder.finish()); 586 } 587 } 588 auto guessEndOfToken(const(char)* ptr) 589 { 590 // TODO: should use the first char to determine the kind of token and 591 // then find the end using that information 592 for(;;) 593 { 594 auto c = *ptr; 595 if(c == '\0' || c == ' ' || c == '\t' || c == '\r' || c == '\n') 596 { 597 return ptr; 598 } 599 decodeUtf8(&ptr); 600 } 601 } 602 auto formatToken(const(char)* token) 603 { 604 static struct Formatter 605 { 606 const(char)* token; 607 void toString(scope void delegate(const(char)[]) sink) const 608 { 609 if(*token == '\0') 610 { 611 sink("EOF"); 612 } 613 else 614 { 615 sink("\""); 616 sink.utf8WriteEscaped(token, guessEndOfToken(token)); 617 sink("\""); 618 } 619 } 620 } 621 return Formatter(token); 622 } 623 624 version(unittest) 625 { 626 static void assertEqual(Statement expected, Statement actual) 627 { 628 assert(expected.expressions.length == actual.expressions.length); 629 } 630 static void test(string text, Statement[] expected) 631 { 632 auto actual = parse(text); 633 assert(actual.length == expected.length); 634 foreach(statementIndex; 0..actual.length) 635 { 636 assertEqual(expected[statementIndex], actual[statementIndex]); 637 } 638 } 639 static auto block(Statement[] statements...) 640 { 641 return statements.dup; 642 } 643 static auto statement(Expression[] expressions, Statement[] block = null) 644 { 645 return Statement(expressions, block); 646 } 647 static auto symbol(string name) 648 { 649 return Expression.createSymbol(name); 650 } 651 } 652 653 unittest 654 { 655 test("", null); 656 test("a;", block(statement([symbol("a")]))); 657 test("a{}", block(statement([symbol("a")]))); 658 659 test(`abc 660 { 661 def; 662 hij; 663 }`, block(statement([symbol("abc")], block( 664 Statement([symbol("def")]), 665 Statement([symbol("hij")]) 666 )))); 667 }