1 module more.fields; 2 3 import std..string; 4 5 import more.utf8; 6 import more.common; 7 8 version(unittest) 9 { 10 import std.stdio; 11 } 12 13 class TextParseException : Exception 14 { 15 this(string msg) 16 { 17 super(msg); 18 } 19 } 20 21 struct Text 22 { 23 uint lineNumber; 24 uint column; 25 26 const(char)[] chars; 27 const(char)* limit; 28 29 const(char)* cpos; 30 const(char)* next; 31 32 dchar c; 33 34 this(const(char)[] chars) { 35 setup(chars); 36 } 37 void setup(const(char)[] chars) { 38 this.lineNumber = 1; 39 this.column = 1; 40 41 this.chars = chars; 42 this.limit = chars.ptr + chars.length; 43 this.cpos = chars.ptr; 44 this.next = chars.ptr; 45 46 if(chars.length > 0) { 47 c = decodeUtf8(&this.next, this.limit); 48 } 49 } 50 @property bool empty() { 51 return cpos >= limit; 52 } 53 54 void skipChar() { 55 cpos = next; 56 if(next < limit) { 57 c = decodeUtf8(&next, limit); 58 } 59 } 60 61 void toNextLine() 62 { 63 while(true) { 64 if(next >= limit) break; 65 c = decodeUtf8(&next, limit); 66 if(c == '\n') { 67 lineNumber++; 68 column = 1; 69 break; 70 } 71 } 72 cpos = next; 73 if(next < limit) { 74 c = decodeUtf8(&next, limit); 75 } 76 } 77 void toNewline() 78 { 79 while(true) { 80 cpos = next; 81 if(next >= limit) break; 82 c = decodeUtf8(&next, limit); 83 column++; 84 if(c == '\n') { 85 break; 86 } 87 } 88 } 89 void toEndOfToken() 90 { 91 if(c == '"') { 92 implement("quoted tokens"); 93 } else { 94 95 while(true) { 96 cpos = next; 97 if(next >= limit) break; 98 c = decodeUtf8(&next, limit); 99 column++; 100 if(isControlChar(c)) { 101 102 // Handle slashes that aren't comments 103 if(c != '/') break; 104 if(next >= limit) { 105 cpos = next; 106 break; 107 } 108 auto saveNext = next; 109 c = decodeUtf8(&next, limit); 110 next = saveNext; 111 112 if(c == '*' || c == '/') { 113 break; 114 } 115 116 } 117 118 } 119 } 120 } 121 122 // If skipNewlines is true, c/cpos will be pointing at the newline if no field was found 123 void skipWhitespaceAndComments(bool skipNewlines) 124 { 125 while(true) { 126 127 // TODO: maybe use a lookup table here 128 if(c == ' ' || c == '\t' || c =='\v' || c == '\f' || c == '\r') { 129 130 // do nothing (check first as this is the most likely case) 131 132 } else if(c == '\n') { 133 134 if(!skipNewlines) return; 135 136 lineNumber++; 137 column = 1; 138 139 } else if(c == '#') { 140 141 if(!skipNewlines) { 142 toNewline(); 143 return; 144 } 145 146 toNextLine(); 147 148 } else if(c == '/') { 149 150 if(next >= limit) return; 151 152 c = decodeUtf8(&next, limit); 153 154 if(c == '/') { 155 156 if(!skipNewlines) { 157 toNewline(); 158 return; 159 } 160 161 toNextLine(); 162 163 } else if(c == '*') { 164 165 if(!skipNewlines) { 166 implement("multiline comments when not skipping newlines"); 167 } 168 169 170 column++; 171 172 MULTILINE_COMMENT_LOOP: 173 while(next < limit) { 174 175 c = decodeUtf8(&next, limit); // no need to save cpos since c will be thrown away 176 column++; 177 178 if(c == '\n') { 179 lineNumber++; 180 column = 0; 181 lineNumber++; 182 } else if(c == '*') { 183 // loop assume c is pointing to a '*' and next is pointing to the next characer 184 while(next < limit) { 185 186 c = decodeUtf8(&next, limit); 187 column++; 188 if(c == '/') break MULTILINE_COMMENT_LOOP; 189 if(c == '\n') { 190 lineNumber++; 191 column = 0; 192 } else if(c != '*') { 193 break; 194 } 195 } 196 } 197 } 198 199 } else { 200 return; 201 } 202 203 } else { 204 205 return; // Found non-whitespace and non-comment 206 207 } 208 209 // 210 // Goto next character 211 // 212 cpos = next; 213 if(next >= limit) return; 214 c = decodeUtf8(&next, limit); 215 column++; 216 } 217 218 } 219 void parseField(ref FieldToken token, bool sameLine = false) 220 { 221 skipWhitespaceAndComments(!sameLine); 222 if(cpos >= limit || (sameLine && c == '\n')) { 223 token.text = null; 224 token.lineNumber = lineNumber; 225 token.column = column; 226 } else { 227 228 if(isControlChar(c)) { 229 throw new TextParseException(format("Expected non-control character but got '%s' (charcode=%s)", 230 c, cast(uint)c)); 231 } 232 233 const(char)* startOfToken = cpos; 234 token.lineNumber = lineNumber; 235 token.column = column; 236 toEndOfToken(); 237 token.text = startOfToken[0..cpos-startOfToken]; 238 } 239 } 240 bool noMoreFieldsOnThisLine() 241 { 242 skipWhitespaceAndComments(false); 243 return cpos >= limit || c == '\n'; 244 } 245 246 247 void parseString(ref FieldToken token) 248 { 249 skipWhitespaceAndComments(true); 250 token.lineNumber = lineNumber; 251 token.column = column; 252 if(cpos >= limit || isControlChar(c)) { 253 token.text = null; 254 } else { 255 const(char)* startOfToken = cpos; 256 toEndOfToken(); 257 token.text = startOfToken[0..cpos-startOfToken]; 258 } 259 } 260 261 262 alias parseString parseObjectFieldName; 263 264 // An object starts with an open curly brace '{' or omits its curly 265 // brace section with a semi-colon ';' 266 // A 'NamelessObjectField' is a field before the curly-brace section 267 void parseNamelessObjectField(ref FieldToken token) 268 { 269 skipWhitespaceAndComments(true); 270 token.lineNumber = lineNumber; 271 token.column = column; 272 if(cpos >= limit || isControlChar(c)) { 273 token.text = null; 274 } else { 275 const(char)* startOfToken = cpos; 276 toEndOfToken(); 277 token.text = startOfToken[0..cpos-startOfToken]; 278 } 279 } 280 bool atObjectStart() 281 { 282 skipWhitespaceAndComments(true); 283 if(cpos >= limit || c != '{') return false; 284 285 cpos = next; 286 if(next < limit) { 287 c = decodeUtf8(&next, limit); 288 } 289 return true; 290 } 291 } 292 293 294 struct FieldToken 295 { 296 const(char)[] text; 297 uint lineNumber; 298 uint column; 299 300 bool eof() 301 { 302 return text is null; 303 } 304 } 305 306 /+ 307 /** 308 * Used to parse the fields in <i>line</i> to the <i>fields</i> sink. 309 * line is a single line without the line ending character. 310 * returns error message on error 311 */ 312 void parseField(ref FieldToken token, ref Text text) 313 { 314 //writefln("[DEBUG] parseField(..., '%s')", escape(text.chars)); 315 316 const(char)* next = text.chars.ptr; 317 const char* limit = next + text.chars.length; 318 const(char)* cpos; 319 dchar c; 320 321 // ExpectedState: 322 // c/cpos: points to a character before the newline character 323 // ReturnState: 324 // c/cpos: points to the character after the newline character or at limit if at EOF 325 void toNextLine() 326 { 327 // no need to save cpos since c will be thrown away 328 while(true) { 329 if(next >= limit) break; 330 c = decodeUtf8(&next, limit); 331 if(c == '\n') { 332 text.lineNumber++; 333 text.column = 1; 334 break; 335 } 336 } 337 cpos = next; 338 if(next < limit) { 339 c = decodeUtf8(&next, limit); 340 } 341 } 342 // ExpectedState: 343 // c/cpos: points to the first character of the token 344 // ReturnState: 345 // c/cpos: points to the character after the token 346 void toEndOfToken() 347 { 348 if(c == '"') { 349 implement("quoted tokens"); 350 } else { 351 352 while(true) { 353 cpos = next; 354 if(next >= limit) break; 355 c = decodeUtf8(&next, limit); 356 text.column++; 357 if(isControlChar(c)) { 358 break; 359 } 360 } 361 } 362 } 363 // ExpectedState: 364 // c/cpos: points to the first character of the potential whitespace/comment 365 // ReturnState: 366 // c/cpos: points to the first character after all the whitespace/comments 367 void skipWhitespaceAndComments() 368 { 369 while(true) { 370 371 // TODO: maybe use a lookup table here 372 if(c == ' ' || c == '\t' || c =='\v' || c == '\f' || c == '\r') { 373 374 // do nothing (check first as this is the most likely case) 375 376 } else if(c == '\n') { 377 378 text.lineNumber++; 379 text.column = 1; 380 381 } else if(c == '#') { 382 383 toNextLine(); 384 385 } else if(c == '/') { 386 387 if(next >= limit) return; 388 389 c = decodeUtf8(&next, limit); 390 391 if(c == '/') { 392 393 toNextLine(); 394 395 } else if(c == '*') { 396 397 text.column++; 398 399 MULTILINE_COMMENT_LOOP: 400 while(next < limit) { 401 402 c = decodeUtf8(&next, limit); // no need to save cpos since c will be thrown away 403 text.column++; 404 405 if(c == '\n') { 406 text.lineNumber++; 407 text.column = 0; 408 text.lineNumber++; 409 } else if(c == '*') { 410 // loop assume c is pointing to a '*' and next is pointing to the next characer 411 while(next < limit) { 412 413 c = decodeUtf8(&next, limit); 414 text.column++; 415 if(c == '/') break MULTILINE_COMMENT_LOOP; 416 if(c == '\n') { 417 text.lineNumber++; 418 text.column = 0; 419 } else if(c != '*') { 420 break; 421 } 422 } 423 } 424 } 425 426 } else { 427 return; 428 } 429 430 } else { 431 432 return; // Found non-whitespace and non-comment 433 434 } 435 436 // 437 // Goto next character 438 // 439 cpos = next; 440 if(next >= limit) return; 441 c = decodeUtf8(&next, limit); 442 text.column++; 443 } 444 445 } 446 447 // 448 // Read the first character 449 // 450 cpos = next; 451 c = decodeUtf8(&next, limit); 452 453 skipWhitespaceAndComments(); 454 if(cpos >= limit) { 455 token.text = null; 456 text.chars = null; 457 return; 458 } 459 460 const(char)* startOfToken = cpos; 461 token.lineNumber = text.lineNumber; 462 token.column = text.column; 463 toEndOfToken(); 464 token.text = startOfToken[0..cpos-startOfToken]; 465 466 text.chars = cpos[0..limit-cpos]; 467 468 return; 469 } 470 +/ 471 472 473 enum ubyte controlCharFlag = 0x01; 474 enum ubyte whitespaceFlag = 0x02; 475 enum ubyte tokenStartFlag = 0x04; 476 477 bool isControlChar(dchar c) { 478 return (c < charLookup.length) && ( (charLookup[c] & controlCharFlag) != 0); 479 } 480 bool isWhitespace(dchar c) { 481 return (c < charLookup.length) && ( (charLookup[c] & whitespaceFlag) != 0); 482 } 483 mixin("private __gshared immutable ubyte[256] charLookup = "~rangeInitializers 484 ( 485 /* 486 "'_'" , "sdlIDFlag", 487 488 `'a'` , "sdlIDFlag", 489 `'b'` , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag", 490 `'c'` , "sdlIDFlag", 491 `'d'` , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag", 492 `'e'` , "sdlIDFlag", 493 `'f'` , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag", 494 `'g'-'k'`, "sdlIDFlag", 495 `'l'` , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag", 496 `'m'-'z'`, "sdlIDFlag", 497 498 `'A'` , "sdlIDFlag", 499 `'B'` , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag", 500 `'C'` , "sdlIDFlag", 501 `'D'` , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag", 502 `'E'` , "sdlIDFlag", 503 `'F'` , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag", 504 `'G'-'K'`, "sdlIDFlag", 505 `'L'` , "sdlIDFlag | sdlNumberFlag | sdlNumberPostfixFlag", 506 `'M'-'Z'`, "sdlIDFlag", 507 508 `'0'-'9'`, "sdlIDFlag | sdlNumberFlag", 509 `'-'` , "sdlIDFlag", 510 `'.'` , "sdlIDFlag | sdlNumberFlag", 511 `'$'` , "sdlIDFlag", 512 */ 513 `' '` , "controlCharFlag | whitespaceFlag", 514 `'\t'` , "controlCharFlag | whitespaceFlag", 515 `'\n'` , "controlCharFlag | whitespaceFlag", 516 `'\v'` , "controlCharFlag | whitespaceFlag", 517 `'\f'` , "controlCharFlag | whitespaceFlag", 518 `'\r'` , "controlCharFlag | whitespaceFlag", 519 `'{'` , "controlCharFlag", 520 `'}'` , "controlCharFlag", 521 522 `'['` , "controlCharFlag", 523 `']'` , "controlCharFlag", 524 //`';'` , "controlCharFlag", 525 //`'\\'` , "controlCharFlag", 526 `'/'` , "controlCharFlag", 527 `'#'` , "controlCharFlag", 528 529 530 )~";"); 531 532 unittest 533 { 534 import more.test; 535 mixin(scopedTest!"fields"); 536 537 writefln("Running Unit Tests..."); 538 539 void testParseFields(const(char)[] textString, FieldToken[] expectedTokens = [], size_t testLine = __LINE__) 540 { 541 auto escapedText = escape(textString); 542 543 debug { 544 writefln("[TEST] testing '%s'", escapedText); 545 } 546 547 FieldToken token; 548 Text text = Text(textString); 549 //text.setup(textString); 550 551 try { 552 553 for(auto i = 0; i < expectedTokens.length; i++) { 554 555 //parseField(token, text); 556 text.parseField(token); 557 if(token.eof) { 558 writefln("Expected %s token(s) but only got %s", expectedTokens.length, i); 559 writefln("Error: test on line %s", testLine); 560 } 561 562 auto expectedToken = expectedTokens[i]; 563 if(token.text != expectedToken.text) { 564 writefln("Error: expected token '%s' but got '%s'", expectedToken.text, token.text); 565 writefln("Error: test on line %s", testLine); 566 assert(0); 567 } 568 } 569 570 //parseField(token, text); 571 text.parseField(token); 572 if(!token.eof) { 573 writefln("Expected %s token(s) but got at least one more (text='%s')", 574 expectedTokens.length, token.text); 575 writefln("Error: test on line %s", testLine); 576 assert(0); 577 } 578 579 } catch(Exception e) { 580 writefln("[TEST] this sdl threw an unexpected Exception: '%s'", escape(text.chars)); 581 writeln(e); 582 writefln("Error: test on line %s", testLine); 583 assert(0); 584 } 585 } 586 587 testParseFields(""); 588 testParseFields(" "); 589 testParseFields("\n"); 590 591 testParseFields("// comment"); 592 testParseFields("# comment"); 593 testParseFields("/* comment */"); 594 testParseFields("/* comment\n next-line \n hey */"); 595 testParseFields("/* comment\n next-line *\n * ** *** \n hey **/"); 596 597 testParseFields("first", [FieldToken("first")]); 598 599 //testParseFields("[", [FieldToken("first")]); 600 601 }