1 /* 2 * Copyright Lodovico Giaretta 2016 - . 3 * Distributed under the Boost Software License, Version 1.0. 4 * (See accompanying file LICENSE_1_0.txt or copy at 5 * http://www.boost.org/LICENSE_1_0.txt) 6 */ 7 8 /++ 9 + This module implements various XML lexers. 10 + 11 + The methods a lexer should implement are documented in 12 + $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`); 13 + The different lexers here implemented are optimized for different kinds of input 14 + and different tradeoffs between speed and memory usage. 15 + 16 + Authors: 17 + Lodovico Giaretta 18 + László Szerémi 19 + 20 + License: 21 + <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>. 22 + 23 + Copyright: 24 + Copyright Lodovico Giaretta 2016 -- 25 +/ 26 27 module newxml.lexers; 28 29 import newxml.interfaces; 30 import newxml.faststrings; 31 32 import std.range.primitives; 33 import std.traits : isArray, isSomeFunction; 34 35 //import std.experimental.allocator;//import stdx.allocator; 36 //import std.experimental.allocator.gc_allocator;//import stdx.allocator.gc_allocator; 37 38 import std.typecons : Flag, Yes; 39 40 /** 41 * Thrown on lexing errors. 42 */ 43 public class LexerException : Exception { 44 @nogc @safe pure nothrow this(string msg, string file = __FILE__, size_t line = __LINE__, Throwable nextInChain = null) 45 { 46 super(msg, file, line, nextInChain); 47 } 48 49 @nogc @safe pure nothrow this(string msg, Throwable nextInChain, string file = __FILE__, size_t line = __LINE__) 50 { 51 super(msg, file, line, nextInChain); 52 } 53 } 54 @safe: 55 /++ 56 + A lexer that takes a sliceable input. 57 + 58 + This lexer will always return slices of the original input; thus, it does not 59 + allocate memory and calls to `start` don't invalidate the outputs of previous 60 + calls to `get`. 61 + 62 + This is the fastest of all lexers, as it only performs very quick searches and 63 + slicing operations. It has the downside of requiring the entire input to be loaded 64 + in memory at the same time; as such, it is optimal for small file but not suitable 65 + for very big ones. 66 + 67 + Parameters: 68 + T = a sliceable type used as input for this lexer 69 +/ 70 struct SliceLexer(T) 71 { 72 package T input; 73 package size_t pos; 74 package size_t begin; 75 76 /++ 77 + See detailed documentation in 78 + $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`) 79 +/ 80 alias CharacterType = ElementEncodingType!T; 81 /// ditto 82 alias InputType = T; 83 84 //mixin UsesAllocator!Alloc; 85 //mixin UsesErrorHandler!ErrorHandler; 86 87 /// ditto 88 void setSource(T input) 89 { 90 this.input = input; 91 pos = 0; 92 } 93 94 static if(isForwardRange!T) 95 { 96 auto save() 97 { 98 SliceLexer result = this; 99 result.input = input.save; 100 return result; 101 } 102 } 103 104 /// ditto 105 auto empty() const 106 { 107 return pos >= input.length; 108 } 109 110 /// ditto 111 void start() 112 { 113 begin = pos; 114 } 115 116 /// ditto 117 CharacterType[] get() const 118 { 119 return input[begin..pos]; 120 } 121 122 /// ditto 123 void dropWhile(string s) 124 { 125 while (pos < input.length && fastIndexOf(s, input[pos]) != -1) 126 pos++; 127 } 128 129 /// ditto 130 bool testAndAdvance(char c) 131 { 132 if (empty) throw new LexerException("No more characters are found!"); 133 //handler(); 134 if (input[pos] == c) 135 { 136 pos++; 137 return true; 138 } 139 return false; 140 } 141 142 /// ditto 143 void advanceUntil(char c, bool included) 144 { 145 if (empty) throw new LexerException("No more characters are found!"); 146 //handler(); 147 auto adv = fastIndexOf(input[pos..$], c); 148 if (adv != -1) 149 { 150 pos += adv; 151 if (empty) throw new LexerException("No more characters are found!"); 152 //handler(); 153 } 154 else 155 { 156 pos = input.length; 157 } 158 159 if (included) 160 { 161 if (empty) throw new LexerException("No more characters are found!"); 162 //handler(); 163 pos++; 164 } 165 } 166 167 /// ditto 168 size_t advanceUntilAny(string s, bool included) 169 { 170 if (empty) throw new LexerException("No more characters are found!"); 171 //handler(); 172 173 ptrdiff_t res; 174 while ((res = fastIndexOf(s, input[pos])) == -1) 175 if (++pos >= input.length) throw new LexerException("No more characters are found!"); 176 //handler(); 177 if (included) 178 pos++; 179 return res; 180 } 181 } 182 183 /++ 184 + A lexer that takes an InputRange. 185 + 186 + This lexer copies the needed characters from the input range to an internal 187 + buffer, returning slices of it. Whether the buffer is reused (and thus all 188 + previously returned slices invalidated) depends on the instantiation parameters. 189 + 190 + This is the most flexible lexer, as it imposes very few requirements on its input, 191 + which only needs to be an InputRange. It is also the slowest lexer, as it copies 192 + characters one by one, so it shall not be used unless it's the only option. 193 + 194 + Params: 195 + T = the InputRange to be used as input for this lexer 196 +/ 197 struct RangeLexer(T) 198 if (isInputRange!T) 199 { 200 //import newxml.appender; 201 202 /++ 203 + See detailed documentation in 204 + $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`) 205 +/ 206 alias CharacterType = ElementEncodingType!T; 207 /// ditto 208 alias InputType = T; 209 210 //mixin UsesAllocator!Alloc; 211 //mixin UsesErrorHandler!ErrorHandler; 212 213 //private Appender!(CharacterType, Alloc) app; 214 private CharacterType[] buffer; 215 216 import std.string: representation; 217 static if (is(typeof(representation!CharacterType("")))) 218 { 219 private typeof(representation!CharacterType("")) input; 220 void setSource(T input) 221 { 222 this.input = input.representation; 223 buffer.length = 0; 224 //app = typeof(app)(allocator); 225 } 226 } 227 else 228 { 229 private T input; 230 void setSource(T input) 231 { 232 this.input = input; 233 buffer.length = 0; 234 //app = typeof(app)(allocator); 235 } 236 } 237 238 static if (isForwardRange!T) 239 { 240 auto save() 241 { 242 RangeLexer result; 243 result.input = input.save; 244 result.buffer.length = 0; 245 //result.app = typeof(app)(allocator); 246 return result; 247 } 248 } 249 250 /++ 251 + See detailed documentation in 252 + $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`) 253 +/ 254 bool empty() const 255 { 256 return input.empty; 257 } 258 259 /// ditto 260 void start() 261 { 262 buffer.length = 0; 263 /+static if (reuseBuffer) 264 app.clear; 265 else 266 app = typeof(app)(allocator);+/ 267 } 268 269 /// ditto 270 CharacterType[] get() const 271 { 272 return buffer;//app.data; 273 } 274 275 /// ditto 276 void dropWhile(string s) 277 { 278 while (!input.empty && fastIndexOf(s, input.front) != -1) 279 input.popFront(); 280 } 281 282 /// ditto 283 bool testAndAdvance(char c) 284 { 285 if (input.empty) 286 throw new LexerException("No more characters are found!");//handler(); 287 if (input.front == c) 288 { 289 buffer ~= input.front;//app.put(input.front); 290 input.popFront(); 291 return true; 292 } 293 return false; 294 } 295 296 /// ditto 297 void advanceUntil(char c, bool included) 298 { 299 if (input.empty) 300 throw new LexerException("No more characters are found!");//handler(); 301 while (input.front != c) 302 { 303 buffer ~= input.front;//app.put(input.front); 304 input.popFront(); 305 if (input.empty) 306 throw new LexerException("No more characters are found!");//handler(); 307 } 308 if (included) 309 { 310 buffer ~= input.front;//app.put(input.front); 311 input.popFront(); 312 } 313 } 314 315 /// ditto 316 size_t advanceUntilAny(string s, bool included) 317 { 318 if (input.empty) 319 throw new LexerException("No more characters are found!");//handler(); 320 size_t res; 321 while ((res = fastIndexOf(s, input.front)) == -1) 322 { 323 buffer ~= input.front;//app.put(input.front); 324 input.popFront; 325 if (input.empty) 326 throw new LexerException("No more characters are found!");//handler(); 327 } 328 if (included) 329 { 330 buffer ~= input.front;// app.put(input.front); 331 input.popFront; 332 } 333 return res; 334 } 335 } 336 337 /++ 338 + A lexer that takes a ForwardRange. 339 + 340 + This lexer copies the needed characters from the forward range to an internal 341 + buffer, returning slices of it. Whether the buffer is reused (and thus all 342 + previously returned slices invalidated) depends on the instantiation parameters. 343 + 344 + This is slightly faster than `RangeLexer`, but shoudn't be used if a faster 345 + lexer is available. 346 + 347 + Params: 348 + T = the InputRange to be used as input for this lexer 349 +/ 350 struct ForwardLexer(T) 351 if (isForwardRange!T) 352 { 353 354 /++ 355 + See detailed documentation in 356 + $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`) 357 +/ 358 alias CharacterType = ElementEncodingType!T; 359 /// ditto 360 alias InputType = T; 361 362 //mixin UsesAllocator!Alloc; 363 //mixin UsesErrorHandler!ErrorHandler; 364 365 private size_t count; 366 private CharacterType[] buffer;//private Appender!(CharacterType, Alloc) app; 367 368 import std.string: representation; 369 static if (is(typeof(representation!CharacterType("")))) 370 { 371 private typeof(representation!CharacterType("")) input; 372 private typeof(input) input_start; 373 void setSource(T input) 374 { 375 buffer.length = 0;//app = typeof(app)(allocator); 376 this.input = input.representation; 377 this.input_start = this.input; 378 } 379 } 380 else 381 { 382 private T input; 383 private T input_start; 384 void setSource(T input) 385 { 386 buffer.length = 0;//app = typeof(app)(allocator); 387 this.input = input; 388 this.input_start = input; 389 } 390 } 391 392 auto save() 393 { 394 ForwardLexer result; 395 result.input = input.save(); 396 result.input_start = input.save(); 397 result.buffer.length = 0;//result.app = typeof(app)(allocator); 398 result.count = count; 399 return result; 400 } 401 402 /++ 403 + See detailed documentation in 404 + $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`) 405 +/ 406 bool empty() const 407 { 408 return input.empty; 409 } 410 411 /// ditto 412 void start() 413 { 414 buffer.length = 0; 415 416 input_start = input.save; 417 count = 0; 418 } 419 420 /// ditto 421 CharacterType[] get() 422 { 423 import std.range: take; 424 auto diff = count - buffer.length; 425 if (diff) 426 { 427 buffer.reserve(diff); 428 buffer ~= input_start.take(diff);//app.put(input_start.take(diff)); 429 } 430 return buffer; 431 } 432 433 /// ditto 434 void dropWhile(string s) 435 { 436 while (!input.empty && fastIndexOf(s, input.front) != -1) 437 input.popFront(); 438 input_start = input.save; 439 } 440 441 /// ditto 442 bool testAndAdvance(char c) 443 { 444 if (input.empty) 445 throw new LexerException("No data found!"); 446 if (input.front == c) 447 { 448 count++; 449 input.popFront(); 450 return true; 451 } 452 return false; 453 } 454 455 /// ditto 456 void advanceUntil(char c, bool included) 457 { 458 if (input.empty) 459 throw new LexerException("No data found!"); 460 while (input.front != c) 461 { 462 count++; 463 input.popFront(); 464 if (input.empty) 465 throw new LexerException("No data found!"); 466 } 467 if (included) 468 { 469 count++; 470 input.popFront(); 471 } 472 } 473 474 /// ditto 475 size_t advanceUntilAny(string s, bool included) 476 { 477 if (input.empty) 478 throw new LexerException("No more characters are found!"); 479 size_t res; 480 while ((res = fastIndexOf(s, input.front)) == -1) 481 { 482 count++; 483 input.popFront; 484 if (input.empty) 485 throw new LexerException("No more characters are found!"); 486 } 487 if (included) 488 { 489 count++; 490 input.popFront; 491 } 492 return res; 493 } 494 } 495 496 /++ 497 + A lexer that takes an InputRange of slices from the input. 498 + 499 + This lexer tries to merge the speed of direct slicing with the low memory requirements 500 + of ranges. Its input is a range whose elements are chunks of the input data; this 501 + lexer returns slices of the original chunks, unless the output is split between two 502 + chunks. If that's the case, a new array is allocated and returned. The various chunks 503 + may have different sizes. 504 + 505 + The bigger the chunks are, the better is the performance and higher the memory usage, 506 + so finding the correct tradeoff is crucial for maximum performance. This lexer is 507 + suitable for very large files, which are read chunk by chunk from the file system. 508 + 509 + Params: 510 + T = the InputRange to be used as input for this lexer 511 +/ 512 struct BufferedLexer(T) 513 if (isInputRange!T && isArray!(ElementType!T)) 514 { 515 //import newxml.appender; 516 517 alias BufferType = ElementType!T; 518 519 /++ 520 + See detailed documentation in 521 + $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`) 522 +/ 523 alias CharacterType = ElementEncodingType!BufferType; 524 /// ditto 525 alias InputType = T; 526 527 private InputType buffers; 528 private size_t pos; 529 private size_t begin; 530 531 private CharacterType[] outBuf;//private Appender!(CharacterType, Alloc) app; 532 private bool onEdge; 533 534 import std.string: representation, assumeUTF; 535 static if (is(typeof(representation!CharacterType("")))) 536 { 537 private typeof(representation!CharacterType("")) buffer; 538 void popBuffer() 539 { 540 buffer = buffers.front.representation; 541 buffers.popFront; 542 } 543 } 544 else 545 { 546 private BufferType buffer; 547 void popBuffer() 548 { 549 buffer = buffers.front; 550 buffers.popFront; 551 } 552 } 553 554 /++ 555 + See detailed documentation in 556 + $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`) 557 +/ 558 void setSource(T input) 559 { 560 outBuf.length = 0; //app = typeof(app)(allocator); 561 this.buffers = input; 562 popBuffer; 563 } 564 565 static if (isForwardRange!T) 566 { 567 auto save() const 568 { 569 BufferedLexer result; 570 result.buffers = buffers.save(); 571 result.buffer = buffer; 572 result.pos = pos; 573 result.begin = begin; 574 result.outBuf.length = 0;//app = typeof(app)(allocator); 575 return result; 576 } 577 } 578 579 /++ 580 + See detailed documentation in 581 + $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`) 582 +/ 583 bool empty() 584 { 585 return buffers.empty && pos >= buffer.length; 586 } 587 588 /// ditto 589 void start() 590 { 591 outBuf.length = 0; 592 /+static if (reuseBuffer) 593 app.clear; 594 else 595 app = typeof(app)(allocator);+/ 596 597 begin = pos; 598 onEdge = false; 599 } 600 601 private void advance() 602 { 603 if (empty) 604 throw new LexerException("No more characters are found!"); 605 if (pos + 1 >= buffer.length) 606 { 607 if (onEdge) 608 outBuf ~= buffer[pos];//app.put(buffer[pos]); 609 else 610 { 611 outBuf ~= buffer[begin..$];//app.put(buffer[begin..$]); 612 onEdge = true; 613 } 614 popBuffer; 615 begin = 0; 616 pos = 0; 617 } 618 else if (onEdge) 619 outBuf ~= buffer[pos++];//app.put(buffer[pos++]); 620 else 621 pos++; 622 } 623 private void advance(ptrdiff_t n) 624 { 625 foreach(i; 0..n) 626 advance(); 627 } 628 private void advanceNextBuffer() 629 { 630 if (empty) 631 throw new LexerException("No more characters are found!"); 632 if (onEdge) 633 outBuf ~= buffer[pos..$]; //app.put(buffer[pos..$]); 634 else 635 { 636 outBuf ~= buffer[begin..$];//app.put(buffer[begin..$]); 637 onEdge = true; 638 } 639 popBuffer; 640 begin = 0; 641 pos = 0; 642 } 643 644 /++ 645 + See detailed documentation in 646 + $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`) 647 +/ 648 CharacterType[] get() const 649 { 650 if (onEdge) 651 return outBuf;//app.data; 652 else 653 { 654 static if (is(typeof(representation!CharacterType("")))) 655 return cast(CharacterType[])buffer[begin..pos]; 656 else 657 return buffer[begin..pos]; 658 } 659 } 660 661 /// ditto 662 void dropWhile(string s) 663 { 664 while (!empty && fastIndexOf(s, buffer[pos]) != -1) 665 advance(); 666 } 667 668 /// ditto 669 bool testAndAdvance(char c) 670 { 671 if (empty) 672 throw new LexerException("No data found!"); 673 if (buffer[pos] == c) 674 { 675 advance(); 676 return true; 677 } 678 return false; 679 } 680 681 /// ditto 682 void advanceUntil(char c, bool included) 683 { 684 if (empty) 685 throw new LexerException("No data found!"); 686 ptrdiff_t adv; 687 while ((adv = fastIndexOf(buffer[pos..$], c)) == -1) 688 { 689 advanceNextBuffer(); 690 } 691 advance(adv); 692 693 if (included) 694 advance(); 695 } 696 697 /// ditto 698 size_t advanceUntilAny(string s, bool included) 699 { 700 if (empty) 701 throw new LexerException("No data found!"); 702 ptrdiff_t res; 703 while ((res = fastIndexOf(s, buffer[pos])) == -1) 704 { 705 advance(); 706 } 707 if (included) 708 advance(); 709 return res; 710 } 711 } 712 713 /++ 714 + Instantiates a specialized lexer for the given input type. 715 + 716 + The default error handler just asserts 0. 717 + If the type of the allocator is specified as template parameter, but no instance of it 718 + is passed as runtime parameter, then the static method `instance` of the allocator type is 719 + used. 720 +/ 721 auto chooseLexer(Input)() 722 { 723 static if (is(SliceLexer!(Input))) 724 { 725 auto res = SliceLexer!(Input)(); 726 return res; 727 } 728 else static if (is(BufferedLexer!(Input))) 729 { 730 auto res = BufferedLexer!(Input)(); 731 return res; 732 } 733 else static if (is(RangeLexer!(Input))) 734 { 735 auto res = RangeLexer!(Input)(); 736 return res; 737 } 738 else static assert(0); 739 740 } 741 742 template lexer() 743 { 744 auto lexer(Input)(auto ref Input input) 745 { 746 auto res = chooseLexer!(Input)(); 747 res.setSource(input); 748 return res; 749 } 750 } 751 752 version(unittest) 753 { 754 struct DumbBufferedReader 755 { 756 string content; 757 size_t chunk_size; 758 759 void popFront() @nogc 760 { 761 if (content.length > chunk_size) 762 content = content[chunk_size..$]; 763 else 764 content = []; 765 } 766 string front() const @nogc 767 { 768 if (content.length >= chunk_size) 769 return content[0..chunk_size]; 770 else 771 return content[0..$]; 772 } 773 bool empty() const @nogc 774 { 775 return !content.length; 776 } 777 } 778 } 779 780 unittest 781 { 782 void testLexer(T)(T.InputType delegate(string) @safe conv) @safe 783 { 784 string xml = q{ 785 <?xml encoding = "utf-8" ?> 786 <aaa xmlns:myns="something"> 787 <myns:bbb myns:att='>'> 788 <!-- lol --> 789 Lots of Text! 790 On multiple lines! 791 </myns:bbb> 792 <![CDATA[ Ciaone! ]]> 793 <ccc/> 794 </aaa> 795 }; 796 797 T lexer; 798 assert(lexer.empty); 799 lexer.setSource(conv(xml)); 800 801 lexer.dropWhile(" \r\n\t"); 802 lexer.start(); 803 lexer.advanceUntilAny(":>", true); 804 assert(lexer.get() == "<?xml encoding = \"utf-8\" ?>"); 805 806 lexer.dropWhile(" \r\n\t"); 807 lexer.start(); 808 lexer.advanceUntilAny("=:", false); 809 assert(lexer.get() == "<aaa xmlns"); 810 811 lexer.start(); 812 lexer.advanceUntil('>', true); 813 assert(lexer.get() == ":myns=\"something\">"); 814 815 lexer.dropWhile(" \r\n\t"); 816 lexer.start(); 817 lexer.advanceUntil('\'', true); 818 assert(lexer.testAndAdvance('>')); 819 lexer.advanceUntil('>', false); 820 assert(lexer.testAndAdvance('>')); 821 assert(lexer.get() == "<myns:bbb myns:att='>'>"); 822 823 assert(!lexer.empty); 824 } 825 826 testLexer!(SliceLexer!(string))(x => x); 827 testLexer!(RangeLexer!(string))(x => x); 828 testLexer!(ForwardLexer!(string))(x => x); 829 testLexer!(BufferedLexer!(DumbBufferedReader))(x => DumbBufferedReader(x, 10)); 830 } 831