newxml.lexers source code

1 /*
2 *             Copyright Lodovico Giaretta 2016 - .
3 *  Distributed under the Boost Software License, Version 1.0.
4 *      (See accompanying file LICENSE_1_0.txt or copy at
5 *            http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 /++
9 +   This module implements various XML lexers.
10 +
11 +   The methods a lexer should implement are documented in
12 +   $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`);
13 +   The different lexers here implemented are optimized for different kinds of input
14 +   and different tradeoffs between speed and memory usage.
15 +
16 +   Authors:
17 +   Lodovico Giaretta
18 +   László Szerémi
19 +
20 +   License:
21 +   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
22 +
23 +   Copyright:
24 +   Copyright Lodovico Giaretta 2016 --
25 +/
26 
27 module newxml.lexers;
28 
29 import newxml.interfaces;
30 import newxml.faststrings;
31 
32 import std.range.primitives;
33 import std.traits : isArray, isSomeFunction;
34 
35 //import std.experimental.allocator;//import stdx.allocator;
36 //import std.experimental.allocator.gc_allocator;//import stdx.allocator.gc_allocator;
37 
38 import std.typecons : Flag, Yes;
39 
40 /** 
41  * Thrown on lexing errors.
42  */
43 public class LexerException : Exception {
44     @nogc @safe pure nothrow this(string msg, string file = __FILE__, size_t line = __LINE__, Throwable nextInChain = null)
45     {
46         super(msg, file, line, nextInChain);
47     }
48 
49     @nogc @safe pure nothrow this(string msg, Throwable nextInChain, string file = __FILE__, size_t line = __LINE__)
50     {
51         super(msg, file, line, nextInChain);
52     }
53 }
54 @safe:
55 /++
56 +   A lexer that takes a sliceable input.
57 +
58 +   This lexer will always return slices of the original input; thus, it does not
59 +   allocate memory and calls to `start` don't invalidate the outputs of previous
60 +   calls to `get`.
61 +
62 +   This is the fastest of all lexers, as it only performs very quick searches and
63 +   slicing operations. It has the downside of requiring the entire input to be loaded
64 +   in memory at the same time; as such, it is optimal for small file but not suitable
65 +   for very big ones.
66 +
67 +   Parameters:
68 +       T = a sliceable type used as input for this lexer
69 +/
70 struct SliceLexer(T)
71 {
72     package T input;
73     package size_t pos;
74     package size_t begin;
75 
76     /++
77     +   See detailed documentation in
78     +   $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`)
79     +/
80     alias CharacterType = ElementEncodingType!T;
81     /// ditto
82     alias InputType = T;
83 
84     //mixin UsesAllocator!Alloc;
85     //mixin UsesErrorHandler!ErrorHandler;
86 
87     /// ditto
88     void setSource(T input)
89     {
90         this.input = input;
91         pos = 0;
92     }
93 
94     static if(isForwardRange!T)
95     {
96         auto save()
97         {
98             SliceLexer result = this;
99             result.input = input.save;
100             return result;
101         }
102     }
103 
104     /// ditto
105     auto empty() const
106     {
107         return pos >= input.length;
108     }
109 
110     /// ditto
111     void start()
112     {
113         begin = pos;
114     }
115 
116     /// ditto
117     CharacterType[] get() const
118     {
119         return input[begin..pos];
120     }
121 
122     /// ditto
123     void dropWhile(string s)
124     {
125         while (pos < input.length && fastIndexOf(s, input[pos]) != -1)
126             pos++;
127     }
128 
129     /// ditto
130     bool testAndAdvance(char c)
131     {
132         if (empty) throw new LexerException("No more characters are found!");
133             //handler();
134         if (input[pos] == c)
135         {
136             pos++;
137             return true;
138         }
139         return false;
140     }
141 
142     /// ditto
143     void advanceUntil(char c, bool included)
144     {
145         if (empty) throw new LexerException("No more characters are found!");
146             //handler();
147         auto adv = fastIndexOf(input[pos..$], c);
148         if (adv != -1)
149         {
150             pos += adv;
151             if (empty) throw new LexerException("No more characters are found!");
152                 //handler();
153         }
154         else
155         {
156             pos = input.length;
157         }
158 
159         if (included)
160         {
161             if (empty) throw new LexerException("No more characters are found!");
162                 //handler();
163             pos++;
164         }
165     }
166 
167     /// ditto
168     size_t advanceUntilAny(string s, bool included)
169     {
170         if (empty) throw new LexerException("No more characters are found!");
171             //handler();
172 
173         ptrdiff_t res;
174         while ((res = fastIndexOf(s, input[pos])) == -1)
175             if (++pos >= input.length) throw new LexerException("No more characters are found!");
176                 //handler();
177         if (included)
178             pos++;
179         return res;
180     }
181 }
182 
183 /++
184 +   A lexer that takes an InputRange.
185 +
186 +   This lexer copies the needed characters from the input range to an internal
187 +   buffer, returning slices of it. Whether the buffer is reused (and thus all
188 +   previously returned slices invalidated) depends on the instantiation parameters.
189 +
190 +   This is the most flexible lexer, as it imposes very few requirements on its input,
191 +   which only needs to be an InputRange. It is also the slowest lexer, as it copies
192 +   characters one by one, so it shall not be used unless it's the only option.
193 +
194 +   Params:
195 +       T           = the InputRange to be used as input for this lexer
196 +/
197 struct RangeLexer(T)
198     if (isInputRange!T)
199 {
200     //import newxml.appender;
201 
202     /++
203     +   See detailed documentation in
204     +   $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`)
205     +/
206     alias CharacterType = ElementEncodingType!T;
207     /// ditto
208     alias InputType = T;
209 
210     //mixin UsesAllocator!Alloc;
211     //mixin UsesErrorHandler!ErrorHandler;
212 
213     //private Appender!(CharacterType, Alloc) app;
214     private CharacterType[] buffer;
215 
216     import std.string: representation;
217     static if (is(typeof(representation!CharacterType(""))))
218     {
219         private typeof(representation!CharacterType("")) input;
220         void setSource(T input)
221         {
222             this.input = input.representation;
223             buffer.length = 0;
224             //app = typeof(app)(allocator);
225         }
226     }
227     else
228     {
229         private T input;
230         void setSource(T input)
231         {
232             this.input = input;
233             buffer.length = 0;
234             //app = typeof(app)(allocator);
235         }
236     }
237 
238     static if (isForwardRange!T)
239     {
240         auto save()
241         {
242             RangeLexer result;
243             result.input = input.save;
244             result.buffer.length = 0;
245             //result.app = typeof(app)(allocator);
246             return result;
247         }
248     }
249 
250     /++
251     +   See detailed documentation in
252     +   $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`)
253     +/
254     bool empty() const
255     {
256         return input.empty;
257     }
258 
259     /// ditto
260     void start()
261     {
262         buffer.length = 0;
263         /+static if (reuseBuffer)
264             app.clear;
265         else
266             app = typeof(app)(allocator);+/
267     }
268 
269     /// ditto
270     CharacterType[] get() const
271     {
272         return buffer;//app.data;
273     }
274 
275     /// ditto
276     void dropWhile(string s)
277     {
278         while (!input.empty && fastIndexOf(s, input.front) != -1)
279             input.popFront();
280     }
281 
282     /// ditto
283     bool testAndAdvance(char c)
284     {
285         if (input.empty)
286             throw new LexerException("No more characters are found!");//handler();
287         if (input.front == c)
288         {
289             buffer ~= input.front;//app.put(input.front);
290             input.popFront();
291             return true;
292         }
293         return false;
294     }
295 
296     /// ditto
297     void advanceUntil(char c, bool included)
298     {
299         if (input.empty)
300             throw new LexerException("No more characters are found!");//handler();
301         while (input.front != c)
302         {
303             buffer ~= input.front;//app.put(input.front);
304             input.popFront();
305             if (input.empty)
306                 throw new LexerException("No more characters are found!");//handler();
307         }
308         if (included)
309         {
310             buffer ~= input.front;//app.put(input.front);
311             input.popFront();
312         }
313     }
314 
315     /// ditto
316     size_t advanceUntilAny(string s, bool included)
317     {
318         if (input.empty)
319             throw new LexerException("No more characters are found!");//handler();
320         size_t res;
321         while ((res = fastIndexOf(s, input.front)) == -1)
322         {
323             buffer ~= input.front;//app.put(input.front);
324             input.popFront;
325             if (input.empty)
326                 throw new LexerException("No more characters are found!");//handler();
327         }
328         if (included)
329         {
330             buffer ~= input.front;// app.put(input.front);
331             input.popFront;
332         }
333         return res;
334     }
335 }
336 
337 /++
338 +   A lexer that takes a ForwardRange.
339 +
340 +   This lexer copies the needed characters from the forward range to an internal
341 +   buffer, returning slices of it. Whether the buffer is reused (and thus all
342 +   previously returned slices invalidated) depends on the instantiation parameters.
343 +
344 +   This is slightly faster than `RangeLexer`, but shoudn't be used if a faster
345 +   lexer is available.
346 +
347 +   Params:
348 +       T           = the InputRange to be used as input for this lexer
349 +/
350 struct ForwardLexer(T)
351     if (isForwardRange!T)
352 {
353 
354     /++
355     +   See detailed documentation in
356     +   $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`)
357     +/
358     alias CharacterType = ElementEncodingType!T;
359     /// ditto
360     alias InputType = T;
361 
362     //mixin UsesAllocator!Alloc;
363     //mixin UsesErrorHandler!ErrorHandler;
364 
365     private size_t count;
366     private CharacterType[] buffer;//private Appender!(CharacterType, Alloc) app;
367 
368     import std.string: representation;
369     static if (is(typeof(representation!CharacterType(""))))
370     {
371         private typeof(representation!CharacterType("")) input;
372         private typeof(input) input_start;
373         void setSource(T input)
374         {
375             buffer.length = 0;//app = typeof(app)(allocator);
376             this.input = input.representation;
377             this.input_start = this.input;
378         }
379     }
380     else
381     {
382         private T input;
383         private T input_start;
384         void setSource(T input)
385         {
386             buffer.length = 0;//app = typeof(app)(allocator);
387             this.input = input;
388             this.input_start = input;
389         }
390     }
391 
392     auto save()
393     {
394         ForwardLexer result;
395         result.input = input.save();
396         result.input_start = input.save();
397         result.buffer.length = 0;//result.app = typeof(app)(allocator);
398         result.count = count;
399         return result;
400     }
401 
402     /++
403     +   See detailed documentation in
404     +   $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`)
405     +/
406     bool empty() const
407     {
408         return input.empty;
409     }
410 
411     /// ditto
412     void start()
413     {
414         buffer.length = 0;
415 
416         input_start = input.save;
417         count = 0;
418     }
419 
420     /// ditto
421     CharacterType[] get()
422     {
423         import std.range: take;
424         auto diff = count - buffer.length;
425         if (diff)
426         {
427             buffer.reserve(diff);
428             buffer ~= input_start.take(diff);//app.put(input_start.take(diff));
429         }
430         return buffer;
431     }
432 
433     /// ditto
434     void dropWhile(string s)
435     {
436         while (!input.empty && fastIndexOf(s, input.front) != -1)
437             input.popFront();
438         input_start = input.save;
439     }
440 
441     /// ditto
442     bool testAndAdvance(char c)
443     {
444         if (input.empty)
445             throw new LexerException("No data found!");
446         if (input.front == c)
447         {
448             count++;
449             input.popFront();
450             return true;
451         }
452         return false;
453     }
454 
455     /// ditto
456     void advanceUntil(char c, bool included)
457     {
458         if (input.empty)
459             throw new LexerException("No data found!");
460         while (input.front != c)
461         {
462             count++;
463             input.popFront();
464             if (input.empty)
465                 throw new LexerException("No data found!");
466         }
467         if (included)
468         {
469             count++;
470             input.popFront();
471         }
472     }
473 
474     /// ditto
475     size_t advanceUntilAny(string s, bool included)
476     {
477         if (input.empty)
478             throw new LexerException("No more characters are found!");
479         size_t res;
480         while ((res = fastIndexOf(s, input.front)) == -1)
481         {
482             count++;
483             input.popFront;
484             if (input.empty)
485                 throw new LexerException("No more characters are found!");
486         }
487         if (included)
488         {
489             count++;
490             input.popFront;
491         }
492         return res;
493     }
494 }
495 
496 /++
497 +   A lexer that takes an InputRange of slices from the input.
498 +
499 +   This lexer tries to merge the speed of direct slicing with the low memory requirements
500 +   of ranges. Its input is a range whose elements are chunks of the input data; this
501 +   lexer returns slices of the original chunks, unless the output is split between two
502 +   chunks. If that's the case, a new array is allocated and returned. The various chunks
503 +   may have different sizes.
504 +
505 +   The bigger the chunks are, the better is the performance and higher the memory usage,
506 +   so finding the correct tradeoff is crucial for maximum performance. This lexer is
507 +   suitable for very large files, which are read chunk by chunk from the file system.
508 +
509 +   Params:
510 +       T           = the InputRange to be used as input for this lexer
511 +/
512 struct BufferedLexer(T)
513     if (isInputRange!T && isArray!(ElementType!T))
514 {
515     //import newxml.appender;
516 
517     alias BufferType = ElementType!T;
518 
519     /++
520     +   See detailed documentation in
521     +   $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`)
522     +/
523     alias CharacterType = ElementEncodingType!BufferType;
524     /// ditto
525     alias InputType = T;
526 
527     private InputType buffers;
528     private size_t pos;
529     private size_t begin;
530 
531     private CharacterType[] outBuf;//private Appender!(CharacterType, Alloc) app;
532     private bool onEdge;
533 
534     import std.string: representation, assumeUTF;
535     static if (is(typeof(representation!CharacterType(""))))
536     {
537         private typeof(representation!CharacterType("")) buffer;
538         void popBuffer()
539         {
540             buffer = buffers.front.representation;
541             buffers.popFront;
542         }
543     }
544     else
545     {
546         private BufferType buffer;
547         void popBuffer()
548         {
549             buffer = buffers.front;
550             buffers.popFront;
551         }
552     }
553 
554     /++
555     +   See detailed documentation in
556     +   $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`)
557     +/
558     void setSource(T input)
559     {
560         outBuf.length = 0; //app = typeof(app)(allocator);
561         this.buffers = input;
562         popBuffer;
563     }
564 
565     static if (isForwardRange!T)
566     {
567         auto save() const
568         {
569             BufferedLexer result;
570             result.buffers = buffers.save();
571             result.buffer = buffer;
572             result.pos = pos;
573             result.begin = begin;
574             result.outBuf.length = 0;//app = typeof(app)(allocator);
575             return result;
576         }
577     }
578 
579     /++
580     +   See detailed documentation in
581     +   $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`)
582     +/
583     bool empty()
584     {
585         return buffers.empty && pos >= buffer.length;
586     }
587 
588     /// ditto
589     void start()
590     {
591         outBuf.length = 0;
592         /+static if (reuseBuffer)
593             app.clear;
594         else
595             app = typeof(app)(allocator);+/
596 
597         begin = pos;
598         onEdge = false;
599     }
600 
601     private void advance()
602     {
603         if (empty)
604             throw new LexerException("No more characters are found!");
605         if (pos + 1 >= buffer.length)
606         {
607             if (onEdge)
608                 outBuf ~= buffer[pos];//app.put(buffer[pos]);
609             else
610             {
611                 outBuf ~= buffer[begin..$];//app.put(buffer[begin..$]);
612                 onEdge = true;
613             }
614             popBuffer;
615             begin = 0;
616             pos = 0;
617         }
618         else if (onEdge)
619             outBuf ~= buffer[pos++];//app.put(buffer[pos++]);
620         else
621             pos++;
622     }
623     private void advance(ptrdiff_t n)
624     {
625         foreach(i; 0..n)
626             advance();
627     }
628     private void advanceNextBuffer()
629     {
630         if (empty)
631             throw new LexerException("No more characters are found!");
632         if (onEdge)
633             outBuf ~= buffer[pos..$]; //app.put(buffer[pos..$]);
634         else
635         {
636             outBuf ~= buffer[begin..$];//app.put(buffer[begin..$]);
637             onEdge = true;
638         }
639         popBuffer;
640         begin = 0;
641         pos = 0;
642     }
643 
644     /++
645     +   See detailed documentation in
646     +   $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`)
647     +/
648     CharacterType[] get() const
649     {
650         if (onEdge)
651             return outBuf;//app.data;
652         else
653         {
654             static if (is(typeof(representation!CharacterType(""))))
655                 return cast(CharacterType[])buffer[begin..pos];
656             else
657                 return buffer[begin..pos];
658         }
659     }
660 
661     /// ditto
662     void dropWhile(string s)
663     {
664         while (!empty && fastIndexOf(s, buffer[pos]) != -1)
665             advance();
666     }
667 
668     /// ditto
669     bool testAndAdvance(char c)
670     {
671         if (empty)
672             throw new LexerException("No data found!");
673         if (buffer[pos] == c)
674         {
675             advance();
676             return true;
677         }
678         return false;
679     }
680 
681     /// ditto
682     void advanceUntil(char c, bool included)
683     {
684         if (empty)
685             throw new LexerException("No data found!");
686         ptrdiff_t adv;
687         while ((adv = fastIndexOf(buffer[pos..$], c)) == -1)
688         {
689             advanceNextBuffer();
690         }
691         advance(adv);
692 
693         if (included)
694             advance();
695     }
696 
697     /// ditto
698     size_t advanceUntilAny(string s, bool included)
699     {
700         if (empty)
701             throw new LexerException("No data found!");
702         ptrdiff_t res;
703         while ((res = fastIndexOf(s, buffer[pos])) == -1)
704         {
705             advance();
706         }
707         if (included)
708             advance();
709         return res;
710     }
711 }
712 
713 /++
714 +   Instantiates a specialized lexer for the given input type.
715 +
716 +   The default error handler just asserts 0.
717 +   If the type of the allocator is specified as template parameter, but no instance of it
718 +   is passed as runtime parameter, then the static method `instance` of the allocator type is
719 +   used.
720 +/
721 auto chooseLexer(Input)()
722 {
723     static if (is(SliceLexer!(Input)))
724     {
725         auto res = SliceLexer!(Input)();
726         return res;
727     }
728     else static if (is(BufferedLexer!(Input)))
729     {
730         auto res = BufferedLexer!(Input)();
731         return res;
732     }
733     else static if (is(RangeLexer!(Input)))
734     {
735         auto res = RangeLexer!(Input)();
736         return res;
737     }
738     else static assert(0);
739    
740 }
741 
742 template lexer()
743 {
744     auto lexer(Input)(auto ref Input input)
745     {
746         auto res = chooseLexer!(Input)();
747         res.setSource(input);
748         return res;
749     }
750 }
751 
752 version(unittest)
753 {
754     struct DumbBufferedReader
755     {
756         string content;
757         size_t chunk_size;
758 
759         void popFront() @nogc
760         {
761             if (content.length > chunk_size)
762                 content = content[chunk_size..$];
763             else
764                 content = [];
765         }
766         string front() const @nogc
767         {
768             if (content.length >= chunk_size)
769                 return content[0..chunk_size];
770             else
771                 return content[0..$];
772         }
773         bool empty() const @nogc
774         {
775             return !content.length;
776         }
777     }
778 }
779 
780 unittest
781 {
782     void testLexer(T)(T.InputType delegate(string) @safe conv) @safe
783     {
784         string xml = q{
785         <?xml encoding = "utf-8" ?>
786         <aaa xmlns:myns="something">
787             <myns:bbb myns:att='>'>
788                 <!-- lol -->
789                 Lots of Text!
790                 On multiple lines!
791             </myns:bbb>
792             <![CDATA[ Ciaone! ]]>
793             <ccc/>
794         </aaa>
795         };
796 
797         T lexer;
798         assert(lexer.empty);
799         lexer.setSource(conv(xml));
800 
801         lexer.dropWhile(" \r\n\t");
802         lexer.start();
803         lexer.advanceUntilAny(":>", true);
804         assert(lexer.get() == "<?xml encoding = \"utf-8\" ?>");
805 
806         lexer.dropWhile(" \r\n\t");
807         lexer.start();
808         lexer.advanceUntilAny("=:", false);
809         assert(lexer.get() == "<aaa xmlns");
810 
811         lexer.start();
812         lexer.advanceUntil('>', true);
813         assert(lexer.get() == ":myns=\"something\">");
814 
815         lexer.dropWhile(" \r\n\t");
816         lexer.start();
817         lexer.advanceUntil('\'', true);
818         assert(lexer.testAndAdvance('>'));
819         lexer.advanceUntil('>', false);
820         assert(lexer.testAndAdvance('>'));
821         assert(lexer.get() == "<myns:bbb myns:att='>'>");
822 
823         assert(!lexer.empty);
824     }
825 
826     testLexer!(SliceLexer!(string))(x => x);
827     testLexer!(RangeLexer!(string))(x => x);
828     testLexer!(ForwardLexer!(string))(x => x);
829     testLexer!(BufferedLexer!(DumbBufferedReader))(x => DumbBufferedReader(x, 10));
830 }
831