newxml.parser source code

1 /*
2 *             Copyright Lodovico Giaretta 2016 - .
3 *  Distributed under the Boost Software License, Version 1.0.
4 *      (See accompanying file LICENSE_1_0.txt or copy at
5 *            http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 /++
9 +   This module implements a low level XML parser.
10 +
11 +   The methods a parser should implement are documented in
12 +   $(LINK2 ../interfaces/isParser, `newxml.interfaces.isParser`);
13 +
14 +   Authors:
15 +   Lodovico Giaretta
16 +   László Szerémi
17 +
18 +   License:
19 +   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
20 +
21 +   Copyright:
22 +   Copyright Lodovico Giaretta 2016 --
23 +/
24 
25 module newxml.parser;
26 
27 import newxml.interfaces;
28 import newxml.faststrings;
29 import newxml.validation;
30 
31 import std.typecons : Flag, Yes, No;
32 
33 public class ParserException : XMLException {
34     @nogc @safe pure nothrow this(string msg, string file = __FILE__, size_t line = __LINE__, Throwable nextInChain = null)
35     {
36         super(msg, file, line, nextInChain);
37     }
38 
39     @nogc @safe pure nothrow this(string msg, Throwable nextInChain, string file = __FILE__, size_t line = __LINE__)
40     {
41         super(msg, file, line, nextInChain);
42     }
43 }
44 @safe:
45 /++
46 +   A low level XML parser.
47 +
48 +   The methods a parser should implement are documented in
49 +   $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`);
50 +
51 +   Params:
52 +       L = the underlying lexer type
53 +       preserveWhitespace = if set to `Yes` (default is `No`), the parser will not remove element content whitespace 
54 +   (i.e. the whitespace that separates tags), but will report it as text.
55 +/
56 struct Parser(L, Flag!"preserveWhitespace" preserveWhitespace = No.preserveWhitespace)
57     if (isLexer!L)
58 {
59     import std.meta : staticIndexOf;
60 
61     alias CharacterType = L.CharacterType;
62     alias StringType = CharacterType[];
63     /++
64     +   The structure returned in output from the low level parser.
65     +   Represents an XML token, delimited by specific patterns, based on its kind.
66     +   This delimiters are not present in the content field.
67     +/
68     struct XMLToken
69     {
70         /++ The content of the token, delimiters excluded +/
71         CharacterType[] content;
72 
73         /++ Represents the kind of token +/
74         XMLKind kind;
75     }
76     ///The lexer associated with the parser.
77     package L lexer;
78     private bool ready;
79     private bool insideDTD;
80     ///if set to `true` (default is `false`), the parser will try to parse any and all badly formed document as long as
81     ///it can be processed.
82     public bool processBadDocument;
83     ///if set to `true` (which is default), then the parser will test for invalid characters, and will throw an 
84     ///exception on errors. Turning it off can speed up parsing.
85     public bool testTextValidity;
86     public XMLVersion xmlVersion;
87     private XMLToken next;
88     ///Contains character and text entities. Text entities might contain additional nodes and elements.
89     ///By default, it is filled with XML entities.
90     public StringType[StringType] chrEntities;
91 
92     //mixin UsesErrorHandler!ErrorHandler;
93 
94     this(L lexer) {
95         this.lexer = lexer;
96         chrEntities = xmlPredefinedEntities!CharacterType();
97     }
98     /++ Generic constructor; forwards its arguments to the lexer constructor +/
99     this(Args...)(Args args)
100     {
101         lexer = L(args);
102         chrEntities = xmlPredefinedEntities!CharacterType();
103     }
104     static if (needSource!L)
105     {
106         alias InputType = L.InputType;
107 
108         /++
109         +   See detailed documentation in
110         +   $(LINK2 ../interfaces/isParser, `newxml.interfaces.isParser`)
111         +/
112         void setSource(InputType input)
113         {
114             lexer.setSource(input);
115             chrEntities = xmlPredefinedEntities!CharacterType();
116             ready = false;
117             insideDTD = false;
118         }
119     }
120 
121     static if (isSaveableLexer!L)
122     {
123         auto save()
124         {
125             Parser result = this;
126             result.lexer = lexer.save;
127             return result;
128         }
129     }
130 
131     private CharacterType[] fetchContent(size_t start = 0, size_t stop = 0)
132     {
133         return lexer.get[start..($ - stop)];
134     }
135 
136     /++
137     +   See detailed documentation in
138     +   $(LINK2 ../interfaces/isParser, `newxml.interfaces.isParser`)
139     +/
140     bool empty()
141     {
142         static if (preserveWhitespace == No.preserveWhitespace)
143             lexer.dropWhile(" \r\n\t");
144 
145         return !ready && lexer.empty;
146     }
147 
148     /// ditto
149     auto front()
150     {
151         if (!ready)
152             fetchNext();
153         return next;
154     }
155 
156     /// ditto
157     void popFront()
158     {
159         front();
160         ready = false;
161     }
162 
163     private void fetchNext()
164     {
165         if (!preserveWhitespace || insideDTD)
166             lexer.dropWhile(" \r\n\t");
167 
168         assert(!lexer.empty);
169 
170         lexer.start();
171 
172         // dtd end
173         if (insideDTD && lexer.testAndAdvance(']'))
174         {
175             lexer.dropWhile(" \r\n\t");
176             if (!lexer.testAndAdvance('>'))
177             {
178                 throw new ParserException("No \">\" character have been found after an \"<\"!");//handler();
179             }
180             next.kind = XMLKind.dtdEnd;
181             next.content = null;
182             insideDTD = false;
183         }
184 
185         // text element
186         else if (!lexer.testAndAdvance('<'))
187         {
188             lexer.advanceUntil('<', false);
189             next.kind = XMLKind.text;
190             if (!processBadDocument)
191                 next.content = xmlUnescape(fetchContent(), chrEntities);
192             else
193                 next.content = xmlUnescape!(No.strict)(fetchContent(), chrEntities);
194             if (testTextValidity)
195             {
196                 if (xmlVersion == XMLVersion.XML1_0)
197                 {
198                     if (!isValidXMLText10(next.content))
199                         throw new ParserException("Text contains invalid characters!");
200                 }
201                 else
202                 {
203                     if (!isValidXMLText11(next.content))
204                         throw new ParserException("Text contains invalid characters!");
205                 }
206             }
207         }
208 
209         // tag end
210         else if (lexer.testAndAdvance('/'))
211         {
212             lexer.advanceUntil('>', true);
213             next.content = fetchContent(2, 1);
214             next.kind = XMLKind.elementEnd;
215         }
216         // processing instruction
217         else if (lexer.testAndAdvance('?'))
218         {
219             do
220                 lexer.advanceUntil('?', true);
221             while (!lexer.testAndAdvance('>'));
222             next.content = fetchContent(2, 2);
223             next.kind = XMLKind.processingInstruction;
224         }
225         // tag start
226         else if (!lexer.testAndAdvance('!'))
227         {
228             size_t c;
229             while ((c = lexer.advanceUntilAny("\"'/>", true)) < 2)
230                 if (c == 0)
231                     lexer.advanceUntil('"', true);
232                 else
233                     lexer.advanceUntil('\'', true);
234 
235             if (c == 2)
236             {
237                 lexer.advanceUntil('>', true); // should be the first character after '/'
238                 next.content = fetchContent(1, 2);
239                 next.kind = XMLKind.elementEmpty;
240             }
241             else
242             {
243                 next.content = fetchContent(1, 1);
244                 next.kind = XMLKind.elementStart;
245             }
246         }
247 
248         // cdata or conditional
249         else if (lexer.testAndAdvance('['))
250         {
251             lexer.advanceUntil('[', true);
252             // cdata
253             if (lexer.get.length == 9 && fastEqual(lexer.get()[3..$], "CDATA["))
254             {
255                 do
256                     lexer.advanceUntil('>', true);
257                 while (!fastEqual(lexer.get()[($-3)..$], "]]>"));
258                 next.content = fetchContent(9, 3);
259                 next.kind = XMLKind.cdata;
260             }
261             // conditional
262             else
263             {
264                 int count = 1;
265                 do
266                 {
267                     lexer.advanceUntilAny("[>", true);
268                     if (lexer.get()[($-3)..$] == "]]>")
269                         count--;
270                     else if (lexer.get()[($-3)..$] == "<![")
271                         count++;
272                 }
273                 while (count > 0);
274                 next.content = fetchContent(3, 3);
275                 next.kind = XMLKind.conditional;
276             }
277         }
278         // comment
279         else if (lexer.testAndAdvance('-'))
280         {
281             lexer.testAndAdvance('-'); // second '-'
282             do
283                 lexer.advanceUntil('>', true);
284             while (!fastEqual(lexer.get()[($-3)..$], "-->"));
285             next.content = fetchContent(4, 3);
286             next.kind = XMLKind.comment;
287         }
288         // declaration or doctype
289         else
290         {
291             size_t c;
292             while ((c = lexer.advanceUntilAny("\"'[>", true)) < 2)
293                 if (c == 0)
294                     lexer.advanceUntil('"', true);
295                 else
296                     lexer.advanceUntil('\'', true);
297 
298             // doctype
299             if (lexer.get.length>= 9 && fastEqual(lexer.get()[2..9], "DOCTYPE"))
300             {
301                 next.content = fetchContent(9, 1);
302                 if (c == 2)
303                 {
304                     next.kind = XMLKind.dtdStart;
305                     insideDTD = true;
306                 }
307                 else next.kind = XMLKind.dtdEmpty;
308             }
309             // declaration
310             else
311             {
312                 if (c == 2)
313                 {
314                     size_t cc;
315                     while ((cc = lexer.advanceUntilAny("\"'>", true)) < 2)
316                         if (cc == 0)
317                             lexer.advanceUntil('"', true);
318                         else
319                             lexer.advanceUntil('\'', true);
320                 }
321                 auto len = lexer.get().length;
322                 if (len > 8 && fastEqual(lexer.get()[2..9], "ATTLIST"))
323                 {
324                     next.content = fetchContent(9, 1);
325                     next.kind = XMLKind.attlistDecl;
326                 }
327                 else if (len > 8 && fastEqual(lexer.get()[2..9], "ELEMENT"))
328                 {
329                     next.content = fetchContent(9, 1);
330                     next.kind = XMLKind.elementDecl;
331                 }
332                 else if (len > 9 && fastEqual(lexer.get()[2..10], "NOTATION"))
333                 {
334                     next.content = fetchContent(10, 1);
335                     next.kind = XMLKind.notationDecl;
336                 }
337                 else if (len > 7 && fastEqual(lexer.get()[2..8], "ENTITY"))
338                 {
339                     next.content = fetchContent(8, 1);
340                     next.kind = XMLKind.entityDecl;
341                 }
342                 else
343                 {
344                     next.content = fetchContent(2, 1);
345                     next.kind = XMLKind.declaration;
346                 }
347             }
348         }
349 
350         ready = true;
351     }
352 }
353 
354 /++
355 +   Returns an instance of `Parser` from the given lexer.
356 +
357 +   Params:
358 +       preserveWhitespace = whether the returned `Parser` shall skip element content
359 +                            whitespace or return it as text nodes
360 +       lexer = the _lexer to build this `Parser` from
361 +
362 +   Returns:
363 +   A `Parser` instance initialized with the given lexer
364 +/
365 auto parser(Flag!"preserveWhitespace" preserveWhitespace = No.preserveWhitespace, T)(T lexer)
366     if (isLexer!T)
367 {
368     auto parser = Parser!(T, preserveWhitespace)();
369     //parser.errorHandler = handler;
370     parser.lexer = lexer;
371     return parser;
372 }
373 /* ///Ditto
374 auto parser(Flag!"preserveWhitespace" preserveWhitespace = No.preserveWhitespace, T)(auto ref T input)
375 {
376     auto lx = input.lexer;
377     auto parser = Parser!(typeof(lx), preserveWhitespace)(lx);
378     //parser.errorHandler = handler;
379     return parser;
380 } */
381 
382 import newxml.lexers;
383 import std.experimental.allocator.gc_allocator;//import stdx.allocator.gc_allocator;
384 
385 /++
386 +   Instantiates a parser suitable for the given `InputType`.
387 +
388 +   This is completely equivalent to
389 +   ---
390 +   auto parser =
391 +        chooseLexer!(InputType, reuseBuffer)(alloc, handler)
392 +       .parser!(preserveWhitespace)(handler)
393 +   ---
394 +/
395 auto chooseParser(InputType, Flag!"preserveWhitespace" preserveWhitespace = No.preserveWhitespace)()
396 {
397     return chooseLexer!(InputType)()
398           .parser!(preserveWhitespace)();
399 }
400 
401 unittest
402 {
403     import newxml.lexers;
404     import std.algorithm : find;
405     import std.string : stripRight;
406 
407     string xml = q"{
408     <!DOCTYPE mydoc https://myUri.org/bla [
409         <!ELEMENT myelem ANY>
410         <!ENTITY   myent    "replacement text">
411         <!ATTLIST myelem foo cdata #REQUIRED >
412         <!NOTATION PUBLIC 'h'>
413         <!FOODECL asdffdsa >
414     ]>
415     }";
416 
417     auto parser = xml.lexer.parser;
418 
419     alias XMLKind = typeof(parser.front.kind);
420 
421     assert(parser.front.kind == XMLKind.dtdStart);
422     assert(parser.front.content == " mydoc https://myUri.org/bla ");
423     parser.popFront;
424 
425     assert(parser.front.kind == XMLKind.elementDecl);
426     assert(parser.front.content == " myelem ANY");
427     parser.popFront;
428 
429     assert(parser.front.kind == XMLKind.entityDecl);
430     assert(parser.front.content == "   myent    \"replacement text\"");
431     parser.popFront;
432 
433     assert(parser.front.kind == XMLKind.attlistDecl);
434     assert(parser.front.content == " myelem foo cdata #REQUIRED ");
435     parser.popFront;
436 
437     assert(parser.front.kind == XMLKind.notationDecl);
438     assert(parser.front.content == " PUBLIC 'h'");
439     parser.popFront;
440 
441     assert(parser.front.kind == XMLKind.declaration);
442     assert(parser.front.content == "FOODECL asdffdsa ");
443     parser.popFront;
444 
445     assert(parser.front.kind == XMLKind.dtdEnd);
446     assert(!parser.front.content);
447     parser.popFront;
448 
449     assert(parser.empty);
450 }