1 /* 2 * Copyright Lodovico Giaretta 2016 - . 3 * Distributed under the Boost Software License, Version 1.0. 4 * (See accompanying file LICENSE_1_0.txt or copy at 5 * http://www.boost.org/LICENSE_1_0.txt) 6 */ 7 8 /++ 9 + This module implements a low level XML parser. 10 + 11 + The methods a parser should implement are documented in 12 + $(LINK2 ../interfaces/isParser, `newxml.interfaces.isParser`); 13 + 14 + Authors: 15 + Lodovico Giaretta 16 + László Szerémi 17 + 18 + License: 19 + <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>. 20 + 21 + Copyright: 22 + Copyright Lodovico Giaretta 2016 -- 23 +/ 24 25 module newxml.parser; 26 27 import newxml.interfaces; 28 import newxml.faststrings; 29 import newxml.validation; 30 31 import std.typecons : Flag, Yes, No; 32 33 public class ParserException : XMLException { 34 @nogc @safe pure nothrow this(string msg, string file = __FILE__, size_t line = __LINE__, Throwable nextInChain = null) 35 { 36 super(msg, file, line, nextInChain); 37 } 38 39 @nogc @safe pure nothrow this(string msg, Throwable nextInChain, string file = __FILE__, size_t line = __LINE__) 40 { 41 super(msg, file, line, nextInChain); 42 } 43 } 44 @safe: 45 /++ 46 + A low level XML parser. 47 + 48 + The methods a parser should implement are documented in 49 + $(LINK2 ../interfaces/isLexer, `newxml.interfaces.isLexer`); 50 + 51 + Params: 52 + L = the underlying lexer type 53 + preserveWhitespace = if set to `Yes` (default is `No`), the parser will not remove element content whitespace 54 + (i.e. the whitespace that separates tags), but will report it as text. 55 +/ 56 struct Parser(L, Flag!"preserveWhitespace" preserveWhitespace = No.preserveWhitespace) 57 if (isLexer!L) 58 { 59 import std.meta : staticIndexOf; 60 61 alias CharacterType = L.CharacterType; 62 alias StringType = CharacterType[]; 63 /++ 64 + The structure returned in output from the low level parser. 65 + Represents an XML token, delimited by specific patterns, based on its kind. 66 + This delimiters are not present in the content field. 67 +/ 68 struct XMLToken 69 { 70 /++ The content of the token, delimiters excluded +/ 71 CharacterType[] content; 72 73 /++ Represents the kind of token +/ 74 XMLKind kind; 75 } 76 ///The lexer associated with the parser. 77 package L lexer; 78 private bool ready; 79 private bool insideDTD; 80 ///if set to `true` (default is `false`), the parser will try to parse any and all badly formed document as long as 81 ///it can be processed. 82 public bool processBadDocument; 83 ///if set to `true` (which is default), then the parser will test for invalid characters, and will throw an 84 ///exception on errors. Turning it off can speed up parsing. 85 public bool testTextValidity; 86 public XMLVersion xmlVersion; 87 private XMLToken next; 88 ///Contains character and text entities. Text entities might contain additional nodes and elements. 89 ///By default, it is filled with XML entities. 90 public StringType[StringType] chrEntities; 91 92 //mixin UsesErrorHandler!ErrorHandler; 93 94 this(L lexer) { 95 this.lexer = lexer; 96 chrEntities = xmlPredefinedEntities!CharacterType(); 97 } 98 /++ Generic constructor; forwards its arguments to the lexer constructor +/ 99 this(Args...)(Args args) 100 { 101 lexer = L(args); 102 chrEntities = xmlPredefinedEntities!CharacterType(); 103 } 104 static if (needSource!L) 105 { 106 alias InputType = L.InputType; 107 108 /++ 109 + See detailed documentation in 110 + $(LINK2 ../interfaces/isParser, `newxml.interfaces.isParser`) 111 +/ 112 void setSource(InputType input) 113 { 114 lexer.setSource(input); 115 chrEntities = xmlPredefinedEntities!CharacterType(); 116 ready = false; 117 insideDTD = false; 118 } 119 } 120 121 static if (isSaveableLexer!L) 122 { 123 auto save() 124 { 125 Parser result = this; 126 result.lexer = lexer.save; 127 return result; 128 } 129 } 130 131 private CharacterType[] fetchContent(size_t start = 0, size_t stop = 0) 132 { 133 return lexer.get[start..($ - stop)]; 134 } 135 136 /++ 137 + See detailed documentation in 138 + $(LINK2 ../interfaces/isParser, `newxml.interfaces.isParser`) 139 +/ 140 bool empty() 141 { 142 static if (preserveWhitespace == No.preserveWhitespace) 143 lexer.dropWhile(" \r\n\t"); 144 145 return !ready && lexer.empty; 146 } 147 148 /// ditto 149 auto front() 150 { 151 if (!ready) 152 fetchNext(); 153 return next; 154 } 155 156 /// ditto 157 void popFront() 158 { 159 front(); 160 ready = false; 161 } 162 163 private void fetchNext() 164 { 165 if (!preserveWhitespace || insideDTD) 166 lexer.dropWhile(" \r\n\t"); 167 168 assert(!lexer.empty); 169 170 lexer.start(); 171 172 // dtd end 173 if (insideDTD && lexer.testAndAdvance(']')) 174 { 175 lexer.dropWhile(" \r\n\t"); 176 if (!lexer.testAndAdvance('>')) 177 { 178 throw new ParserException("No \">\" character have been found after an \"<\"!");//handler(); 179 } 180 next.kind = XMLKind.dtdEnd; 181 next.content = null; 182 insideDTD = false; 183 } 184 185 // text element 186 else if (!lexer.testAndAdvance('<')) 187 { 188 lexer.advanceUntil('<', false); 189 next.kind = XMLKind.text; 190 if (!processBadDocument) 191 next.content = xmlUnescape(fetchContent(), chrEntities); 192 else 193 next.content = xmlUnescape!(No.strict)(fetchContent(), chrEntities); 194 if (testTextValidity) 195 { 196 if (xmlVersion == XMLVersion.XML1_0) 197 { 198 if (!isValidXMLText10(next.content)) 199 throw new ParserException("Text contains invalid characters!"); 200 } 201 else 202 { 203 if (!isValidXMLText11(next.content)) 204 throw new ParserException("Text contains invalid characters!"); 205 } 206 } 207 } 208 209 // tag end 210 else if (lexer.testAndAdvance('/')) 211 { 212 lexer.advanceUntil('>', true); 213 next.content = fetchContent(2, 1); 214 next.kind = XMLKind.elementEnd; 215 } 216 // processing instruction 217 else if (lexer.testAndAdvance('?')) 218 { 219 do 220 lexer.advanceUntil('?', true); 221 while (!lexer.testAndAdvance('>')); 222 next.content = fetchContent(2, 2); 223 next.kind = XMLKind.processingInstruction; 224 } 225 // tag start 226 else if (!lexer.testAndAdvance('!')) 227 { 228 size_t c; 229 while ((c = lexer.advanceUntilAny("\"'/>", true)) < 2) 230 if (c == 0) 231 lexer.advanceUntil('"', true); 232 else 233 lexer.advanceUntil('\'', true); 234 235 if (c == 2) 236 { 237 lexer.advanceUntil('>', true); // should be the first character after '/' 238 next.content = fetchContent(1, 2); 239 next.kind = XMLKind.elementEmpty; 240 } 241 else 242 { 243 next.content = fetchContent(1, 1); 244 next.kind = XMLKind.elementStart; 245 } 246 } 247 248 // cdata or conditional 249 else if (lexer.testAndAdvance('[')) 250 { 251 lexer.advanceUntil('[', true); 252 // cdata 253 if (lexer.get.length == 9 && fastEqual(lexer.get()[3..$], "CDATA[")) 254 { 255 do 256 lexer.advanceUntil('>', true); 257 while (!fastEqual(lexer.get()[($-3)..$], "]]>")); 258 next.content = fetchContent(9, 3); 259 next.kind = XMLKind.cdata; 260 } 261 // conditional 262 else 263 { 264 int count = 1; 265 do 266 { 267 lexer.advanceUntilAny("[>", true); 268 if (lexer.get()[($-3)..$] == "]]>") 269 count--; 270 else if (lexer.get()[($-3)..$] == "<![") 271 count++; 272 } 273 while (count > 0); 274 next.content = fetchContent(3, 3); 275 next.kind = XMLKind.conditional; 276 } 277 } 278 // comment 279 else if (lexer.testAndAdvance('-')) 280 { 281 lexer.testAndAdvance('-'); // second '-' 282 do 283 lexer.advanceUntil('>', true); 284 while (!fastEqual(lexer.get()[($-3)..$], "-->")); 285 next.content = fetchContent(4, 3); 286 next.kind = XMLKind.comment; 287 } 288 // declaration or doctype 289 else 290 { 291 size_t c; 292 while ((c = lexer.advanceUntilAny("\"'[>", true)) < 2) 293 if (c == 0) 294 lexer.advanceUntil('"', true); 295 else 296 lexer.advanceUntil('\'', true); 297 298 // doctype 299 if (lexer.get.length>= 9 && fastEqual(lexer.get()[2..9], "DOCTYPE")) 300 { 301 next.content = fetchContent(9, 1); 302 if (c == 2) 303 { 304 next.kind = XMLKind.dtdStart; 305 insideDTD = true; 306 } 307 else next.kind = XMLKind.dtdEmpty; 308 } 309 // declaration 310 else 311 { 312 if (c == 2) 313 { 314 size_t cc; 315 while ((cc = lexer.advanceUntilAny("\"'>", true)) < 2) 316 if (cc == 0) 317 lexer.advanceUntil('"', true); 318 else 319 lexer.advanceUntil('\'', true); 320 } 321 auto len = lexer.get().length; 322 if (len > 8 && fastEqual(lexer.get()[2..9], "ATTLIST")) 323 { 324 next.content = fetchContent(9, 1); 325 next.kind = XMLKind.attlistDecl; 326 } 327 else if (len > 8 && fastEqual(lexer.get()[2..9], "ELEMENT")) 328 { 329 next.content = fetchContent(9, 1); 330 next.kind = XMLKind.elementDecl; 331 } 332 else if (len > 9 && fastEqual(lexer.get()[2..10], "NOTATION")) 333 { 334 next.content = fetchContent(10, 1); 335 next.kind = XMLKind.notationDecl; 336 } 337 else if (len > 7 && fastEqual(lexer.get()[2..8], "ENTITY")) 338 { 339 next.content = fetchContent(8, 1); 340 next.kind = XMLKind.entityDecl; 341 } 342 else 343 { 344 next.content = fetchContent(2, 1); 345 next.kind = XMLKind.declaration; 346 } 347 } 348 } 349 350 ready = true; 351 } 352 } 353 354 /++ 355 + Returns an instance of `Parser` from the given lexer. 356 + 357 + Params: 358 + preserveWhitespace = whether the returned `Parser` shall skip element content 359 + whitespace or return it as text nodes 360 + lexer = the _lexer to build this `Parser` from 361 + 362 + Returns: 363 + A `Parser` instance initialized with the given lexer 364 +/ 365 auto parser(Flag!"preserveWhitespace" preserveWhitespace = No.preserveWhitespace, T)(T lexer) 366 if (isLexer!T) 367 { 368 auto parser = Parser!(T, preserveWhitespace)(); 369 //parser.errorHandler = handler; 370 parser.lexer = lexer; 371 return parser; 372 } 373 /* ///Ditto 374 auto parser(Flag!"preserveWhitespace" preserveWhitespace = No.preserveWhitespace, T)(auto ref T input) 375 { 376 auto lx = input.lexer; 377 auto parser = Parser!(typeof(lx), preserveWhitespace)(lx); 378 //parser.errorHandler = handler; 379 return parser; 380 } */ 381 382 import newxml.lexers; 383 import std.experimental.allocator.gc_allocator;//import stdx.allocator.gc_allocator; 384 385 /++ 386 + Instantiates a parser suitable for the given `InputType`. 387 + 388 + This is completely equivalent to 389 + --- 390 + auto parser = 391 + chooseLexer!(InputType, reuseBuffer)(alloc, handler) 392 + .parser!(preserveWhitespace)(handler) 393 + --- 394 +/ 395 auto chooseParser(InputType, Flag!"preserveWhitespace" preserveWhitespace = No.preserveWhitespace)() 396 { 397 return chooseLexer!(InputType)() 398 .parser!(preserveWhitespace)(); 399 } 400 401 unittest 402 { 403 import newxml.lexers; 404 import std.algorithm : find; 405 import std.string : stripRight; 406 407 string xml = q"{ 408 <!DOCTYPE mydoc https://myUri.org/bla [ 409 <!ELEMENT myelem ANY> 410 <!ENTITY myent "replacement text"> 411 <!ATTLIST myelem foo cdata #REQUIRED > 412 <!NOTATION PUBLIC 'h'> 413 <!FOODECL asdffdsa > 414 ]> 415 }"; 416 417 auto parser = xml.lexer.parser; 418 419 alias XMLKind = typeof(parser.front.kind); 420 421 assert(parser.front.kind == XMLKind.dtdStart); 422 assert(parser.front.content == " mydoc https://myUri.org/bla "); 423 parser.popFront; 424 425 assert(parser.front.kind == XMLKind.elementDecl); 426 assert(parser.front.content == " myelem ANY"); 427 parser.popFront; 428 429 assert(parser.front.kind == XMLKind.entityDecl); 430 assert(parser.front.content == " myent \"replacement text\""); 431 parser.popFront; 432 433 assert(parser.front.kind == XMLKind.attlistDecl); 434 assert(parser.front.content == " myelem foo cdata #REQUIRED "); 435 parser.popFront; 436 437 assert(parser.front.kind == XMLKind.notationDecl); 438 assert(parser.front.content == " PUBLIC 'h'"); 439 parser.popFront; 440 441 assert(parser.front.kind == XMLKind.declaration); 442 assert(parser.front.content == "FOODECL asdffdsa "); 443 parser.popFront; 444 445 assert(parser.front.kind == XMLKind.dtdEnd); 446 assert(!parser.front.content); 447 parser.popFront; 448 449 assert(parser.empty); 450 }