1 /* 2 * Copyright Lodovico Giaretta 2016 - . 3 * Distributed under the Boost Software License, Version 1.0. 4 * (See accompanying file LICENSE_1_0.txt or copy at 5 * http://www.boost.org/LICENSE_1_0.txt) 6 */ 7 8 /++ 9 + This module implements fast search and compare 10 + functions on slices. In the future, these may be 11 + optimized by means of aggressive specialization, 12 + inline assembly and SIMD instructions. 13 +/ 14 15 module newxml.faststrings; 16 17 import newxml.interfaces : XMLException; 18 19 /** 20 * Compares two strings, and returns true if they're both equal. Both input must be of equal lengths. 21 */ 22 package bool fastEqual(T, S)(T[] t, S[] s) pure @nogc nothrow @trusted 23 in 24 { 25 assert(t.length == s.length); 26 } 27 do 28 { 29 import std.traits; 30 static if (is(Unqual!S == Unqual!T)) 31 { 32 import core.stdc.string : memcmp; 33 return memcmp(t.ptr, s.ptr, t.length * T.sizeof) == 0; 34 } 35 else 36 { 37 foreach (i; 0 .. t.length) 38 if (t[i] != s[i]) 39 return false; 40 return true; 41 } 42 } 43 unittest 44 { 45 assert( fastEqual("ciao"w, "ciao"w)); 46 assert(!fastEqual("ciao", "ciAo")); 47 assert( fastEqual([1, 2], [1, 2])); 48 assert(!fastEqual([1, 2], [1, 3])); 49 } 50 51 /** 52 * Returns the index of the first occurrence of a value in a slice. Returns -1 if nor found. 53 */ 54 package ptrdiff_t fastIndexOf(T, S)(T[] t, S s) pure @nogc nothrow 55 { 56 foreach (i; 0 .. t.length) 57 if (t[i] == s) 58 return i; 59 return -1; 60 } 61 unittest 62 { 63 assert(fastIndexOf("FoO"w, 'O') == 2); 64 assert(fastIndexOf([1, 2], 3.14) == -1); 65 } 66 /** 67 * Returns the index of the last occurrence of a value in a slice. Returns -1 if nor found. 68 */ 69 package ptrdiff_t fastLastIndexOf(T, S)(T[] t, S s) 70 { 71 foreach_reverse (i; 0.. t.length) 72 if (t[i] == s) 73 return i; 74 return -1; 75 } 76 unittest 77 { 78 assert(fastLastIndexOf("FoOo"w, 'o') == 3); 79 assert(fastLastIndexOf([1, 2], 3.14) == -1); 80 } 81 82 83 /++ 84 + Returns the index of the first occurrence of any of the values in the second 85 + slice inside the first one. 86 +/ 87 package ptrdiff_t fastIndexOfAny(T, S)(T[] t, S[] s) pure @nogc nothrow @safe 88 { 89 foreach (i; 0 .. t.length) 90 if (fastIndexOf(s, t[i]) != -1) 91 return i; 92 return -1; 93 } 94 95 unittest 96 { 97 assert(fastIndexOfAny([1, 2, 3, 4], [5, 4, 3]) == 2); 98 assert(fastIndexOfAny("Foo", "baz") == -1); 99 } 100 101 /++ 102 + Returns the index of the first occurrence of a value of the first slice that 103 + does not appear in the second. 104 +/ 105 package ptrdiff_t fastIndexOfNeither(T, S)(T[] t, S[] s) pure @nogc nothrow 106 { 107 foreach (i; 0 .. t.length) 108 if (fastIndexOf(s, t[i]) == -1) 109 return i; 110 return -1; 111 } 112 unittest 113 { 114 assert(fastIndexOfNeither("lulublubla", "luck") == 4); 115 assert(fastIndexOfNeither([1, 3, 2], [2, 3, 1]) == -1); 116 } 117 package bool checkStringBeforeChr(T, S)(T[] haysack, S[] needle, S before) @nogc @safe pure nothrow 118 { 119 for (sizediff_t i ; i < haysack.length ; i++) { 120 if (haysack[i] == needle[0]) 121 { 122 if (cast(sizediff_t)(haysack.length) - i > needle.length) 123 { 124 return fastEqual(haysack[i..i + needle.length], needle); 125 } 126 else 127 return false; 128 } 129 else if (haysack[i] == before) 130 return false; 131 } 132 return false; 133 } 134 unittest 135 { 136 assert(checkStringBeforeChr("extentity SYSTEM \"someexternalentity.file\"", "SYSTEM", '"')); 137 assert(!checkStringBeforeChr("extentity SYST", "SYSTEM", '"')); 138 assert(!checkStringBeforeChr("intentity \"Some internal entity\"", "SYSTEM", '"')); 139 } 140 141 /++ 142 + Returns a copy of the input string, after escaping all XML reserved characters. 143 + 144 + If the string does not contain any reserved character, it is returned unmodified; 145 + otherwise, a copy is made using the specified allocator. 146 +/ 147 T[] xmlEscape(T)(T[] str) 148 { 149 if (str.fastIndexOfAny("&<>'\"") >= 0) 150 { 151 //import newxml.appender; 152 153 T[] app; //auto app = Appender!(T, Alloc)(alloc); 154 app.reserve(str.length + 3); 155 156 app.xmlEscapedWrite(str); 157 return app; 158 } 159 return str; 160 } 161 162 /++ 163 + Writes the input string to the given output range, after escaping all XML reserved characters. 164 +/ 165 void xmlEscapedWrite(Out, T)(ref Out output, T[] str) 166 { 167 import std.conv : to; 168 static immutable amp = to!(T[])("&"); 169 static immutable lt = to!(T[])("<"); 170 static immutable gt = to!(T[])(">"); 171 static immutable apos = to!(T[])("'"); 172 static immutable quot = to!(T[])("""); 173 174 ptrdiff_t i; 175 while ((i = str.fastIndexOfAny("&<>'\"")) >= 0) 176 { 177 output ~= str[0..i]; 178 179 if (str[i] == '&') 180 output ~= amp; 181 else if (str[i] == '<') 182 output ~= lt; 183 else if (str[i] == '>') 184 output ~= gt; 185 else if (str[i] == '\'') 186 output ~= apos; 187 else if (str[i] == '"') 188 output ~= quot; 189 190 str = str[i+1..$]; 191 } 192 output ~= str; 193 } 194 195 auto xmlPredefinedEntities(T)() { 196 alias STR = T[]; 197 STR[STR] result; 198 result["amp"] = "&"; 199 result["lt"] = "<"; 200 result["gt"] = ">"; 201 result["apos"] = "'"; 202 result["quot"] = "\""; 203 204 return result; 205 } 206 207 import std.typecons: Flag, Yes; 208 209 /++ 210 + Returns a copy of the input string, after unescaping all known entity references. 211 + 212 + If the string does not contain any entity reference, it is returned unmodified; 213 + otherwise, a copy is made using the specified allocator. 214 + 215 + The set of known entities can be specified with the last parameter, which must support 216 + the `in` operator (it is treated as an associative array). 217 +/ 218 T[] xmlUnescape(Flag!"strict" strict = Yes.strict, T, U)(T[] str, U replacements) 219 { 220 if (str.fastIndexOf('&') >= 0) 221 { 222 //import newxml.appender; 223 224 T[] app;//auto app = Appender!(T, Alloc)(alloc); 225 app.reserve(str.length); 226 227 app.xmlUnescapedWrite!strict(str, replacements); 228 return app; 229 } 230 return str; 231 } 232 T[] xmlUnescape(Flag!"strict" strict = Yes.strict, T)(T[] str) 233 { 234 if (str.fastIndexOf('&') >= 0) 235 { 236 //import newxml.appender; 237 238 T[] app;//auto app = Appender!(T, Alloc)(alloc); 239 app.reserve(str.length); 240 241 app.xmlUnescapedWrite!strict(str, xmlPredefinedEntities!T()); 242 return app; 243 } 244 return str; 245 } 246 247 /++ 248 + Outputs the input string to the given output range, after unescaping all known entity references. 249 + 250 + The set of known entities can be specified with the last parameter, which must support 251 + the `in` operator (it is treated as an associative array). 252 +/ 253 void xmlUnescapedWrite(Flag!"strict" strict = Yes.strict, Out, T, U) 254 (ref Out output, T[] str, U replacements) 255 { 256 ptrdiff_t i; 257 while ((i = str.fastIndexOf('&')) >= 0) 258 { 259 output ~= str[0..i]; 260 261 ptrdiff_t j = str[(i+1)..$].fastIndexOf(';'); 262 static if (strict == Yes.strict) 263 { 264 if (j < 0) throw new XMLException("Missing ';' ending XML entity!"); 265 } 266 else 267 { 268 if (j < 0) continue; 269 } 270 auto ent = str[(i+1)..(i+j+1)]; 271 static if (strict == Yes.strict) 272 { 273 if (!ent.length) throw new XMLException("Character replacement entity not found!"); 274 } 275 else 276 { 277 if (!ent.length) continue; 278 } 279 280 // character entities 281 if (ent[0] == '#') 282 { 283 //assert(ent.length > 1); 284 ulong num; 285 // hex number 286 if (ent.length > 2 && ent[1] == 'x') 287 { 288 static if (strict == Yes.strict) 289 if (ent.length > 10) throw new XMLException("Number escape value is too large!"); 290 foreach(digit; ent[2..$]) 291 { 292 if ('0' <= digit && digit <= '9') 293 num = (num << 4) + (digit - '0'); 294 else if ('a' <= digit && digit <= 'f') 295 num = (num << 4) + (digit - 'a' + 10); 296 else if ('A' <= digit && digit <= 'F') 297 num = (num << 4) + (digit - 'A' + 10); 298 else 299 { 300 static if (strict == Yes.strict) 301 throw new XMLException("Wrong character encountered within hexadecimal number!"); 302 else 303 break; 304 } 305 } 306 } 307 // decimal number 308 else 309 { 310 static if (strict == Yes.strict) 311 if (ent.length > 12) throw new XMLException("Number escape value is too large!"); 312 foreach(digit; ent[1..$]) 313 { 314 if ('0' <= digit && digit <= '9') 315 { 316 num = (num * 10) + (digit - '0'); 317 } 318 else 319 static if (strict == Yes.strict) 320 throw new XMLException("Wrong character encountered within decimal number!"); 321 else 322 break; 323 } 324 } 325 //assert(num <= 0x10FFFF); 326 static if (strict == Yes.strict) 327 if (num > 0x10FFFF) 328 throw new XMLException("Number escape value is too large!"); 329 330 output ~= cast(dchar)num; 331 } 332 // named entities 333 else 334 { 335 auto repl = replacements.get(ent, null); 336 static if (strict == Yes.strict) 337 { 338 //assert (repl, cast(string)str[(i+1)..(i+j+1)]); 339 if (!repl) throw new XMLException("Character replacement entity not found!"); 340 } 341 else 342 { 343 if (!repl) 344 { 345 output ~= str[i]; 346 str = str[(i+1)..$]; 347 continue; 348 } 349 } 350 output ~= repl; 351 } 352 353 str = str[(i+j+2)..$]; 354 } 355 output ~= str; 356 } 357 358 unittest 359 { 360 //import std.experimental.allocator.mallocator;//import stdx.allocator.mallocator; 361 //auto alloc = Mallocator.instance; 362 assert(xmlEscape("some standard string"d) == "some standard string"d); 363 assert(xmlEscape("& \"some\" <standard> 'string'") == 364 "& "some" <standard> 'string'"); 365 assert(xmlEscape("<&'>>>\"'\"<&&"w) == 366 "<&'>>>"'"<&&"w); 367 } 368 369 unittest 370 { 371 import std.exception : assertThrown; 372 assert(xmlUnescape("some standard string"d) == "some standard string"d); 373 assert(xmlUnescape("some strange string") == "some strange string"); 374 assert(xmlUnescape("& "some" <standard> 'string'") 375 == "& \"some\" <standard> 'string'"); 376 assert(xmlUnescape("<&'>>>"'"<&&"w) 377 == "<&'>>>\"'\"<&&"w); 378 assert(xmlUnescape("Illegal markup (<% ... %>)") == "Illegal markup (<% ... %>)"); 379 assertThrown!XMLException(xmlUnescape("Fa�il")); 380 assertThrown!XMLException(xmlUnescape("Fa�il")); 381 }