1 /*
2 *             Copyright Lodovico Giaretta 2016 - .
3 *  Distributed under the Boost Software License, Version 1.0.
4 *      (See accompanying file LICENSE_1_0.txt or copy at
5 *            http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 /++
9 +   This module implements fast search and compare
10 +   functions on slices. In the future, these may be
11 +   optimized by means of aggressive specialization,
12 +   inline assembly and SIMD instructions.
13 +/
14 
15 module newxml.faststrings;
16 
17 import newxml.interfaces : XMLException;
18 
19 /** 
20  * Compares two strings, and returns true if they're both equal. Both input must be of equal lengths.
21  */
22 package bool fastEqual(T, S)(T[] t, S[] s) pure @nogc nothrow @trusted
23 in
24 {
25     assert(t.length == s.length);
26 }
27 do
28 {
29     import std.traits;
30     static if (is(Unqual!S == Unqual!T))
31     {
32         import core.stdc.string : memcmp;
33         return memcmp(t.ptr, s.ptr, t.length * T.sizeof) == 0;
34     }
35     else
36     {
37         foreach (i; 0 .. t.length)
38             if (t[i] != s[i])
39                 return false;
40         return true;
41     }
42 }
43 unittest
44 {
45     assert( fastEqual("ciao"w, "ciao"w));
46     assert(!fastEqual("ciao", "ciAo"));
47     assert( fastEqual([1, 2], [1, 2]));
48     assert(!fastEqual([1, 2], [1, 3]));
49 }
50 
51 /** 
52  * Returns the index of the first occurrence of a value in a slice. Returns -1 if nor found.
53  */
54 package ptrdiff_t fastIndexOf(T, S)(T[] t, S s) pure @nogc nothrow
55 {
56     foreach (i; 0 .. t.length)
57         if (t[i] == s)
58             return i;
59     return -1;
60 }
61 unittest
62 {
63     assert(fastIndexOf("FoO"w, 'O') == 2);
64     assert(fastIndexOf([1, 2], 3.14) == -1);
65 }
66 /** 
67  * Returns the index of the last occurrence of a value in a slice. Returns -1 if nor found.
68  */
69 package ptrdiff_t fastLastIndexOf(T, S)(T[] t, S s)
70 {
71     foreach_reverse (i; 0.. t.length)
72         if (t[i] == s)
73             return i;
74     return -1;
75 }
76 unittest
77 {
78     assert(fastLastIndexOf("FoOo"w, 'o') == 3);
79     assert(fastLastIndexOf([1, 2], 3.14) == -1);
80 }
81 
82 
83 /++
84 + Returns the index of the first occurrence of any of the values in the second
85 + slice inside the first one.
86 +/
87 package ptrdiff_t fastIndexOfAny(T, S)(T[] t, S[] s) pure @nogc nothrow @safe
88 {
89     foreach (i; 0 .. t.length)
90         if (fastIndexOf(s, t[i]) != -1)
91             return i;
92     return -1;
93 }
94 
95 unittest
96 {
97     assert(fastIndexOfAny([1, 2, 3, 4], [5, 4, 3]) == 2);
98     assert(fastIndexOfAny("Foo", "baz") == -1);
99 }
100 
101 /++
102 + Returns the index of the first occurrence of a value of the first slice that
103 + does not appear in the second.
104 +/
105 package ptrdiff_t fastIndexOfNeither(T, S)(T[] t, S[] s) pure @nogc nothrow
106 {
107     foreach (i; 0 .. t.length)
108         if (fastIndexOf(s, t[i]) == -1)
109             return i;
110     return -1;
111 }
112 unittest
113 {
114     assert(fastIndexOfNeither("lulublubla", "luck") == 4);
115     assert(fastIndexOfNeither([1, 3, 2], [2, 3, 1]) == -1);
116 }
117 package bool checkStringBeforeChr(T, S)(T[] haysack, S[] needle, S before) @nogc @safe pure nothrow
118 {
119     for (sizediff_t i ; i < haysack.length ; i++) {
120         if (haysack[i] == needle[0])
121         {
122             if (cast(sizediff_t)(haysack.length) - i > needle.length) 
123             {
124                 return fastEqual(haysack[i..i + needle.length], needle);
125             }
126             else
127                 return false;
128         }
129         else if (haysack[i] == before)
130             return false;
131     }
132     return false;
133 }
134 unittest 
135 {
136     assert(checkStringBeforeChr("extentity SYSTEM \"someexternalentity.file\"", "SYSTEM", '"'));
137     assert(!checkStringBeforeChr("extentity SYST", "SYSTEM", '"'));
138     assert(!checkStringBeforeChr("intentity \"Some internal entity\"", "SYSTEM", '"'));
139 }
140 
141 /++
142 +   Returns a copy of the input string, after escaping all XML reserved characters.
143 +
144 +   If the string does not contain any reserved character, it is returned unmodified;
145 +   otherwise, a copy is made using the specified allocator.
146 +/
147 T[] xmlEscape(T)(T[] str)
148 {
149     if (str.fastIndexOfAny("&<>'\"") >= 0)
150     {
151         //import newxml.appender;
152 
153         T[] app; //auto app = Appender!(T, Alloc)(alloc);
154         app.reserve(str.length + 3);
155 
156         app.xmlEscapedWrite(str);
157         return app;
158     }
159     return str;
160 }
161 
162 /++
163 +   Writes the input string to the given output range, after escaping all XML reserved characters.
164 +/
165 void xmlEscapedWrite(Out, T)(ref Out output, T[] str)
166 {
167     import std.conv : to;
168     static immutable amp = to!(T[])("&amp;");
169     static immutable lt = to!(T[])("&lt;");
170     static immutable gt = to!(T[])("&gt;");
171     static immutable apos = to!(T[])("&apos;");
172     static immutable quot = to!(T[])("&quot;");
173 
174     ptrdiff_t i;
175     while ((i = str.fastIndexOfAny("&<>'\"")) >= 0)
176     {
177         output ~= str[0..i];
178 
179         if (str[i] == '&')
180             output ~= amp;
181         else if (str[i] == '<')
182             output ~= lt;
183         else if (str[i] == '>')
184             output ~= gt;
185         else if (str[i] == '\'')
186             output ~= apos;
187         else if (str[i] == '"')
188             output ~= quot;
189 
190         str = str[i+1..$];
191     }
192     output ~= str;
193 }
194 
195 auto xmlPredefinedEntities(T)() {
196     alias STR = T[];
197     STR[STR] result;
198     result["amp"] = "&";
199     result["lt"] = "<";
200     result["gt"] = ">";
201     result["apos"] = "'";
202     result["quot"] = "\"";
203     
204     return result;
205 }
206 
207 import std.typecons: Flag, Yes;
208 
209 /++
210 +   Returns a copy of the input string, after unescaping all known entity references.
211 +
212 +   If the string does not contain any entity reference, it is returned unmodified;
213 +   otherwise, a copy is made using the specified allocator.
214 +
215 +   The set of known entities can be specified with the last parameter, which must support
216 +   the `in` operator (it is treated as an associative array).
217 +/
218 T[] xmlUnescape(Flag!"strict" strict = Yes.strict, T, U)(T[] str, U replacements)
219 {
220     if (str.fastIndexOf('&') >= 0)
221     {
222         //import newxml.appender;
223 
224         T[] app;//auto app = Appender!(T, Alloc)(alloc);
225         app.reserve(str.length);
226 
227         app.xmlUnescapedWrite!strict(str, replacements);
228         return app;
229     }
230     return str;
231 }
232 T[] xmlUnescape(Flag!"strict" strict = Yes.strict, T)(T[] str)
233 {
234     if (str.fastIndexOf('&') >= 0)
235     {
236         //import newxml.appender;
237 
238         T[] app;//auto app = Appender!(T, Alloc)(alloc);
239         app.reserve(str.length);
240 
241         app.xmlUnescapedWrite!strict(str, xmlPredefinedEntities!T());
242         return app;
243     }
244     return str;
245 }
246 
247 /++
248 +   Outputs the input string to the given output range, after unescaping all known entity references.
249 +
250 +   The set of known entities can be specified with the last parameter, which must support
251 +   the `in` operator (it is treated as an associative array).
252 +/
253 void xmlUnescapedWrite(Flag!"strict" strict = Yes.strict, Out, T, U)
254                       (ref Out output, T[] str, U replacements)
255 {
256     ptrdiff_t i;
257     while ((i = str.fastIndexOf('&')) >= 0)
258     {
259         output ~= str[0..i];
260 
261         ptrdiff_t j = str[(i+1)..$].fastIndexOf(';');
262         static if (strict == Yes.strict)
263         {
264             if (j < 0) throw new XMLException("Missing ';' ending XML entity!");
265         }
266         else 
267         {
268             if (j < 0) continue;
269         }
270         auto ent = str[(i+1)..(i+j+1)];
271         static if (strict == Yes.strict)
272         {
273             if (!ent.length) throw new XMLException("Character replacement entity not found!");
274         }
275         else
276         {
277             if (!ent.length) continue;
278         }
279 
280         // character entities
281         if (ent[0] == '#')
282         {
283             //assert(ent.length > 1);
284             ulong num;
285             // hex number
286             if (ent.length > 2 && ent[1] == 'x')
287             {
288                 static if (strict == Yes.strict)
289                     if (ent.length > 10) throw new XMLException("Number escape value is too large!");
290                 foreach(digit; ent[2..$])
291                 {
292                     if ('0' <= digit && digit <= '9')
293                         num = (num << 4) + (digit - '0');
294                     else if ('a' <= digit && digit <= 'f')
295                         num = (num << 4) + (digit - 'a' + 10);
296                     else if ('A' <= digit && digit <= 'F')
297                         num = (num << 4) + (digit - 'A' + 10);
298                     else
299                     {
300                         static if (strict == Yes.strict)
301                             throw new XMLException("Wrong character encountered within hexadecimal number!");
302                         else
303                             break;
304                     }
305                 }
306             }
307             // decimal number
308             else
309             {
310                 static if (strict == Yes.strict)
311                     if (ent.length > 12) throw new XMLException("Number escape value is too large!");
312                 foreach(digit; ent[1..$])
313                 {
314                     if ('0' <= digit && digit <= '9')
315                     {
316                         num = (num * 10) + (digit - '0');
317                     }
318                     else
319                         static if (strict == Yes.strict)
320                             throw new XMLException("Wrong character encountered within decimal number!");
321                         else
322                             break;
323                 }
324             }
325             //assert(num <= 0x10FFFF);
326             static if (strict == Yes.strict)
327                 if (num > 0x10FFFF)
328                     throw new XMLException("Number escape value is too large!");
329 
330             output ~= cast(dchar)num;
331         }
332         // named entities
333         else
334         {
335             auto repl = replacements.get(ent, null);
336             static if (strict == Yes.strict)
337             {
338                 //assert (repl, cast(string)str[(i+1)..(i+j+1)]);
339                 if (!repl) throw new XMLException("Character replacement entity not found!");
340             }
341             else
342             {
343                 if (!repl)
344                 {
345                     output ~= str[i];
346                     str = str[(i+1)..$];
347                     continue;
348                 }
349             }
350             output ~= repl;
351         }
352 
353         str = str[(i+j+2)..$];
354     }
355     output ~= str;
356 }
357 
358 unittest
359 {
360     //import std.experimental.allocator.mallocator;//import stdx.allocator.mallocator;
361     //auto alloc = Mallocator.instance;
362     assert(xmlEscape("some standard string"d) == "some standard string"d);
363     assert(xmlEscape("& \"some\" <standard> 'string'") ==
364                      "&amp; &quot;some&quot; &lt;standard&gt; &apos;string&apos;");
365     assert(xmlEscape("<&'>>>\"'\"<&&"w) ==
366                      "&lt;&amp;&apos;&gt;&gt;&gt;&quot;&apos;&quot;&lt;&amp;&amp;"w);
367 }
368 
369 unittest
370 {
371     import std.exception : assertThrown;
372     assert(xmlUnescape("some standard string"d) == "some standard string"d);
373     assert(xmlUnescape("some s&#116;range&#x20;string") == "some strange string");
374     assert(xmlUnescape("&amp; &quot;some&quot; &lt;standard&gt; &apos;string&apos;")
375                        == "& \"some\" <standard> 'string'");
376     assert(xmlUnescape("&lt;&amp;&apos;&gt;&gt;&gt;&quot;&apos;&quot;&lt;&amp;&amp;"w)
377                        == "<&'>>>\"'\"<&&"w);
378     assert(xmlUnescape("Illegal markup (&lt;% ... %&gt;)") == "Illegal markup (<% ... %>)");
379     assertThrown!XMLException(xmlUnescape("Fa&#xFF000000F6;il"));
380     assertThrown!XMLException(xmlUnescape("Fa&#68000000000;il"));
381 }