1 /* This Source Code Form is subject to the terms of the Mozilla Public
2  * License, v. 2.0. If a copy of the MPL was not distributed with this
3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 
5 /**
6  * URL parser.
7  *
8  * Copyright: Eugene Wissner 2017-2020.
9  * License: $(LINK2 https://www.mozilla.org/en-US/MPL/2.0/,
10  *                  Mozilla Public License, v. 2.0).
11  * Authors: $(LINK2 mailto:info@caraus.de, Eugene Wissner)
12  * Source: $(LINK2 https://github.com/caraus-ecms/tanya/blob/master/source/tanya/net/uri.d,
13  *                 tanya/net/uri.d)
14  */
15 module tanya.net.uri;
16 
17 import std.ascii;
18 import tanya.conv;
19 import tanya.memory.allocator;
20 
21 /**
22  * Thrown if an invalid URI was specified.
23  */
24 final class URIException : Exception
25 {
26     /**
27      * Params:
28      *  msg  = The message for the exception.
29      *  file = The file where the exception occurred.
30      *  line = The line number where the exception occurred.
31      *  next = The previous exception in the chain of exceptions, if any.
32      */
33     this(string msg,
34          string file = __FILE__,
35          size_t line = __LINE__,
36          Throwable next = null) @nogc nothrow pure @safe
37     {
38         super(msg, file, line, next);
39     }
40 }
41 
42 /**
43  * A Unique Resource Locator.
44  */
45 struct URL
46 {
47     /// The URL scheme.
48     const(char)[] scheme;
49 
50     /// The username.
51     const(char)[] user;
52 
53     /// The password.
54     const(char)[] pass;
55 
56     /// The hostname.
57     const(char)[] host;
58 
59     /// The port number.
60     ushort port;
61 
62     /// The path.
63     const(char)[] path;
64 
65     /// The query string.
66     const(char)[] query;
67 
68     /// The anchor.
69     const(char)[] fragment;
70 
71     /**
72      * Attempts to parse an URL from a string.
73      * Output string data (scheme, user, etc.) are just slices of input string
74      * (i.e., no memory allocation and copying).
75      *
76      * Params:
77      *  source = The string containing the URL.
78      *
79      * Throws: $(D_PSYMBOL URIException) if the URL is malformed.
80      */
81     this(const char[] source) @nogc pure
82     {
83         ptrdiff_t pos = -1, endPos = source.length, start;
84 
85         foreach (i, ref c; source)
86         {
87             if (pos == -1 && c == ':')
88             {
89                 pos = i;
90             }
91             if (endPos == source.length && (c == '?' || c == '#'))
92             {
93                 endPos = i;
94             }
95         }
96 
97         // Check if the colon is a part of the scheme or the port and parse
98         // the appropriate part.
99         if (source.length > 1 && source[0] == '/' && source[1] == '/')
100         {
101             // Relative scheme.
102             start = 2;
103         }
104         else if (pos > 0)
105         {
106             // Validate scheme:
107             // [ toLower(alpha) | digit | "+" | "-" | "." ]
108             foreach (ref c; source[0 .. pos])
109             {
110                 if (!c.isAlphaNum && c != '+' && c != '-' && c != '.')
111                 {
112                     goto ParsePath;
113                 }
114             }
115 
116             if (source.length == pos + 1) // only "scheme:" is available.
117             {
118                 this.scheme = source[0 .. $ - 1];
119                 return;
120             }
121             else if (source.length > pos + 1 && source[pos + 1] == '/')
122             {
123                 this.scheme = source[0 .. pos];
124 
125                 if (source.length > pos + 2 && source[pos + 2] == '/')
126                 {
127                     start = pos + 3;
128 
129                     if (source.length <= start)
130                     {
131                         // Only "scheme://" is available.
132                         return;
133                     }
134                     if (this.scheme == "file" && source[start] == '/')
135                     {
136                         // Windows drive letters.
137                         if (source.length - start > 2
138                          && source[start + 2] == ':')
139                         {
140                             ++start;
141                         }
142                         goto ParsePath;
143                     }
144                 }
145                 else
146                 {
147                     start = pos + 1;
148                     goto ParsePath;
149                 }
150             }
151             else if (!parsePort(source[pos .. $]))
152             {
153                 // Schemas like mailto: and zlib: may not have any slash after
154                 // them.
155                 this.scheme = source[0 .. pos];
156                 start = pos + 1;
157                 goto ParsePath;
158             }
159         }
160         else if (pos == 0 && parsePort(source[pos .. $]))
161         {
162             // An URL shouldn't begin with a port number.
163             throw defaultAllocator.make!URIException("URL begins with port");
164         }
165         else
166         {
167             goto ParsePath;
168         }
169 
170         // Parse host.
171         pos = -1;
172         for (ptrdiff_t i = start; i < source.length; ++i)
173         {
174             if (source[i] == '@')
175             {
176                 pos = i;
177             }
178             else if (source[i] == '/')
179             {
180                 endPos = i;
181                 break;
182             }
183         }
184 
185         // Check for login and password.
186         if (pos != -1)
187         {
188             // *( unreserved / pct-encoded / sub-delims / ":" )
189             foreach (i, c; source[start .. pos])
190             {
191                 if (c == ':')
192                 {
193                     if (this.user is null)
194                     {
195                         this.user = source[start .. start + i];
196                         this.pass = source[start + i + 1 .. pos]; 
197                     }
198                 }
199                 else if (!c.isAlpha() &&
200                          !c.isDigit() &&
201                          c != '!' &&
202                          c != ';' &&
203                          c != '=' &&
204                          c != '_' &&
205                          c != '~' &&
206                          !(c >= '$' && c <= '.'))
207                 {
208                     this.scheme = this.user = this.pass = null;
209                     throw make!URIException(defaultAllocator,
210                                             "Restricted characters in user information");
211                 }
212             }
213             if (this.user is null)
214             {
215                 this.user = source[start .. pos];
216             }
217 
218             start = ++pos;
219         }
220 
221         pos = endPos;
222         if (endPos <= 1 || source[start] != '[' || source[endPos - 1] != ']')
223         {
224             // Short circuit portscan.
225             // IPv6 embedded address.
226             for (ptrdiff_t i = endPos - 1; i >= start; --i)
227             {
228                 if (source[i] == ':')
229                 {
230                     pos = i;
231                     if  (this.port == 0 && !parsePort(source[i .. endPos]))
232                     {
233                         this.scheme = this.user = this.pass = null;
234                         throw defaultAllocator.make!URIException("Invalid port");
235                     }
236                     break;
237                 }
238             }
239         }
240 
241         // Check if we have a valid host, if we don't reject the string as URL.
242         if (pos <= start)
243         {
244             this.scheme = this.user = this.pass = null;
245             throw defaultAllocator.make!URIException("Invalid host");
246         }
247 
248         this.host = source[start .. pos];
249 
250         if (endPos == source.length)
251         {
252             return;
253         }
254 
255         start = endPos;
256 
257     ParsePath:
258         endPos = source.length;
259         pos = -1;
260         foreach (i, ref c; source[start .. $])
261         {
262             if (c == '?' && pos == -1)
263             {
264                 pos = start + i;
265             }
266             else if (c == '#')
267             {
268                 endPos = start + i;
269                 break;
270             }
271         }
272         if (pos == -1)
273         {
274             pos = endPos;
275         }
276 
277         if (pos > start)
278         {
279             this.path = source[start .. pos];
280         }
281         if (endPos >= ++pos)
282         {
283             this.query = source[pos .. endPos];
284         }
285         if (++endPos <= source.length)
286         {
287             this.fragment = source[endPos .. $];
288         }
289     }
290 
291     /*
292      * Attempts to parse and set the port.
293      *
294      * Params:
295      *  port = String beginning with a colon followed by the port number and
296      *         an optional path (query string and/or fragment), like:
297      *         `:12345/some_path` or `:12345`.
298      *
299      * Returns: Whether the port could be found.
300      */
301     private bool parsePort(const(char)[] port) @nogc nothrow pure @safe
302     {
303         auto unparsed = port[1 .. $];
304         auto parsed = readIntegral!ushort(unparsed);
305         if (unparsed.length == 0 || unparsed[0] == '/')
306         {
307             this.port = parsed;
308             return true;
309         }
310         return false;
311     }
312 }
313 
314 ///
315 @nogc pure @system unittest
316 {
317     auto u = URL("example.org");
318     assert(u.path == "example.org"); 
319 
320     u = URL("relative/path");
321     assert(u.path == "relative/path"); 
322 
323     // Host and scheme
324     u = URL("https://example.org");
325     assert(u.scheme == "https");
326     assert(u.host == "example.org");
327     assert(u.path is null);
328     assert(u.port == 0);
329     assert(u.fragment is null);
330 
331     // With user and port and path
332     u = URL("https://hilary:putnam@example.org:443/foo/bar");
333     assert(u.scheme == "https");
334     assert(u.host == "example.org");
335     assert(u.path == "/foo/bar");
336     assert(u.port == 443);
337     assert(u.user == "hilary");
338     assert(u.pass == "putnam");
339     assert(u.fragment is null);
340 
341     // With query string
342     u = URL("https://example.org/?login=true");
343     assert(u.scheme == "https");
344     assert(u.host == "example.org");
345     assert(u.path == "/");
346     assert(u.query == "login=true");
347     assert(u.fragment is null);
348 
349     // With query string and fragment
350     u = URL("https://example.org/?login=false#label");
351     assert(u.scheme == "https");
352     assert(u.host == "example.org");
353     assert(u.path == "/");
354     assert(u.query == "login=false");
355     assert(u.fragment == "label");
356 
357     u = URL("redis://root:password@localhost:2201/path?query=value#fragment");
358     assert(u.scheme == "redis");
359     assert(u.user == "root");
360     assert(u.pass == "password");
361     assert(u.host == "localhost");
362     assert(u.port == 2201);
363     assert(u.path == "/path");
364     assert(u.query == "query=value");
365     assert(u.fragment == "fragment");
366 }
367 
368 /**
369  * Attempts to parse an URL from a string and returns the specified component
370  * of the URL or $(D_PSYMBOL URL) if no component is specified.
371  *
372  * Params:
373  *  T      = "scheme", "host", "port", "user", "pass", "path", "query",
374  *           "fragment".
375  *  source = The string containing the URL.
376  *
377  * Returns: Requested URL component.
378  */
379 auto parseURL(string T)(const char[] source)
380 if (T == "scheme"
381  || T == "host"
382  || T == "user"
383  || T == "pass"
384  || T == "path"
385  || T == "query"
386  || T == "fragment"
387  || T == "port")
388 {
389     auto ret = URL(source);
390     return mixin("ret." ~ T);
391 }
392 
393 /// ditto
394 URL parseURL(const char[] source) @nogc pure
395 {
396     return URL(source);
397 }
398 
399 ///
400 @nogc pure @system unittest
401 {
402     auto u = parseURL("http://example.org:5326");
403     assert(u.scheme == parseURL!"scheme"("http://example.org:5326"));
404     assert(u.host == parseURL!"host"("http://example.org:5326"));
405     assert(u.user == parseURL!"user"("http://example.org:5326"));
406     assert(u.pass == parseURL!"pass"("http://example.org:5326"));
407     assert(u.path == parseURL!"path"("http://example.org:5326"));
408     assert(u.query == parseURL!"query"("http://example.org:5326"));
409     assert(u.fragment == parseURL!"fragment"("http://example.org:5326"));
410     assert(u.port == parseURL!"port"("http://example.org:5326"));
411 }