1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 /** 6 * URL parser. 7 * 8 * Copyright: Eugene Wissner 2017-2020. 9 * License: $(LINK2 https://www.mozilla.org/en-US/MPL/2.0/, 10 * Mozilla Public License, v. 2.0). 11 * Authors: $(LINK2 mailto:info@caraus.de, Eugene Wissner) 12 * Source: $(LINK2 https://github.com/caraus-ecms/tanya/blob/master/source/tanya/net/uri.d, 13 * tanya/net/uri.d) 14 */ 15 module tanya.net.uri; 16 17 import std.ascii; 18 import tanya.conv; 19 import tanya.memory.allocator; 20 21 /** 22 * Thrown if an invalid URI was specified. 23 */ 24 final class URIException : Exception 25 { 26 /** 27 * Params: 28 * msg = The message for the exception. 29 * file = The file where the exception occurred. 30 * line = The line number where the exception occurred. 31 * next = The previous exception in the chain of exceptions, if any. 32 */ 33 this(string msg, 34 string file = __FILE__, 35 size_t line = __LINE__, 36 Throwable next = null) @nogc nothrow pure @safe 37 { 38 super(msg, file, line, next); 39 } 40 } 41 42 /** 43 * A Unique Resource Locator. 44 */ 45 struct URL 46 { 47 /// The URL scheme. 48 const(char)[] scheme; 49 50 /// The username. 51 const(char)[] user; 52 53 /// The password. 54 const(char)[] pass; 55 56 /// The hostname. 57 const(char)[] host; 58 59 /// The port number. 60 ushort port; 61 62 /// The path. 63 const(char)[] path; 64 65 /// The query string. 66 const(char)[] query; 67 68 /// The anchor. 69 const(char)[] fragment; 70 71 /** 72 * Attempts to parse an URL from a string. 73 * Output string data (scheme, user, etc.) are just slices of input string 74 * (i.e., no memory allocation and copying). 75 * 76 * Params: 77 * source = The string containing the URL. 78 * 79 * Throws: $(D_PSYMBOL URIException) if the URL is malformed. 80 */ 81 this(const char[] source) @nogc pure 82 { 83 ptrdiff_t pos = -1, endPos = source.length, start; 84 85 foreach (i, ref c; source) 86 { 87 if (pos == -1 && c == ':') 88 { 89 pos = i; 90 } 91 if (endPos == source.length && (c == '?' || c == '#')) 92 { 93 endPos = i; 94 } 95 } 96 97 // Check if the colon is a part of the scheme or the port and parse 98 // the appropriate part. 99 if (source.length > 1 && source[0] == '/' && source[1] == '/') 100 { 101 // Relative scheme. 102 start = 2; 103 } 104 else if (pos > 0) 105 { 106 // Validate scheme: 107 // [ toLower(alpha) | digit | "+" | "-" | "." ] 108 foreach (ref c; source[0 .. pos]) 109 { 110 if (!c.isAlphaNum && c != '+' && c != '-' && c != '.') 111 { 112 goto ParsePath; 113 } 114 } 115 116 if (source.length == pos + 1) // only "scheme:" is available. 117 { 118 this.scheme = source[0 .. $ - 1]; 119 return; 120 } 121 else if (source.length > pos + 1 && source[pos + 1] == '/') 122 { 123 this.scheme = source[0 .. pos]; 124 125 if (source.length > pos + 2 && source[pos + 2] == '/') 126 { 127 start = pos + 3; 128 129 if (source.length <= start) 130 { 131 // Only "scheme://" is available. 132 return; 133 } 134 if (this.scheme == "file" && source[start] == '/') 135 { 136 // Windows drive letters. 137 if (source.length - start > 2 138 && source[start + 2] == ':') 139 { 140 ++start; 141 } 142 goto ParsePath; 143 } 144 } 145 else 146 { 147 start = pos + 1; 148 goto ParsePath; 149 } 150 } 151 else if (!parsePort(source[pos .. $])) 152 { 153 // Schemas like mailto: and zlib: may not have any slash after 154 // them. 155 this.scheme = source[0 .. pos]; 156 start = pos + 1; 157 goto ParsePath; 158 } 159 } 160 else if (pos == 0 && parsePort(source[pos .. $])) 161 { 162 // An URL shouldn't begin with a port number. 163 throw defaultAllocator.make!URIException("URL begins with port"); 164 } 165 else 166 { 167 goto ParsePath; 168 } 169 170 // Parse host. 171 pos = -1; 172 for (ptrdiff_t i = start; i < source.length; ++i) 173 { 174 if (source[i] == '@') 175 { 176 pos = i; 177 } 178 else if (source[i] == '/') 179 { 180 endPos = i; 181 break; 182 } 183 } 184 185 // Check for login and password. 186 if (pos != -1) 187 { 188 // *( unreserved / pct-encoded / sub-delims / ":" ) 189 foreach (i, c; source[start .. pos]) 190 { 191 if (c == ':') 192 { 193 if (this.user is null) 194 { 195 this.user = source[start .. start + i]; 196 this.pass = source[start + i + 1 .. pos]; 197 } 198 } 199 else if (!c.isAlpha() && 200 !c.isDigit() && 201 c != '!' && 202 c != ';' && 203 c != '=' && 204 c != '_' && 205 c != '~' && 206 !(c >= '$' && c <= '.')) 207 { 208 this.scheme = this.user = this.pass = null; 209 throw make!URIException(defaultAllocator, 210 "Restricted characters in user information"); 211 } 212 } 213 if (this.user is null) 214 { 215 this.user = source[start .. pos]; 216 } 217 218 start = ++pos; 219 } 220 221 pos = endPos; 222 if (endPos <= 1 || source[start] != '[' || source[endPos - 1] != ']') 223 { 224 // Short circuit portscan. 225 // IPv6 embedded address. 226 for (ptrdiff_t i = endPos - 1; i >= start; --i) 227 { 228 if (source[i] == ':') 229 { 230 pos = i; 231 if (this.port == 0 && !parsePort(source[i .. endPos])) 232 { 233 this.scheme = this.user = this.pass = null; 234 throw defaultAllocator.make!URIException("Invalid port"); 235 } 236 break; 237 } 238 } 239 } 240 241 // Check if we have a valid host, if we don't reject the string as URL. 242 if (pos <= start) 243 { 244 this.scheme = this.user = this.pass = null; 245 throw defaultAllocator.make!URIException("Invalid host"); 246 } 247 248 this.host = source[start .. pos]; 249 250 if (endPos == source.length) 251 { 252 return; 253 } 254 255 start = endPos; 256 257 ParsePath: 258 endPos = source.length; 259 pos = -1; 260 foreach (i, ref c; source[start .. $]) 261 { 262 if (c == '?' && pos == -1) 263 { 264 pos = start + i; 265 } 266 else if (c == '#') 267 { 268 endPos = start + i; 269 break; 270 } 271 } 272 if (pos == -1) 273 { 274 pos = endPos; 275 } 276 277 if (pos > start) 278 { 279 this.path = source[start .. pos]; 280 } 281 if (endPos >= ++pos) 282 { 283 this.query = source[pos .. endPos]; 284 } 285 if (++endPos <= source.length) 286 { 287 this.fragment = source[endPos .. $]; 288 } 289 } 290 291 /* 292 * Attempts to parse and set the port. 293 * 294 * Params: 295 * port = String beginning with a colon followed by the port number and 296 * an optional path (query string and/or fragment), like: 297 * `:12345/some_path` or `:12345`. 298 * 299 * Returns: Whether the port could be found. 300 */ 301 private bool parsePort(const(char)[] port) @nogc nothrow pure @safe 302 { 303 auto unparsed = port[1 .. $]; 304 auto parsed = readIntegral!ushort(unparsed); 305 if (unparsed.length == 0 || unparsed[0] == '/') 306 { 307 this.port = parsed; 308 return true; 309 } 310 return false; 311 } 312 } 313 314 /// 315 @nogc pure @system unittest 316 { 317 auto u = URL("example.org"); 318 assert(u.path == "example.org"); 319 320 u = URL("relative/path"); 321 assert(u.path == "relative/path"); 322 323 // Host and scheme 324 u = URL("https://example.org"); 325 assert(u.scheme == "https"); 326 assert(u.host == "example.org"); 327 assert(u.path is null); 328 assert(u.port == 0); 329 assert(u.fragment is null); 330 331 // With user and port and path 332 u = URL("https://hilary:putnam@example.org:443/foo/bar"); 333 assert(u.scheme == "https"); 334 assert(u.host == "example.org"); 335 assert(u.path == "/foo/bar"); 336 assert(u.port == 443); 337 assert(u.user == "hilary"); 338 assert(u.pass == "putnam"); 339 assert(u.fragment is null); 340 341 // With query string 342 u = URL("https://example.org/?login=true"); 343 assert(u.scheme == "https"); 344 assert(u.host == "example.org"); 345 assert(u.path == "/"); 346 assert(u.query == "login=true"); 347 assert(u.fragment is null); 348 349 // With query string and fragment 350 u = URL("https://example.org/?login=false#label"); 351 assert(u.scheme == "https"); 352 assert(u.host == "example.org"); 353 assert(u.path == "/"); 354 assert(u.query == "login=false"); 355 assert(u.fragment == "label"); 356 357 u = URL("redis://root:password@localhost:2201/path?query=value#fragment"); 358 assert(u.scheme == "redis"); 359 assert(u.user == "root"); 360 assert(u.pass == "password"); 361 assert(u.host == "localhost"); 362 assert(u.port == 2201); 363 assert(u.path == "/path"); 364 assert(u.query == "query=value"); 365 assert(u.fragment == "fragment"); 366 } 367 368 /** 369 * Attempts to parse an URL from a string and returns the specified component 370 * of the URL or $(D_PSYMBOL URL) if no component is specified. 371 * 372 * Params: 373 * T = "scheme", "host", "port", "user", "pass", "path", "query", 374 * "fragment". 375 * source = The string containing the URL. 376 * 377 * Returns: Requested URL component. 378 */ 379 auto parseURL(string T)(const char[] source) 380 if (T == "scheme" 381 || T == "host" 382 || T == "user" 383 || T == "pass" 384 || T == "path" 385 || T == "query" 386 || T == "fragment" 387 || T == "port") 388 { 389 auto ret = URL(source); 390 return mixin("ret." ~ T); 391 } 392 393 /// ditto 394 URL parseURL(const char[] source) @nogc pure 395 { 396 return URL(source); 397 } 398 399 /// 400 @nogc pure @system unittest 401 { 402 auto u = parseURL("http://example.org:5326"); 403 assert(u.scheme == parseURL!"scheme"("http://example.org:5326")); 404 assert(u.host == parseURL!"host"("http://example.org:5326")); 405 assert(u.user == parseURL!"user"("http://example.org:5326")); 406 assert(u.pass == parseURL!"pass"("http://example.org:5326")); 407 assert(u.path == parseURL!"path"("http://example.org:5326")); 408 assert(u.query == parseURL!"query"("http://example.org:5326")); 409 assert(u.fragment == parseURL!"fragment"("http://example.org:5326")); 410 assert(u.port == parseURL!"port"("http://example.org:5326")); 411 }