urlapi.c (53584B)
1 /*************************************************************************** 2 * _ _ ____ _ 3 * Project ___| | | | _ \| | 4 * / __| | | | |_) | | 5 * | (__| |_| | _ <| |___ 6 * \___|\___/|_| \_\_____| 7 * 8 * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al. 9 * 10 * This software is licensed as described in the file COPYING, which 11 * you should have received as part of this distribution. The terms 12 * are also available at https://curl.se/docs/copyright.html. 13 * 14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell 15 * copies of the Software, and permit persons to whom the Software is 16 * furnished to do so, under the terms of the COPYING file. 17 * 18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY 19 * KIND, either express or implied. 20 * 21 * SPDX-License-Identifier: curl 22 * 23 ***************************************************************************/ 24 25 #include "curl_setup.h" 26 27 #include "urldata.h" 28 #include "urlapi-int.h" 29 #include "strcase.h" 30 #include "url.h" 31 #include "escape.h" 32 #include "curl_ctype.h" 33 #include "curlx/inet_pton.h" 34 #include "curlx/inet_ntop.h" 35 #include "strdup.h" 36 #include "idn.h" 37 #include "curlx/strparse.h" 38 #include "curl_memrchr.h" 39 40 /* The last 3 #include files should be in this order */ 41 #include "curl_printf.h" 42 #include "curl_memory.h" 43 #include "memdebug.h" 44 45 /* MS-DOS/Windows style drive prefix, eg c: in c:foo */ 46 #define STARTS_WITH_DRIVE_PREFIX(str) \ 47 ((('a' <= str[0] && str[0] <= 'z') || \ 48 ('A' <= str[0] && str[0] <= 'Z')) && \ 49 (str[1] == ':')) 50 51 /* MS-DOS/Windows style drive prefix, optionally with 52 * a '|' instead of ':', followed by a slash or NUL */ 53 #define STARTS_WITH_URL_DRIVE_PREFIX(str) \ 54 ((('a' <= (str)[0] && (str)[0] <= 'z') || \ 55 ('A' <= (str)[0] && (str)[0] <= 'Z')) && \ 56 ((str)[1] == ':' || (str)[1] == '|') && \ 57 ((str)[2] == '/' || (str)[2] == '\\' || (str)[2] == 0)) 58 59 /* scheme is not URL encoded, the longest libcurl supported ones are... */ 60 #define MAX_SCHEME_LEN 40 61 62 /* 63 * If USE_IPV6 is disabled, we still want to parse IPv6 addresses, so make 64 * sure we have _some_ value for AF_INET6 without polluting our fake value 65 * everywhere. 66 */ 67 #if !defined(USE_IPV6) && !defined(AF_INET6) 68 #define AF_INET6 (AF_INET + 1) 69 #endif 70 71 /* Internal representation of CURLU. Point to URL-encoded strings. */ 72 struct Curl_URL { 73 char *scheme; 74 char *user; 75 char *password; 76 char *options; /* IMAP only? */ 77 char *host; 78 char *zoneid; /* for numerical IPv6 addresses */ 79 char *port; 80 char *path; 81 char *query; 82 char *fragment; 83 unsigned short portnum; /* the numerical version (if 'port' is set) */ 84 BIT(query_present); /* to support blank */ 85 BIT(fragment_present); /* to support blank */ 86 BIT(guessed_scheme); /* when a URL without scheme is parsed */ 87 }; 88 89 #define DEFAULT_SCHEME "https" 90 91 static CURLUcode parseurl_and_replace(const char *url, CURLU *u, 92 unsigned int flags); 93 94 static void free_urlhandle(struct Curl_URL *u) 95 { 96 free(u->scheme); 97 free(u->user); 98 free(u->password); 99 free(u->options); 100 free(u->host); 101 free(u->zoneid); 102 free(u->port); 103 free(u->path); 104 free(u->query); 105 free(u->fragment); 106 } 107 108 /* 109 * Find the separator at the end of the hostname, or the '?' in cases like 110 * http://www.example.com?id=2380 111 */ 112 static const char *find_host_sep(const char *url) 113 { 114 /* Find the start of the hostname */ 115 const char *sep = strstr(url, "//"); 116 if(!sep) 117 sep = url; 118 else 119 sep += 2; 120 121 /* Find first / or ? */ 122 while(*sep && *sep != '/' && *sep != '?') 123 sep++; 124 125 return sep; 126 } 127 128 /* convert CURLcode to CURLUcode */ 129 #define cc2cu(x) ((x) == CURLE_TOO_LARGE ? CURLUE_TOO_LARGE : \ 130 CURLUE_OUT_OF_MEMORY) 131 132 /* urlencode_str() writes data into an output dynbuf and URL-encodes the 133 * spaces in the source URL accordingly. 134 * 135 * URL encoding should be skipped for hostnames, otherwise IDN resolution 136 * will fail. 137 */ 138 static CURLUcode urlencode_str(struct dynbuf *o, const char *url, 139 size_t len, bool relative, 140 bool query) 141 { 142 /* we must add this with whitespace-replacing */ 143 bool left = !query; 144 const unsigned char *iptr; 145 const unsigned char *host_sep = (const unsigned char *) url; 146 CURLcode result = CURLE_OK; 147 148 if(!relative) { 149 size_t n; 150 host_sep = (const unsigned char *) find_host_sep(url); 151 152 /* output the first piece as-is */ 153 n = (const char *)host_sep - url; 154 result = curlx_dyn_addn(o, url, n); 155 len -= n; 156 } 157 158 for(iptr = host_sep; len && !result; iptr++, len--) { 159 if(*iptr == ' ') { 160 if(left) 161 result = curlx_dyn_addn(o, "%20", 3); 162 else 163 result = curlx_dyn_addn(o, "+", 1); 164 } 165 else if((*iptr < ' ') || (*iptr >= 0x7f)) { 166 unsigned char out[3]={'%'}; 167 Curl_hexbyte(&out[1], *iptr); 168 result = curlx_dyn_addn(o, out, 3); 169 } 170 else { 171 result = curlx_dyn_addn(o, iptr, 1); 172 if(*iptr == '?') 173 left = FALSE; 174 } 175 } 176 177 if(result) 178 return cc2cu(result); 179 return CURLUE_OK; 180 } 181 182 /* 183 * Returns the length of the scheme if the given URL is absolute (as opposed 184 * to relative). Stores the scheme in the buffer if TRUE and 'buf' is 185 * non-NULL. The buflen must be larger than MAX_SCHEME_LEN if buf is set. 186 * 187 * If 'guess_scheme' is TRUE, it means the URL might be provided without 188 * scheme. 189 */ 190 size_t Curl_is_absolute_url(const char *url, char *buf, size_t buflen, 191 bool guess_scheme) 192 { 193 size_t i = 0; 194 DEBUGASSERT(!buf || (buflen > MAX_SCHEME_LEN)); 195 (void)buflen; /* only used in debug-builds */ 196 if(buf) 197 buf[0] = 0; /* always leave a defined value in buf */ 198 #ifdef _WIN32 199 if(guess_scheme && STARTS_WITH_DRIVE_PREFIX(url)) 200 return 0; 201 #endif 202 if(ISALPHA(url[0])) 203 for(i = 1; i < MAX_SCHEME_LEN; ++i) { 204 char s = url[i]; 205 if(s && (ISALNUM(s) || (s == '+') || (s == '-') || (s == '.') )) { 206 /* RFC 3986 3.1 explains: 207 scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) 208 */ 209 } 210 else { 211 break; 212 } 213 } 214 if(i && (url[i] == ':') && ((url[i + 1] == '/') || !guess_scheme)) { 215 /* If this does not guess scheme, the scheme always ends with the colon so 216 that this also detects data: URLs etc. In guessing mode, data: could 217 be the hostname "data" with a specified port number. */ 218 219 /* the length of the scheme is the name part only */ 220 size_t len = i; 221 if(buf) { 222 Curl_strntolower(buf, url, i); 223 buf[i] = 0; 224 } 225 return len; 226 } 227 return 0; 228 } 229 230 /* 231 * Concatenate a relative URL onto a base URL making it absolute. 232 */ 233 static CURLUcode redirect_url(const char *base, const char *relurl, 234 CURLU *u, unsigned int flags) 235 { 236 struct dynbuf urlbuf; 237 bool host_changed = FALSE; 238 const char *useurl = relurl; 239 const char *cutoff = NULL; 240 size_t prelen; 241 CURLUcode uc; 242 243 /* protsep points to the start of the hostname, after [scheme]:// */ 244 const char *protsep = base + strlen(u->scheme) + 3; 245 DEBUGASSERT(base && relurl && u); /* all set here */ 246 if(!base) 247 return CURLUE_MALFORMED_INPUT; /* should never happen */ 248 249 /* handle different relative URL types */ 250 switch(relurl[0]) { 251 case '/': 252 if(relurl[1] == '/') { 253 /* protocol-relative URL: //example.com/path */ 254 cutoff = protsep; 255 useurl = &relurl[2]; 256 host_changed = TRUE; 257 } 258 else 259 /* absolute /path */ 260 cutoff = strchr(protsep, '/'); 261 break; 262 263 case '#': 264 /* fragment-only change */ 265 if(u->fragment) 266 cutoff = strchr(protsep, '#'); 267 break; 268 269 default: 270 /* path or query-only change */ 271 if(u->query && u->query[0]) 272 /* remove existing query */ 273 cutoff = strchr(protsep, '?'); 274 else if(u->fragment && u->fragment[0]) 275 /* Remove existing fragment */ 276 cutoff = strchr(protsep, '#'); 277 278 if(relurl[0] != '?') { 279 /* append a relative path after the last slash */ 280 cutoff = memrchr(protsep, '/', 281 cutoff ? (size_t)(cutoff - protsep) : strlen(protsep)); 282 if(cutoff) 283 cutoff++; /* truncate after last slash */ 284 } 285 break; 286 } 287 288 prelen = cutoff ? (size_t)(cutoff - base) : strlen(base); 289 290 /* build new URL */ 291 curlx_dyn_init(&urlbuf, CURL_MAX_INPUT_LENGTH); 292 293 if(!curlx_dyn_addn(&urlbuf, base, prelen) && 294 !urlencode_str(&urlbuf, useurl, strlen(useurl), !host_changed, FALSE)) { 295 uc = parseurl_and_replace(curlx_dyn_ptr(&urlbuf), u, 296 flags & ~CURLU_PATH_AS_IS); 297 } 298 else 299 uc = CURLUE_OUT_OF_MEMORY; 300 301 curlx_dyn_free(&urlbuf); 302 return uc; 303 } 304 305 /* scan for byte values <= 31, 127 and sometimes space */ 306 CURLUcode Curl_junkscan(const char *url, size_t *urllen, bool allowspace) 307 { 308 size_t n = strlen(url); 309 size_t i; 310 unsigned char control; 311 const unsigned char *p = (const unsigned char *)url; 312 if(n > CURL_MAX_INPUT_LENGTH) 313 return CURLUE_MALFORMED_INPUT; 314 315 control = allowspace ? 0x1f : 0x20; 316 for(i = 0; i < n; i++) { 317 if(p[i] <= control || p[i] == 127) 318 return CURLUE_MALFORMED_INPUT; 319 } 320 *urllen = n; 321 return CURLUE_OK; 322 } 323 324 /* 325 * parse_hostname_login() 326 * 327 * Parse the login details (username, password and options) from the URL and 328 * strip them out of the hostname 329 * 330 */ 331 static CURLUcode parse_hostname_login(struct Curl_URL *u, 332 const char *login, 333 size_t len, 334 unsigned int flags, 335 size_t *offset) /* to the hostname */ 336 { 337 CURLUcode result = CURLUE_OK; 338 CURLcode ccode; 339 char *userp = NULL; 340 char *passwdp = NULL; 341 char *optionsp = NULL; 342 const struct Curl_handler *h = NULL; 343 344 /* At this point, we assume all the other special cases have been taken 345 * care of, so the host is at most 346 * 347 * [user[:password][;options]]@]hostname 348 * 349 * We need somewhere to put the embedded details, so do that first. 350 */ 351 char *ptr; 352 353 DEBUGASSERT(login); 354 355 *offset = 0; 356 ptr = memchr(login, '@', len); 357 if(!ptr) 358 goto out; 359 360 /* We will now try to extract the 361 * possible login information in a string like: 362 * ftp://user:password@ftp.site.example:8021/README */ 363 ptr++; 364 365 /* if this is a known scheme, get some details */ 366 if(u->scheme) 367 h = Curl_get_scheme_handler(u->scheme); 368 369 /* We could use the login information in the URL so extract it. Only parse 370 options if the handler says we should. Note that 'h' might be NULL! */ 371 ccode = Curl_parse_login_details(login, ptr - login - 1, 372 &userp, &passwdp, 373 (h && (h->flags & PROTOPT_URLOPTIONS)) ? 374 &optionsp : NULL); 375 if(ccode) { 376 result = CURLUE_BAD_LOGIN; 377 goto out; 378 } 379 380 if(userp) { 381 if(flags & CURLU_DISALLOW_USER) { 382 /* Option DISALLOW_USER is set and URL contains username. */ 383 result = CURLUE_USER_NOT_ALLOWED; 384 goto out; 385 } 386 free(u->user); 387 u->user = userp; 388 } 389 390 if(passwdp) { 391 free(u->password); 392 u->password = passwdp; 393 } 394 395 if(optionsp) { 396 free(u->options); 397 u->options = optionsp; 398 } 399 400 /* the hostname starts at this offset */ 401 *offset = ptr - login; 402 return CURLUE_OK; 403 404 out: 405 406 free(userp); 407 free(passwdp); 408 free(optionsp); 409 u->user = NULL; 410 u->password = NULL; 411 u->options = NULL; 412 413 return result; 414 } 415 416 UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, struct dynbuf *host, 417 bool has_scheme) 418 { 419 const char *portptr; 420 char *hostname = curlx_dyn_ptr(host); 421 /* 422 * Find the end of an IPv6 address on the ']' ending bracket. 423 */ 424 if(hostname[0] == '[') { 425 portptr = strchr(hostname, ']'); 426 if(!portptr) 427 return CURLUE_BAD_IPV6; 428 portptr++; 429 /* this is a RFC2732-style specified IP-address */ 430 if(*portptr) { 431 if(*portptr != ':') 432 return CURLUE_BAD_PORT_NUMBER; 433 } 434 else 435 portptr = NULL; 436 } 437 else 438 portptr = strchr(hostname, ':'); 439 440 if(portptr) { 441 curl_off_t port; 442 size_t keep = portptr - hostname; 443 444 /* Browser behavior adaptation. If there is a colon with no digits after, 445 just cut off the name there which makes us ignore the colon and just 446 use the default port. Firefox, Chrome and Safari all do that. 447 448 Do not do it if the URL has no scheme, to make something that looks like 449 a scheme not work! 450 */ 451 curlx_dyn_setlen(host, keep); 452 portptr++; 453 if(!*portptr) 454 return has_scheme ? CURLUE_OK : CURLUE_BAD_PORT_NUMBER; 455 456 if(curlx_str_number(&portptr, &port, 0xffff) || *portptr) 457 return CURLUE_BAD_PORT_NUMBER; 458 459 u->portnum = (unsigned short) port; 460 /* generate a new port number string to get rid of leading zeroes etc */ 461 free(u->port); 462 u->port = aprintf("%" CURL_FORMAT_CURL_OFF_T, port); 463 if(!u->port) 464 return CURLUE_OUT_OF_MEMORY; 465 } 466 467 return CURLUE_OK; 468 } 469 470 /* this assumes 'hostname' now starts with [ */ 471 static CURLUcode ipv6_parse(struct Curl_URL *u, char *hostname, 472 size_t hlen) /* length of hostname */ 473 { 474 size_t len; 475 DEBUGASSERT(*hostname == '['); 476 if(hlen < 4) /* '[::]' is the shortest possible valid string */ 477 return CURLUE_BAD_IPV6; 478 hostname++; 479 hlen -= 2; 480 481 /* only valid IPv6 letters are ok */ 482 len = strspn(hostname, "0123456789abcdefABCDEF:."); 483 484 if(hlen != len) { 485 hlen = len; 486 if(hostname[len] == '%') { 487 /* this could now be '%[zone id]' */ 488 char zoneid[16]; 489 int i = 0; 490 char *h = &hostname[len + 1]; 491 /* pass '25' if present and is a URL encoded percent sign */ 492 if(!strncmp(h, "25", 2) && h[2] && (h[2] != ']')) 493 h += 2; 494 while(*h && (*h != ']') && (i < 15)) 495 zoneid[i++] = *h++; 496 if(!i || (']' != *h)) 497 return CURLUE_BAD_IPV6; 498 zoneid[i] = 0; 499 u->zoneid = strdup(zoneid); 500 if(!u->zoneid) 501 return CURLUE_OUT_OF_MEMORY; 502 hostname[len] = ']'; /* insert end bracket */ 503 hostname[len + 1] = 0; /* terminate the hostname */ 504 } 505 else 506 return CURLUE_BAD_IPV6; 507 /* hostname is fine */ 508 } 509 510 /* Normalize the IPv6 address */ 511 { 512 char dest[16]; /* fits a binary IPv6 address */ 513 hostname[hlen] = 0; /* end the address there */ 514 if(1 != curlx_inet_pton(AF_INET6, hostname, dest)) 515 return CURLUE_BAD_IPV6; 516 if(curlx_inet_ntop(AF_INET6, dest, hostname, hlen)) { 517 hlen = strlen(hostname); /* might be shorter now */ 518 hostname[hlen + 1] = 0; 519 } 520 hostname[hlen] = ']'; /* restore ending bracket */ 521 } 522 return CURLUE_OK; 523 } 524 525 static CURLUcode hostname_check(struct Curl_URL *u, char *hostname, 526 size_t hlen) /* length of hostname */ 527 { 528 size_t len; 529 DEBUGASSERT(hostname); 530 531 if(!hlen) 532 return CURLUE_NO_HOST; 533 else if(hostname[0] == '[') 534 return ipv6_parse(u, hostname, hlen); 535 else { 536 /* letters from the second string are not ok */ 537 len = strcspn(hostname, " \r\n\t/:#?!@{}[]\\$\'\"^`*<>=;,+&()%"); 538 if(hlen != len) 539 /* hostname with bad content */ 540 return CURLUE_BAD_HOSTNAME; 541 } 542 return CURLUE_OK; 543 } 544 545 /* 546 * Handle partial IPv4 numerical addresses and different bases, like 547 * '16843009', '0x7f', '0x7f.1' '0177.1.1.1' etc. 548 * 549 * If the given input string is syntactically wrong IPv4 or any part for 550 * example is too big, this function returns HOST_NAME. 551 * 552 * Output the "normalized" version of that input string in plain quad decimal 553 * integers. 554 * 555 * Returns the host type. 556 */ 557 558 #define HOST_ERROR -1 /* out of memory */ 559 560 #define HOST_NAME 1 561 #define HOST_IPV4 2 562 #define HOST_IPV6 3 563 564 static int ipv4_normalize(struct dynbuf *host) 565 { 566 bool done = FALSE; 567 int n = 0; 568 const char *c = curlx_dyn_ptr(host); 569 unsigned int parts[4] = {0, 0, 0, 0}; 570 CURLcode result = CURLE_OK; 571 572 if(*c == '[') 573 return HOST_IPV6; 574 575 while(!done) { 576 int rc; 577 curl_off_t l; 578 if(*c == '0') { 579 if(c[1] == 'x') { 580 c += 2; /* skip the prefix */ 581 rc = curlx_str_hex(&c, &l, UINT_MAX); 582 } 583 else 584 rc = curlx_str_octal(&c, &l, UINT_MAX); 585 } 586 else 587 rc = curlx_str_number(&c, &l, UINT_MAX); 588 589 if(rc) 590 return HOST_NAME; 591 592 parts[n] = (unsigned int)l; 593 594 switch(*c) { 595 case '.': 596 if(n == 3) 597 return HOST_NAME; 598 n++; 599 c++; 600 break; 601 602 case '\0': 603 done = TRUE; 604 break; 605 606 default: 607 return HOST_NAME; 608 } 609 } 610 611 switch(n) { 612 case 0: /* a -- 32 bits */ 613 curlx_dyn_reset(host); 614 615 result = curlx_dyn_addf(host, "%u.%u.%u.%u", 616 (parts[0] >> 24), 617 ((parts[0] >> 16) & 0xff), 618 ((parts[0] >> 8) & 0xff), 619 (parts[0] & 0xff)); 620 break; 621 case 1: /* a.b -- 8.24 bits */ 622 if((parts[0] > 0xff) || (parts[1] > 0xffffff)) 623 return HOST_NAME; 624 curlx_dyn_reset(host); 625 result = curlx_dyn_addf(host, "%u.%u.%u.%u", 626 (parts[0]), 627 ((parts[1] >> 16) & 0xff), 628 ((parts[1] >> 8) & 0xff), 629 (parts[1] & 0xff)); 630 break; 631 case 2: /* a.b.c -- 8.8.16 bits */ 632 if((parts[0] > 0xff) || (parts[1] > 0xff) || (parts[2] > 0xffff)) 633 return HOST_NAME; 634 curlx_dyn_reset(host); 635 result = curlx_dyn_addf(host, "%u.%u.%u.%u", 636 (parts[0]), 637 (parts[1]), 638 ((parts[2] >> 8) & 0xff), 639 (parts[2] & 0xff)); 640 break; 641 case 3: /* a.b.c.d -- 8.8.8.8 bits */ 642 if((parts[0] > 0xff) || (parts[1] > 0xff) || (parts[2] > 0xff) || 643 (parts[3] > 0xff)) 644 return HOST_NAME; 645 curlx_dyn_reset(host); 646 result = curlx_dyn_addf(host, "%u.%u.%u.%u", 647 (parts[0]), 648 (parts[1]), 649 (parts[2]), 650 (parts[3])); 651 break; 652 } 653 if(result) 654 return HOST_ERROR; 655 return HOST_IPV4; 656 } 657 658 /* if necessary, replace the host content with a URL decoded version */ 659 static CURLUcode urldecode_host(struct dynbuf *host) 660 { 661 char *per = NULL; 662 const char *hostname = curlx_dyn_ptr(host); 663 per = strchr(hostname, '%'); 664 if(!per) 665 /* nothing to decode */ 666 return CURLUE_OK; 667 else { 668 /* encoded */ 669 size_t dlen; 670 char *decoded; 671 CURLcode result = Curl_urldecode(hostname, 0, &decoded, &dlen, 672 REJECT_CTRL); 673 if(result) 674 return CURLUE_BAD_HOSTNAME; 675 curlx_dyn_reset(host); 676 result = curlx_dyn_addn(host, decoded, dlen); 677 free(decoded); 678 if(result) 679 return cc2cu(result); 680 } 681 682 return CURLUE_OK; 683 } 684 685 static CURLUcode parse_authority(struct Curl_URL *u, 686 const char *auth, size_t authlen, 687 unsigned int flags, 688 struct dynbuf *host, 689 bool has_scheme) 690 { 691 size_t offset; 692 CURLUcode uc; 693 CURLcode result; 694 695 /* 696 * Parse the login details and strip them out of the hostname. 697 */ 698 uc = parse_hostname_login(u, auth, authlen, flags, &offset); 699 if(uc) 700 goto out; 701 702 result = curlx_dyn_addn(host, auth + offset, authlen - offset); 703 if(result) { 704 uc = cc2cu(result); 705 goto out; 706 } 707 708 uc = Curl_parse_port(u, host, has_scheme); 709 if(uc) 710 goto out; 711 712 if(!curlx_dyn_len(host)) 713 return CURLUE_NO_HOST; 714 715 switch(ipv4_normalize(host)) { 716 case HOST_IPV4: 717 break; 718 case HOST_IPV6: 719 uc = ipv6_parse(u, curlx_dyn_ptr(host), curlx_dyn_len(host)); 720 break; 721 case HOST_NAME: 722 uc = urldecode_host(host); 723 if(!uc) 724 uc = hostname_check(u, curlx_dyn_ptr(host), curlx_dyn_len(host)); 725 break; 726 case HOST_ERROR: 727 uc = CURLUE_OUT_OF_MEMORY; 728 break; 729 default: 730 uc = CURLUE_BAD_HOSTNAME; /* Bad IPv4 address even */ 731 break; 732 } 733 734 out: 735 return uc; 736 } 737 738 /* used for HTTP/2 server push */ 739 CURLUcode Curl_url_set_authority(CURLU *u, const char *authority) 740 { 741 CURLUcode result; 742 struct dynbuf host; 743 744 DEBUGASSERT(authority); 745 curlx_dyn_init(&host, CURL_MAX_INPUT_LENGTH); 746 747 result = parse_authority(u, authority, strlen(authority), 748 CURLU_DISALLOW_USER, &host, !!u->scheme); 749 if(result) 750 curlx_dyn_free(&host); 751 else { 752 free(u->host); 753 u->host = curlx_dyn_ptr(&host); 754 } 755 return result; 756 } 757 758 /* 759 * "Remove Dot Segments" 760 * https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4 761 */ 762 763 static bool is_dot(const char **str, size_t *clen) 764 { 765 const char *p = *str; 766 if(*p == '.') { 767 (*str)++; 768 (*clen)--; 769 return TRUE; 770 } 771 else if((*clen >= 3) && 772 (p[0] == '%') && (p[1] == '2') && ((p[2] | 0x20) == 'e')) { 773 *str += 3; 774 *clen -= 3; 775 return TRUE; 776 } 777 return FALSE; 778 } 779 780 #define ISSLASH(x) ((x) == '/') 781 782 /* 783 * dedotdotify() 784 * @unittest: 1395 785 * 786 * This function gets a null-terminated path with dot and dotdot sequences 787 * passed in and strips them off according to the rules in RFC 3986 section 788 * 5.2.4. 789 * 790 * The function handles a path. It should not contain the query nor fragment. 791 * 792 * RETURNS 793 * 794 * Zero for success and 'out' set to an allocated dedotdotified string. 795 */ 796 UNITTEST int dedotdotify(const char *input, size_t clen, char **outp); 797 UNITTEST int dedotdotify(const char *input, size_t clen, char **outp) 798 { 799 struct dynbuf out; 800 CURLcode result = CURLE_OK; 801 802 *outp = NULL; 803 /* the path always starts with a slash, and a slash has not dot */ 804 if(clen < 2) 805 return 0; 806 807 curlx_dyn_init(&out, clen + 1); 808 809 /* A. If the input buffer begins with a prefix of "../" or "./", then 810 remove that prefix from the input buffer; otherwise, */ 811 if(is_dot(&input, &clen)) { 812 const char *p = input; 813 size_t blen = clen; 814 815 if(!clen) 816 /* . [end] */ 817 goto end; 818 else if(ISSLASH(*p)) { 819 /* one dot followed by a slash */ 820 input = p + 1; 821 clen--; 822 } 823 824 /* D. if the input buffer consists only of "." or "..", then remove 825 that from the input buffer; otherwise, */ 826 else if(is_dot(&p, &blen)) { 827 if(!blen) 828 /* .. [end] */ 829 goto end; 830 else if(ISSLASH(*p)) { 831 /* ../ */ 832 input = p + 1; 833 clen = blen - 1; 834 } 835 } 836 } 837 838 while(clen && !result) { /* until end of path content */ 839 if(ISSLASH(*input)) { 840 const char *p = &input[1]; 841 size_t blen = clen - 1; 842 /* B. if the input buffer begins with a prefix of "/./" or "/.", where 843 "." is a complete path segment, then replace that prefix with "/" in 844 the input buffer; otherwise, */ 845 if(is_dot(&p, &blen)) { 846 if(!blen) { /* /. */ 847 result = curlx_dyn_addn(&out, "/", 1); 848 break; 849 } 850 else if(ISSLASH(*p)) { /* /./ */ 851 input = p; 852 clen = blen; 853 continue; 854 } 855 856 /* C. if the input buffer begins with a prefix of "/../" or "/..", 857 where ".." is a complete path segment, then replace that prefix 858 with "/" in the input buffer and remove the last segment and its 859 preceding "/" (if any) from the output buffer; otherwise, */ 860 else if(is_dot(&p, &blen) && (ISSLASH(*p) || !blen)) { 861 /* remove the last segment from the output buffer */ 862 size_t len = curlx_dyn_len(&out); 863 if(len) { 864 char *ptr = curlx_dyn_ptr(&out); 865 char *last = memrchr(ptr, '/', len); 866 if(last) 867 /* trim the output at the slash */ 868 curlx_dyn_setlen(&out, last - ptr); 869 } 870 871 if(blen) { /* /../ */ 872 input = p; 873 clen = blen; 874 continue; 875 } 876 result = curlx_dyn_addn(&out, "/", 1); 877 break; 878 } 879 } 880 } 881 882 /* E. move the first path segment in the input buffer to the end of 883 the output buffer, including the initial "/" character (if any) and 884 any subsequent characters up to, but not including, the next "/" 885 character or the end of the input buffer. */ 886 887 result = curlx_dyn_addn(&out, input, 1); 888 input++; 889 clen--; 890 } 891 end: 892 if(!result) { 893 if(curlx_dyn_len(&out)) 894 *outp = curlx_dyn_ptr(&out); 895 else { 896 *outp = strdup(""); 897 if(!*outp) 898 return 1; 899 } 900 } 901 return result ? 1 : 0; /* success */ 902 } 903 904 static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags) 905 { 906 const char *path; 907 size_t pathlen; 908 char *query = NULL; 909 char *fragment = NULL; 910 char schemebuf[MAX_SCHEME_LEN + 1]; 911 size_t schemelen = 0; 912 size_t urllen; 913 CURLUcode result = CURLUE_OK; 914 size_t fraglen = 0; 915 struct dynbuf host; 916 917 DEBUGASSERT(url); 918 919 curlx_dyn_init(&host, CURL_MAX_INPUT_LENGTH); 920 921 result = Curl_junkscan(url, &urllen, !!(flags & CURLU_ALLOW_SPACE)); 922 if(result) 923 goto fail; 924 925 schemelen = Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf), 926 flags & (CURLU_GUESS_SCHEME| 927 CURLU_DEFAULT_SCHEME)); 928 929 /* handle the file: scheme */ 930 if(schemelen && !strcmp(schemebuf, "file")) { 931 bool uncpath = FALSE; 932 if(urllen <= 6) { 933 /* file:/ is not enough to actually be a complete file: URL */ 934 result = CURLUE_BAD_FILE_URL; 935 goto fail; 936 } 937 938 /* path has been allocated large enough to hold this */ 939 path = &url[5]; 940 pathlen = urllen - 5; 941 942 u->scheme = strdup("file"); 943 if(!u->scheme) { 944 result = CURLUE_OUT_OF_MEMORY; 945 goto fail; 946 } 947 948 /* Extra handling URLs with an authority component (i.e. that start with 949 * "file://") 950 * 951 * We allow omitted hostname (e.g. file:/<path>) -- valid according to 952 * RFC 8089, but not the (current) WHAT-WG URL spec. 953 */ 954 if(path[0] == '/' && path[1] == '/') { 955 /* swallow the two slashes */ 956 const char *ptr = &path[2]; 957 958 /* 959 * According to RFC 8089, a file: URL can be reliably dereferenced if: 960 * 961 * o it has no/blank hostname, or 962 * 963 * o the hostname matches "localhost" (case-insensitively), or 964 * 965 * o the hostname is a FQDN that resolves to this machine, or 966 * 967 * o it is an UNC String transformed to an URI (Windows only, RFC 8089 968 * Appendix E.3). 969 * 970 * For brevity, we only consider URLs with empty, "localhost", or 971 * "127.0.0.1" hostnames as local, otherwise as an UNC String. 972 * 973 * Additionally, there is an exception for URLs with a Windows drive 974 * letter in the authority (which was accidentally omitted from RFC 8089 975 * Appendix E, but believe me, it was meant to be there. --MK) 976 */ 977 if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) { 978 /* the URL includes a hostname, it must match "localhost" or 979 "127.0.0.1" to be valid */ 980 if(checkprefix("localhost/", ptr) || 981 checkprefix("127.0.0.1/", ptr)) { 982 ptr += 9; /* now points to the slash after the host */ 983 } 984 else { 985 #ifdef _WIN32 986 size_t len; 987 988 /* the hostname, NetBIOS computer name, can not contain disallowed 989 chars, and the delimiting slash character must be appended to the 990 hostname */ 991 path = strpbrk(ptr, "/\\:*?\"<>|"); 992 if(!path || *path != '/') { 993 result = CURLUE_BAD_FILE_URL; 994 goto fail; 995 } 996 997 len = path - ptr; 998 if(len) { 999 CURLcode code = curlx_dyn_addn(&host, ptr, len); 1000 if(code) { 1001 result = cc2cu(code); 1002 goto fail; 1003 } 1004 uncpath = TRUE; 1005 } 1006 1007 ptr -= 2; /* now points to the // before the host in UNC */ 1008 #else 1009 /* Invalid file://hostname/, expected localhost or 127.0.0.1 or 1010 none */ 1011 result = CURLUE_BAD_FILE_URL; 1012 goto fail; 1013 #endif 1014 } 1015 } 1016 1017 path = ptr; 1018 pathlen = urllen - (ptr - url); 1019 } 1020 1021 if(!uncpath) 1022 /* no host for file: URLs by default */ 1023 curlx_dyn_reset(&host); 1024 1025 #if !defined(_WIN32) && !defined(MSDOS) && !defined(__CYGWIN__) 1026 /* Do not allow Windows drive letters when not in Windows. 1027 * This catches both "file:/c:" and "file:c:" */ 1028 if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) || 1029 STARTS_WITH_URL_DRIVE_PREFIX(path)) { 1030 /* File drive letters are only accepted in MS-DOS/Windows */ 1031 result = CURLUE_BAD_FILE_URL; 1032 goto fail; 1033 } 1034 #else 1035 /* If the path starts with a slash and a drive letter, ditch the slash */ 1036 if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) { 1037 /* This cannot be done with strcpy, as the memory chunks overlap! */ 1038 path++; 1039 pathlen--; 1040 } 1041 #endif 1042 1043 } 1044 else { 1045 /* clear path */ 1046 const char *schemep = NULL; 1047 const char *hostp; 1048 size_t hostlen; 1049 1050 if(schemelen) { 1051 int i = 0; 1052 const char *p = &url[schemelen + 1]; 1053 while((*p == '/') && (i < 4)) { 1054 p++; 1055 i++; 1056 } 1057 1058 schemep = schemebuf; 1059 if(!Curl_get_scheme_handler(schemep) && 1060 !(flags & CURLU_NON_SUPPORT_SCHEME)) { 1061 result = CURLUE_UNSUPPORTED_SCHEME; 1062 goto fail; 1063 } 1064 1065 if((i < 1) || (i > 3)) { 1066 /* less than one or more than three slashes */ 1067 result = CURLUE_BAD_SLASHES; 1068 goto fail; 1069 } 1070 hostp = p; /* hostname starts here */ 1071 } 1072 else { 1073 /* no scheme! */ 1074 1075 if(!(flags & (CURLU_DEFAULT_SCHEME|CURLU_GUESS_SCHEME))) { 1076 result = CURLUE_BAD_SCHEME; 1077 goto fail; 1078 } 1079 if(flags & CURLU_DEFAULT_SCHEME) 1080 schemep = DEFAULT_SCHEME; 1081 1082 /* 1083 * The URL was badly formatted, let's try without scheme specified. 1084 */ 1085 hostp = url; 1086 } 1087 1088 if(schemep) { 1089 u->scheme = strdup(schemep); 1090 if(!u->scheme) { 1091 result = CURLUE_OUT_OF_MEMORY; 1092 goto fail; 1093 } 1094 } 1095 1096 /* find the end of the hostname + port number */ 1097 hostlen = strcspn(hostp, "/?#"); 1098 path = &hostp[hostlen]; 1099 1100 /* this pathlen also contains the query and the fragment */ 1101 pathlen = urllen - (path - url); 1102 if(hostlen) { 1103 1104 result = parse_authority(u, hostp, hostlen, flags, &host, schemelen); 1105 if(result) 1106 goto fail; 1107 1108 if((flags & CURLU_GUESS_SCHEME) && !schemep) { 1109 const char *hostname = curlx_dyn_ptr(&host); 1110 /* legacy curl-style guess based on hostname */ 1111 if(checkprefix("ftp.", hostname)) 1112 schemep = "ftp"; 1113 else if(checkprefix("dict.", hostname)) 1114 schemep = "dict"; 1115 else if(checkprefix("ldap.", hostname)) 1116 schemep = "ldap"; 1117 else if(checkprefix("imap.", hostname)) 1118 schemep = "imap"; 1119 else if(checkprefix("smtp.", hostname)) 1120 schemep = "smtp"; 1121 else if(checkprefix("pop3.", hostname)) 1122 schemep = "pop3"; 1123 else 1124 schemep = "http"; 1125 1126 u->scheme = strdup(schemep); 1127 if(!u->scheme) { 1128 result = CURLUE_OUT_OF_MEMORY; 1129 goto fail; 1130 } 1131 u->guessed_scheme = TRUE; 1132 } 1133 } 1134 else if(flags & CURLU_NO_AUTHORITY) { 1135 /* allowed to be empty. */ 1136 if(curlx_dyn_add(&host, "")) { 1137 result = CURLUE_OUT_OF_MEMORY; 1138 goto fail; 1139 } 1140 } 1141 else { 1142 result = CURLUE_NO_HOST; 1143 goto fail; 1144 } 1145 } 1146 1147 fragment = strchr(path, '#'); 1148 if(fragment) { 1149 fraglen = pathlen - (fragment - path); 1150 u->fragment_present = TRUE; 1151 if(fraglen > 1) { 1152 /* skip the leading '#' in the copy but include the terminating null */ 1153 if(flags & CURLU_URLENCODE) { 1154 struct dynbuf enc; 1155 curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH); 1156 result = urlencode_str(&enc, fragment + 1, fraglen - 1, TRUE, FALSE); 1157 if(result) 1158 goto fail; 1159 u->fragment = curlx_dyn_ptr(&enc); 1160 } 1161 else { 1162 u->fragment = Curl_memdup0(fragment + 1, fraglen - 1); 1163 if(!u->fragment) { 1164 result = CURLUE_OUT_OF_MEMORY; 1165 goto fail; 1166 } 1167 } 1168 } 1169 /* after this, pathlen still contains the query */ 1170 pathlen -= fraglen; 1171 } 1172 1173 query = memchr(path, '?', pathlen); 1174 if(query) { 1175 size_t qlen = fragment ? (size_t)(fragment - query) : 1176 pathlen - (query - path); 1177 pathlen -= qlen; 1178 u->query_present = TRUE; 1179 if(qlen > 1) { 1180 if(flags & CURLU_URLENCODE) { 1181 struct dynbuf enc; 1182 curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH); 1183 /* skip the leading question mark */ 1184 result = urlencode_str(&enc, query + 1, qlen - 1, TRUE, TRUE); 1185 if(result) 1186 goto fail; 1187 u->query = curlx_dyn_ptr(&enc); 1188 } 1189 else { 1190 u->query = Curl_memdup0(query + 1, qlen - 1); 1191 if(!u->query) { 1192 result = CURLUE_OUT_OF_MEMORY; 1193 goto fail; 1194 } 1195 } 1196 } 1197 else { 1198 /* single byte query */ 1199 u->query = strdup(""); 1200 if(!u->query) { 1201 result = CURLUE_OUT_OF_MEMORY; 1202 goto fail; 1203 } 1204 } 1205 } 1206 1207 if(pathlen && (flags & CURLU_URLENCODE)) { 1208 struct dynbuf enc; 1209 curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH); 1210 result = urlencode_str(&enc, path, pathlen, TRUE, FALSE); 1211 if(result) 1212 goto fail; 1213 pathlen = curlx_dyn_len(&enc); 1214 path = u->path = curlx_dyn_ptr(&enc); 1215 } 1216 1217 if(pathlen <= 1) { 1218 /* there is no path left or just the slash, unset */ 1219 path = NULL; 1220 } 1221 else { 1222 if(!u->path) { 1223 u->path = Curl_memdup0(path, pathlen); 1224 if(!u->path) { 1225 result = CURLUE_OUT_OF_MEMORY; 1226 goto fail; 1227 } 1228 path = u->path; 1229 } 1230 else if(flags & CURLU_URLENCODE) 1231 /* it might have encoded more than just the path so cut it */ 1232 u->path[pathlen] = 0; 1233 1234 if(!(flags & CURLU_PATH_AS_IS)) { 1235 /* remove ../ and ./ sequences according to RFC3986 */ 1236 char *dedot; 1237 int err = dedotdotify(path, pathlen, &dedot); 1238 if(err) { 1239 result = CURLUE_OUT_OF_MEMORY; 1240 goto fail; 1241 } 1242 if(dedot) { 1243 free(u->path); 1244 u->path = dedot; 1245 } 1246 } 1247 } 1248 1249 u->host = curlx_dyn_ptr(&host); 1250 1251 return result; 1252 fail: 1253 curlx_dyn_free(&host); 1254 free_urlhandle(u); 1255 return result; 1256 } 1257 1258 /* 1259 * Parse the URL and, if successful, replace everything in the Curl_URL struct. 1260 */ 1261 static CURLUcode parseurl_and_replace(const char *url, CURLU *u, 1262 unsigned int flags) 1263 { 1264 CURLUcode result; 1265 CURLU tmpurl; 1266 memset(&tmpurl, 0, sizeof(tmpurl)); 1267 result = parseurl(url, &tmpurl, flags); 1268 if(!result) { 1269 free_urlhandle(u); 1270 *u = tmpurl; 1271 } 1272 return result; 1273 } 1274 1275 /* 1276 */ 1277 CURLU *curl_url(void) 1278 { 1279 return calloc(1, sizeof(struct Curl_URL)); 1280 } 1281 1282 void curl_url_cleanup(CURLU *u) 1283 { 1284 if(u) { 1285 free_urlhandle(u); 1286 free(u); 1287 } 1288 } 1289 1290 #define DUP(dest, src, name) \ 1291 do { \ 1292 if(src->name) { \ 1293 dest->name = strdup(src->name); \ 1294 if(!dest->name) \ 1295 goto fail; \ 1296 } \ 1297 } while(0) 1298 1299 CURLU *curl_url_dup(const CURLU *in) 1300 { 1301 struct Curl_URL *u = calloc(1, sizeof(struct Curl_URL)); 1302 if(u) { 1303 DUP(u, in, scheme); 1304 DUP(u, in, user); 1305 DUP(u, in, password); 1306 DUP(u, in, options); 1307 DUP(u, in, host); 1308 DUP(u, in, port); 1309 DUP(u, in, path); 1310 DUP(u, in, query); 1311 DUP(u, in, fragment); 1312 DUP(u, in, zoneid); 1313 u->portnum = in->portnum; 1314 u->fragment_present = in->fragment_present; 1315 u->query_present = in->query_present; 1316 } 1317 return u; 1318 fail: 1319 curl_url_cleanup(u); 1320 return NULL; 1321 } 1322 1323 #ifndef USE_IDN 1324 #define host_decode(x,y) CURLUE_LACKS_IDN 1325 #define host_encode(x,y) CURLUE_LACKS_IDN 1326 #else 1327 static CURLUcode host_decode(const char *host, char **allochost) 1328 { 1329 CURLcode result = Curl_idn_decode(host, allochost); 1330 if(result) 1331 return (result == CURLE_OUT_OF_MEMORY) ? 1332 CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME; 1333 return CURLUE_OK; 1334 } 1335 1336 static CURLUcode host_encode(const char *host, char **allochost) 1337 { 1338 CURLcode result = Curl_idn_encode(host, allochost); 1339 if(result) 1340 return (result == CURLE_OUT_OF_MEMORY) ? 1341 CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME; 1342 return CURLUE_OK; 1343 } 1344 #endif 1345 1346 static CURLUcode urlget_format(const CURLU *u, CURLUPart what, 1347 const char *ptr, char **part, 1348 bool plusdecode, unsigned int flags) 1349 { 1350 size_t partlen = strlen(ptr); 1351 bool urldecode = (flags & CURLU_URLDECODE) ? 1 : 0; 1352 bool urlencode = (flags & CURLU_URLENCODE) ? 1 : 0; 1353 bool punycode = (flags & CURLU_PUNYCODE) && (what == CURLUPART_HOST); 1354 bool depunyfy = (flags & CURLU_PUNY2IDN) && (what == CURLUPART_HOST); 1355 *part = Curl_memdup0(ptr, partlen); 1356 if(!*part) 1357 return CURLUE_OUT_OF_MEMORY; 1358 if(plusdecode) { 1359 /* convert + to space */ 1360 char *plus = *part; 1361 size_t i = 0; 1362 for(i = 0; i < partlen; ++plus, i++) { 1363 if(*plus == '+') 1364 *plus = ' '; 1365 } 1366 } 1367 if(urldecode) { 1368 char *decoded; 1369 size_t dlen; 1370 /* this unconditional rejection of control bytes is documented 1371 API behavior */ 1372 CURLcode res = Curl_urldecode(*part, 0, &decoded, &dlen, REJECT_CTRL); 1373 free(*part); 1374 if(res) { 1375 *part = NULL; 1376 return CURLUE_URLDECODE; 1377 } 1378 *part = decoded; 1379 partlen = dlen; 1380 } 1381 if(urlencode) { 1382 struct dynbuf enc; 1383 CURLUcode uc; 1384 curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH); 1385 uc = urlencode_str(&enc, *part, partlen, TRUE, what == CURLUPART_QUERY); 1386 if(uc) 1387 return uc; 1388 free(*part); 1389 *part = curlx_dyn_ptr(&enc); 1390 } 1391 else if(punycode) { 1392 if(!Curl_is_ASCII_name(u->host)) { 1393 char *allochost = NULL; 1394 CURLUcode ret = host_decode(*part, &allochost); 1395 if(ret) 1396 return ret; 1397 free(*part); 1398 *part = allochost; 1399 } 1400 } 1401 else if(depunyfy) { 1402 if(Curl_is_ASCII_name(u->host)) { 1403 char *allochost = NULL; 1404 CURLUcode ret = host_encode(*part, &allochost); 1405 if(ret) 1406 return ret; 1407 free(*part); 1408 *part = allochost; 1409 } 1410 } 1411 1412 return CURLUE_OK; 1413 } 1414 1415 static CURLUcode urlget_url(const CURLU *u, char **part, unsigned int flags) 1416 { 1417 char *url; 1418 const char *scheme; 1419 char *options = u->options; 1420 char *port = u->port; 1421 char *allochost = NULL; 1422 bool show_fragment = 1423 u->fragment || (u->fragment_present && flags & CURLU_GET_EMPTY); 1424 bool show_query = (u->query && u->query[0]) || 1425 (u->query_present && flags & CURLU_GET_EMPTY); 1426 bool punycode = (flags & CURLU_PUNYCODE) ? 1 : 0; 1427 bool depunyfy = (flags & CURLU_PUNY2IDN) ? 1 : 0; 1428 bool urlencode = (flags & CURLU_URLENCODE) ? 1 : 0; 1429 char portbuf[7]; 1430 if(u->scheme && curl_strequal("file", u->scheme)) { 1431 url = aprintf("file://%s%s%s%s%s", 1432 u->path, 1433 show_query ? "?": "", 1434 u->query ? u->query : "", 1435 show_fragment ? "#": "", 1436 u->fragment ? u->fragment : ""); 1437 } 1438 else if(!u->host) 1439 return CURLUE_NO_HOST; 1440 else { 1441 const struct Curl_handler *h = NULL; 1442 char schemebuf[MAX_SCHEME_LEN + 5]; 1443 if(u->scheme) 1444 scheme = u->scheme; 1445 else if(flags & CURLU_DEFAULT_SCHEME) 1446 scheme = DEFAULT_SCHEME; 1447 else 1448 return CURLUE_NO_SCHEME; 1449 1450 h = Curl_get_scheme_handler(scheme); 1451 if(!port && (flags & CURLU_DEFAULT_PORT)) { 1452 /* there is no stored port number, but asked to deliver 1453 a default one for the scheme */ 1454 if(h) { 1455 msnprintf(portbuf, sizeof(portbuf), "%u", h->defport); 1456 port = portbuf; 1457 } 1458 } 1459 else if(port) { 1460 /* there is a stored port number, but asked to inhibit if it matches 1461 the default one for the scheme */ 1462 if(h && (h->defport == u->portnum) && 1463 (flags & CURLU_NO_DEFAULT_PORT)) 1464 port = NULL; 1465 } 1466 1467 if(h && !(h->flags & PROTOPT_URLOPTIONS)) 1468 options = NULL; 1469 1470 if(u->host[0] == '[') { 1471 if(u->zoneid) { 1472 /* make it '[ host %25 zoneid ]' */ 1473 struct dynbuf enc; 1474 size_t hostlen = strlen(u->host); 1475 curlx_dyn_init(&enc, CURL_MAX_INPUT_LENGTH); 1476 if(curlx_dyn_addf(&enc, "%.*s%%25%s]", (int)hostlen - 1, u->host, 1477 u->zoneid)) 1478 return CURLUE_OUT_OF_MEMORY; 1479 allochost = curlx_dyn_ptr(&enc); 1480 } 1481 } 1482 else if(urlencode) { 1483 allochost = curl_easy_escape(NULL, u->host, 0); 1484 if(!allochost) 1485 return CURLUE_OUT_OF_MEMORY; 1486 } 1487 else if(punycode) { 1488 if(!Curl_is_ASCII_name(u->host)) { 1489 CURLUcode ret = host_decode(u->host, &allochost); 1490 if(ret) 1491 return ret; 1492 } 1493 } 1494 else if(depunyfy) { 1495 if(Curl_is_ASCII_name(u->host)) { 1496 CURLUcode ret = host_encode(u->host, &allochost); 1497 if(ret) 1498 return ret; 1499 } 1500 } 1501 1502 if(!(flags & CURLU_NO_GUESS_SCHEME) || !u->guessed_scheme) 1503 msnprintf(schemebuf, sizeof(schemebuf), "%s://", scheme); 1504 else 1505 schemebuf[0] = 0; 1506 1507 url = aprintf("%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", 1508 schemebuf, 1509 u->user ? u->user : "", 1510 u->password ? ":": "", 1511 u->password ? u->password : "", 1512 options ? ";" : "", 1513 options ? options : "", 1514 (u->user || u->password || options) ? "@": "", 1515 allochost ? allochost : u->host, 1516 port ? ":": "", 1517 port ? port : "", 1518 u->path ? u->path : "/", 1519 show_query ? "?": "", 1520 u->query ? u->query : "", 1521 show_fragment ? "#": "", 1522 u->fragment ? u->fragment : ""); 1523 free(allochost); 1524 } 1525 if(!url) 1526 return CURLUE_OUT_OF_MEMORY; 1527 *part = url; 1528 return CURLUE_OK; 1529 } 1530 1531 CURLUcode curl_url_get(const CURLU *u, CURLUPart what, 1532 char **part, unsigned int flags) 1533 { 1534 const char *ptr; 1535 CURLUcode ifmissing = CURLUE_UNKNOWN_PART; 1536 char portbuf[7]; 1537 bool plusdecode = FALSE; 1538 if(!u) 1539 return CURLUE_BAD_HANDLE; 1540 if(!part) 1541 return CURLUE_BAD_PARTPOINTER; 1542 *part = NULL; 1543 1544 switch(what) { 1545 case CURLUPART_SCHEME: 1546 ptr = u->scheme; 1547 ifmissing = CURLUE_NO_SCHEME; 1548 flags &= ~CURLU_URLDECODE; /* never for schemes */ 1549 if((flags & CURLU_NO_GUESS_SCHEME) && u->guessed_scheme) 1550 return CURLUE_NO_SCHEME; 1551 break; 1552 case CURLUPART_USER: 1553 ptr = u->user; 1554 ifmissing = CURLUE_NO_USER; 1555 break; 1556 case CURLUPART_PASSWORD: 1557 ptr = u->password; 1558 ifmissing = CURLUE_NO_PASSWORD; 1559 break; 1560 case CURLUPART_OPTIONS: 1561 ptr = u->options; 1562 ifmissing = CURLUE_NO_OPTIONS; 1563 break; 1564 case CURLUPART_HOST: 1565 ptr = u->host; 1566 ifmissing = CURLUE_NO_HOST; 1567 break; 1568 case CURLUPART_ZONEID: 1569 ptr = u->zoneid; 1570 ifmissing = CURLUE_NO_ZONEID; 1571 break; 1572 case CURLUPART_PORT: 1573 ptr = u->port; 1574 ifmissing = CURLUE_NO_PORT; 1575 flags &= ~CURLU_URLDECODE; /* never for port */ 1576 if(!ptr && (flags & CURLU_DEFAULT_PORT) && u->scheme) { 1577 /* there is no stored port number, but asked to deliver 1578 a default one for the scheme */ 1579 const struct Curl_handler *h = Curl_get_scheme_handler(u->scheme); 1580 if(h) { 1581 msnprintf(portbuf, sizeof(portbuf), "%u", h->defport); 1582 ptr = portbuf; 1583 } 1584 } 1585 else if(ptr && u->scheme) { 1586 /* there is a stored port number, but ask to inhibit if 1587 it matches the default one for the scheme */ 1588 const struct Curl_handler *h = Curl_get_scheme_handler(u->scheme); 1589 if(h && (h->defport == u->portnum) && 1590 (flags & CURLU_NO_DEFAULT_PORT)) 1591 ptr = NULL; 1592 } 1593 break; 1594 case CURLUPART_PATH: 1595 ptr = u->path; 1596 if(!ptr) 1597 ptr = "/"; 1598 break; 1599 case CURLUPART_QUERY: 1600 ptr = u->query; 1601 ifmissing = CURLUE_NO_QUERY; 1602 plusdecode = flags & CURLU_URLDECODE; 1603 if(ptr && !ptr[0] && !(flags & CURLU_GET_EMPTY)) 1604 /* there was a blank query and the user do not ask for it */ 1605 ptr = NULL; 1606 break; 1607 case CURLUPART_FRAGMENT: 1608 ptr = u->fragment; 1609 ifmissing = CURLUE_NO_FRAGMENT; 1610 if(!ptr && u->fragment_present && flags & CURLU_GET_EMPTY) 1611 /* there was a blank fragment and the user asks for it */ 1612 ptr = ""; 1613 break; 1614 case CURLUPART_URL: 1615 return urlget_url(u, part, flags); 1616 default: 1617 ptr = NULL; 1618 break; 1619 } 1620 if(ptr) 1621 return urlget_format(u, what, ptr, part, plusdecode, flags); 1622 1623 return ifmissing; 1624 } 1625 1626 static CURLUcode set_url_scheme(CURLU *u, const char *scheme, 1627 unsigned int flags) 1628 { 1629 size_t plen = strlen(scheme); 1630 const struct Curl_handler *h = NULL; 1631 if((plen > MAX_SCHEME_LEN) || (plen < 1)) 1632 /* too long or too short */ 1633 return CURLUE_BAD_SCHEME; 1634 /* verify that it is a fine scheme */ 1635 h = Curl_get_scheme_handler(scheme); 1636 if(!h) { 1637 const char *s = scheme; 1638 if(!(flags & CURLU_NON_SUPPORT_SCHEME)) 1639 return CURLUE_UNSUPPORTED_SCHEME; 1640 if(ISALPHA(*s)) { 1641 /* ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */ 1642 while(--plen) { 1643 if(ISALNUM(*s) || (*s == '+') || (*s == '-') || (*s == '.')) 1644 s++; /* fine */ 1645 else 1646 return CURLUE_BAD_SCHEME; 1647 } 1648 } 1649 else 1650 return CURLUE_BAD_SCHEME; 1651 } 1652 u->guessed_scheme = FALSE; 1653 return CURLUE_OK; 1654 } 1655 1656 static CURLUcode set_url_port(CURLU *u, const char *provided_port) 1657 { 1658 char *tmp; 1659 curl_off_t port; 1660 if(!ISDIGIT(provided_port[0])) 1661 /* not a number */ 1662 return CURLUE_BAD_PORT_NUMBER; 1663 if(curlx_str_number(&provided_port, &port, 0xffff) || *provided_port) 1664 /* weirdly provided number, not good! */ 1665 return CURLUE_BAD_PORT_NUMBER; 1666 tmp = aprintf("%" CURL_FORMAT_CURL_OFF_T, port); 1667 if(!tmp) 1668 return CURLUE_OUT_OF_MEMORY; 1669 free(u->port); 1670 u->port = tmp; 1671 u->portnum = (unsigned short)port; 1672 return CURLUE_OK; 1673 } 1674 1675 static CURLUcode set_url(CURLU *u, const char *url, size_t part_size, 1676 unsigned int flags) 1677 { 1678 /* 1679 * Allow a new URL to replace the existing (if any) contents. 1680 * 1681 * If the existing contents is enough for a URL, allow a relative URL to 1682 * replace it. 1683 */ 1684 CURLUcode uc; 1685 char *oldurl = NULL; 1686 1687 if(!part_size) { 1688 /* a blank URL is not a valid URL unless we already have a complete one 1689 and this is a redirect */ 1690 if(!curl_url_get(u, CURLUPART_URL, &oldurl, flags)) { 1691 /* success, meaning the "" is a fine relative URL, but nothing 1692 changes */ 1693 free(oldurl); 1694 return CURLUE_OK; 1695 } 1696 return CURLUE_MALFORMED_INPUT; 1697 } 1698 1699 /* if the new thing is absolute or the old one is not (we could not get an 1700 * absolute URL in 'oldurl'), then replace the existing with the new. */ 1701 if(Curl_is_absolute_url(url, NULL, 0, 1702 flags & (CURLU_GUESS_SCHEME|CURLU_DEFAULT_SCHEME)) 1703 || curl_url_get(u, CURLUPART_URL, &oldurl, flags)) { 1704 return parseurl_and_replace(url, u, flags); 1705 } 1706 DEBUGASSERT(oldurl); /* it is set here */ 1707 /* apply the relative part to create a new URL */ 1708 uc = redirect_url(oldurl, url, u, flags); 1709 free(oldurl); 1710 return uc; 1711 } 1712 1713 static CURLUcode urlset_clear(CURLU *u, CURLUPart what) 1714 { 1715 switch(what) { 1716 case CURLUPART_URL: 1717 free_urlhandle(u); 1718 memset(u, 0, sizeof(struct Curl_URL)); 1719 break; 1720 case CURLUPART_SCHEME: 1721 Curl_safefree(u->scheme); 1722 u->guessed_scheme = FALSE; 1723 break; 1724 case CURLUPART_USER: 1725 Curl_safefree(u->user); 1726 break; 1727 case CURLUPART_PASSWORD: 1728 Curl_safefree(u->password); 1729 break; 1730 case CURLUPART_OPTIONS: 1731 Curl_safefree(u->options); 1732 break; 1733 case CURLUPART_HOST: 1734 Curl_safefree(u->host); 1735 break; 1736 case CURLUPART_ZONEID: 1737 Curl_safefree(u->zoneid); 1738 break; 1739 case CURLUPART_PORT: 1740 u->portnum = 0; 1741 Curl_safefree(u->port); 1742 break; 1743 case CURLUPART_PATH: 1744 Curl_safefree(u->path); 1745 break; 1746 case CURLUPART_QUERY: 1747 Curl_safefree(u->query); 1748 u->query_present = FALSE; 1749 break; 1750 case CURLUPART_FRAGMENT: 1751 Curl_safefree(u->fragment); 1752 u->fragment_present = FALSE; 1753 break; 1754 default: 1755 return CURLUE_UNKNOWN_PART; 1756 } 1757 return CURLUE_OK; 1758 } 1759 1760 CURLUcode curl_url_set(CURLU *u, CURLUPart what, 1761 const char *part, unsigned int flags) 1762 { 1763 char **storep = NULL; 1764 bool urlencode = (flags & CURLU_URLENCODE) ? 1 : 0; 1765 bool plusencode = FALSE; 1766 bool urlskipslash = FALSE; 1767 bool leadingslash = FALSE; 1768 bool appendquery = FALSE; 1769 bool equalsencode = FALSE; 1770 size_t nalloc; 1771 1772 if(!u) 1773 return CURLUE_BAD_HANDLE; 1774 if(!part) 1775 /* setting a part to NULL clears it */ 1776 return urlset_clear(u, what); 1777 1778 nalloc = strlen(part); 1779 if(nalloc > CURL_MAX_INPUT_LENGTH) 1780 /* excessive input length */ 1781 return CURLUE_MALFORMED_INPUT; 1782 1783 switch(what) { 1784 case CURLUPART_SCHEME: { 1785 CURLUcode status = set_url_scheme(u, part, flags); 1786 if(status) 1787 return status; 1788 storep = &u->scheme; 1789 urlencode = FALSE; /* never */ 1790 break; 1791 } 1792 case CURLUPART_USER: 1793 storep = &u->user; 1794 break; 1795 case CURLUPART_PASSWORD: 1796 storep = &u->password; 1797 break; 1798 case CURLUPART_OPTIONS: 1799 storep = &u->options; 1800 break; 1801 case CURLUPART_HOST: 1802 storep = &u->host; 1803 Curl_safefree(u->zoneid); 1804 break; 1805 case CURLUPART_ZONEID: 1806 storep = &u->zoneid; 1807 break; 1808 case CURLUPART_PORT: 1809 return set_url_port(u, part); 1810 case CURLUPART_PATH: 1811 urlskipslash = TRUE; 1812 leadingslash = TRUE; /* enforce */ 1813 storep = &u->path; 1814 break; 1815 case CURLUPART_QUERY: 1816 plusencode = urlencode; 1817 appendquery = (flags & CURLU_APPENDQUERY) ? 1 : 0; 1818 equalsencode = appendquery; 1819 storep = &u->query; 1820 u->query_present = TRUE; 1821 break; 1822 case CURLUPART_FRAGMENT: 1823 storep = &u->fragment; 1824 u->fragment_present = TRUE; 1825 break; 1826 case CURLUPART_URL: 1827 return set_url(u, part, nalloc, flags); 1828 default: 1829 return CURLUE_UNKNOWN_PART; 1830 } 1831 DEBUGASSERT(storep); 1832 { 1833 const char *newp; 1834 struct dynbuf enc; 1835 curlx_dyn_init(&enc, nalloc * 3 + 1 + leadingslash); 1836 1837 if(leadingslash && (part[0] != '/')) { 1838 CURLcode result = curlx_dyn_addn(&enc, "/", 1); 1839 if(result) 1840 return cc2cu(result); 1841 } 1842 if(urlencode) { 1843 const unsigned char *i; 1844 1845 for(i = (const unsigned char *)part; *i; i++) { 1846 CURLcode result; 1847 if((*i == ' ') && plusencode) { 1848 result = curlx_dyn_addn(&enc, "+", 1); 1849 if(result) 1850 return CURLUE_OUT_OF_MEMORY; 1851 } 1852 else if(ISUNRESERVED(*i) || 1853 ((*i == '/') && urlskipslash) || 1854 ((*i == '=') && equalsencode)) { 1855 if((*i == '=') && equalsencode) 1856 /* only skip the first equals sign */ 1857 equalsencode = FALSE; 1858 result = curlx_dyn_addn(&enc, i, 1); 1859 if(result) 1860 return cc2cu(result); 1861 } 1862 else { 1863 unsigned char out[3]={'%'}; 1864 Curl_hexbyte(&out[1], *i); 1865 result = curlx_dyn_addn(&enc, out, 3); 1866 if(result) 1867 return cc2cu(result); 1868 } 1869 } 1870 } 1871 else { 1872 char *p; 1873 CURLcode result = curlx_dyn_add(&enc, part); 1874 if(result) 1875 return cc2cu(result); 1876 p = curlx_dyn_ptr(&enc); 1877 while(*p) { 1878 /* make sure percent encoded are lower case */ 1879 if((*p == '%') && ISXDIGIT(p[1]) && ISXDIGIT(p[2]) && 1880 (ISUPPER(p[1]) || ISUPPER(p[2]))) { 1881 p[1] = Curl_raw_tolower(p[1]); 1882 p[2] = Curl_raw_tolower(p[2]); 1883 p += 3; 1884 } 1885 else 1886 p++; 1887 } 1888 } 1889 newp = curlx_dyn_ptr(&enc); 1890 1891 if(appendquery && newp) { 1892 /* Append the 'newp' string onto the old query. Add a '&' separator if 1893 none is present at the end of the existing query already */ 1894 1895 size_t querylen = u->query ? strlen(u->query) : 0; 1896 bool addamperand = querylen && (u->query[querylen -1] != '&'); 1897 if(querylen) { 1898 struct dynbuf qbuf; 1899 curlx_dyn_init(&qbuf, CURL_MAX_INPUT_LENGTH); 1900 1901 if(curlx_dyn_addn(&qbuf, u->query, querylen)) /* add original query */ 1902 goto nomem; 1903 1904 if(addamperand) { 1905 if(curlx_dyn_addn(&qbuf, "&", 1)) 1906 goto nomem; 1907 } 1908 if(curlx_dyn_add(&qbuf, newp)) 1909 goto nomem; 1910 curlx_dyn_free(&enc); 1911 free(*storep); 1912 *storep = curlx_dyn_ptr(&qbuf); 1913 return CURLUE_OK; 1914 nomem: 1915 curlx_dyn_free(&enc); 1916 return CURLUE_OUT_OF_MEMORY; 1917 } 1918 } 1919 1920 else if(what == CURLUPART_HOST) { 1921 size_t n = curlx_dyn_len(&enc); 1922 if(!n && (flags & CURLU_NO_AUTHORITY)) { 1923 /* Skip hostname check, it is allowed to be empty. */ 1924 } 1925 else { 1926 bool bad = FALSE; 1927 if(!n) 1928 bad = TRUE; /* empty hostname is not okay */ 1929 else if(!urlencode) { 1930 /* if the host name part was not URL encoded here, it was set ready 1931 URL encoded so we need to decode it to check */ 1932 size_t dlen; 1933 char *decoded = NULL; 1934 CURLcode result = 1935 Curl_urldecode(newp, n, &decoded, &dlen, REJECT_CTRL); 1936 if(result || hostname_check(u, decoded, dlen)) 1937 bad = TRUE; 1938 free(decoded); 1939 } 1940 else if(hostname_check(u, (char *)CURL_UNCONST(newp), n)) 1941 bad = TRUE; 1942 if(bad) { 1943 curlx_dyn_free(&enc); 1944 return CURLUE_BAD_HOSTNAME; 1945 } 1946 } 1947 } 1948 1949 free(*storep); 1950 *storep = (char *)CURL_UNCONST(newp); 1951 } 1952 return CURLUE_OK; 1953 }