tool_urlglob.c (20216B)
1 /*************************************************************************** 2 * _ _ ____ _ 3 * Project ___| | | | _ \| | 4 * / __| | | | |_) | | 5 * | (__| |_| | _ <| |___ 6 * \___|\___/|_| \_\_____| 7 * 8 * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al. 9 * 10 * This software is licensed as described in the file COPYING, which 11 * you should have received as part of this distribution. The terms 12 * are also available at https://curl.se/docs/copyright.html. 13 * 14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell 15 * copies of the Software, and permit persons to whom the Software is 16 * furnished to do so, under the terms of the COPYING file. 17 * 18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY 19 * KIND, either express or implied. 20 * 21 * SPDX-License-Identifier: curl 22 * 23 ***************************************************************************/ 24 #include "tool_setup.h" 25 26 #include "tool_cfgable.h" 27 #include "tool_doswin.h" 28 #include "tool_urlglob.h" 29 #include "tool_vms.h" 30 #include "memdebug.h" /* keep this as LAST include */ 31 32 #define GLOBERROR(string, column, code) \ 33 glob->error = string, glob->pos = column, code 34 35 static CURLcode glob_fixed(struct URLGlob *glob, char *fixed, size_t len) 36 { 37 struct URLPattern *pat = &glob->pattern[glob->size]; 38 pat->type = UPTSet; 39 pat->content.Set.size = 1; 40 pat->content.Set.ptr_s = 0; 41 pat->globindex = -1; 42 43 pat->content.Set.elements = malloc(sizeof(char *)); 44 45 if(!pat->content.Set.elements) 46 return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY); 47 48 pat->content.Set.elements[0] = malloc(len + 1); 49 if(!pat->content.Set.elements[0]) 50 return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY); 51 52 memcpy(pat->content.Set.elements[0], fixed, len); 53 pat->content.Set.elements[0][len] = 0; 54 55 return CURLE_OK; 56 } 57 58 /* multiply 59 * 60 * Multiplies and checks for overflow. 61 */ 62 static int multiply(curl_off_t *amount, curl_off_t with) 63 { 64 curl_off_t sum; 65 DEBUGASSERT(*amount >= 0); 66 DEBUGASSERT(with >= 0); 67 if((with <= 0) || (*amount <= 0)) { 68 sum = 0; 69 } 70 else { 71 #if defined(__GNUC__) && \ 72 ((__GNUC__ > 5) || ((__GNUC__ == 5) && (__GNUC_MINOR__ >= 1))) 73 if(__builtin_mul_overflow(*amount, with, &sum)) 74 return 1; 75 #else 76 sum = *amount * with; 77 if(sum/with != *amount) 78 return 1; /* did not fit, bail out */ 79 #endif 80 } 81 *amount = sum; 82 return 0; 83 } 84 85 static CURLcode glob_set(struct URLGlob *glob, const char **patternp, 86 size_t *posp, curl_off_t *amount, 87 int globindex) 88 { 89 /* processes a set expression with the point behind the opening '{' 90 ','-separated elements are collected until the next closing '}' 91 */ 92 struct URLPattern *pat; 93 bool done = FALSE; 94 char *buf = glob->glob_buffer; 95 const char *pattern = *patternp; 96 const char *opattern = pattern; 97 size_t opos = *posp-1; 98 99 pat = &glob->pattern[glob->size]; 100 /* patterns 0,1,2,... correspond to size=1,3,5,... */ 101 pat->type = UPTSet; 102 pat->content.Set.size = 0; 103 pat->content.Set.ptr_s = 0; 104 pat->content.Set.elements = NULL; 105 pat->globindex = globindex; 106 107 while(!done) { 108 switch(*pattern) { 109 case '\0': /* URL ended while set was still open */ 110 return GLOBERROR("unmatched brace", opos, CURLE_URL_MALFORMAT); 111 112 case '{': 113 case '[': /* no nested expressions at this time */ 114 return GLOBERROR("nested brace", *posp, CURLE_URL_MALFORMAT); 115 116 case '}': /* set element completed */ 117 if(opattern == pattern) 118 return GLOBERROR("empty string within braces", *posp, 119 CURLE_URL_MALFORMAT); 120 121 /* add 1 to size since it will be incremented below */ 122 if(multiply(amount, pat->content.Set.size + 1)) 123 return GLOBERROR("range overflow", 0, CURLE_URL_MALFORMAT); 124 125 FALLTHROUGH(); 126 case ',': 127 128 *buf = '\0'; 129 if(pat->content.Set.elements) { 130 char **new_arr = realloc(pat->content.Set.elements, 131 (size_t)(pat->content.Set.size + 1) * 132 sizeof(char *)); 133 if(!new_arr) 134 return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY); 135 136 pat->content.Set.elements = new_arr; 137 } 138 else 139 pat->content.Set.elements = malloc(sizeof(char *)); 140 141 if(!pat->content.Set.elements) 142 return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY); 143 144 pat->content.Set.elements[pat->content.Set.size] = 145 strdup(glob->glob_buffer); 146 if(!pat->content.Set.elements[pat->content.Set.size]) 147 return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY); 148 ++pat->content.Set.size; 149 150 if(*pattern == '}') { 151 pattern++; /* pass the closing brace */ 152 done = TRUE; 153 continue; 154 } 155 156 buf = glob->glob_buffer; 157 ++pattern; 158 ++(*posp); 159 break; 160 161 case ']': /* illegal closing bracket */ 162 return GLOBERROR("unexpected close bracket", *posp, CURLE_URL_MALFORMAT); 163 164 case '\\': /* escaped character, skip '\' */ 165 if(pattern[1]) { 166 ++pattern; 167 ++(*posp); 168 } 169 FALLTHROUGH(); 170 default: 171 *buf++ = *pattern++; /* copy character to set element */ 172 ++(*posp); 173 } 174 } 175 176 *patternp = pattern; /* return with the new position */ 177 return CURLE_OK; 178 } 179 180 static CURLcode glob_range(struct URLGlob *glob, const char **patternp, 181 size_t *posp, curl_off_t *amount, 182 int globindex) 183 { 184 /* processes a range expression with the point behind the opening '[' 185 - char range: e.g. "a-z]", "B-Q]" 186 - num range: e.g. "0-9]", "17-2000]" 187 - num range with leading zeros: e.g. "001-999]" 188 expression is checked for well-formedness and collected until the next ']' 189 */ 190 struct URLPattern *pat; 191 const char *pattern = *patternp; 192 const char *c; 193 194 pat = &glob->pattern[glob->size]; 195 pat->globindex = globindex; 196 197 if(ISALPHA(*pattern)) { 198 /* character range detected */ 199 bool pmatch = FALSE; 200 char min_c = 0; 201 char max_c = 0; 202 char end_c = 0; 203 unsigned long step = 1; 204 205 pat->type = UPTCharRange; 206 207 if((pattern[1] == '-') && pattern[2] && pattern[3]) { 208 min_c = pattern[0]; 209 max_c = pattern[2]; 210 end_c = pattern[3]; 211 pmatch = TRUE; 212 213 if(end_c == ':') { 214 curl_off_t num; 215 const char *p = &pattern[4]; 216 if(curlx_str_number(&p, &num, 256) || curlx_str_single(&p, ']')) 217 step = 0; 218 else 219 step = (unsigned long)num; 220 pattern = p; 221 } 222 else if(end_c != ']') 223 /* then this is wrong */ 224 pmatch = FALSE; 225 else 226 /* end_c == ']' */ 227 pattern += 4; 228 } 229 230 *posp += (pattern - *patternp); 231 232 if(!pmatch || !step || 233 (min_c == max_c && step != 1) || 234 (min_c != max_c && (min_c > max_c || step > (unsigned)(max_c - min_c) || 235 (max_c - min_c) > ('z' - 'a')))) 236 /* the pattern is not well-formed */ 237 return GLOBERROR("bad range", *posp, CURLE_URL_MALFORMAT); 238 239 /* if there was a ":[num]" thing, use that as step or else use 1 */ 240 pat->content.CharRange.step = (int)step; 241 pat->content.CharRange.ptr_c = pat->content.CharRange.min_c = min_c; 242 pat->content.CharRange.max_c = max_c; 243 244 if(multiply(amount, ((pat->content.CharRange.max_c - 245 pat->content.CharRange.min_c) / 246 pat->content.CharRange.step + 1))) 247 return GLOBERROR("range overflow", *posp, CURLE_URL_MALFORMAT); 248 } 249 else if(ISDIGIT(*pattern)) { 250 /* numeric range detected */ 251 unsigned long min_n = 0; 252 unsigned long max_n = 0; 253 unsigned long step_n = 0; 254 curl_off_t num; 255 256 pat->type = UPTNumRange; 257 pat->content.NumRange.padlength = 0; 258 259 if(*pattern == '0') { 260 /* leading zero specified, count them! */ 261 c = pattern; 262 while(ISDIGIT(*c)) { 263 c++; 264 ++pat->content.NumRange.padlength; /* padding length is set for all 265 instances of this pattern */ 266 } 267 } 268 269 if(!curlx_str_number(&pattern, &num, CURL_OFF_T_MAX)) { 270 min_n = (unsigned long)num; 271 if(!curlx_str_single(&pattern, '-')) { 272 curlx_str_passblanks(&pattern); 273 if(!curlx_str_number(&pattern, &num, CURL_OFF_T_MAX)) { 274 max_n = (unsigned long)num; 275 if(!curlx_str_single(&pattern, ']')) 276 step_n = 1; 277 else if(!curlx_str_single(&pattern, ':') && 278 !curlx_str_number(&pattern, &num, CURL_OFF_T_MAX) && 279 !curlx_str_single(&pattern, ']')) { 280 step_n = (unsigned long)num; 281 } 282 /* else bad syntax */ 283 } 284 } 285 } 286 287 *posp += (pattern - *patternp); 288 289 if(!step_n || 290 (min_n == max_n && step_n != 1) || 291 (min_n != max_n && (min_n > max_n || step_n > (max_n - min_n)))) 292 /* the pattern is not well-formed */ 293 return GLOBERROR("bad range", *posp, CURLE_URL_MALFORMAT); 294 295 /* typecasting to ints are fine here since we make sure above that we 296 are within 31 bits */ 297 pat->content.NumRange.ptr_n = pat->content.NumRange.min_n = min_n; 298 pat->content.NumRange.max_n = max_n; 299 pat->content.NumRange.step = step_n; 300 301 if(multiply(amount, ((pat->content.NumRange.max_n - 302 pat->content.NumRange.min_n) / 303 pat->content.NumRange.step + 1))) 304 return GLOBERROR("range overflow", *posp, CURLE_URL_MALFORMAT); 305 } 306 else 307 return GLOBERROR("bad range specification", *posp, CURLE_URL_MALFORMAT); 308 309 *patternp = pattern; 310 return CURLE_OK; 311 } 312 313 #define MAX_IP6LEN 128 314 315 static bool peek_ipv6(const char *str, size_t *skip) 316 { 317 /* 318 * Scan for a potential IPv6 literal. 319 * - Valid globs contain a hyphen and <= 1 colon. 320 * - IPv6 literals contain no hyphens and >= 2 colons. 321 */ 322 char hostname[MAX_IP6LEN]; 323 CURLU *u; 324 char *endbr = strchr(str, ']'); 325 size_t hlen; 326 CURLUcode rc; 327 if(!endbr) 328 return FALSE; 329 330 hlen = endbr - str + 1; 331 if(hlen >= MAX_IP6LEN) 332 return FALSE; 333 334 u = curl_url(); 335 if(!u) 336 return FALSE; 337 338 memcpy(hostname, str, hlen); 339 hostname[hlen] = 0; 340 341 /* ask to "guess scheme" as then it works without an https:// prefix */ 342 rc = curl_url_set(u, CURLUPART_URL, hostname, CURLU_GUESS_SCHEME); 343 344 curl_url_cleanup(u); 345 if(!rc) 346 *skip = hlen; 347 return rc ? FALSE : TRUE; 348 } 349 350 static CURLcode glob_parse(struct URLGlob *glob, const char *pattern, 351 size_t pos, curl_off_t *amount) 352 { 353 /* processes a literal string component of a URL 354 special characters '{' and '[' branch to set/range processing functions 355 */ 356 CURLcode res = CURLE_OK; 357 int globindex = 0; /* count "actual" globs */ 358 359 *amount = 1; 360 361 while(*pattern && !res) { 362 char *buf = glob->glob_buffer; 363 size_t sublen = 0; 364 while(*pattern && *pattern != '{') { 365 if(*pattern == '[') { 366 /* skip over IPv6 literals and [] */ 367 size_t skip = 0; 368 if(!peek_ipv6(pattern, &skip) && (pattern[1] == ']')) 369 skip = 2; 370 if(skip) { 371 memcpy(buf, pattern, skip); 372 buf += skip; 373 pattern += skip; 374 sublen += skip; 375 continue; 376 } 377 break; 378 } 379 if(*pattern == '}' || *pattern == ']') 380 return GLOBERROR("unmatched close brace/bracket", pos, 381 CURLE_URL_MALFORMAT); 382 383 /* only allow \ to escape known "special letters" */ 384 if(*pattern == '\\' && 385 (*(pattern + 1) == '{' || *(pattern + 1) == '[' || 386 *(pattern + 1) == '}' || *(pattern + 1) == ']') ) { 387 388 /* escape character, skip '\' */ 389 ++pattern; 390 ++pos; 391 } 392 *buf++ = *pattern++; /* copy character to literal */ 393 ++pos; 394 sublen++; 395 } 396 if(sublen) { 397 /* we got a literal string, add it as a single-item list */ 398 *buf = '\0'; 399 res = glob_fixed(glob, glob->glob_buffer, sublen); 400 } 401 else { 402 switch(*pattern) { 403 case '\0': /* done */ 404 break; 405 406 case '{': 407 /* process set pattern */ 408 pattern++; 409 pos++; 410 res = glob_set(glob, &pattern, &pos, amount, globindex++); 411 break; 412 413 case '[': 414 /* process range pattern */ 415 pattern++; 416 pos++; 417 res = glob_range(glob, &pattern, &pos, amount, globindex++); 418 break; 419 } 420 } 421 422 if(++glob->size >= GLOB_PATTERN_NUM) 423 return GLOBERROR("too many globs", pos, CURLE_URL_MALFORMAT); 424 } 425 return res; 426 } 427 428 CURLcode glob_url(struct URLGlob **glob, char *url, curl_off_t *urlnum, 429 FILE *error) 430 { 431 /* 432 * We can deal with any-size, just make a buffer with the same length 433 * as the specified URL! 434 */ 435 struct URLGlob *glob_expand; 436 curl_off_t amount = 0; 437 char *glob_buffer; 438 CURLcode res; 439 440 *glob = NULL; 441 442 glob_buffer = malloc(strlen(url) + 1); 443 if(!glob_buffer) 444 return CURLE_OUT_OF_MEMORY; 445 glob_buffer[0] = 0; 446 447 glob_expand = calloc(1, sizeof(struct URLGlob)); 448 if(!glob_expand) { 449 tool_safefree(glob_buffer); 450 return CURLE_OUT_OF_MEMORY; 451 } 452 glob_expand->urllen = strlen(url); 453 glob_expand->glob_buffer = glob_buffer; 454 455 res = glob_parse(glob_expand, url, 1, &amount); 456 if(!res) 457 *urlnum = amount; 458 else { 459 if(error && glob_expand->error) { 460 char text[512]; 461 const char *t; 462 if(glob_expand->pos) { 463 msnprintf(text, sizeof(text), "%s in URL position %zu:\n%s\n%*s^", 464 glob_expand->error, 465 glob_expand->pos, url, (int)glob_expand->pos - 1, " "); 466 t = text; 467 } 468 else 469 t = glob_expand->error; 470 471 /* send error description to the error-stream */ 472 fprintf(error, "curl: (%d) %s\n", res, t); 473 } 474 /* it failed, we cleanup */ 475 glob_cleanup(&glob_expand); 476 *urlnum = 1; 477 return res; 478 } 479 480 *glob = glob_expand; 481 return CURLE_OK; 482 } 483 484 void glob_cleanup(struct URLGlob **globp) 485 { 486 size_t i; 487 curl_off_t elem; 488 struct URLGlob *glob = *globp; 489 490 if(!glob) 491 return; 492 493 for(i = 0; i < glob->size; i++) { 494 if((glob->pattern[i].type == UPTSet) && 495 (glob->pattern[i].content.Set.elements)) { 496 for(elem = glob->pattern[i].content.Set.size - 1; 497 elem >= 0; 498 --elem) { 499 tool_safefree(glob->pattern[i].content.Set.elements[elem]); 500 } 501 tool_safefree(glob->pattern[i].content.Set.elements); 502 } 503 } 504 tool_safefree(glob->glob_buffer); 505 tool_safefree(glob); 506 *globp = NULL; 507 } 508 509 CURLcode glob_next_url(char **globbed, struct URLGlob *glob) 510 { 511 struct URLPattern *pat; 512 size_t i; 513 size_t len; 514 size_t buflen = glob->urllen + 1; 515 char *buf = glob->glob_buffer; 516 517 *globbed = NULL; 518 519 if(!glob->beenhere) 520 glob->beenhere = 1; 521 else { 522 bool carry = TRUE; 523 524 /* implement a counter over the index ranges of all patterns, starting 525 with the rightmost pattern */ 526 for(i = 0; carry && (i < glob->size); i++) { 527 carry = FALSE; 528 pat = &glob->pattern[glob->size - 1 - i]; 529 switch(pat->type) { 530 case UPTSet: 531 if((pat->content.Set.elements) && 532 (++pat->content.Set.ptr_s == pat->content.Set.size)) { 533 pat->content.Set.ptr_s = 0; 534 carry = TRUE; 535 } 536 break; 537 case UPTCharRange: 538 pat->content.CharRange.ptr_c = 539 (char)(pat->content.CharRange.step + 540 (int)((unsigned char)pat->content.CharRange.ptr_c)); 541 if(pat->content.CharRange.ptr_c > pat->content.CharRange.max_c) { 542 pat->content.CharRange.ptr_c = pat->content.CharRange.min_c; 543 carry = TRUE; 544 } 545 break; 546 case UPTNumRange: 547 pat->content.NumRange.ptr_n += pat->content.NumRange.step; 548 if(pat->content.NumRange.ptr_n > pat->content.NumRange.max_n) { 549 pat->content.NumRange.ptr_n = pat->content.NumRange.min_n; 550 carry = TRUE; 551 } 552 break; 553 default: 554 printf("internal error: invalid pattern type (%d)\n", (int)pat->type); 555 return CURLE_FAILED_INIT; 556 } 557 } 558 if(carry) { /* first pattern ptr has run into overflow, done! */ 559 return CURLE_OK; 560 } 561 } 562 563 for(i = 0; i < glob->size; ++i) { 564 pat = &glob->pattern[i]; 565 switch(pat->type) { 566 case UPTSet: 567 if(pat->content.Set.elements) { 568 msnprintf(buf, buflen, "%s", 569 pat->content.Set.elements[pat->content.Set.ptr_s]); 570 len = strlen(buf); 571 buf += len; 572 buflen -= len; 573 } 574 break; 575 case UPTCharRange: 576 if(buflen) { 577 *buf++ = pat->content.CharRange.ptr_c; 578 *buf = '\0'; 579 buflen--; 580 } 581 break; 582 case UPTNumRange: 583 msnprintf(buf, buflen, "%0*" CURL_FORMAT_CURL_OFF_T, 584 pat->content.NumRange.padlength, 585 pat->content.NumRange.ptr_n); 586 len = strlen(buf); 587 buf += len; 588 buflen -= len; 589 break; 590 default: 591 printf("internal error: invalid pattern type (%d)\n", (int)pat->type); 592 return CURLE_FAILED_INIT; 593 } 594 } 595 596 *globbed = strdup(glob->glob_buffer); 597 if(!*globbed) 598 return CURLE_OUT_OF_MEMORY; 599 600 return CURLE_OK; 601 } 602 603 #define MAX_OUTPUT_GLOB_LENGTH (10*1024) 604 605 CURLcode glob_match_url(char **result, const char *filename, 606 struct URLGlob *glob) 607 { 608 char numbuf[18]; 609 const char *appendthis = ""; 610 size_t appendlen = 0; 611 struct dynbuf dyn; 612 613 *result = NULL; 614 615 /* We cannot use the glob_buffer for storage since the filename may be 616 * longer than the URL we use. 617 */ 618 curlx_dyn_init(&dyn, MAX_OUTPUT_GLOB_LENGTH); 619 620 while(*filename) { 621 if(*filename == '#' && ISDIGIT(filename[1])) { 622 const char *ptr = filename; 623 curl_off_t num; 624 struct URLPattern *pat = NULL; 625 filename++; 626 if(!curlx_str_number(&filename, &num, glob->size) && num) { 627 unsigned long i; 628 num--; /* make it zero based */ 629 /* find the correct glob entry */ 630 for(i = 0; i < glob->size; i++) { 631 if(glob->pattern[i].globindex == (int)num) { 632 pat = &glob->pattern[i]; 633 break; 634 } 635 } 636 } 637 638 if(pat) { 639 switch(pat->type) { 640 case UPTSet: 641 if(pat->content.Set.elements) { 642 appendthis = pat->content.Set.elements[pat->content.Set.ptr_s]; 643 appendlen = 644 strlen(pat->content.Set.elements[pat->content.Set.ptr_s]); 645 } 646 break; 647 case UPTCharRange: 648 numbuf[0] = pat->content.CharRange.ptr_c; 649 numbuf[1] = 0; 650 appendthis = numbuf; 651 appendlen = 1; 652 break; 653 case UPTNumRange: 654 msnprintf(numbuf, sizeof(numbuf), "%0*" CURL_FORMAT_CURL_OFF_T, 655 pat->content.NumRange.padlength, 656 pat->content.NumRange.ptr_n); 657 appendthis = numbuf; 658 appendlen = strlen(numbuf); 659 break; 660 default: 661 fprintf(tool_stderr, "internal error: invalid pattern type (%d)\n", 662 (int)pat->type); 663 curlx_dyn_free(&dyn); 664 return CURLE_FAILED_INIT; 665 } 666 } 667 else { 668 /* #[num] out of range, use the #[num] in the output */ 669 filename = ptr; 670 appendthis = filename++; 671 appendlen = 1; 672 } 673 } 674 else { 675 appendthis = filename++; 676 appendlen = 1; 677 } 678 if(curlx_dyn_addn(&dyn, appendthis, appendlen)) 679 return CURLE_OUT_OF_MEMORY; 680 } 681 682 if(curlx_dyn_addn(&dyn, "", 0)) 683 return CURLE_OUT_OF_MEMORY; 684 685 #if defined(_WIN32) || defined(MSDOS) 686 { 687 char *sanitized; 688 SANITIZEcode sc = sanitize_file_name(&sanitized, curlx_dyn_ptr(&dyn), 689 (SANITIZE_ALLOW_PATH | 690 SANITIZE_ALLOW_RESERVED)); 691 curlx_dyn_free(&dyn); 692 if(sc) 693 return CURLE_URL_MALFORMAT; 694 *result = sanitized; 695 return CURLE_OK; 696 } 697 #else 698 *result = curlx_dyn_ptr(&dyn); 699 return CURLE_OK; 700 #endif /* _WIN32 || MSDOS */ 701 }