summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Stenberg <daniel@haxx.se>2001-01-03 08:35:16 +0000
committerDaniel Stenberg <daniel@haxx.se>2001-01-03 08:35:16 +0000
commit0d12c567386d54474ababcde3678a0cc5e1fb6b8 (patch)
tree5912828824b04f1b47d778992af1e15fcc282040
parent880208c5b25b33a51fc5dc0bc4a61dd9da639006 (diff)
downloadgnurl-0d12c567386d54474ababcde3678a0cc5e1fb6b8.tar.gz
gnurl-0d12c567386d54474ababcde3678a0cc5e1fb6b8.tar.bz2
gnurl-0d12c567386d54474ababcde3678a0cc5e1fb6b8.zip
Added -i to allow ingore-patterns to get added
-rwxr-xr-xperl/crawlink.pl42
1 files changed, 32 insertions, 10 deletions
diff --git a/perl/crawlink.pl b/perl/crawlink.pl
index d7855c383..53be500cc 100755
--- a/perl/crawlink.pl
+++ b/perl/crawlink.pl
@@ -9,10 +9,14 @@
# Written to use 'curl' for URL checking.
#
# Author: Daniel Stenberg <daniel@haxx.se>
-# Version: 0.2 Dec 19, 2000
+# Version: 0.3 Jan 3, 2001
#
# HISTORY
#
+# 0.3 - The -i now adds regexes that if a full URL link matches one of those,
+# it is not followed. This can then be used to prevent this script from
+# following '.*\.cgi', specific pages or whatever.
+#
# 0.2 - Made it only HEAD non html files (i.e skip the GET). Makes it a lot
# faster to skip large non HTML files such as pdfs or big RFCs! ;-)
# Added a -c option that allows me to pass options to curl.
@@ -32,6 +36,8 @@ my $help;
my $external;
my $curlopts;
+my @ignorelist;
+
argv:
if($ARGV[0] eq "-v" ) {
$verbose++;
@@ -44,6 +50,12 @@ elsif($ARGV[0] eq "-c" ) {
shift @ARGV;
goto argv;
}
+elsif($ARGV[0] eq "-i" ) {
+ push @ignorelist, $ARGV[1];
+ shift @ARGV;
+ shift @ARGV;
+ goto argv;
+}
elsif($ARGV[0] eq "-l" ) {
$linenumber = 1;
shift @ARGV;
@@ -72,10 +84,12 @@ $rooturls{$ARGV[0]}=1;
if(($geturl eq "") || $help) {
print "Usage: $0 [-hilvx] <full URL>\n",
" Use a traling slash for directory URLs!\n",
- " -h This help text\n",
- " -l Line number report for BAD links\n",
- " -v Verbose mode\n",
- " -x Check non-local (external?) links only\n";
+ " -c [data] Pass [data] as argument to every curl invoke\n",
+ " -h This help text\n",
+ " -i [regex] Ignore root links that match this pattern\n",
+ " -l Line number report for BAD links\n",
+ " -v Verbose mode\n",
+ " -x Check non-local (external?) links only\n";
exit;
}
@@ -303,9 +317,6 @@ while(1) {
if($geturl == -1) {
last;
}
- if($verbose) {
- print "ROOT: $geturl\n";
- }
#
# Splits the URL in its different parts
@@ -332,6 +343,8 @@ while(1) {
next;
}
+ print " ==== $geturl ====\n";
+
if($verbose == 2) {
printf("Error code $error, Content-Type: $ctype, got %d bytes\n",
length($in));
@@ -405,8 +418,17 @@ while(1) {
}
}
else {
- # the link works, add it!
- $rooturls{$link}++; # check this if not checked already
+ # the link works, add it if it isn't in the ingore list
+ my $ignore=0;
+ for(@ignorelist) {
+ if($link =~ /$_/) {
+ $ignore=1;
+ }
+ }
+ if(!$ignore) {
+ # not ignored, add
+ $rooturls{$link}++; # check this if not checked already
+ }
}
}