webscrape/webscrape 0000755 0001750 0000146 00000005704 10226706400 015376 0 ustar msells sells 0000000 0000000 #!/usr/bin/perl
use XML::LibXML;
use WWW::Mechanize;
use strict;
use Getopt::Std;
my $VERSION='Tue Apr 12 04:54:18 EDT 2005';
# These are set by our data file
#
our ($baseurl,$basepath,%paths,%nextpage,$argument);
our $docheader="\n
\n\n";
my %opts;
my $limit=1e99;
my $follow=0;
our $csv=0;
my $verbose=0;
# Handle options and require our data file
#
getopts('hca:vfd:l:', \%opts);
HelpMsg() if exists($opts{h});
$argument=$opts{a} if exists($opts{a});
$limit=$opts{l} if exists($opts{l});
$follow++ if exists($opts{f});
$csv++ if exists($opts{c});
$verbose++ if exists($opts{v});
if (exists($opts{d})) { require $opts{d}; } else { die "You didn't supply a -d parameter!\n"; }
# Build our list of URLs (possibly file:: ones)
#
my @pages;
for my $f (@ARGV) { push(@pages, (-f $f ? "file://$f" : $f)); }
if ($#pages < 0) {
die "No base URL defined in data file and no files/URLs supplied!\n" unless ($baseurl);
push(@pages,$baseurl);
}
unless ($csv) {
print $docheader . " " . join(" | ",ItemData('headers')) . " |
\n";
} else {
print join("\t",ItemData('headers')) . "\n";
}
print STDERR "Base path is $basepath\n" if ($verbose);
my $mech = WWW::Mechanize->new();
while (my $url=shift(@pages)) {
# Get our document
#
print STDERR "GET $url\n" if ($verbose);
my $resp=$mech->get($url);
# Setup the parser
#
my $parser = XML::LibXML->new();
$parser->recover(1);
$parser->validation(0);
$parser->expand_entities(0);
$parser->keep_blanks(0);
# necessary evil since libxml dumps junk to STDERR
#
open(OLDERR, ">&STDERR");
open(STDERR, ">/dev/null");
my $doc = $parser->parse_html_string($resp->content);
close(STDERR);
open(STDERR,">&OLDERR");
for my $val ($doc->findnodes($basepath)) {
print STDERR "Base node found.\n" if ($verbose);
my %item;
for my $n (keys %paths) {
my @nodes= $val->findnodes($paths{$n});
next if (! @nodes);
if ($n =~ /^HTML/) { $item{$n} = @nodes[0]->toString('HTML',1);
} else { $item{$n} = @nodes[0]->textContent;
}
}
CleanItem(\%item);
my @data=ItemData(\%item);
if ($#data > 0) {
unless ($csv) {
print " "; print join(" | ",@data) . " |
\n";
} else {
print join("\t",@data) . "\n";
}
$limit--;
}
last if ( $limit <= 0);
}
last if ( $limit <= 0);
if ($follow) {
my $next=$mech->find_link(%nextpage);
push(@pages,$next->url()) if ($next);
}
}
print $docfooter unless ($csv);
sub HelpMsg {
print <