#!/usr/bin/perl -w use strict; use HTML::TreeBuilder; use LWP::Simple qw(get); # element: extract the contents of an element by element id # or class from a file or URL. # Don Marti # References: # "Scanning HTML" by Sean M. Burke # http://www.foo.be/docs/tpj/issues/vol5_3/tpj0503-0008.html # perldoc HTML::TreeBuilder # perldoc HTML::Element my $id = $ARGV[0]; # which id (or failing that, class) to get my $document = $ARGV[1]; # filename or URL of the source die "Usage: $0 id document\n" if !$id or !$document; my $html = (snarf_file($document) or get($document)); die "Can't get $document\n" if !defined($html); my $tree = HTML::TreeBuilder->new(); $tree->parse($html); $tree->eof; foreach my $attr qw(id class) { my $e = $tree->look_down($attr, $id); if (defined($e)) { print join '', map( ref($_) ? $_->as_HTML : $_, $e->content_list); last; } } sub snarf_file { my $filename = shift; local $/ = undef; open (IN, "<$filename") or return undef; my $result = ; close IN or die $!; return $result; }