# -------------------------------------------------- # # XML::RSS::Tools # Version 0.01 # April 2002 # Copyright iredale Consulting, all rights reserved # http://www.iredale.net/ # # -------------------------------------------------- # -------------------------------------------------- # Module starts here... # -------------------------------------------------- package XML::RSS::Tools; use 5.006; use strict; use warnings; use Diagnostics; use Carp; #use HTTP::GHTTP; # Delayed loading #use LWP::UserAgent; # In case you don't have GHTTP use XML::RSS; # Handle the RSS/RDF files use XML::LibXML; # Hand the XML file for XSL-T use XML::LibXSLT; # Hand the XSL file and do the XSL-T use HTML::Entities; # Try and fix entities require Exporter; our $VERSION = '0.01'; our @ISA = qw(Exporter); # Preloaded methods go here. #{ # Class Data #} # # Tools Constructor # sub new { my $self = shift; my %args = @_; bless { _rss_version => $args{version} || 0.91, _debug => $args{debug} || 4, _xml_string => "", _xsl_string => "", _output_string => "", _transformed => 0, _auto_wash => $args{auto_wash} || 1, }, $self; } # # Output what we have as a string # sub as_string { my $self = shift; my $mode = shift; if ($mode) { if ($mode =~ /rss/i) { croak "No RSS File to output" unless $self->{_rss_string}; return $self->{_rss_string}; } elsif ($mode =~ /xsl/i) { croak "No XSL Template to output" unless $self->{_xsl_string}; return $self->{_xsl_string}; } else { croak "Unknown mode: $mode"; } } else { croak "Nothing to output..." unless $self->{_transformed}; return $self->{_output_string}; } } # # Set/Read the debug level # sub debug { my $self = shift; my $debug = shift; $self->{_debug} = $debug if defined $debug; return $self->{_debug}; } # # Set/Read the auto_wash level # sub auto_wash { my $self = shift; my $wash = shift; $self->{_auto_wash} = $wash if defined $wash; return $self->{_auto_wash}; } # # Get the RSS Version # sub get_version { my $self = shift; return $self->{_rss_version}; } # # Set the RSS Version # sub set_version { my $self = shift; my $version = shift; $self->{_rss_version} = $version if defined $version; return $self->{_rss_version}; } # # Load an RSS file, and call RSS conversion to standard RSS format # sub rss_file { my $self = shift; my $file_name = shift; _check_file($file_name); open SOURCE_FILE, "<$file_name" or croak "Unable to open $file_name for reading"; $self->{_rss_string} = _load_filehandle(\*SOURCE_FILE); close SOURCE_FILE; _parse_rss_string($self); $self->{_transformed} = 0; } # # Load an XSL file # sub xsl_file { my $self = shift; my $file_name = shift; _check_file($file_name); open SOURCE_FILE, "<$file_name" or croak "Unable to open $file_name for reading"; $self->{_xsl_string} = _load_filehandle(\*SOURCE_FILE); close SOURCE_FILE; $self->{_transformed} = 0; } # # Load an RSS file via HTTP and call RSS conversion to standard RSS format # sub rss_uri { my $self = shift; my $uri = shift; $self->{_rss_string} = _http_get($uri); _parse_rss_string($self); $self->{_transformed} = 0; } # # Load an XSL file via HTTP # sub xsl_uri { my $self = shift; my $uri = shift; $self->{_xsl_string} = _http_get($uri); $self->{_transformed} = 0; } # # Parse a string and convert to standard RSS # sub rss_string { my $self = shift; my $xml = shift; croak "Nothing to parse" unless $xml; $self->{_rss_string} = $xml; _parse_rss_string($self); $self->{_transformed} = 0; } # # Import an XSL from string # sub xsl_string { my $self = shift; my $xml = shift; croak "No XSL-T" unless $xml; _$self->{_xsl_string} = $xml; $self->{_transformed} = 0; } # # Do the transformatoin # sub transform { my $self = shift; croak "No XSL-T loaded" unless $self->{_xsl_string}; croak "No RSS loaded" unless $self->{_rss_string}; croak "Can't transform twice without a change" if $self->{_transformed}; my $xslt = XML::LibXSLT->new; my $xml_parser = XML::LibXML->new; $xml_parser->keep_blanks(0); my $source_xml = $xml_parser->parse_string($self->{_rss_string}); # Parse the source XML my $style_xsl = $xml_parser->parse_string($self->{_xsl_string}); # and Template XSL files my $stylesheet = $xslt->parse_stylesheet($style_xsl); # Load the parsed XSL into XSLT my $result_xml = $stylesheet->transform($source_xml); # Transform the source XML $self->{_output_string} = $stylesheet->output_string($result_xml); # Store the result $self->{_transformed} = 1; } # --------------- # Private Methods # --------------- # # Parse the RSS string # sub _parse_rss_string { my $self = shift; my $xml = $self->{_rss_string}; $xml = _wash_xml($xml) if $self->{_auto_wash}; my $rss = XML::RSS->new; $rss->parse($xml); if ($rss->{version} != $self->{_rss_version}) { $rss->{output} = $self->{_rss_version}; $xml = $rss->as_string; } $self->{_rss_string} = $xml; } # # Load file from File Handle # sub _load_filehandle { my $handle = shift; my $content; while (<$handle>) { $content .= $_ } return $content; } # # Wash the XML File of known nasties # sub _wash_xml { my $xml = shift; my %entity = ( trade => "™", ); my $entities = join('|', keys %entity); decode_entities($xml); # Try and deal with known entities $xml =~ s/&($entities);/$entity{$1}/g; # Deal with odd entities $xml =~ s/&(?!(#[0-9]+|#x[0-9a-fA-F]+|\w+);)/&/g; # Matt's ampersand entity fixer $xml =~ s/\s+/ /gs; $xml =~ s/> />/g; return $xml } # # Check that the requested file is there and readable # sub _check_file { my $file_name = shift; croak "No file name supplied" unless $file_name; croak "Cannot find $file_name" unless -e $file_name; croak "$file_name isn't a real file" unless -f _; croak "Cannot read $file_name" unless -r _; croak "$file_name is zero bytes long" if -z _; } # # Grab something via HTTP # sub _http_get { my $uri = shift; my $xml; eval { # Try and use Gnome HTTP, it's faster require HTTP::GHTTP; }; if ($@) { # Otherwise use LWP require LWP::UserAgent; my $ua = LWP::UserAgent->new; $ua->agent("iC-XML::RSS::Tools/$VERSION " . $ua->agent . " ($^O)"); my $response = $ua->request(HTTP::Request->new('GET', $uri)); croak "HTTP error: " . $response->status_line if $response->is_error; $xml = $response->content(); } else { my $r = HTTP::GHTTP->new($uri); $r->process_request; $xml = $r->get_body; croak "HTTP error: Unable to connect to server.\n" unless $xml; my ($status, $message) = $r->get_status; croak "HTTP error: $status, $message\n" unless $status == 200; } return $xml; } 1; __END__ =head1 NAME XML::RSS::Tools - Perl extension for very high level RSS Feed manipulation =head1 SYNOPSIS use XML::RSS::Tools; my $rss_feed = XML::RSS::Tools->new; $rss_feed->rss_uri('http:://foo/bar.rdf'); $rss_feed->xsl_file('/my/rss_transformation.xsl'); $rss_feed->transform; print $rss_feed->as_string; =head1 DESCRIPTION RSS/RDF feeds are commonly available ways of distributing the latest news about a given web site for news syndication. This module provides a VERY high level way of manipulating them. You can easily use LWP, the XML::RSS and XML::LibXSLT do to this yourself. =head2 EXPORT None by default. =head1 AUTHOR Adam Trickett, Eadam@iredale.net =head1 SEE ALSO C, C, C, C and C. =head1 COPYRIGHT XML::RSS::Tools, Copyright iredale Consulting 2002 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. =cut