<?xml version="1.0" encoding="windows-1252"?>
<node id="32291" title="HTML Token Diff" created="2000-09-13 16:25:20" updated="2005-08-13 02:54:49">
<type id="11">
note</type>
<author id="9685">
mdillon</author>
<data>
<field name="doctext">
how about this:

&lt;code&gt;#!/usr/bin/perl -w

use strict;

use Algorithm::Diff qw(diff LCS);
use HTML::TokeParser;
use LWP::Simple;

sub tokenize_url
{
	my $url = shift;

	my $content = get $url or die $!;

	my $p = new HTML::TokeParser(\$content);

	my (@tokens, $token);

	push @tokens, $token
		while (defined ($token = $p-&gt;get_token));

	\@tokens;
}

my @content = map { tokenize_url($_) } qw{
http://perlmonks.org/index.pl?node_id=32285
http://perlmonks.org/index.pl?node_id=32286
};

# hash tokens based on their text content
sub hash_token {$_[0][$_[0][0] eq 'T' ? 1 : -1]}

my @diffs = diff $content[0], $content[1], \&amp;hash_token;
my @LCS = LCS $content[0], $content[1], \&amp;hash_token;

my $largest = 0;

for my $hunk (@diffs)
{
	my (@deletions, @additions);

	for (@$hunk)
	{
		push @deletions, $_ if $_-&gt;[0] eq '-';
		push @additions, $_ if $_-&gt;[0] eq '+';
	}

	my $size = @deletions &gt; @additions ? @deletions : @additions;

	$largest = $size if $size &gt; $largest;
}

print scalar(@{$content[0]}), " line",
	(@{$content[0]} == 1 ? '' : 's'), " in original", $/;
print scalar(@{$content[1]}), " line",
	(@{$content[1]} == 1 ? '' : 's'), " in revision", $/;

print scalar(@diffs), " hunk", (@diffs == 1 ? '' : 's'),
	" differ", $/;
print $largest, " line", ($largest == 1 ? '' : 's'),
	" in largest hunk", $/;

printf "Revision %0.2f%% similar to original$/",
	100 * @LCS / @{$content[0]};
&lt;/code&gt;

&lt;p&gt;&lt;strong&gt;updated 2001-Aug-01&lt;/strong&gt;: small code changes; renamed from "RE: Re: HTML Document Comparison"&lt;/p&gt;</field>
<field name="root_node">
32285</field>
<field name="parent_node">
32286</field>
</data>
</node>
