#!/usr/bin/perl # backlink.pl -- script to rewrite .ic files by adding backlink text. use strict; use warnings; use English; use Prosaix; use Data::Dumper::Simple; my %hrefs; my $url; my $re_url = qr/^\/url: (.*)/; my $re_end = qr/^\/endtext/; my $re_wc = qr/^\/wc: (.*)/; my $notfound = 0; my $total = 0; my $success = '_____found_____'; my $count = 1; my $base_file = $ARGV[0] or die "Missing input file name\n"; (my $output_file = $base_file) =~ s/\.txt/\+bl.txt/; open( my $fh_HREF, '<', $base_file . '.links' ) or die "Couldn't open '$base_file.links': $OS_ERROR\n"; open( my $fh_IN, '<', $base_file ) or die "Couldn't open '$base_file': $OS_ERROR\n"; open( my $fh_OUT, '>', $output_file ) or die "Couldn't open 'base_file.backlnked': $OS_ERROR\n"; open( my $fh_ERR, '>', $base_file . '.unlinked' ) or die "Couldn't open 'base_file.unlinked': $OS_ERROR\n"; binmode $fh_OUT; start(); while (<$fh_HREF>) { chomp; my ( $key, $value ) = split(/\|/); unless ( defined( $hrefs{$key} ) ) { $hrefs{$key} = []; } push( @{ $hrefs{$key} }, $value ); } while (<$fh_IN>) { print $fh_OUT $_; if (/$re_url/) { $url = $1; } if (/$re_wc/) { print "wc $count $1\n"; $count++; } elsif (/$re_end/) { print $fh_OUT "/backlinks\n"; if ( defined( $hrefs{$url} ) ) { my $s = $hrefs{$url}; if (defined($s->[0])) { my @text = collapse(@$s); print $fh_OUT join( ".\n", @text ), ".\n"; print join( ".\n", @text ), ".\n"; } push( @{ $hrefs{$url} }, $success ); } } } while ( my ( $url, $text ) = each(%hrefs) ) { if ( !defined( $text->[-1] ) ) { print $fh_ERR "$url|_____NO_TEXT_____\n"; } elsif ( $text->[-1] ne $success ) { $notfound++; if (defined($text->[0])) { my @text = collapse(@$text); print $fh_ERR "$url|", join( ",", @text ), "\n"; } else { print $fh_ERR "$url|\n"; } } $total++; } close($fh_HREF); close($fh_IN); close($fh_OUT); close($fh_ERR); print "$notfound URLs (of $total) pointing to about.com pages not\n"; print "in this IC ($base_file) written to $base_file.unlinked\n"; finish(); sub collapse { my @s = @_; my @t; for (@s) { next unless defined($_); push(@t,$_); } return @t; }