use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use Spreadsheet::WriteExcel; use FileHandle; use strict; # create useragent, open an excel workbook and sheet my $ua = LWP::UserAgent -> new; my $workbook = Spreadsheet::WriteExcel -> new ( "IPA.xls" ); my $sheet = $workbook -> add_worksheet ( ); $sheet -> set_column ( 0, 0, 100 ); # get html source and parse my $address = "http://dictionary.reference.com/browse/hello"; my $request = HTTP::Request -> new ( GET => $address ); my $response = $ua -> request ( $request ); my $htmlsource; my $writestring; if ( $response -> is_success ) { $htmlsource = $response -> content; $writestring = parse( $htmlsource ); } # write to spreadsheet, close excel $sheet -> write ( 0, 0, $writestring ); $workbook -> close ( ); sub parse { my $source = shift; my $htmlchunk; my $ipa; # select from html source the chunk of html which contains IPA- # encoded symbols # this chunk will still contain html tags that need to be removed # i'll find it between the *first* (but perhaps not last) pairing # of these two surrounding strings: # "prondelim">/\/<\/span>' and '<', # and continue doing the same over the remaining chunk while ( $htmlchunk =~ /(.*?)>(.*?)<(.*)/ ) { $ipa = $ipa . $2; $htmlchunk = $3; } return $ipa; }