!/usr/bin/perl use warnings; use strict; use diagnostics; print "Find pair of entities without/with separating space\n"; my $infile = $ARGV[0]; # put in comments what is not relevant # for modified files: currently known cases # my @regexes = (qr/–§/, qr/–Ü/, qr/ߧ/); # for original files: check for entities separated by space my @regexes = (qr/;\s&/); open my $in, '<', $infile or die "Cannot open $infile for reading: $!"; #read input file in variable $xml my $xml; { local $/ = undef; $xml = <$in>; } #define output file open my $out, '>', 'pairs.txt' or die $!; print {$out} "Find pair of entities without/with separating space\n\ninput file: "; print {$out} "$infile"; print {$out} "\n========================================================================\n\n"; for my $i (0 .. $#regexes) { my $regex = $regexes[$i]; $regex =~ s/^\(\?\^://; $regex =~ s/\)$//; print {$out} "$regex\n" while $xml =~ /$regex/g;; } close $in; close $out;