sub file_is_valid_utf8 { my $f = shift; open(F,"<:raw",$f) or return 0; local $/; my $x=; close F; return is_valid_utf8($x); } # What's passed to this routine has to be a stream of bytes, not a utf8 string in which the characters are complete utf8 characters. # That's why you typically want to call file_is_valid_utf8 rather than calling this directly. sub is_valid_utf8 { my $x = shift; my $leading0 = '[\x{0}-\x{7f}]'; my $leading10 = '[\x{80}-\x{bf}]'; my $leading110 = '[\x{c0}-\x{df}]'; my $leading1110 = '[\x{e0}-\x{ef}]'; my $leading11110 = '[\x{f0}-\x{f7}]'; my $utf8 = "($leading0|($leading110$leading10)|($leading1110$leading10$leading10)|($leading11110$leading10$leading10$leading10))*"; return ($x=~/^$utf8$/); }