sub readbackwards_utf8 { # returns an iterator my ($fn, $window) = @_; die "Bad window $window" unless $window>=4; open my $fh, '<:raw', $fn or die "open $fn: $!"; my $curpos = -s $fh; return sub { if ( $curpos<1 ) { close $fh if $fh; $fh=undef; return } my $bytes = $curpos < $window ? $curpos : $window; seek($fh, $curpos-=$bytes, 0) or die "seek $curpos $fn: $!"; read($fh, my $buf, $bytes) == $bytes or die "read $bytes bytes at $curpos from $fn: $!"; while ( (ord(substr $buf, 0, 1) & 0b11000000)==0b10000000 ) { $buf = substr $buf, 1; $curpos++ } utf8::decode($buf); return $buf; } } #### use open qw/:std :utf8/; use Test::More; use File::Temp qw/tempfile/; my ($tempfh, $filename) = tempfile( UNLINK => 1 ); binmode $tempfh, ':encoding(UTF-8)'; print $tempfh "H\N{U+20AC}ll\N{U+00F6}, \N{U+1F5FA}!\n"; close $tempfh; #system('hexdump','-C',$filename); my $four = readbackwards_utf8($filename, 4); is $four->(), "!\n"; is $four->(), "\N{U+1F5FA}"; is $four->(), "\N{U+00F6}, "; is $four->(), "ll"; is $four->(), "H\N{U+20AC}"; is $four->(), undef; is $four->(), undef; my $five = readbackwards_utf8($filename, 5); is $five->(), "!\n"; is $five->(), " \N{U+1F5FA}"; is $five->(), "ll\N{U+00F6},"; is $five->(), "H\N{U+20AC}"; is $five->(), undef; my $six = readbackwards_utf8($filename, 6); is $six->(), "\N{U+1F5FA}!\n"; is $six->(), "ll\N{U+00F6}, "; is $six->(), "H\N{U+20AC}"; is $six->(), undef; my $seven = readbackwards_utf8($filename, 7); is $seven->(), " \N{U+1F5FA}!\n"; is $seven->(), "ll\N{U+00F6},"; is $seven->(), "H\N{U+20AC}"; is $seven->(), undef; for my $n (8..9) { my $eight = readbackwards_utf8($filename, $n); is $eight->(), ", \N{U+1F5FA}!\n"; is $eight->(), "H\N{U+20AC}ll\N{U+00F6}"; is $eight->(), undef; } my $ten = readbackwards_utf8($filename, 10); is $ten->(), "\N{U+00F6}, \N{U+1F5FA}!\n"; is $ten->(), "H\N{U+20AC}ll"; is $ten->(), undef; my $eleven = readbackwards_utf8($filename, 11); is $eleven->(), "l\N{U+00F6}, \N{U+1F5FA}!\n"; is $eleven->(), "H\N{U+20AC}l"; is $eleven->(), undef; for my $n (12..14) { my $twelve = readbackwards_utf8($filename, $n); is $twelve->(), "ll\N{U+00F6}, \N{U+1F5FA}!\n"; is $twelve->(), "H\N{U+20AC}"; is $twelve->(), undef; } my $fifteen = readbackwards_utf8($filename, 15); is $fifteen->(), "\N{U+20AC}ll\N{U+00F6}, \N{U+1F5FA}!\n"; is $fifteen->(), "H"; is $fifteen->(), undef; for my $n (16..17) { my $sixteen = readbackwards_utf8($filename, 16); is $sixteen->(), "H\N{U+20AC}ll\N{U+00F6}, \N{U+1F5FA}!\n"; is $sixteen->(), undef; } done_testing;