A program written in a hurry some time ago to munge file paths generally for file systems for Unix(-like) OSen & specifically for FAT32.
Learned the hard way that NTFS would allow file names to be written to FAT32 even if some characters are outside of FAT32 specification. Problematic characters seemed to be en- & em-dash, fancy quotes, pipe, Unicode "?", & possibly few others (web pages saved with title as the file name). Mounting FAT32 file system on FreeBSD with specific codepage(s), or "nowin95" or "shortnames" mount options did not help (mount_msdosfs(8)). Munging it was then🤷🏽♂️
# quick-sanename.pl
use strict;
use warnings;
use feature qw[ state ];
use File::Copy qw[ move ];
@ARGV
or die qq[Give a file name to sanitize.\n];
my $dry_run = 0;
my $noisy = 1;
my $lowercase = 1 ;
my $for_windows = 1;
my $clean_past_255 = 1;
# General cleansing of base names.
my %cleansed = run_cleanser( \&Cleanser::cleanse, @ARGV );
if ( $for_windows )
{
if ( ! %cleansed )
{
%cleansed = run_cleanser( \&Cleanser::cleanse_for_windows, @ARGV
+ );
}
else
{
# Work on the changes of general cleansing.
while( my ( $old, $once ) = each %cleansed )
{
my $again = Cleanser::cleanse_for_windows( $once ) or next;
$cleansed{ $old } = $again;
}
# Take care of those which were skipped during general cleansing
+.
my @todo = grep { ! exists $cleansed{ $_ } } @ARGV;
my %win_cleansed = run_cleanser( \&Cleanser::cleanse_for_windows
+, @todo );
%cleansed = ( %cleansed, %win_cleansed );
}
}
%cleansed
or die qq[No new file names were generated.\n];
# Move file.
for my $old ( sort keys %cleansed )
{
my $new = $cleansed{ $old };
if ( $noisy || $dry_run )
{
printf qq['%s' -> '%s'\n] , $old, $new;
}
$dry_run and next;
if ( -e $new )
{
warn qq[Skipped rename of "$old", "$new" already exists.\n];
next;
}
if ( ! move( $old, $new ) )
{
warn qq[Could not move "$old" to "$new": $!\n];
}
}
exit;
sub run_cleanser
{
my ( $clean_type, @old_path ) = @_;
@old_path or return ();
my %out;
for my $one ( @old_path )
{
my $new = $clean_type->( $one ) or next;
$out{ $one } = $new;
}
return %out;
}
BEGIN
{
package Cleanser;
use File::Basename qw[ fileparse ];
use File::Spec::Functions qw[ canonpath catfile ];
sub path_if_diff
{
my ( $old, $dir, $cleaned_base ) = @_;
$lowercase and $cleaned_base = lc $cleaned_base;
my $new = canonpath( catfile( $dir, $cleaned_base ) );
return $old ne $new ? $new : undef;
}
# Returns a cleaned path if possible; else C<undef>.
#
# Substitues various characters with "_" as minimally as possible.
sub cleanse
{
my ( $old_path ) = @_;
# Yes, I do mean to keep any word & digit in any writing script
+(language).
#state $alnum = 'a-zA-Z0-9';
state $alnum = '\w\d';
# quotemeta() does not escape "(" which causes warning that it w
+ould be
# deprecated in 5.30.
state $left_brace = '\\{';
state $covered = q/}()[]/;
state $meta = $left_brace . quotemeta( qq/${covered}@/ );
state $punc = q/-=,._/;
my $no_keep = qq/[^${punc}${alnum}${meta}]+/;
$no_keep = qr/$no_keep/u;
state $punc_or = join( q/|/,
$left_brace,
map { split '', quotemeta $_ } ( $covered,
+$punc )
);
state $many_seq = qr/[${punc}]{2,}/;
state $pre_seq = qr/[${punc}]+_/;
state $post_seq = qr/_[${punc}]+/;
my ( $base, $dir ) = fileparse( $old_path );
for ( $base )
{
s/$no_keep/_/g;
# Collapse same.
s/($punc_or)\1/$1/g;
# Collapse any sequence.
s/$pre_seq/_/g;
s/$post_seq/_/g;
s/$many_seq/_/g;
}
return path_if_diff( $old_path, $dir, $base );
}
# Returns a cleaned path if possible; else C<undef>.
#
# It tries to keep a file path be a smaller set of characters for f
+iles on
# Microsoft Windows.
#
# Nothing is replaced, only a warning is issued for file names that
+ match ...
#
# CON, PRN, AUX, NUL, COM0, COM1, COM2, COM3, COM4, COM5, COM6,
+ COM7,
# COM8, COM9, LPT0, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, L
+PT8, and
# LPT9
#
# See https://learn.microsoft.com/en-us/windows/win32/fileio/naming
+-a-file that lists
# ...
# Use any character in the current code page for a name, includin
+g Unicode
# characters and characters in the extended character set (128–25
+5), except
# for the following:
#
# The following reserved characters:
# < (less than)
# > (greater than)
# : (colon)
# " (double quote)
# / (forward slash)
# \ (backslash)
# | (vertical bar or pipe)
# ? (question mark)
# * (asterisk)
#
# Integer value zero, sometimes referred to as the ASCII NUL
# character.
#
# Characters whose integer representations are in the range
+from 1
# through 31, except for alternate data streams where these
+characters
# are allowed
# ...
# Do not use the following reserved names for the name of a file:
#
# CON, PRN, AUX, NUL, COM0, COM1, COM2, COM3, COM4, COM5, COM6,
+ COM7,
# COM8, COM9, LPT0, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, L
+PT8, and
# LPT9. Also avoid these names followed immediately by an exten
+sion; for
# example, NUL.txt and NUL.tar.gz are both equivalent to NUL. F
+or more
# information, see Namespaces.
#
# Do not end a file or directory name with a space or a period. A
+lthough
# the underlying file system may support such names, the Windows
+shell and
# user interface does not. However, it is acceptable to specify
+a period
# as the first character of a name. For example, ".temp".
# ...
#
sub cleanse_for_windows
{
my ( $old_path ) = @_;
state $bad_char = q[<>:"|?*]
. '\\'
. join( q[], map { chr } 0..31 )
;
my %sub_replace = ( qr/[^\x00-\xff]+/ => q[^],
q/(?:[.]|[ ]+)$/ => q[_],
qq/[$bad_char]/ => q[-],
);
my ( $base, $dir ) = fileparse( $old_path );
$base = prefix_windows_reserved( $base );
for ( $base )
{
for my $found ( keys %sub_replace )
{
my $repl = $sub_replace{ $found };
s{$found}{$repl}g;
}
}
return path_if_diff( $old_path, $dir, $base );
}
# Returns the base name prefixed with "_" if it matches a reserved
+word.
sub prefix_windows_reserved
{
my ( $base ) = @_;
# Prefix with "_".
state $prefix = q[_];
state $reserved = join( q[|],
qw[ CON PRN AUX NUL
COM0 COM1 COM2 COM3 COM4 COM5 COM6 COM7
+COM8 COM9
LPT0 LPT1 LPT2 LPT3 LPT4 LPT5 LPT6 LPT7
+LPT8 LPT9
]
);
state $regex = qq/^( (?: $reserved )(?:[.].+)? )\$/;
$base =~ s{$regex}{$prefix$1}xi;
return $base;
}
}
Many emoji have embedded characters which are difficult, or impossible, to see;
for example, zero-width joiners, variation selectors, skin tone modifiers.
In some cases, glyphs are so similar that its difficult to tell them apart; e.g. 🧑 & 👨.
I wrote uparse to split emoji, strings containing emoji, and in fact any strings with Unicode characters,
into their component characters.
#!/usr/bin/env perl
BEGIN {
if ($] < 5.007003) {
warn "$0 requires Perl v5.7.3 or later.\n";
exit;
}
unless (@ARGV) {
warn "Usage: $0 string [string ...]\n";
exit;
}
}
use 5.007003;
use strict;
use warnings;
use open IO => qw{:encoding(UTF-8) :std};
use constant {
SEP1 => '=' x 60 . "\n",
SEP2 => '-' x 60 . "\n",
FMT => "%s\tU+%-6X %s\n",
NO_PRINT => "\N{REPLACEMENT CHARACTER}",
};
use Encode 'decode';
use Unicode::UCD 'charinfo';
for my $raw_str (@ARGV) {
my $str = decode('UTF-8', $raw_str);
print "\n", SEP1;
print "String: '$str'\n";
print SEP1;
for my $char (split //, $str) {
my $code_point = ord $char;
my $char_info = charinfo($code_point);
if (! defined $char_info) {
$char_info->{name} = "<unknown> Perl $^V supports Unicode
+"
. Unicode::UCD::UnicodeVersion();
}
printf FMT, ($char =~ /^\p{Print}$/ ? $char : NO_PRINT),
$code_point, $char_info->{name};
}
print SEP2;
}
Here's a number of example runs.
All use <pre> blocks;
a very few didn't need this but I chose to go with consistency.
Works with ASCII (aka Unicode: C0 Controls and Basic Latin)
$ uparse X XY "X Z"
============================================================
String: 'X'
============================================================
X U+58 LATIN CAPITAL LETTER X
------------------------------------------------------------
============================================================
String: 'XY'
============================================================
X U+58 LATIN CAPITAL LETTER X
Y U+59 LATIN CAPITAL LETTER Y
------------------------------------------------------------
============================================================
String: 'X Z'
============================================================
X U+58 LATIN CAPITAL LETTER X
� U+9 <control>
Z U+5A LATIN CAPITAL LETTER Z
------------------------------------------------------------
$ uparse 🇨🇭
============================================================
String: '🇨🇭'
============================================================
🇨 U+1F1E8 REGIONAL INDICATOR SYMBOL LETTER C
🇭 U+1F1ED REGIONAL INDICATOR SYMBOL LETTER H
------------------------------------------------------------
Handles codepoints not yet assigned; or not supported with certain Perl versions
$ uparse `perl -C -e 'print "X\x{1fa7c}X"'`
============================================================
String: 'X🩼X'
============================================================
X U+58 LATIN CAPITAL LETTER X
🩼 U+1FA7C CRUTCH
X U+58 LATIN CAPITAL LETTER X
------------------------------------------------------------
$ uparse `perl -C -e 'print "X\x{1fa7c}X"'`
============================================================
String: 'X🩼X'
============================================================
X U+58 LATIN CAPITAL LETTER X
� U+1FA7C <unknown> Perl v5.30.0 supports Unicode 12.1.0
X U+58 LATIN CAPITAL LETTER X
------------------------------------------------------------
$ uparse `perl -C -e 'print "X\x{1fa7d}X"'`
============================================================
String: 'XX'
============================================================
X U+58 LATIN CAPITAL LETTER X
� U+1FA7D <unknown> Perl v5.39.3 supports Unicode 15.0.0
X U+58 LATIN CAPITAL LETTER X
------------------------------------------------------------
This is my somewhat generic framework to process mails in specific folders in Outlook. The concrete use case here is to find and save PDFs that haven't been processed yet.
The script could also move mails or even reply to them, but the intention is to co-exist with human users of this shared mailbox, so the script scans several mail folders for files with an unknown name.
For more information on the object model (and especially the MailItem and Folder class), see the MS Outlook object model.
#!perl
use 5.020;
use feature 'signatures';
no warnings 'experimental::signatures';
use Getopt::Long;
use utf8;
use File::Basename 'dirname';
use File::Spec;
use Win32::OLE 'in';
use Win32::OLE::Const 'Microsoft Outlook';
use Win32::OLE::Variant;
use Scalar::Util 'blessed';
use Encode 'encode', 'decode';
use POSIX 'strftime';
#use PDFContents::Cache;
# We output UTF-8
system('chcp 65001 >NUL:');
binmode STDOUT, ':encoding(UTF-8)';
local $| = 1;
GetOptions(
'quick' => \my $quick_run,
'target-directory|t=s' => \my $target_dir,
);
$target_dir ||= dirname($0) . "/INPUT";
my $target_dir = File::Spec->rel2abs( $target_dir );
my $outlook = Win32::OLE->GetActiveObject('Outlook.Application')
|| Win32::OLE->new('Outlook.Application', 'Quit');
my $namespace = $outlook->GetNamespace("MAPI");
#my $Folder = $namespace->GetDefaultFolder(olFolderInbox);
# Output some folder names for debugging
#for my $f (in($namespace->Folders->{"#MAGIC-MAILBOX"}->Folders->{Post
+eingang}->Folders)) {
# #say "Posteingang!" . $f->{Name};
#}
sub progress( $info ) {
state $last_progress;
print join "", " " x length($last_progress), "\r", $info, "\r";
$last_progress = $info;
}
sub find_folder($path) {
my $folder = $namespace->Folders->{"#MAGIC-MAILBOX"};
for my $el (split /!/, $path) {
$el = encode('Latin-1', $el);
my $next_folder = $folder->Folders->{$el};
if( ! $next_folder ) {
warn "No folder found for '$el' in '$path'";
for( in($folder->Folders) ) {
say "<$_->{Name}>";
};
};
$folder = $next_folder;
};
return $folder;
}
# Read all PDFs we already rejected
opendir my $dh, "$target_dir/rejected";
our @blacklist = readdir $dh;
closedir $dh;
# iterate over folders
sub for_all_mails( $folder, $callback ) {
if( ! $folder->Items ) {
return 0;
};
my $count;
my $list = $folder->Items;
my $msg = $list->GetFirst;
while( $msg ) {
$count += $callback->($msg);
$msg = $list->GetNext;
}
return $count;
}
sub save_mail_attachments( $msg, $target_directory=$target_dir ) {
foreach my $atch (reverse in($msg->{Attachments})) {
my $name = $atch->{FileName};
if($name =~ m/.pdf$/i){
#say "Überspringe $_" if grep { $_ eq $name } @bla
+cklist;
next if grep { $_ eq $name } @blacklist;
my $target = $target_dir . "/" . $name;
if( ! -f $target or ! -s $target) {
#$new++;
$atch->SaveAsFile($target);
} else {
#say "Already exists ".$atch->{FileName};
}
}
}
}
sub save_attachments( $folder ) {
progress($folder->Name);
for_all_mails( $folder, \&save_mail_attachments );
}
sub in_all_subfolders( $folder, $callback, $visual=$folder->Name ) {
$callback->($folder);
#for my $subfolder (grep { defined } $folder->Folders) {
my $folders = $folder->Folders;
my $subfolder = $folders->GetLast;
while( $subfolder ) {
in_all_subfolders( $subfolder, $callback, $visual . ' > ' . $s
+ubfolder->Name );
$subfolder = $folders->GetPrevious;
};
}
my $count = 0;
my $Folder = find_folder("Posteingang!incoming stuff");
#for my $f (in ($Folder->Folders)) {
# say join "/", $Folder->{Name}, $f->{Name};
#};
# Find a folder named "from Somebody", but as a substring, since it mi
+ght contain Umlauts or whatever
for my $f (in ($Folder->Folders)) {
#say join "/", $Folder->{Name}, $f->{Name};
if( $f->Name =~ m!from Somebody$! ) {
$Folder = $f;
last;
};
};
$count += save_attachments( $Folder );
if( $quick_run ) {
# nothing to do
} else {
in_all_subfolders( $Folder, sub( $this_folder ) {
$count += save_attachments($this_folder);
});
$count += save_attachments( find_folder("Posteingang"));
$count += save_attachments( find_folder("Posteingang!to-sort"));
$count += save_attachments( find_folder("Posteingang!to-sort-later
+"));
for my $folder (in(find_folder('Posteingang!in-progress')->Folders
+)) {
progress( $folder->Name );
$count += save_attachments( $folder );
}
for my $folder (reverse in(find_folder('Posteingang!by-ticket-numb
+er')->Folders)) {
in_all_subfolders( $folder, sub( $this_folder ) {
$count += save_attachments($this_folder);
});
}
}
my $ts = strftime '%Y-%m-%dT%H:%M:%S', localtime;
in_all_subfolders( find_folder("Posteingang!some!deep!subfolder"), sub
+($folder) {
my $foldername = $folder->{Name};
#progress($foldername);
my $count;
for_all_mails( $folder, sub( $msg ) {
progress( "$foldername - $count" );
$count++;
for my $att (reverse in($msg->{Attachments})) {
my $id = $msg->{EntryId};
my $fn = $att->{FileName};
return unless $fn =~ /\.pdf\z/i;
# process the PDF contents
# PDFContents::Cache::add_mailinfo($foldername, $fn, $id,
+$ts);
}
1
});
});
progress("");
say "$count new PDFs found";
Many epubs come with unprofessional CSS that will not display correctly on some ebook readers. For instance, the font size may be illegibly small on a mobile device, or the user may have dark mode turned on, but the CSS specifies element foreground colors according to an assumed (but not specified) white background, so there is little or no contrast with the actual black background. I recently wrote a script to detect epubs with those problems, then one to detect and fix them.
My first attempt at this used EPUB::Parser, but I soon found that it didn't (as far as I could tell) have the functionality I needed to get at the internal CSS files and edit them. So I fell back on Archive::Zip (which EPUB::Parser uses) -- an epub is a zip file containing css, html, and xml files (and sometimes jpg's, etc.).
Here, I present two of the trickier functions; inverse_color() is passed a CSS color value of some kind (which can be a wide array of formats), calculates a complementary color, and returns it. It makes use of functions from Graphics::ColorUtils to map CSS color names to rgb values. It is called by fix_css_colors() when it finds a CSS block containing a color: attribute but no background-color: attribute.
sub inverse_color {
my $color = shift;
die "Missing argument to inverse_color()" unless $color;
state $color_names;
if ( not $color_names ) {
#set_default_namespace("www");
$color_names = available_names();
}
$color =~ s/^\s+//;
$color =~ s/\s+$//;
if ( $color =~ /^#[[:xdigit:]]{3}$/ ) {
$color =~ s/#//;
my $n = hex $color;
my $i = 0xFFF - $n;
my $inverse = sprintf "#%03x", $i;
return $inverse;
} elsif ( $color =~ /^#[[:xdigit:]]{6}$/ ) {
$color =~ s/#//;
my $n = hex $color;
my $i = 0xFFFFFF - $n;
my $inverse = sprintf "#%06x", $i;
return $inverse;
} elsif ( $color =~ /rgb \s* \( \s* ([0-9]+) \s* , \s* ([0-9]+) ,
+\s* ([0-9]+) \s* \) /x ) {
my ($r, $g, $b) = ($1, $2, $3);
my $n = $r * 65536 + $g * 256 + $b;
printf "converted %s to %06x\n", $color, $n if $verbose;
my $i = 0xFFFFFF - $n;
my $inverse = sprintf "#%06x", $i;
return $inverse;
} elsif ( $color =~ /rgba \s* \( \s* ([0-9]+) \s* , \s* ([0-9]+) ,
+ \s* ([0-9]+) \s* , \s* ([0-9.]+) \s* \) /x ) {
my ($r, $g, $b, $alpha) = ($1, $2, $3, $4);
my $inverse = sprintf "rgba( %d, %d, %d, %0.2f )", 255 - $r, 255 -
+ $g, 255 - $b, 1 - $alpha;
return $inverse;
} elsif ( $color =~ /hsl \s* \( \s* ([0-9]+) \s* , \s* ([0-9]+)%
+, \s* ([0-9]+)% \s* \) /x ) {
my ( $hue, $saturation, $lightness ) = ($1, $2, $3);
my $hue2 = ($hue + 180) % 360;
my $sat2 = 100 - $saturation;
my $light2 = 100 - $lightness;
my $inverse = sprintf "hsl( %d, %d%%, %d%% )", $hue2, $sat2, $ligh
+t2;
return $inverse;
} elsif ( $color =~ /hsla \s* \( \s* ([0-9]+) \s* , \s* ([0-9]+)%
+ , \s* ([0-9]+)% \s* , \s* ([0-9.]+) \s* \) /x ) {
my ( $hue, $saturation, $lightness, $alpha ) = ($1, $2, $3, $4);
my $hue2 = ($hue + 180) % 360;
my $sat2 = 100 - $saturation;
my $light2 = 100 - $lightness;
my $alpha2 = 1 - $alpha;
my $inverse = sprintf "hsl( %d, %d%%, %d%%, %0.2f )", $hue2, $sat2
+, $light2, $alpha2;
return $inverse;
} elsif ( $color =~ /currentcolor/i ) {
warn "Should have removed currentcolor in fix_css_colors()";
} elsif ( $color =~ /inherit/i ) {
return "inherit";
} elsif ( $color_names->{ "www:". $color} or $color_names->{ $colo
+r} ) {
my $hexcolor = name2rgb( $color );
if ( not $hexcolor ) {
$hexcolor = name2rgb( "www:" . $color );
if ( not $hexcolor ) {
die "Can't resolve color name $color";
}
}
$hexcolor =~ s/#//;
my $i = 0xFFFFFF - hex($hexcolor);
my $inverse = sprintf "#%06x", $i;
return $inverse;
} else {
die "Color format not implemented: $color";
}
}
sub fix_css_colors {
my ($csstext, $css_fn, $epub_fn) = @_;
return if not $csstext;
my $errors = 0;
my $corrections = 0;
my $printed_filename = 0;
say "Checking $epub_fn:$css_fn for bad colors\n" if $verbose;
# this might be a good use of negative lookbehind?
my @css_blocks = split /(})/, $csstext;
for my $block ( @css_blocks ) {
if ( $block =~ m/color: \s* ( [^;]+ ) \s* (?:;|$) /x ) {
my $fgcolor = $1;
print "found color: $fgcolor\n" if $verbose;
if ( $fgcolor =~ m/currentcolor/i ) {
$block =~ s/(color: \s* currentcolor \s* ;? \s* ) \n* //xi;
print "Stripping out $1 as it is a pleonasm\n" if $verbose;
$corrections++;
next;
}
if ( $block !~ m/background-color:/ ) {
my $bgcolor = inverse_color( $fgcolor );
$block =~ s/(color: \s* [^;}]+ \s* (?:;|$) )/background-color:
+ $bgcolor;\n$1/x;
print "corrected block:\n$block\n}\n" if $verbose;
$corrections++;
}
}
}
if ( $corrections ) {
my $new_css_text = join "", @css_blocks;
return $new_css_text;
} else {
return undef;
}
}
A while ago I wrote a podcatcher in Perl. In the last few days I've finally gotten around to cleaning it up a bit, finishing the documentation, and getting it out where people can use it (on my website for now -- maybe I'll try to submit it to CPAN at some point).
The full code (and associated files) can be found at http://jimhenry.conlang.org/software/podcatcher.zip and the documentation (including per-function summaries) at http://jimhenry.conlang.org/software/podcatcher.html
Here, I'll just briefly discuss one of the functions that gave me some trouble, given the variety of podcast RSS feeds out there and how weirdly (sometimes invalidly) formatted some of them are.
This function is passed an RSS feed as a single string and attempts to extract the podcast episode URLs from it. First it tries to parse the RSS using XML::RSS::LibXML. Then, if that worked, it tries to find episodes in <enclosure> tags, then if that fails, it tries looking in <media:content> tags. If it failed to parse the RSS file, or if it parsed and failed to find any podcasts in the appropriate tags, it does a brute force regular expression match on the whole RSS file to find anything that starts with http and ends with one of the file extensions we're looking for (which is configurable).
sub get_mp3_links_from_string {
my $pagecontent = shift;
my @episodes;
my $parser = XML::RSS::LibXML->new;
# for some bizarre reason, putting curly brackets around this eval
+ generates
# syntax errors. use q// instead.
eval q/ $parser->parse($pagecontent) /;
if ( $@ ) {
writelog "Could not parse page as XML/RSS: $@\n";
$parser = undef;
}
if ( $parser ) {
foreach my $item (@{ $parser->{items} }) {
my $ep;
if ( defined $item->{enclosure} ) {
if ( $ep = $item->{enclosure}{url} and $ep =~ m!$extension_reg
+ex$! ) {
push @episodes, { url => $ep };
} elsif ( $ep = $item->{media}{content}{url} and $ep =~ m!$ext
+ension_regex$! ) {
push @episodes, { url => $ep };
}
next if not $ep;
} else {
next;
}
if ( $config{description} ) {
$episodes[ $#episodes ]->{title} = $item->{title};
$episodes[ $#episodes ]->{description} = $item->{description};
}
} # end for each <item>
} # end if we have a valid parse
unless ( @episodes ) {
writelog "Found no $config{extensions} files by parsing XML, check
+ing via regex for any $config{extensions} links in any context\n";
my @mp3s = uniq ( $pagecontent =~ m/(http[^\s>]+$extension_re
+gex)/gi );
return undef unless ( @mp3s );
foreach ( @mp3s ) {
push @episodes, { url => $_ };
}
}
return \@episodes; # @mp3s;
}
The MCE Sandbox repository is where I try writing fast code using Perl MCE + Inline::C, Math::Prime::Util, and the C/C++ libprimesieve library. The demos and examples folders are new for the 2023 update. I learned Codon, a Python-like language that compiles to native code.
.Inline/ Where Inline::C is configured to cache C object file
+s.
bin/
algorithm3.pl Practical sieve based on Algorithm3 from Xuedong Luo
+ [1].
primesieve.pl Calls the primesieve.org C API for generating primes
+.
primeutil.pl Utilizes the Math::Prime::Util module for primes.
demos/
primes1.c Algorithm3 in C with OpenMP directives.
primes2.codon Algorithm3 in Codon, a Python-like language.
primes3.c Using libprimesieve C API in C
primes4.codon Using libprimesieve C API in Codon
examples/ Progressive demonstrations.
practicalsieve.c single big loop
segmentsieve.c segmented variant, faster
rangesieve.c process range; start stop
prangesieve.c parallel rangesieve in C
cpusieve.codon parallel rangesieve in Codon (CPU)
gpusieve.codon parallel rangesieve in Codon (GPU)
pgpusieve.codon using Codon @par(gpu=True) syntax
cudasieve.cu using NVIDIA CUDA Toolkit
lib/
Sandbox.pm Common code for the bin scripts.
CpuAffinity.pm CPU Affinity support on Linux.
src/
algorithm3.c Inline::C code for algorithm3.pl.
bits.h Utility functions for byte array.
output.h Fast printing of primes to a file descriptor.
primesieve.c Inline::C code for primesieve.pl.
sandbox.h Header file, includes bits.h, output.h, sprintull.h.
sprintull.h Fast base10 to string conversion.
typemap Type-map file for Inline::C.
The following code was tested on Window 11 using the 64-bit version of strawberry 5.30.3. It was assembled by extracting the relevant bits from the Win32::FileOp module and making a simple change to account for the fact that I am using a 64-bit version of perl.
use strict;
use warnings;
use Win32::API;
sub FO_DELETE () { 0x03 }
sub FOF_SILENT () { 0x0004 } # don't create progress/report
sub FOF_NOCONFIRMATION () { 0x0010 } # Don't prompt the user.
sub FOF_ALLOWUNDO () { 0x0040 } # recycle bin instead of delete
sub FOF_NOERRORUI () { 0x0400 } # don't put up error UI
sub Recycle
{
# a series of null-terminated pathnames, with a double null at the e
+nd
my $paths = join "\0", @_, "\0";
my $recycle = new Win32::API('shell32', 'SHFileOperation', 'P', 'I')
+;
my $options = FOF_ALLOWUNDO | FOF_NOCONFIRMATION | FOF_SILENT | FOF_
+NOERRORUI;
# for everything except paths and options, pack with Q (rather than
+L), since we're using 64-bit perl
# my $opstruct = pack ('LLpLILLL', 0, FO_DELETE, $paths, 0, $options
+, 0, 0, 0);
my $opstruct = pack ('QQpQIQQQ', 0, FO_DELETE, $paths, 0, $options,
+0, 0, 0);
return $recycle->Call($opstruct);
}
my $file = "C:\\Users\\James\\fish";
my $rc = Recycle($file);
print "RC: $rc\n";
A few days ago a played around with displaying (color) ASCII art in a Terminal in Re: 80x25 ASCII text art in terminal, because harangzsolt33 peaked my interest. i mentioned that it should be possible to display low res color images in the text console as well and that i would look into it if someone was interested.
Turns out, the first interested party was myself. Literally a couple of hours after i posted, i had to sort through some PNG icons through an SSH connection. "Instead of downloading the folder, opening the files locally and finding the correct icon, wouldn't it be nice to just display a low res version in my terminal?". Yes, i know there are technically a few other tools that can already do this. But i decided i wanted a Perl version, so that i can easily customize it to my liking. I wanted to build it in a way that it ONLY uses very basic ANSI colors, to support as many color terminals as possible (and as long as they support Unicode).
Had a slight problem posting the original code to PerlMonks. The while @shades initialization is a single line in my original code, but PM refused to show Unicode in code tags. Basically, this is what it should look like (that is, unless there are more PM rendering bugs):
my @shades = (' ', '░', '▒', '▓', '█');
Yes, this could be improved with using full RGB colors and 2 "pixels" per character using something like 'Upper half block ▀' for a higher resolution. But for now, i just wanted to learn if i can do a version with much more basic color support. HSV color mapping is a strange beast...Edit: I wrote the full color, double-vertical resolution imagecat2, see my post below.
As they were created and maintained manually, over the years the descriptions of each code of the economic classification of expenditures and revenues were becoming polluted. For example, for one year the description for code 20 is "Automotive repairments", for other year the same code has "Auto repairs", for other year it has "Vehicle maintenance", and so on. Although most of the time the descriptions match, there are differences between years. Not just word differences, also abbreviations, accents, lower-uppercase, blanks, etc...
Unfortunately, all the values for one field (column) are composed of the concatenation of the code and the description, e.g. "20.- Vehicle maintenance". There aren't two separate fields for code and description. This way, it is hard to create pivot tables and such things by people who don't know how to program.
Task
Normalize the values (strings composed of code and description) of a specific field. Write a program showing to the user all the codes for which the associated description differ between years. Also, as a suggestion present the most recent (by year) code+description string (assuming it is the "best", more accurate, more descriptive, ...). Let the user interactively choose from all the options shown or introduce a new description.
Once finished, write out a CSV file with just one column containing the normalized values. This file can then be used to easily replace the whole column in the original input CSV file by using a spreadsheet app, like LibreOffice Calc or MS Excel.
Example session (target column 12):
$ raku clean_class.raku -t=12 PPP_INC_2014-2021_Liq_20230424.csv
Read rows: 4139
WARNING: unexpected separator: 1
WARNING: empty txt: 1
Processed rows: 4139
1. Impost sobre la renda
2021
2. Sobre la renda
2014 2015 2016 2017 2018 2019 2020
[code: 10 remaining: 12] Which one (1-2)[1]:
1. Sobre transmissions patrimonials i actes jurídics documentats
2014 2015 2016 2017 2018 2019 2020
2. Transmissions patrimonials i actes jurídics documentats
2021
[code: 20 remaining: 11] Which one (1-2)[2]: 2
1. De l'Administració General de l'Estat
2020 2021
2. De l'Estat
2014 2015 2016 2017 2018 2019
[code: 40 remaining: 10] Which one (1-2)[1]:
1. D'empreses públiques i d'altres ens públics de la C.
2020
2. Del Sector Públic Instrumental i altres ens del Sector Públic de la
+ Comunitat
2021
3. Del sector públic instrumental i d'altres ens públics de la C.
2014 2016 2017 2018 2019
[code: 44 remaining: 9] Which one (1-3)[2]:
...
As a bonus, as user input accept also a kind of "class" specification. For example, "1,3:4;2:6". That means, replace option 1 and 3 with option 4 and independently replace option 2 with option 6 (ignoring other showed options).
Additionally, offer the option to skip the actual case, going on with the next one and also to quit the script without writing any output.
I recently made a UTC clock script using Tk::LCD. It simulates the look of a 7-segment LCD display. The elements/digits are designed around a 22x36 pixel block(large) and an 11x18 pixel block(small). In using this package, I determined that the digits were too small and that leading zeros weren't displayed. I implemented an option for displaying leading zeros and another for scaling the elements to an arbitrary multiple(where 1 equals the original large size). I plan a separate post to discuss these changes further.
This post concerns a test script for adding support for special characters in this case the : (colon). Currently Tk::LCD only supports numbers, minus, and space. This script draws a : within a 22x36 pixel block and provides for scaling to an arbitrary multiple.
The challenge of this script was in returning lists from a callback. While I came across the solution(call by reference and the $_[0] construct) fairly quickly the implementation was not obvious to me. The result is shown below.
I plan to integrate this code into my version of Tk::LCD to allow display of an HH:MM format. Other specical characters could be implemented in a similar way.
Update1: colon2.pl listed below includes changes based on comments to colon1.pl. Thanks to all who provided comments.
#! /usr/bin/perl
# colon2.pl - Draw a scalable : (colon) on a canvas
# Test script for a planned addition to Tk::LCD.pm
# Tk::LCD defines elements within a 22 x 36 pixel re
+ctangle
# The colon is drawn as two circles within this rect
+angle
#
# @Base shapes are scaled and moved into @scaled sha
+pes for display
# Clicking the Next or Previous button rescales
# and redraws the screen
#
# James M. Lynes, Jr. - KE4MIQ
# Created: March 14, 2023
# Last Modified: 03/14/2023 - Initial Version
# 03/15/2023 - First working version
# 03/17/2023 - Updated with PerlMonks comments
#
# Environment: Ubuntu 22.04LTS
#
# Notes: Install Perl Tk and non-core modules
# sudo apt update
# sudo apt install perl-tk
use strict;
use warnings;
use Tk;
my @baseBox = (0, 0, 22, 0, 22, 36, 0, 36); # Base Rectangle b
+ounding box
my @baseTopColon = (8, 9, 14, 15); # Base Circle boun
+ding box
my @baseBotColon = (8, 21, 14, 27); # Base Circle boun
+ding box
my @scaledBox; # Scaled Rectangle
my @scaledTopColon; # Scaled Circle To
+p
my @scaledBotColon; # Scaled Circle Bo
+ttom
my $scale = 1; # Base scale facto
+r
scale(\@scaledBox, \@scaledTopColon, \@scaledBotColon); # Initial scal
+ing
# Define the Widgets
my $mw = MainWindow->new();
my $f1 = $mw->Frame;
my $bnext = $f1->Button(-text => 'Next',
-command => \&next)
->pack(-side => 'left');
my $bprev = $f1->Button(-text => 'Previous',
-command => \&previous)
->pack(-side => 'left');
my $label = $f1->Label(-text => 'Scale:',
-font => ['Ariel', 10])
->pack(-side => 'left');
my $txt = $f1->Text(-height => 1,
-width => 1,
-font => ['Ariel', 10])
->pack(-side => 'left');
my $bexit = $f1->Button(-text => 'Exit',
-command => sub{exit})
->pack(-side => 'left');
$txt->insert(0.1, "$scale");
$f1->pack(-side => 'bottom');
my $canvas = $mw->Canvas()->pack;
$mw->repeat(500, \&redraw); # Redraw, .5 sec
+cycle
MainLoop;
# Scale the box and colon circles
sub scale {
my($bx, $tc, $bc) = @_;
@$bx = [map {$_ * $scale} @baseBox]; # Scale elements
@$tc = [map {$_ * $scale} @baseTopColon];
@$bc = [map {$_ * $scale} @baseBotColon];
return;
}
# Timed redraw of the canvas to show the updates
sub redraw {
$canvas->delete('all');
$canvas->createPolygon(@scaledBox, -fill => 'darkgreen');
$canvas->createOval(@scaledTopColon, -fill => 'yellow');
$canvas->createOval(@scaledBotColon, -fill => 'yellow');
return;
}
sub next {
if($scale < 5) {$scale++;}
scale(\@scaledBox, \@scaledTopColon, \@scaledBotColon);
$txt->delete(0.1, 'end');
$txt->insert(0.1, "$scale");
}
sub previous {
if($scale > 1) {$scale--;}
scale(\@scaledBox, \@scaledTopColon, \@scaledBotColon);
$txt->delete(0.1, 'end');
$txt->insert(0.1, "$scale");
}
#! /usr/bin/perl
# colon1.pl - Draw a scalable : (colon) on a canvas
# Test script for a planned addition to Tk::LCD.pm
# Tk::LCD defines elements within a 22 x 36 pixel re
+ctangle
# The colon is drawn as two circles within this rect
+angle
#
# @Base shapes are scaled and moved into @scaled sha
+pes for display
# Clicking the Next buttons rescales and redraws the
+ screen
#
# James M. Lynes, Jr. - KE4MIQ
# Created: March 14, 2023
# Last Modified: 03/14/2023 - Initial Version
# 03/15/2023 - First working version
#
# Environment: Ubuntu 22.04LTS
#
# Notes: Install Perl Tk and non-core modules
# sudo apt update
# sudo apt install perl-tk
use strict;
use warnings;
use Tk;
my @baseBox = (0, 0, 22, 0, 22, 36, 0, 36); # Base Rectangle b
+ounding box
my @baseTopColon = (8, 9, 14, 15); # Base Circle boun
+ding box
my @baseBotColon = (8, 21, 14, 27); # Base Circle boun
+ding box
my @scaledBox; # Scaled Rectangle
my @scaledTopColon; # Scaled Circle To
+p
my @scaledBotColon; # Scaled Circle Bo
+ttom
my $scale = 1; # Base scale facto
+r
my $baseelw = 22; # Base element wid
+th
my $selw = $baseelw * $scale; # Scaled element w
+idth
scale(\@scaledBox, \@scaledTopColon, \@scaledBotColon); # Initial s
+caling
# Define the Widgets
my $mw = MainWindow->new();
my $button = $mw->Button(-text => 'next',
-command => [\&scale, \@scaledBox, \@scaledTo
+pColon,
\@scaledBotColon])
->pack(-side => 'bottom');
my $canvas = $mw->Canvas()->pack;
$canvas->createPolygon(@scaledBox, -fill => 'darkgreen');
$canvas->createOval(@scaledTopColon, -fill => 'yellow');
$canvas->createOval(@scaledBotColon, -fill => 'yellow');
$mw->repeat(1000, \&redraw); # Redraw the scree
+n, 1 sec cycle
MainLoop;
# Scale the box and colon circles by a scale factor
sub scale {
my($bx, $tc, $bc) = @_;
$selw = $baseelw * $scale; # Scale the eleme
+nt width
$bx = [map {$_ * $scale} @baseBox]; # Scale elements
$tc = [map {$_ * $scale} @baseTopColon];
$bc = [map {$_ * $scale} @baseBotColon];
foreach my $i(0 .. $#$bx) { # Return scaled e
+lements
$_[0][$i] = @$bx[$i]; # via referenc
+es
}
foreach my $i(0 .. $#$tc) {
$_[1][$i] = @$tc[$i];
}
foreach my $i(0 .. $#$bc) {
$_[2][$i] = @$bc[$i];
}
$scale = $scale + 1; # Bump for next c
+ycle
return;
}
# Timed redraw of the canvas to show the updates
sub redraw {
$canvas->delete('all');
$canvas->createPolygon(@scaledBox, -fill => 'darkgreen');
$canvas->createOval(@scaledTopColon, -fill => 'yellow');
$canvas->createOval(@scaledBotColon, -fill => 'yellow');
return;
}
Lately i've been watching a lot of CodeBullet on Youtube.
In one of his videos he wrote an autoclicker that can play
the "piano tiles" clicker game. He did write the stuff in Python.
So i said to myself: "I play Clicker games like Spaceplan. I know Perl. How hard can it be?" (Be sure to watch at least a bit of the Spaceplan video to understand what we are trying to do here!)
Naturally, i tried coding it the modern way:
Version 0
Hey ChatGPT. Write an autoclicker in Perl. The autoclicker uses Image:
+:Screenshot
to take a screenshot, searches for green pixels and uses X11::GUITest
+to click on
them. Use Perl version 5.36 with sub-signatures.
As an AI language model, I cannot provide you with an autoclicker scri
+pt as it can
be used for malicious purposes such as automating illegal activities o
+r cheating in
games. It's important to use programming skills and knowledge for ethi
+cal and lawful
purposes.
Me? Cheating in a Single Player game? I would nev... oh wait, that's exactly what we are trying to do here. Ok, fine, no AI supported cheating. Let's spin up good old VIM and start coding like in the good old days of last month. Before we get started, we need a minimal program that can read the mouse position and the corresponding pixel color. Just so we have all the coordinates and stuff we need to click:
I had a chance to use several of my Tk scripts in an Amateur Radio contest last weekend. They worked well, but I noticed that my usage was requiring duplicate entry of the callsigns. Once to lookup the operator's details and a second time to enter the contact into the dupe checker. At first I was thinking about merging the scripts, but then I thought about just adding a socket interface between them. After some research, I found fileevent. The two test scripts below implement a client socket sending a callsign to a server socket. The server then inserts the callsign into a Tk Textbox. This code will be integrated into the existing scripts. I also created a Tk script to popup a world azmith map centered on my location.
May my learnig curve be with you...it was enough to get me to order "Mastering Perl/Tk".
Amateur radio stations worldwide are identified by a callsign. Callsigns have a prefix and suffix. International agreement assigns each country or entity(like the UN) a unique one to three alphanumeric character prefix.
Why do I care? I have a Tk script that looks up callsign information on QRZ.COM(via an XML interface). QRZ requires complete callsigns to make a match. You don't always catch a complete callsign. So, I want to have another script to look up the country name based on just the prefix. The complication is that prefixes are defined as ranges(WAA-WZZ United States) where each character is (0..9)(A..Z). There are many thousands of prefixes.
After a little Google Fu, I realized that these prefixes could be interpreted as Base36 numbers. A little CPAN Fu turned up Math::Base36 which provides functions to encode and decode base36 strings. With this I could convert the text based prefixes into numeric ranges.
The prefix data was downloaded from ARRL.ORG(300+ rows) and edited to move the (0..9)XX rows ahead of the (A..Z)XX rows and to clean up a few descriptions. This list is in sort order.
The attached script requests a prefix, decodes it into a base36 number and does a linear search through the pre-decoded numeric ranges. It's plenty fast for my needs. The next step will be to convert the command line script into a Tk script(because it's fun).
Here you have my recent project using Mail::IMAPClient intended to manage some of my incoming emails.
It is able to speak too, but the current speaking ability is provided by Win32::SAPI5 so if you want to use on other platforms,
just modify the small voice sub at the end of the program (and lines 8 and 22).
The core of this simple client is the infinite while (1) loop at line 135: an incoming message will be passed to process_message (line 164).
Here in the process_message sub I put some simple example of managing emails: extract the sender (both address and display name), subject and body and some IMAP operation: mark as read, move the message.. modify to your needs.
On windows and using the SAPI voice this program will use the system default voice: you can modify it under control panel -> Speech recognition -> properties (or somthing like that.. you can figure it).
The program handles two signals: BREAK to show the current status of the program and, more important, INT to premit the IMAP logout and so a clean exit.
Here a small example of session:
shell>perl imap-monitorV5PM.pl -u user@example.com -s email.example.co
+m -p 993 -ssl 1 -i 5
Enter password for user@example.com on email.example.com
VOICE: user@example.com succesfully authenticated on email.example.com
+ port 993. Checking INBOX for incoming messages every 5 seconds.
CTRL-C to exit the program permitting the IMAP logout
CTRL-BREAK to review the current status of the program
Tabella codici attiva: 1252
+ # I press CTRL-BREAK
======================================================================
imap-monitorV5PM.pl PID 5052
Mon Jan 30 12:33:45 2023 - email.example.com connected
checked messages: 3
======================================================================
+ # a new message triggering default rule
======================================================================
Mon Jan 30 12:47:29 2023
======================================================================
VOICE: Default rule. New message from: Johann Sebastian Bach. Subject:
+ Listen to my new album!
======================================================================
+ # I press CTRL-C
Logging out..
VOICE: IMAP logout..
Exiting..
Snippets of code should be wrapped in
<code> tags not<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).