Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl: the Markov chain saw
 
PerlMonks  

corpus - print a large corpus of code text

by sflitman (Hermit)
on Mar 31, 2010 at 05:58 UTC ( #831985=CUFP: print w/ replies, xml ) Need Help??

Corpus is a Perl script which scans your code directories looking for executable text files, plus any non-executable files with optional specified extensions, and puts them in a nice HTML file suitable for submitting to a code review or federal agency. It optionally adds line numbers too, and will ignore any directory's contents (including subdirs) containing the file .nocorpus. Meant for documenting work, it undoubtedly duplicates lots of other pretty printers out there, and it doesn't even do any syntax highlighting! Like all good tools, I wrote it for myself, but maybe it will help someone else.

#!/usr/bin/perl # corpus - Generate a text of all code in a corpus # Written by S. Flitman, MD; released under GPLv3 # # 032710 Initial implementation # 032810 Unangle to avoid premature termination, also unamp # 032810 Fold long lines to avoid problems, -n for line numbers # 032810 If .nocorpus is in a folder it is ignored # 032810 Better displayed names for duplicates in different subdirs li +ke index.fcgi # 032810 Wanted extensions will also be included, even if not executab +le use warnings; use strict 'vars'; use vars qw/%opt %doc $tpl $pgbrk $lnbrk $maxline %maxlineByFontsize $title $sigIgnore $reWantExts $nLines/; use File::Find; use Getopt::Std; $pgbrk=qq{<p style="page-break-before: always">&nbsp;</p>\n}; $lnbrk=qq{ &crarr;\n}; $maxline=0; %maxlineByFontsize=( 12=>120,10=>144,8=>180 ); $sigIgnore='.nocorpus'; getopts('f:hno:t:T:w:x:',\%opt); if (!@ARGV || $opt{h}) { print <<"EOT"; Usage: corpus [-f size] [-hn] [-t title] [-T tpl] [-o file] [-w ext] [ +-x pat] [folder]... Grab all code in folders and produced a nice listing in HTML to stdout or to outfile, folds long lines too -f N Font size N pt, also sets how long a line has to be to g +et folded -h This help -n Add line numbers -o X Send output to file X -t X Use X as title for generated HTML page -T X Use alternate template file X -w X,Y.. Additional wanted extensions, no dots, separate by comma -x P Also extract all lines matching pattern P to STDERR EOT exit(1); } $opt{f}||=12; $maxline=$maxlineByFontsize{$opt{f}}; die "Bad fontsize $opt{f}\n" unless $maxline; $title=$opt{t}||join(',',@ARGV); if ($opt{T}) { open(TPL,$opt{T}) || die "$opt{T}: $!"; $tpl.=$_ while (<TPL>); close TPL; } else { $tpl=<<"EOT"; <table border=2 width="100%" height="100%" cellspacing=0 cellpadding=3 +> <tr height=16> <td align=left><em>:n:</em></td> <td align=left>:name:</td> <td align=center>:modified:</td> <td align=right>:size:</td> </tr> <tr> <td colspan=4 valign=top> <pre> :text: </pre> </td> </table> EOT } $reWantExts=join('|',split(/,/,$opt{w})); find({ wanted=>\&process,follow=>0,no_chdir=>1 },@ARGV); sub process { my $file=$File::Find::name; $File::Find::prune=1,return if -e "$File::Find::dir/$sigIgnore"; my ($tplFile,$text,@s,$ln,$name,$fWantExts); return if -l $file; # no symlinks $fWantExts=$file=~/\.($reWantExts)$/o; return unless (-x $file && -T $file) or $fWantExts; $name=substr($file,$fWantExts ? rindex($file,'/',rindex($file,'/')- +1)+1 : rindex($file,'/')+1); $ln=0; open(FILE,$file) || die "$file: $!"; while (<FILE>) { print STDERR "$name: $_" if $opt{x} && /$opt{x}/o; $text.=($opt{n} ? sprintf('%4d ',++$ln) : '').$_; ++$nLines; } close FILE; $text=~s/&/&amp;/g; $text=~s/</&lt;/g; $text=~s/>/&gt;/g; @s=stat($file); $tplFile=$tpl; $tplFile=~s/:text:/$text/; $tplFile=~s!:modified:!getdate('M0/DD/YY',$s[9])!e; $tplFile=~s!:size:!comma($s[7])!e; $doc{$file}{name}=$name; $doc{$file}{text}=$tplFile; } if ($opt{o}) { open(STDOUT,'>',$opt{o}) || die "$opt{o}: $!"; } print <<"EOT"; <html> <head> <title>$title</title> <style> pre { font-size:$opt{f}pt } </style> </head> <body> <center> EOT # disambiguate filenames in corpus my ($i,@doc,$text,$name,$file,%docnames,@names); for $file (keys %doc) { push @{$docnames{$doc{$file}{name}}},$file; } for $name (keys %docnames) { if (scalar @{$docnames{$name}}>1) { # not unique? for $file (@{$docnames{$name}}) { $doc{$file}{name}=substr($file,rindex($file,'/',rindex($file, +'/')-1)+1); } } } @doc=sort keys %doc; for ($i=0; $i<=$#doc; $i++) { $file=$doc[$i]; $text=$doc{$file}{text}; $text=~s/:n:/sprintf('%03d',$i+1)/e; $text=~s/:name:/$doc{$file}{name}/; $text=~s/^(.{$maxline})(.*)$/$1$lnbrk$2/omg; print $text; print $pgbrk if $i<$#doc; } print <<"EOT"; </center> </body> </html> EOT print STDERR comma($i)," files, ",comma($nLines)," lines\n"; exit; sub fmtdate { # format a date. MMMM=full month name, MMM=short month +name, # M0=2 digit month, leading 0, MM=digit month, no leading 0 # YY=2 digit year, YYYY=full year; DD=2 digit day, dd=day # DDDD=ordinal day +st/nd/rd/th; WWW=weekday # All with leading 0: hh=2 digit hour, mm=2 digit minute, ss=2 digi +t second my ($fmt,$mon,$day,$yr,$hr,$min,$sec)=@_; # note month is 1..12 my @months=qw/January February March April May June July August September October November December/; $fmt=~s/WWW/substr('MonTueWedThuFriSatSun',Day_of_Week($yr,$mon,$da +y)*3-3,3)/e; $fmt=~s/MMMM/$months[$mon-1]/; $fmt=~s/MMM/substr($months[$mon-1],0,3)/e; $fmt=~s/M0/sprintf("%02d",$mon)/e; $fmt=~s/MM/$mon/; $fmt=~s/YYYY/$yr/; $fmt=~s/YY/sprintf("%02d",$yr%100)/e; $fmt=~s/DDDD/getOrdinal($day)/e; $fmt=~s/DD/sprintf("%02d",$day)/e; $fmt=~s/dd/$day/; $fmt=~s/hh/sprintf("%02d",$hr)/e; $fmt=~s/mm/sprintf("%02d",$min)/e; $fmt=~s/ss/sprintf("%02d",$sec)/e; $fmt; } sub getdate { my ($fmt,$time)=@_; $fmt||='M0DDYY_hhmmss'; $time||=time; my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) =localtime($time); $year+=1900 if $year<1900; return fmtdate($fmt,$mon+1,$mday,$year,$hour,$min,$sec); } sub getOrdinal { my $n=shift; my @ordend=qw/th st nd rd th th th th th th/; $n . ($n>=10 && $n<=20 ? 'th' : $ordend[$n%10]); } sub comma { # print longs formatted with commas local($_) = shift; 1 while s/^(-?\d+)(\d{3})/$1,$2/; return $_; }

Comment on corpus - print a large corpus of code text
Select or Download Code

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: CUFP [id://831985]
Front-paged by Arunbear
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others avoiding work at the Monastery: (6)
As of 2014-12-20 06:15 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    Is guessing a good strategy for surviving in the IT business?





    Results (95 votes), past polls