So lets focus on hippo's step 1 first, since it is the key to the next steps
In some ways this is easy, and in other ways it will be hard
If i was to do this based on what you have shown us i would start with a base you have already identified, in one case the a table begins with "work and language :65" , another table begins with "place and year data: 67" and yet another with "Position log".
This takes that concept, and uses whats sometimes called a state machine to separate the lines into table parts, i then kept going to parse all the data into a hash of arrays of hashs. I realize its not quite the output style you wanted but it shows a lot of the techniques and you could modify it to get what you want.
use strict; use warnings;
my $state='';
my %tables;
my @titles;
while (my $line=<DATA>) {
chomp $line;
if (-1 != index($line,'place and year data: 67')) {$state='place'
+;@titles=();}
elsif (-1 != index($line,'work and language :65')) {$state='work';
+@titles=();}
elsif (-1 != index($line,'Position log')) {$state='positi
+on';@titles=();}
elsif (-1 != index($line,'|')){
$line=~s/^\s*//; # take off leading spaces
$line=~s/\s*$//; # take off trailing spaces
$line=~s/^\|//; # take off leading bar
my @thisset=split('\|',$line);
for my $part (@thisset){
$part=~s/_//g; # remove any underscores
$part=~s/^\s*//; # take off leading spaces
$part=~s/\s*$//; # take off trailing spaces
}
unless ($thisset[0]=~m/^\d+$/) {
# if first not digits this is a title part
my $ix=0;
for my $part (@thisset) {
if ($part ne '') {
unless (defined($titles[$ix])) {$titles[$ix]=$part; }
else {$titles[$ix].=' '.$part; }
}
$ix++;
} # part
} # not digits
else {
# first is digits so this is data
my %hashpart;
my $ix=0;
for my $part (@thisset) {
$hashpart{$titles[$ix]}=$part;
$ix++;
} # part
push @{$tables{$state}},\%hashpart;
} # digits
} # not sep
} # line
use Data::Dumper;
print Dumper(\%tables);
__DATA__
place and year data: 67
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
|no.| name | age | place | year |
|_ _|_ _ _ _|_ _ _ | _ _ _ | _ _ |
|1 | sue |33 | NY | 2015 |
|2 | mark |28 | cal | 2106 |
work and language :65
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
|no.| name | languages | proficiency | time taken|
|_ _| _ _ _| _ _ _ _ _ |_ _ _ _ _ _ _| _ _ _ _ _ |
|1 | eliz | English | good | 24 hrs |
|2 | susan| Spanish | good | 13 hrs |
|3 | danny| Italian | decent | 21 hrs |
Position log
| | |Pos |value | |bulk|lot| prev| newest|
|# |Locker|(dfg) |(no) |nul|val |Id | val |val |
-----------------------------------------------------------
| 0| 1| 302832| -11.88| 1| 0|Pri| 16| 0|
| 1| 9| 302836| 11.88| 9| 0|Pri| 10| 0|
| 2| 1| 302832| -11.88| 5| 3|Pri| 14| 4|
| 3| 3| 302833| 11.88| 1| 0|sec| 12| 0|
| 4| 6| 302837| -11.88| 1| 0|Pri| 16| 3|
Result$VAR1 = {
'work' => [
{
'languages' => 'English',
'no.' => '1',
'name' => 'eliz',
'time taken' => '24 hrs',
'proficiency' => 'good'
},
{
'no.' => '2',
'languages' => 'Spanish',
'name' => 'susan',
'proficiency' => 'good',
'time taken' => '13 hrs'
},
{
'name' => 'danny',
'time taken' => '21 hrs',
'proficiency' => 'decent',
'languages' => 'Italian',
'no.' => '3'
}
],
'place' => [
{
'year' => '2015',
'place' => 'NY',
'name' => 'sue',
'no.' => '1',
'age' => '33'
},
{
'year' => '2106',
'name' => 'mark',
'place' => 'cal',
'no.' => '2',
'age' => '28'
}
],
'position' => [
{
'newest val' => '0',
'prev val' => '16',
'bulk val' => '0',
'value (no)' => '-11.88',
'Locker' => '1',
'#' => '0',
'nul' => '1',
'Pos (dfg)' => '302832',
'lot Id' => 'Pri'
},
{
'newest val' => '0',
'bulk val' => '0',
'prev val' => '10',
'Locker' => '9',
'value (no)' => '11.88',
'nul' => '9',
'#' => '1',
'lot Id' => 'Pri',
'Pos (dfg)' => '302836'
},
{
'lot Id' => 'Pri',
'Pos (dfg)' => '302832',
'newest val' => '4',
'bulk val' => '3',
'prev val' => '14',
'Locker' => '1',
'value (no)' => '-11.88',
'nul' => '5',
'#' => '2'
},
{
'nul' => '1',
'#' => '3',
'Locker' => '3',
'value (no)' => '11.88',
'bulk val' => '0',
'prev val' => '12',
'newest val' => '0',
'lot Id' => 'sec',
'Pos (dfg)' => '302833'
},
{
'#' => '4',
'nul' => '1',
'value (no)' => '-11.88',
'Locker' => '6',
'bulk val' => '0',
'prev val' => '16',
'newest val' => '3',
'Pos (dfg)' => '302837',
'lot Id' => 'Pri'
}
]
};
|