Re^2: Parsing .txt into arrays

Replies are listed 'Best First'.

Re^3: Parsing .txt into arrays
by huck (Prior) on May 25, 2017 at 07:07 UTC

So lets focus on hippo's step 1 first, since it is the key to the next steps

In some ways this is easy, and in other ways it will be hard

If i was to do this based on what you have shown us i would start with a base you have already identified, in one case the a table begins with "work and language :65" , another table begins with "place and year data: 67" and yet another with "Position log".

This takes that concept, and uses whats sometimes called a state machine to separate the lines into table parts, i then kept going to parse all the data into a hash of arrays of hashs. I realize its not quite the output style you wanted but it shows a lot of the techniques and you could modify it to get what you want.

 
use strict; use warnings; 

my $state=''; 
my %tables; 
my @titles; 
while (my $line=<DATA>) { 
  chomp $line; 
  if    (-1 != index($line,'place and year data: 67')) {$state='place'
+;@titles=();}
  elsif (-1 != index($line,'work and language :65'))   {$state='work';
+@titles=();}
  elsif (-1 != index($line,'Position log'))            {$state='positi
+on';@titles=();}
  elsif (-1 != index($line,'|')){ 
    $line=~s/^\s*//;  # take off leading spaces 
    $line=~s/\s*$//;  # take off trailing spaces 
    $line=~s/^\|//;   # take off leading bar 
    my @thisset=split('\|',$line); 
    for my $part (@thisset){ 
      $part=~s/_//g;    # remove any underscores 
      $part=~s/^\s*//;  # take off leading spaces 
      $part=~s/\s*$//;  # take off trailing spaces 
      } 
    unless ($thisset[0]=~m/^\d+$/) {  
        # if first not digits this is a title part 
        my $ix=0; 
        for my $part (@thisset) {
          if ($part ne '') { 
            unless (defined($titles[$ix])) {$titles[$ix]=$part; } 
              else {$titles[$ix].=' '.$part; } 
            } 
          $ix++; 
          } # part 
        } # not digits 
    else { 
        # first is digits so this is data 
        my %hashpart; 
        my $ix=0; 
        for my $part (@thisset) {
          $hashpart{$titles[$ix]}=$part; 
          $ix++; 
          } # part 
        push @{$tables{$state}},\%hashpart; 
        } # digits  
    } # not sep 
  } # line 
use Data::Dumper; 
print Dumper(\%tables); 
  

__DATA__

place and year data: 67

_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
|no.|  name | age  | place | year |
|_ _|_ _ _ _|_ _ _ | _ _ _ |  _ _ |
|1  |  sue  |33    | NY    | 2015 |
|2  |  mark |28    | cal   | 2106 |


work and language :65
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
|no.| name | languages | proficiency | time taken|
|_ _| _ _ _| _ _ _ _ _ |_ _ _ _ _ _ _| _ _ _ _ _ |      
|1  | eliz | English   | good        | 24 hrs    |
|2  | susan| Spanish   | good        | 13 hrs    |
|3  | danny| Italian   | decent      | 21 hrs    |

Position log

   |   |      |Pos     |value   |   |bulk|lot|   prev| newest|
   |#  |Locker|(dfg)   |(no)    |nul|val |Id |   val |val   |
   -----------------------------------------------------------
   |  0|     1|  302832|  -11.88|  1|   0|Pri|     16|      0|
   |  1|     9|  302836|   11.88|  9|   0|Pri|     10|      0|
   |  2|     1|  302832|  -11.88|  5|   3|Pri|     14|      4|
   |  3|     3|  302833|   11.88|  1|   0|sec|     12|      0|
   |  4|     6|  302837|  -11.88|  1|   0|Pri|     16|      3|
[download]

$VAR1 = {
          'work' => [
                      {
                        'languages' => 'English',
                        'no.' => '1',
                        'name' => 'eliz',
                        'time taken' => '24 hrs',
                        'proficiency' => 'good'
                      },
                      {
                        'no.' => '2',
                        'languages' => 'Spanish',
                        'name' => 'susan',
                        'proficiency' => 'good',
                        'time taken' => '13 hrs'
                      },
                      {
                        'name' => 'danny',
                        'time taken' => '21 hrs',
                        'proficiency' => 'decent',
                        'languages' => 'Italian',
                        'no.' => '3'
                      }
                    ],
          'place' => [
                       {
                         'year' => '2015',
                         'place' => 'NY',
                         'name' => 'sue',
                         'no.' => '1',
                         'age' => '33'
                       },
                       {
                         'year' => '2106',
                         'name' => 'mark',
                         'place' => 'cal',
                         'no.' => '2',
                         'age' => '28'
                       }
                     ],
          'position' => [
                          {
                            'newest val' => '0',
                            'prev val' => '16',
                            'bulk val' => '0',
                            'value (no)' => '-11.88',
                            'Locker' => '1',
                            '#' => '0',
                            'nul' => '1',
                            'Pos (dfg)' => '302832',
                            'lot Id' => 'Pri'
                          },
                          {
                            'newest val' => '0',
                            'bulk val' => '0',
                            'prev val' => '10',
                            'Locker' => '9',
                            'value (no)' => '11.88',
                            'nul' => '9',
                            '#' => '1',
                            'lot Id' => 'Pri',
                            'Pos (dfg)' => '302836'
                          },
                          {
                            'lot Id' => 'Pri',
                            'Pos (dfg)' => '302832',
                            'newest val' => '4',
                            'bulk val' => '3',
                            'prev val' => '14',
                            'Locker' => '1',
                            'value (no)' => '-11.88',
                            'nul' => '5',
                            '#' => '2'
                          },
                          {
                            'nul' => '1',
                            '#' => '3',
                            'Locker' => '3',
                            'value (no)' => '11.88',
                            'bulk val' => '0',
                            'prev val' => '12',
                            'newest val' => '0',
                            'lot Id' => 'sec',
                            'Pos (dfg)' => '302833'
                          },
                          {
                            '#' => '4',
                            'nul' => '1',
                            'value (no)' => '-11.88',
                            'Locker' => '6',
                            'bulk val' => '0',
                            'prev val' => '16',
                            'newest val' => '3',
                            'Pos (dfg)' => '302837',
                            'lot Id' => 'Pri'
                          }
                        ]
        };
[download]

[reply]
[d/l]
[select]


go ahead... be a heretic
	PerlMonks