Re: Data extraction with specific keywords

Something like that is working for you?

#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
use feature 'say';

my @array = ("91:", "86:", "184:", "430:", "391:", "254:", "121:", "19
+2:", "404:", "12:", "87:", "638:", "417:", "129:", "549:", "548:", "1
+22:", "443:", "378:", "365:", "665:", "148:", "185:", "88:",  "629:",
+  "637:", "149:", "625:", "635:", "627:", "650:", "468:", "92:", "618
+:", "212:", "85:", "628:", "171:", "649:", "15:", "61:", "169:", "104
+:", "202:", "523:", "60:", "672:", "291:", "658:", "59:", "547:", "49
+1:", "234:", "411:", "620:", "581:", "414:", "14:", "412:", "416:", "
+345:", "626:", "457:", "72:", "384:", "371:", "9:", "580:", "436:", "
+356:", "385:", "58:", "669:", "388:", "386:", "390:", "636:", "619:",
+ "16:", "413:", "17:", "524:", "579:", "624:", "90:", "471:", "410:",
+ "551:", "289:", "387:", "531:", "64:", "166:", "211:", "467:", "415:
+", "232:", "550:", "362:", "375:", "401:", "359:", "372:", "398:", "3
+60:", "364:", "399:", "403:", "373:", "377:", "18:", "118:", "585:", 
+"427:", "424:", "586:", "469:", "425:", "429:", "13:", "423:", "500:"
+, "62:", "109:", "19:", "539:", "499:", "532:", "400:", "63:", "361:"
+, "374:", "73:", "449:", "175:", "426:", "89:", "507:", "397:", "389:
+", "582:", "475:", "20:", "22:", "541:", "492:", "503:", "555:", "595
+:", "596:", "450:", "23:", "611:", "509:", "3:", "485:", "24:", "438:
+", "442:", "440:", "484:", "117:", "32:", "437:", "31:", "663:", "339
+:", "535:", "21:", "470:", "439:", "525:", "172:", "40:", "65:", "487
+:", "50:", "517:", "597:", "545:", "516:", "402:", "347:", "614:", "5
+40:", "613:", "346:", "67:", "363:", "583:", "376:", "428:", "71:", "
+615:", "332:", "271:", "5:", "508:", "74:");

while (<>) {
    chomp;
    # say "$.\t$_";
    my @line = split /\s+/; # Matches one or more white space
    # print Dumper \@line;
    foreach my $element (@line) {
    if (index($element, ':') != -1) {
        my @tmp = split /:/, $element;
        my $match = $tmp[0] . ':';
        if ( grep( /^$match$/, @array ) ) {
        say "Found it: $match$tmp[1]";
        }
    }
    }
}

__END__

$ perl test.pl in.txt
Found it: 3:0.33582025
Found it: 5:65.316997
Found it: 9:28
Found it: 12:5
Found it: 13:4328.520884
Found it: 14:5279.218852
Found it: 15:7434.371708
Found it: 16:7829.126536
Found it: 17:7560.24877
Found it: 18:7380.518025
Found it: 19:7094.262906
Found it: 20:6916.621367
Found it: 21:6198.40255
Found it: 22:11858.88819484
Found it: 23:15547.317962699
Found it: 24:23174.9885578928
Found it: 31:419.275504
Found it: 32:463.700544
Found it: 40:83.0884742406
Found it: 50:8977.2048903741
Found it: 58:128.1944444444
Found it: 59:155.2777777778
Found it: 60:253.1111111111
Found it: 61:311
Found it: 62:335.75
Found it: 63:328.4722222222
Found it: 64:338.4166666667
Found it: 65:318.0277777778
Found it: 67:75.9389628772
Found it: 71:44.7351998225
Found it: 72:47.3110129808
Found it: 73:42.4806162036
Found it: 74:42.4332599202
Found it: 85:7.3557105965
Found it: 86:7.6016482623
Found it: 87:7.5850301982
Found it: 88:7.4495976828
Found it: 89:7.3768447337
Found it: 90:7.4461809487
Found it: 91:7.3609459641
Found it: 92:7.2909765644
Found it: 104:147.1672932848
Found it: 109:161.9295657107
[download]

Input data that I used, based on what you provided us:

0 2:-0.5795 3:0.33582025 4:55.8255 5:65.316997 6:15 7:16 8:57 9:28 10:
+29 11:23 12:5 13:4328.520884 14:5279.218852 15:7434.371708 16:7829.12
+6536 17:7560.24877 18:7380.518025
19:7094.262906 20:6916.621367 21:6198.40255 22:11858.88819484 23:15547
+.317962699 24:23174.9885578928 25:26259.9933684153 26:26163.282596974
+5 27:26115.0415561043
28:25798.4258540249 29:24623.253542266 30:23630.8474599248 31:419.2755
+04 32:463.700544 33:841.938352 34:1080.191664 35:1246.68676 36:1161.6
+04228 37:1229.277976 38:1188.42918
39:1084.302664 40:83.0884742406 41:109.16288499 42:167.636308343 43:19
+9.4597516818 44:203.7524145431 45:201.2240880658 46:204.6352052546 47
+:194.1195252421 48:187.4305290355
49:9335.0824128232 50:8977.2048903741 51:17564.3373325462 52:22978.817
+901802 53:27754.6236137749 54:25282.4660739928 55:27042.2374736936 56
+:26691.0761515138
57:23720.386147332 58:128.1944444444 59:155.2777777778 60:253.11111111
+11 61:311 62:335.75 63:328.4722222222 64:338.4166666667 65:318.027777
+7778 66:301.6388888889
67:75.9389628772 68:86.5445713443 69:66.9763216937 70:53.9939761103 71
+:44.7351998225 72:47.3110129808 73:42.4806162036 74:42.4332599202 75:
+41.8810983108 76:208.0506700849
77:254.8740649623 78:208.7836807017 79:181.1034025408 80:154.812323059
+ 81:167.4041125391 82:154.4815919403 83:151.0629051673 84:159.6678882
+427 85:7.3557105965 86:7.6016482623
87:7.5850301982 88:7.4495976828 89:7.3768447337 90:7.4461809487 91:7.3
+609459641 92:7.2909765644 93:7.3263693514 94:1.4576925305 95:1.789555
+4916 96:1.5102370121 97:1.3755844944
98:1.205635589 99:1.2898980004 100:1.2253605105 101:1.1909173328 102:1
+.2664224935 103:163.7733756636 104:147.1672932848 105:158.2372732662 
+106:158.4746062193
107:164.2285420933 108:162.0670902179 109:161.9295657107 110:163.74893
+34449 111:160.2728793739 112:2.2490253411 113:2.5455373406 114:2.2802
+802803 115:2.1448275862
[download]

Even the sample of code that I provided is working for you, I do not think so it is the most efficient solution but it is a good point of start (I think).

Update 2: I was testing a minor modification that might help to make the script a bit more efficient. The truth is that 4% is not a big difference in a few lines of file input but if your files are big any minor additional resources could be helpful.

#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
use feature 'say';
# use Benchmark qw( timethese cmpthese ) ; # WindowsOS
use Benchmark::Forking qw( timethese cmpthese ); # UnixOS

my @array = ("91:", "86:", "184:", "430:", "391:", "254:", "121:", "19
+2:", "404:", "12:", "87:", "638:", "417:", "129:", "549:", "548:", "1
+22:", "443:", "378:", "365:", "665:", "148:", "185:", "88:",  "629:",
+  "637:", "149:", "625:", "635:", "627:", "650:", "468:", "92:", "618
+:", "212:", "85:", "628:", "171:", "649:", "15:", "61:", "169:", "104
+:", "202:", "523:", "60:", "672:", "291:", "658:", "59:", "547:", "49
+1:", "234:", "411:", "620:", "581:", "414:", "14:", "412:", "416:", "
+345:", "626:", "457:", "72:", "384:", "371:", "9:", "580:", "436:", "
+356:", "385:", "58:", "669:", "388:", "386:", "390:", "636:", "619:",
+ "16:", "413:", "17:", "524:", "579:", "624:", "90:", "471:", "410:",
+ "551:", "289:", "387:", "531:", "64:", "166:", "211:", "467:", "415:
+", "232:", "550:", "362:", "375:", "401:", "359:", "372:", "398:", "3
+60:", "364:", "399:", "403:", "373:", "377:", "18:", "118:", "585:", 
+"427:", "424:", "586:", "469:", "425:", "429:", "13:", "423:", "500:"
+, "62:", "109:", "19:", "539:", "499:", "532:", "400:", "63:", "361:"
+, "374:", "73:", "449:", "175:", "426:", "89:", "507:", "397:", "389:
+", "582:", "475:", "20:", "22:", "541:", "492:", "503:", "555:", "595
+:", "596:", "450:", "23:", "611:", "509:", "3:", "485:", "24:", "438:
+", "442:", "440:", "484:", "117:", "32:", "437:", "31:", "663:", "339
+:", "535:", "21:", "470:", "439:", "525:", "172:", "40:", "65:", "487
+:", "50:", "517:", "597:", "545:", "516:", "402:", "347:", "614:", "5
+40:", "613:", "346:", "67:", "363:", "583:", "376:", "428:", "71:", "
+615:", "332:", "271:", "5:", "508:", "74:");

my @preserved = @ARGV;

sub subIndex {
    @ARGV = @preserved; # restore original @ARGV

    while (<>) {
    chomp;
    # say "$.\t$_";
    my @line = split /\s+/; # Matches one or more white space
    # print Dumper \@line;
    foreach my $element (@line) {
        if (index($element, ':') != -1) {
        my $indx = index($element, ':');
        my $match = substr $element, 0, ++$indx;
        if ( grep( /^$match$/, @array ) ) {
            # say "Found it: $element";
        }
        }
    }
    }
}

sub subSplit {
    @ARGV = @preserved; # restore original @ARGV

    while (<>) {
    chomp;
    # say "$.\t$_";
    my @line = split /\s+/; # Matches one or more white space
    # print Dumper \@line;
    foreach my $element (@line) {
        if (index($element, ':') != -1) {
        my @tmp = split /:/, $element;
        my $match = $tmp[0] . ':';
        if ( grep( /^$match$/, @array ) ) {
            # say "Found it: $element";
        }
        }
    }
    }
}

my $results = timethese(100000,
            { 'split'  => \&subSplit,
                  'index' => \&subIndex,
            }, 'none');
cmpthese( $results );

__END__

$ perl test.pl in.txt
       Rate split index
split 145/s    --   -4%
index 151/s    4%    --
[download]

Hope this helps, BR.

Seeking for Perl wisdom...on the process of learning...not there...yet!

Comment on Re: Data extraction with specific keywords Select or Download Code


Don't ask to ask, just ask
	PerlMonks