Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl Monk, Perl Meditation
 
PerlMonks  

Re^2: Getstore to avoid of memory?

by wrkrbeee (Scribe)
on Mar 01, 2017 at 22:08 UTC ( #1183330=note: print w/replies, xml ) Need Help??


in reply to Re: Getstore to avoid of memory?
in thread Getstore to avoid of memory?

My apologies, revised/cleaned code appears below:
#!/usr/bin/perl -w #use strict; use Tie::File; use Fcntl; use LWP::Simple; #First year you want downloaded files for for: my $startyear=2016; #Last year you want files for: my $endyear=2016; #First qtr you want files for (usually 1): my $startqtr=1; #Last qtr you want files for (usually 4): my $endqtr=1; #The directory you want your index files to be stored in. my $inddirect="C:/Volumes/EDGAR1/Edgar/full-index"; #The directory you are going to download filings to my $direct="G:/Research/SEC filings 10K and 10Q/Data"; #The file that will contain the filings you want to download. my $outfile="G:/Research/SEC filings 10K and 10Q/Data/sizefiles1.txt"; #Specify the directory containing the results/output; my $write_dir = 'G:\Research\SEC filings 10K and 10Q\Data\Header Data\ +data2016.txt'; my $base_url = 'http://www.sec.gov/Archives'; my $line_count=0; my $cik=-99; my $form_type=""; my $report_date=-99; my $file_date=-99; my $name=""; my $count=0; #Initialize file counter variable; my $file_count = 0; my $formget1='(10-K )'; my $formget2='(10-K405 )'; my $formget3='(10KSB )'; my $formget4='(10-KSB )'; my $formget5='(10KSB40 )'; my $formget6='(10-KT )'; my $formget7='(10KT405 )'; my $slash='/'; for($yr=$startyear;$yr<=$endyear;$yr++) { #loop through all the index quarters you specified if($yr<$endyear){$eqtr=4}else{$eqtr=$endqtr} for($qtr=$startqtr;$qtr<=$eqtr;$qtr++) { #Open the index file open(INPUT, "$inddirect/company$qtr$yr.idx") || die "file for company$ +qtr$yr.idx: $!"; #Open the file you want to write to. The first time through #the file is opened to "replace" the existing file. #After that, it is opened to append ">>". if ($yr==$startyear && $qtr==$startqtr) {$outfiler=">$outfile";} else{$outfiler=">>$outfile";} open(OUTPUT, "$outfiler") || die "file for 2006 1: $!"; $count=1; while ($line=<INPUT>) { #ignore the first 10 lines because they only contain header informatio +n if ($.<11) {next}; $company_name=substr($line,0,60); $form_type=substr($line,62,12); my $cik=substr($line,74,10); $file_date=substr($line,86,10); $file_date=~s/\-//g; my $fullfilename=trim(substr($line,98,43)); if ($form_type=~/^$formget1(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $line_count, " ", $form_type, " ", $base_url,"/",$fullfilena +me,"\n"; } elsif ($form_type=~/^$formget2(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget3(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget4(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget5(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } elsif ($form_type=~/^$formget6(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } elsif ($form_type=~/^$formget7(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } } close(INPUT); close(OUTPUT); # check to see if directory exists. If not, create it. unless(-d "$direct$slash$yr"){ mkdir("$direct$slash$yr") or die; } #Open the directory and get put the names of all files into the array +@old opendir(DIR,"$direct$slash$yr")||die "Can't open directory"; @Old=readdir(DIR); #The tie statement assigns the file containing the #files you want to download to the array @New1. tie(@New1,Tie::File,"$outfile", mode=> O_RDWR) or die "Cannot tie file BOO: $!n"; #checks to see what files on the current index listing are not in the +directory #defines a hash called seen. %seen=(); #defines an array called @aonly. @aonly=(); #build lookup table. This step is building a lookup table(hash). #each filename (from OLD) has a value of 1 assigned to it. foreach $item(@Old){$seen{$item}=1} #for each item in the New1 array, which we got from the txt file #containing all the files we want to download, add #it to the array, @aonly, as long is it is not already #in the current directory. We do this so we don't download #a file we have already downloaded. foreach $item(@New1){ $item=~/(edgar\/data\/.*\/)(.*\.txt)/; unless($seen{$item}){ push(@aonly,$item); } } #downloads all the files in the @oanly array which are the files not i +n the directory foreach $filetoget(@aonly) { $fullfile="$base_url/$filetoget"; $fonly=$filetoget; for my $line (split qr/\'\n'/, get($fullfile)) { while ($line_count < 2) { if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m){$cik=$1;} if($line=~m/^\s*FORM\s*TYPE:\s*(.*$)/m){$form_type=$1;} if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m){$ +report_date=$1;} if($line=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m){$file_date= +$1;} if($line=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m){$name=$ +1;} $line_count++; print "$cik, $form_type, $report_date, $file_date, $name\n"; print "$line_count\n"; } # end of while loop; } # end of for my $line split loop; ### Now write the results to file!; #Open the ouput file; open my $FH_OUT, '>>',$write_dir or die "Can't open file $write_dir"; #Save/write results/output; $,='|'; print $FH_OUT "$cik$,$form_type$,$report_date$,$file_date$,$name$,\n"; $line_count=0; #Update file counter; ++$file_count; print "$file_count lines read from $fullfile\n"; #closedir($dir_handle); close($FH_OUT); } # end of foreach file to get loop; #end of qtr loop } #end of year loop } sub trim { my $new_phrase; my $phrase = shift(@_); $phrase =~ s/^\s+//; $phrase =~ s/\s+$//; $new_phrase = "$phrase"; return "$new_phrase"; }

Replies are listed 'Best First'.
Re^3: Getstore to avoid of memory?
by GotToBTru (Prior) on Mar 01, 2017 at 22:42 UTC

    Never comment out use strict. Just don't.

    In fact, make it help you by defining variables within the scope they are needed, and only there. The memory they take up will be released once the variables go out of scope. That could possibly fix your problem.

    Do your print statements give any clue about where your program runs out of memory? If not, I would pare the program down into steps until you can find out where.

    But God demonstrates His own love toward us, in that while we were yet sinners, Christ died for us. Romans 5:8 (NASB)

Re^3: Getstore to avoid of memory?
by huck (Parson) on Mar 02, 2017 at 01:59 UTC

    #!/usr/bin/perl -w use strict; use warnings; my $base_url = 'http://www.sec.gov/Archives'; my @aonly=qw{ edgar/data/1122304/0001193125-15-118890.txt' edgar/data/1640984/0001052918-16-000754.txt edgar/data/1640984/0001052918-16-000803.txt edgar/data/1084869/0001437749-16-024828.txt edgar/data/1084869/0001084869-16-000045.txt edgar/data/1084869/0001084869-16-000046.txt edgar/data/1084869/0001084869-16-000047.txt edgar/data/1084869/0000950162-16-000085.txt edgar/data/1511144/0001206774-16-004470.txt edgar/data/1665022/0001665022-16-000001.txt edgar/data/1662965/0001662965-16-000001.txt edgar/data/1651654/0001651654-16-000002.txt edgar/data/1664237/0001664237-16-000001.txt edgar/data/1664513/0001664513-16-000001.txt edgar/data/1665711/0001665711-16-000001.txt edgar/data/1665354/0001665354-16-000001.txt edgar/data/1664635/0001664635-16-000001.txt edgar/data/1625109/0001625109-16-000002.txt edgar/data/1658659/0001658659-16-000002.txt edgar/data/1666635/0001666635-16-000001.txt edgar/data/1614102/0001614102-16-000002.txt edgar/data/1665218/0001665218-16-000001.txt edgar/data/1663921/0000905729-16-000383.txt edgar/data/1666561/0001666561-16-000002.txt edgar/data/1668972/0001668972-16-000002.txt edgar/data/1540531/0000905718-16-001186.txt edgar/data/1540531/0000905718-16-001254.txt }; my $file_count=0; my $FH_OUT=\*STDOUT; my @fields=qw/cik form_type report_date file_date name/; foreach my $filetoget(@aonly) { my $res=get_process_trunc ($filetoget); if (scalar(keys(%$res))) { my $lineout=''; for my $field (@fields){ if ($res->{$field}) {$lineout.=$res->{$field}} $lineout.='|'; } print $FH_OUT $lineout."\n"; } } exit; sub get_process_trunc { # http://www.perlmonks.org/?node_id=1183107 my $filetoget=shift; my $fullfile="$base_url/$filetoget"; my $res={}; use LWP::UserAgent; my $received_size = 0; my $partial = ''; my $ua = LWP::UserAgent->new; my $response = $ua->get($fullfile , ':content_cb'=> sub { my ($data, $response, $protocol) = @_; $partial.=$data; $received_size += length $data; + die if ($received_size>10000); # die inside this callback interrupt th +e request, not the program!! } ); if ($partial) { # print 'length:'.length($partial)."\n"; my $line_count=0; for my $line (split qr/\'\n'/, $partial) { if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m) +{$res->{cik} =$1;} if($line=~m/^\s*FORM\s*TYPE:\s*(.*$)/m) +{$res->{form_type} =$1;} if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m) +{$res->{report_date}=$1;} if($line=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m) +{$res->{file_date} =$1;} if($line=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m) +{$res->{name} =$1;} $line_count++; last if ($line_count>50); } } # success return $res; } # get_process_trunc
    result
    0001394872|4|20160119|20160121|Ambient Water Corp| 0001394872|5|20150422|20160209|Ambient Water Corp| 0001084869|10-Q|20151227|20160205|1 800 FLOWERS COM INC| 0001084869|4|20160202|20160331|1 800 FLOWERS COM INC| 0001084869|4|20160329|20160331|1 800 FLOWERS COM INC| 0001084869|4|20160202|20160331|1 800 FLOWERS COM INC| 0001084869|SC 13G/A||20160216|1 800 FLOWERS COM INC| 0001511144|13F-HR|20151231|20160212|10-15 ASSOCIATES, INC.| 0001665022|D||20160127|10-20 Channel Center REIT| 0001662965|D||20160106|100 Wall Investments LLC| 0001651654|D/A||20160127|100INSIGHTS, INC| 0001664237|D||20160212|1075 Weybridge Holding, LLC| 0001664513|D||20160205|10Stories, Inc.| 0001665711|D||20160210|11 Madison Investor II LLC| 0001665354|D||20160210|11 Madison Investor LLC| 0001664635|D||20160223|11 Pine, Inc.| 0001625109|D||20160226|11 Roniin, LLC| 0001658659|D/A||20160113|110 Corcoran Property Partners, LLC| 0001666635|D||20160210|1111 Broadway Distribution, LLC| 0001614102|D||20160301|1125 North Fairfax LLC| 0001665218|D||20160128|114 REIT LP| 0001663921|D||20160126|1143 Highland Drive, LLC| 0001666561|D||20160211|1155 Boulder, LLC| 0001668972|D||20160323|11619 Euclid, LLC| 0001540531|13F-HR|20151231|20160216|12 West Capital Management LP| 0001386301|SC 13G||20160307|Research Solutions, Inc.|
    I thought i recognized this type of data
    notice reference to @Discipulus at Re: Split web page, first 30 lines only -- :content_cb trick

      yea i know i left an extra quote behind, well it shows what happens if it cant find a url....

        thank you huck, impressive! Grateful for your insight, time and effort!!
Re^3: Getstore to avoid of memory?
by Anonymous Monk on Mar 02, 2017 at 00:26 UTC
    You know how you have sub trim? You need about 5-10 more of those, all those big blocks of code need to be subs

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://1183330]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others about the Monastery: (5)
As of 2021-05-07 17:09 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?
    Perl 7 will be out ...





    Results (93 votes). Check out past polls.

    Notices?