Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl-Sensitive Sunglasses
 
PerlMonks  

Re: The 10**21 Problem (Part 2)

by eyepopslikeamosquito (Canon)
on May 18, 2014 at 04:07 UTC ( #1086481=note: print w/ replies, xml ) Need Help??


in reply to The 10**21 Problem (Part 2)

To avoid cluttering the main node with too much code, I omitted the program to generate the lookup table, mentioned above in the "Memoization" section of the root node.

I include a standalone program here (built with a 64-bit compile, with 32-bit int and 64-bit size_t) that generates the "bytevecM" 4GB lookup table in case anyone wants to duplicate what I did (this is not pretty code, only needs to be run once).

// Generate the "M" lookup table (aka bytemap). #include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> #define XmItemOf(array) (sizeof(array) / sizeof(array[0])) // Start at one because '\0' needs escaping in Python. #define MIN_SEARCH_CHAR 1 #define MAX_SEARCH_CHAR 128 // Next one is used to check for no hit #define INVALID_SEARCH_CHAR 0 // Change this line for different mod #define MOD_TARG 1001 // Change this line for different length #define HASH_LEN 11 // Change these two lines to generate different bytemaps for C,L,X,V,I +. #define ZPD_BYTE "lookzpM.byte" #define D_TARG 1000 // Maximum of all hits, including terminating zero. #define MAX_ELT 256 #define MAXI 2147483647 #define MAXU 4294967295U #define BYTE_MAP_SIZE_BYTES ((size_t)MAXU+1) #define ARR_SIZE MAXU static inline int rubymod(int x, int y) { int xmody = x - (x / y) * y; if (xmody && ((y ^ xmody) < 0)) { xmody += y; } return xmody; } static int hitval[MAX_ELT]; static int nhitval; static unsigned int histo[MAX_ELT]; int main() { int ii = 0; int jj = 0; unsigned int uu = 0; unsigned int ucnt = 0; unsigned char hitchar; // the (presumed unique) hit char int k = 0; int z = 0; int q = 0; int qq = 0; int maxpos = 0; int posind = 0; printf("sizeof int=%d\n", (int)sizeof(int)); printf("sizeof long=%d\n", (int)sizeof(long)); printf("sizeof size_t=%d\n", (int)sizeof(size_t)); printf("sizeof void*=%d\n", (int)sizeof(void*)); if (sizeof(int) != 4) { fprintf(stderr, "oops sizeof int != 4 (%d)\n", sizeof(int)); exit(1); } if (sizeof(hitval) != sizeof(int) * MAX_ELT) { fprintf(stderr, "oops array hitval is padded (%d v %d)\n", sizeof(hitval), sizeof(int) * MAX_ELT); exit(1); } memset(histo, 0, sizeof(histo)); size_t siQuarter = BYTE_MAP_SIZE_BYTES/4; if (siQuarter + siQuarter + siQuarter + siQuarter != BYTE_MAP_SIZE_ +BYTES) { fprintf(stderr, "oops quarter is not even, quarter=%lu\n", (unsi +gned long)siQuarter); exit(1); } unsigned int uiQuarter = (unsigned int)siQuarter; fprintf(stderr, "uiQuarter=%u\n", uiQuarter); char* fnames[4]; FILE* fhs[4]; int i; for (i = 0; i < 4; ++i) { fnames[i] = (char*)malloc(64); if (fnames[i] == NULL) { fprintf(stderr, "out of memory\n"); exit(1); } sprintf(fnames[i], "%s-%d", ZPD_BYTE, i); fprintf(stderr, "%d: %s\n", i, fnames[i]); fhs[i] = fopen(fnames[i], "wb"); if (fhs[i] == NULL) { fprintf(stderr, "oops open '%s' failed: errno=%d\n", fhs[i], +errno); exit(1); } } int ifh = 0; for (;;) { ii = (int)uu; if (uu % 10000000 == 0) { printf("uu=%u ucnt=%u ii=%d: maxpos=%d posind=%d\n", uu, ucnt, ii, maxpos, posind); } nhitval = 0; memset(hitval, 0, sizeof(hitval)); qq = -1; hitchar = INVALID_SEARCH_CHAR; // We ignore q==0, 10, 13 because these three require escaping i +n Python. for (q = MIN_SEARCH_CHAR; q < MAX_SEARCH_CHAR; ++q) { if (q == 10 || q == 13) continue; jj = (1000003 * ii) ^ q; jj ^= HASH_LEN; if (jj == -1) jj = -2; k = rubymod(jj, MOD_TARG); if (k == D_TARG) { hitval[nhitval] = MOD_TARG; ++nhitval; if (nhitval >= MAX_ELT) { fprintf(stderr, "oops nhitval=%d uu=%u ii=%d jj=%d (%d)\ +n", nhitval, uu, ii, jj, MOD_TARG); exit(1); } qq = q; } } if (nhitval > maxpos) { maxpos = nhitval; posind = (int)uu; } if (nhitval > 0) { if (nhitval != 1 || qq < 0) { fprintf(stderr, "oops nhitval != 1 is =%d uu=%u ii=%d qq=% +d (%d)\n", nhitval, uu, ii, qq, MOD_TARG); exit(1); } ++ucnt; hitchar = (unsigned char)qq; } ++histo[nhitval]; if (uu == uiQuarter) { printf("closing first file handle %d: uu=%u\n", ifh, uu); fclose(fhs[ifh]); ++ifh; } else if (uu == uiQuarter * 2) { printf("closing second file handle %d: uu=%u\n", ifh, uu); fclose(fhs[ifh]); ++ifh; } else if (uu == uiQuarter * 3) { printf("closing third file handle %d: uu=%u\n", ifh, uu); fclose(fhs[ifh]); ++ifh; } if (fputc(hitchar, fhs[ifh]) == EOF) { fprintf(stderr, "oops fputs '%s' failed: errno=%d\n", ZPD_BYT +E, errno); exit(1); } if (uu == ARR_SIZE) { printf("breaking: uu=%u ii=%d\n", uu, ii); break; } ++uu; } printf("closing last file handle %d: uu=%u\n", ifh, uu); fclose(fhs[ifh]); printf("ucnt=%u maxpos=%d (ind=%d)\n", ucnt, maxpos, posind); printf("\n"); printf("histo:\n"); for (z = 0; z < XmItemOf(histo); ++z) { printf("%d: %u\n", z, histo[z]); } printf("\n"); return 0; }


Comment on Re: The 10**21 Problem (Part 2)
Download Code
Re^2: The 10**21 Problem (Part 2)
by oiskuu (Friar) on May 18, 2014 at 09:21 UTC

    $ perl -lwe '$/ = \128; ++$hist[tr/\0//c] while <>; print "$_: $hist[$ +_]" for 0..$#hist;' lookzpM.byte-* 0: 12003438 1: 148200 2: 147874 ... 42: 330481 43: 33028

    This means you have plenty of room to pack the lines as: {N} ({i}{x})* {pad}.
    Compute q8 = (i ^ m7) & 127; q9 = (x ^ HASH_LEN);

    Average 15.984 elements per line. Lots of looping and branches can be avoided this way.

    Update. One more thing occurs to me. Indeed, those files compress rather well: lzma packs them 1:2500!

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://1086481]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others pondering the Monastery: (6)
As of 2015-07-04 08:04 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









    Results (58 votes), past polls