(OP here again, with long-forgotten PWC 342-2, exploring depths in muddy waters around 4-5 lines of trivial C)
What a pity, I only have 4 cores and can't achieve more than 4x increase, would be fun to watch
Fun should never be denied:
v5.42.0 / MSWin32-x64-multi-thread / 13.2.0
String length: 10,000
Rate/s %
c 87663 100
va 622005 710
String length: 100,000
Rate/s %
c 8875 100
c_omp 15439 174
va 71646 807
String length: 1,000,000
Rate/s %
c 891 100
c_omp 2307 259
va 12770 1434
String length: 10,000,000
Rate/s %
c 88.3 100
c_omp 361.2 409
va 1615.6 1829
String length: 100,000,000
Rate/s %
c 8.8 100
c_omp 36.2 410
va 167.7 1900
'c' and 'c_omp' are for 'mscore_c' (straightforward i.e. "dumb") and 'mscore_c_omp' found in this thread. The 'va' stands for 'vectors, assembly'; where OMP (I have 4 cores) is "automatically" switched on for strings longer than 4e5. CPU is supposed to support AVX2 i.e. to be 2017+.
There's a "cheat": sequential runs of either '0' or '1's, in target string, shouldn't be longer than 2**31. Because current (running) scores are maintained (dragged along) as 8 per 256-bit registers. Perhaps there is more optimal way to find (horizontal) cumulative sums along 16 bytes, other than 4 shifts/shuffles and 4 additions. What's really funny is that to find (rather, maintain) running maximum over 32 numbers I'm only doing 3 comparisons. But maybe all of the above (i.e. below) looks not funny but ugly to "people who know how".
use Inline C => Config =>
ccflagsex => '-fopenmp',
libs => '-lgomp -ldl';
use Inline C => << 'END_OF_C';
#include <omp.h>
#define MANY 400000
#define MAX_CORES 4
void _helper_a( char* buf, size_t len,
int32_t* acc, int32_t* cum, int32_t* max );
void _helper_c( char* buf, size_t len,
int32_t* acc, int32_t* cum, int32_t* max );
int mscore_va( SV* str ) {
STRLEN len;
char* buf = SvPVbyte( str, len );
int ncores = 1;
if ( len >= MANY ) {
int n = omp_get_num_procs();
ncores = n > MAX_CORES ? MAX_CORES : n;
}
int nparts = 2 + ncores;
int32_t acc_a[ nparts ];
int32_t cum_a[ nparts ];
int32_t max_a[ nparts ];
memset( acc_a, 0x00, sizeof( acc_a ));
memset( cum_a, 0x00, sizeof( cum_a ));
memset( max_a, 0xFF, sizeof( max_a ));
len --;
int32_t prefix_len = ( size_t ) buf % 32;
if ( len < prefix_len ) prefix_len = len;
int32_t suffix_len = ( len - prefix_len ) % 32;
int32_t aligned_len = len - prefix_len - suffix_len;
char* prefix_start = buf;
char* aligned_start = buf + prefix_len;
char* suffix_start = aligned_start + aligned_len;
if ( prefix_len > 0 ) {
_helper_c( prefix_start, prefix_len,
acc_a, cum_a, max_a );
}
if ( suffix_len > 0 ) {
_helper_c( suffix_start, suffix_len,
&( acc_a[ nparts - 1 ]),
&( cum_a[ nparts - 1 ]),
&( max_a[ nparts - 1 ]));
}
if ( aligned_len > 0 ) {
if ( ncores == 1 ) {
_helper_a( aligned_start, aligned_len,
&( acc_a[ 1 ]),
&( cum_a[ 1 ]),
&( max_a[ 1 ]));
}
else {
size_t psize = len / 32 / ncores * 32;
#pragma omp parallel for schedule( static, 1 ) \
num_threads( ncores )
for ( int j = 0; j < ncores; j ++ ) {
char* start = aligned_start + j * psize;
size_t len_ = ( j < ncores - 1 ) ? psize : aligned_len
+ - j * psize;
_helper_a( start, len_,
&( acc_a[ j + 1 ]),
&( cum_a[ j + 1 ]),
&( max_a[ j + 1 ]));
}
}
}
int32_t acc = acc_a[ 0 ];
int32_t max = max_a[ 0 ];
for ( int j = 1; j < nparts; j ++ ) {
acc += acc_a[ j ];
cum_a[ j ] += cum_a[ j - 1 ];
max_a[ j ] += cum_a[ j - 1 ];
if ( max_a[ j ] > max ) max = max_a[ j ];
}
return acc + max + ( buf[ len ] == '1' );
}
void _helper_c( char* buf, size_t len, // in
int32_t* acc, int32_t* cum, int32_t* max ) { // out
for( int i = 0; i < len; i ++ ) {
int one = buf[ i ] - '0';
*acc += one;
*cum += one ? -1 : 1;
if ( *cum > *max ) *max = *cum;
}
}
void _helper_a( char* buf, size_t len, // in
int32_t* acc, int32_t* cum, int32_t* max ) { // out
void* fin = buf + len;
int32_t acc_, cum_, max_;
__asm__ (
// " IDDQD \n"
" XOR %[acc], %[acc] \n"
" MOV $0x00, %%r11 \n"
" MOVQ %%r11, %%xmm0 \n"
" VPBROADCASTB %%xmm0, %%ymm0 \n" // ymm0 = cum
" MOV $0xFF, %%r11 \n"
" MOVQ %%r11, %%xmm1 \n"
" VPBROADCASTB %%xmm1, %%ymm1 \n" // ymm1 = max
" MOV $0x61, %%r11 \n"
" MOVQ %%r11, %%xmm2 \n"
" VPBROADCASTB %%xmm2, %%ymm2 \n" // ymm2 = 97 x 32
" MOV $0x0F, %%r11 \n"
" MOVQ %%r11, %%xmm6 \n"
" VPBROADCASTB %%xmm6, %%ymm6 \n" // ymm6 = 15 x 32
" MOV %[buf], %%r8 \n"
" MOV %[fin], %%r9 \n"
" MOV $32, %%r10 \n"
".L2_: \n"
" VMOVDQA (%%r8), %%ymm3 \n"
" VPADDB %%ymm3, %%ymm3, %%ymm3 \n"
" VPSUBB %%ymm3, %%ymm2, %%ymm3 \n"
" VPMOVMSKB %%ymm3, %%ecx \n"
" POPCNT %%ecx, %%ecx \n"
" ADD %%ecx, %[acc] \n"
" VPSLLDQ $1, %%ymm3, %%ymm4 \n"
" VPADDSB %%ymm3, %%ymm4, %%ymm3 \n"
" VPSLLDQ $2, %%ymm3, %%ymm4 \n"
" VPADDSB %%ymm3, %%ymm4, %%ymm3 \n"
" VPSLLDQ $4, %%ymm3, %%ymm4 \n"
" VPADDSB %%ymm3, %%ymm4, %%ymm3 \n"
" VPSLLDQ $8, %%ymm3, %%ymm4 \n"
" VPADDSB %%ymm3, %%ymm4, %%ymm3 \n"
" VPSHUFB %%ymm6, %%ymm3, %%ymm4 \n"
" VPSHUFD $0b01001110, %%ymm3, %%ymm5 \n"
" VPMAXSB %%ymm3, %%ymm5, %%ymm3 \n"
" VPMOVSXBD %%xmm3, %%ymm5 \n"
" VPADDD %%ymm0, %%ymm5, %%ymm5 \n"
" VPMAXSD %%ymm1, %%ymm5, %%ymm1 \n"
" VPMOVSXBD %%xmm4, %%ymm5 \n"
" VPADDD %%ymm0, %%ymm5, %%ymm0 \n"
" VPERMQ $0b01001110, %%ymm3, %%ymm3 \n"
" VPERMQ $0b01001110, %%ymm4, %%ymm4 \n"
" VPMOVSXBD %%xmm3, %%ymm5 \n"
" VPADDD %%ymm0, %%ymm5, %%ymm5 \n"
" VPMAXSD %%ymm1, %%ymm5, %%ymm1 \n"
" VPMOVSXBD %%xmm4, %%ymm5 \n"
" VPADDD %%ymm0, %%ymm5, %%ymm0 \n"
" ADD %%r10, %%r8 \n"
" CMP %%r9, %%r8 \n"
" JL .L2_ \n"
" VPERMQ $0b00111001, %%ymm1, %%ymm4 \n"
" VPMAXSD %%ymm1, %%ymm4, %%ymm1 \n"
" VPERMQ $0b01001110, %%ymm1, %%ymm4 \n"
" VPMAXSD %%ymm1, %%ymm4, %%ymm1 \n"
" PSHUFD $0b00111001, %%xmm1, %%xmm4 \n"
" PMAXSD %%xmm4, %%xmm1 \n"
" MOVD %%xmm1, %[max] \n"
" MOVD %%xmm0, %[cum] \n"
: [acc] "=r" ( acc_ ),
[cum] "=rm" ( cum_ ),
[max] "=rm" ( max_ )
: [buf] "m" ( buf ),
[fin] "m" ( fin )
: "cc",
"rcx", "r8", "r9", "r10", "r11",
"ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6"
);
*acc = acc_;
*cum = cum_;
*max = max_;
}
END_OF_C