#include #include #define H_PRIME 1000003 #define HASH_LEN 11 #define MOD_TARG 1001 #define M_TARG 1000 #define D_TARG 500 #define C_TARG 100 #define L_TARG 50 #define X_TARG 10 #define GET_TWO_BITS(x, p) (((x) >> (p)) & ((size_t)3)) #define UNROLL(qx) qqm = bytevecM[m7 ^ qx]; \ if (qqm != 0 \ && qqm == bytevecD[d7 ^ qx]) { \ qk[nqk++] = qqm; qk[nqk++] = qx; } #define UNROLL_M7A(qx) \ m7arr[qx] = (unsigned int)( (m6 ^ qx) * H_PRIME ); \ m7arr_r12[qx] = m7arr[qx] >> 12; \ _mm_prefetch((const char*)(&pbvM[m7arr_r12[qx]]), _MM_HINT_T0); #define UNROLL_M7B(qx) \ if ( (mceo = GET_TWO_BITS(pbvM[m7arr_r12[qx]], ((m7arr[qx] >> 6) & 62)) ) != 0) { \ d7 = (unsigned int)( (d6 ^ qx) * H_PRIME ); \ d7_r12 = d7 >> 12; \ _mm_prefetch((const char*)(&pbvD[d7_r12]), _MM_HINT_T0); \ qj[nqj++] = qx; \ qj[nqj++] = d7; \ qj[nqj++] = d7_r12; \ qj[nqj++] = mceo; } // ------------------------------------------------------------------- int inner( const unsigned char* bitvecM, const unsigned char* bitvecD, const unsigned char* bitvecC, const unsigned char* bitvecL, const unsigned char* bitvecX, const unsigned char* bitvecV, const unsigned char* bitvecI, const unsigned char* bytevecM, const unsigned char* bytevecD, int startval, int endval, int m2, int d2, int c2, int l2, int x2, int v2, int i2, __m128i* ps ) { __m128i s2 = _mm_set_epi32(i2, v2, x2, l2); __m128i hp = _mm_set1_epi32(H_PRIME); __m128i s3, s4, s5, s6; int m3, m4, m5, m6; int d3, d4, d5, d6; int c3, c4, c5, c6; unsigned int m7, d7, d7_r12, c7, c7_r12, l7, l7_r12, x7, x7_r12, v7, v7_r12, i7, i7_r12; int q3, q4, q5, q6, q7; int c8, l8, x8; int c9, l9, x9; unsigned int qqm; unsigned int q8; int jj; int iret = 0; int cnt; int qz; int nqj; int nqk; unsigned int qj[128*8]; unsigned int qk[128*8]; unsigned int m7arr[128]; unsigned int m7arr_r12[128]; unsigned int mceo, dceo, cceo, lceo, xceo, vceo, iceo; const size_t* pbvM = (size_t*)bitvecM; const size_t* pbvD = (size_t*)bitvecD; const size_t* pbvC = (size_t*)bitvecC; const size_t* pbvL = (size_t*)bitvecL; const size_t* pbvX = (size_t*)bitvecX; const size_t* pbvV = (size_t*)bitvecV; const size_t* pbvI = (size_t*)bitvecI; for (q3 = startval; q3 < endval; ++q3) { if (q3 == 10 || q3 == 13) continue; m3 = (m2 ^ q3) * H_PRIME; d3 = (d2 ^ q3) * H_PRIME; c3 = (c2 ^ q3) * H_PRIME; s3 = _mm_mullo_epi32(_mm_xor_si128(s2, _mm_set1_epi32(q3)), hp); for (q4 = 1; q4 < 128; ++q4) { if (q4 == 10 || q4 == 13) continue; m4 = (m3 ^ q4) * H_PRIME; d4 = (d3 ^ q4) * H_PRIME; c4 = (c3 ^ q4) * H_PRIME; s4 = _mm_mullo_epi32(_mm_xor_si128(s3, _mm_set1_epi32(q4)), hp); for (q5 = 1; q5 < 128; ++q5) { if (q5 == 10 || q5 == 13) continue; m5 = (m4 ^ q5) * H_PRIME; d5 = (d4 ^ q5) * H_PRIME; c5 = (c4 ^ q5) * H_PRIME; s5 = _mm_mullo_epi32(_mm_xor_si128(s4, _mm_set1_epi32(q5)), hp); for (q6 = 1; q6 < 128; ++q6) { if (q6 == 10 || q6 == 13) continue; m6 = (m5 ^ q6) * H_PRIME; d6 = (d5 ^ q6) * H_PRIME; c6 = (c5 ^ q6) * H_PRIME; s6 = _mm_mullo_epi32(_mm_xor_si128(s5, _mm_set1_epi32(q6)), hp); UNROLL_M7A(1) UNROLL_M7A(2) UNROLL_M7A(3) UNROLL_M7A(4) UNROLL_M7A(5) UNROLL_M7A(6) UNROLL_M7A(7) UNROLL_M7A(8) UNROLL_M7A(9) UNROLL_M7A(11) UNROLL_M7A(12) for (q7 = 14; q7 < 128; ++q7) { UNROLL_M7A(q7) } nqj = 0; UNROLL_M7B(1) UNROLL_M7B(2) UNROLL_M7B(3) UNROLL_M7B(4) UNROLL_M7B(5) UNROLL_M7B(6) UNROLL_M7B(7) UNROLL_M7B(8) UNROLL_M7B(9) UNROLL_M7B(11) UNROLL_M7B(12) for (q7 = 14; q7 < 128; ++q7) { // UNROLL_M7B(q7) mceo = GET_TWO_BITS(pbvM[m7arr_r12[q7]], ((m7arr[q7] >> 6) & 62)); if (mceo == 0) continue; d7 = (unsigned int)( (d6 ^ q7) * H_PRIME ); d7_r12 = d7 >> 12; _mm_prefetch((const char*)(&pbvD[d7_r12]), _MM_HINT_T0); qj[nqj++] = q7; qj[nqj++] = d7; qj[nqj++] = d7_r12; qj[nqj++] = mceo; } nqk = 0; cnt = 0; while (cnt < nqj) { q7 = qj[cnt++]; d7 = qj[cnt++]; d7_r12 = qj[cnt++]; mceo = qj[cnt++]; dceo = GET_TWO_BITS(pbvD[d7_r12], ((d7 >> 6) & 62)); if (dceo != mceo) continue; c7 = (unsigned int)( (c6 ^ q7) * H_PRIME ); c7_r12 = c7 >> 12; _mm_prefetch((const char*)(&pbvC[c7_r12]), _MM_HINT_T0); qk[nqk++] = q7; qk[nqk++] = d7; qk[nqk++] = c7; qk[nqk++] = c7_r12; qk[nqk++] = mceo; } nqj = 0; cnt = 0; while (cnt < nqk) { q7 = qk[cnt++]; d7 = qk[cnt++]; c7 = qk[cnt++]; c7_r12 = qk[cnt++]; mceo = qk[cnt++]; cceo = GET_TWO_BITS(pbvC[c7_r12], ((c7 >> 6) & 62)); if (cceo != mceo) continue; l7 = (unsigned int)( (s6.m128i_i32[0] ^ q7) * H_PRIME ); l7_r12 = l7 >> 12; _mm_prefetch((const char*)(&pbvL[l7_r12]), _MM_HINT_T0); qj[nqj++] = q7; qj[nqj++] = d7; qj[nqj++] = c7; qj[nqj++] = l7; qj[nqj++] = l7_r12; qj[nqj++] = mceo; } nqk = 0; cnt = 0; while (cnt < nqj) { q7 = qj[cnt++]; d7 = qj[cnt++]; c7 = qj[cnt++]; l7 = qj[cnt++]; l7_r12 = qj[cnt++]; mceo = qj[cnt++]; lceo = GET_TWO_BITS(pbvL[l7_r12], ((l7 >> 6) & 62)); if (lceo != mceo) continue; x7 = (unsigned int)( (s6.m128i_i32[1] ^ q7) * H_PRIME ); x7_r12 = x7 >> 12; _mm_prefetch((const char*)(&pbvX[x7_r12]), _MM_HINT_T0); qk[nqk++] = q7; qk[nqk++] = d7; qk[nqk++] = c7; qk[nqk++] = l7; qk[nqk++] = x7; qk[nqk++] = x7_r12; qk[nqk++] = mceo; } nqj = 0; cnt = 0; while (cnt < nqk) { q7 = qk[cnt++]; d7 = qk[cnt++]; c7 = qk[cnt++]; l7 = qk[cnt++]; x7 = qk[cnt++]; x7_r12 = qk[cnt++]; mceo = qk[cnt++]; xceo = GET_TWO_BITS(pbvX[x7_r12], ((x7 >> 6) & 62)); if (xceo != mceo) continue; v7 = (unsigned int)( (s6.m128i_i32[2] ^ q7) * H_PRIME ); v7_r12 = v7 >> 12; _mm_prefetch((const char*)(&pbvV[v7_r12]), _MM_HINT_T0); qj[nqj++] = q7; qj[nqj++] = d7; qj[nqj++] = c7; qj[nqj++] = l7; qj[nqj++] = x7; qj[nqj++] = v7; qj[nqj++] = v7_r12; qj[nqj++] = mceo; } nqk = 0; cnt = 0; while (cnt < nqj) { q7 = qj[cnt++]; d7 = qj[cnt++]; c7 = qj[cnt++]; l7 = qj[cnt++]; x7 = qj[cnt++]; v7 = qj[cnt++]; v7_r12 = qj[cnt++]; mceo = qj[cnt++]; vceo = GET_TWO_BITS(pbvV[v7_r12], ((v7 >> 6) & 62)); if (vceo != mceo) continue; i7 = (unsigned int)( (s6.m128i_i32[3] ^ q7) * H_PRIME ); i7_r12 = i7 >> 12; _mm_prefetch((const char*)(&pbvI[i7_r12]), _MM_HINT_T0); qk[nqk++] = q7; qk[nqk++] = d7; qk[nqk++] = c7; qk[nqk++] = l7; qk[nqk++] = x7; qk[nqk++] = i7; qk[nqk++] = i7_r12; qk[nqk++] = mceo; } nqj = 0; cnt = 0; while (cnt < nqk) { q7 = qk[cnt++]; d7 = qk[cnt++]; c7 = qk[cnt++]; l7 = qk[cnt++]; x7 = qk[cnt++]; i7 = qk[cnt++]; i7_r12 = qk[cnt++]; mceo = qk[cnt++]; iceo = GET_TWO_BITS(pbvI[i7_r12], ((i7 >> 6) & 62)); if (iceo != mceo) continue; _mm_prefetch(&bytevecM[m7arr[q7] & 0xffffff80], _MM_HINT_T0); _mm_prefetch(&bytevecM[64+(m7arr[q7] & 0xffffff80)], _MM_HINT_T0); _mm_prefetch(&bytevecD[d7 & 0xffffff80], _MM_HINT_T0); _mm_prefetch(&bytevecD[64+(d7 & 0xffffff80)], _MM_HINT_T0); qj[nqj++] = q7; qj[nqj++] = d7; qj[nqj++] = c7; qj[nqj++] = l7; qj[nqj++] = x7; } if (nqj == 0) continue; qz = 0; while (qz < nqj) { q7 = qj[qz++]; d7 = qj[qz++]; c7 = qj[qz++]; l7 = qj[qz++]; x7 = qj[qz++]; m7 = m7arr[q7]; nqk = 0; UNROLL(1) UNROLL(2) UNROLL(3) UNROLL(4) UNROLL(5) UNROLL(6) UNROLL(7) UNROLL(8) UNROLL(9) UNROLL(11) UNROLL(12) UNROLL(14) UNROLL(15) UNROLL(16) UNROLL(17) UNROLL(18) UNROLL(19) UNROLL(20) UNROLL(21) UNROLL(22) UNROLL(23) UNROLL(24) UNROLL(25) UNROLL(26) UNROLL(27) UNROLL(28) UNROLL(29) UNROLL(30) UNROLL(31) UNROLL(32) UNROLL(33) UNROLL(34) UNROLL(35) UNROLL(36) UNROLL(37) UNROLL(38) UNROLL(39) UNROLL(40) UNROLL(41) UNROLL(42) UNROLL(43) UNROLL(44) UNROLL(45) UNROLL(46) UNROLL(47) UNROLL(48) UNROLL(49) UNROLL(50) UNROLL(51) UNROLL(52) UNROLL(53) UNROLL(54) UNROLL(55) UNROLL(56) UNROLL(57) UNROLL(58) UNROLL(59) UNROLL(60) UNROLL(61) UNROLL(62) UNROLL(63) UNROLL(64) UNROLL(65) UNROLL(66) UNROLL(67) UNROLL(68) UNROLL(69) UNROLL(70) UNROLL(71) UNROLL(72) UNROLL(73) UNROLL(74) UNROLL(75) UNROLL(76) UNROLL(77) UNROLL(78) UNROLL(79) UNROLL(80) UNROLL(81) UNROLL(82) UNROLL(83) UNROLL(84) UNROLL(85) UNROLL(86) UNROLL(87) UNROLL(88) UNROLL(89) UNROLL(90) UNROLL(91) UNROLL(92) UNROLL(93) UNROLL(94) UNROLL(95) UNROLL(96) UNROLL(97) UNROLL(98) UNROLL(99) UNROLL(100) UNROLL(101) UNROLL(102) UNROLL(103) UNROLL(104) UNROLL(105) UNROLL(106) UNROLL(107) UNROLL(108) UNROLL(109) UNROLL(110) UNROLL(111) UNROLL(112) UNROLL(113) UNROLL(114) UNROLL(115) UNROLL(116) UNROLL(117) UNROLL(118) UNROLL(119) UNROLL(120) UNROLL(121) UNROLL(122) UNROLL(123) UNROLL(124) UNROLL(125) UNROLL(126) UNROLL(127) if (nqk == 0) continue; cnt = 0; while (cnt < nqk) { qqm = qk[cnt++]; q8 = qk[cnt++]; // Calculate instead of lookup to reduce memory footprint. c8 = ((int)c7 ^ q8) * H_PRIME; jj = (c8 ^ qqm) ^ HASH_LEN; c9 = jj % MOD_TARG; if (c9 < 0) c9 += MOD_TARG; if (c9 == C_TARG) { l8 = ((int)l7 ^ q8) * H_PRIME; jj = (l8 ^ qqm) ^ HASH_LEN; l9 = jj % MOD_TARG; if (l9 < 0) l9 += MOD_TARG; if (l9 == L_TARG) { x8 = ((int)x7 ^ q8) * H_PRIME; jj = (x8 ^ qqm) ^ HASH_LEN; x9 = jj % MOD_TARG; if (x9 < 0) x9 += MOD_TARG; if (x9 == X_TARG) { ps[iret++] = _mm_set_epi16(0, qqm, q8, q7, q6, q5, q4, q3); } } } } } } } } } return iret; }