Ror can do byteswaps all right, but it's not very remarkable at that.
#! /usr/bin/perl
use Inline C => Config => CC => 'gcc', OPTIMIZE => '-O3 -mssse3 -funro
+ll-all-loops';
use Inline C => <<'__CUT__', NAME => 'swab';
#include <x86intrin.h>
void swab_ror(SV *v)
{
STRLEN slen; char *s = SvPV(v, slen);
uint16_t *w = (uint16_t*) s;
size_t n = slen >> 1;
for (; n; n--) {
asm("rorw $8, %0" : "+r,m" (w[n-1]) : : "cc");
}
}
void swab_sse(SV *v)
{
STRLEN slen; char *s = SvPV(v, slen);
__m128i x, t;
size_t n = slen & ~(size_t)1;
for (; (n & 0xe); n -= 2) {
uint16_t *w = (uint16_t*) &s[n-2];
*w = __rorw(*w, 8);
}
t = _mm_set_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
for (; n; n -= 16) {
x = _mm_lddqu_si128((__m128i*)&s[n-16]);
x = _mm_shuffle_epi8(x, t);
_mm_storeu_si128((__m128i*)&s[n-16], x);
}
}
__CUT__
our $str = pack "C*", map rand(256), 1..34567;
use Benchmark 'cmpthese';
cmpthese -5, {
swab_ror => q( swab_ror $str ),
swab_sse => q( swab_sse $str ),
};