lavu/ripemd: Fully unroll the transform function loops
authorJames Almer <jamrial@gmail.com>
Mon, 9 Sep 2013 08:42:21 +0000 (05:42 -0300)
committerMichael Niedermayer <michaelni@gmx.at>
Mon, 9 Sep 2013 09:18:43 +0000 (11:18 +0200)
crypto_bench RIPEMD-160 results using an AMD Athlon X2 7750+, mingw32-w64 GCC 4.8.1 x86_64

Before:
lavu       RIPEMD-160   size: 1048576  runs:   1024  time:   12.342 +- 0.199

After:
lavu       RIPEMD-160   size: 1048576  runs:   1024  time:   10.143 +- 0.192

Signed-off-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
libavutil/ripemd.c

index d737c38..37b42df 100644 (file)
@@ -128,37 +128,42 @@ static void ripemd128_transform(uint32_t *state, const uint8_t buffer[64], int e
 
     for (n = 0; n < 16; n++)
         block[n] = AV_RL32(buffer + 4 * n);
+    n = 0;
 
-    for (n = 0; n < 16;) {
-        ROUND128_0_TO_15(a,b,c,d,e,f,g,h);
-        ROUND128_0_TO_15(d,a,b,c,h,e,f,g);
-        ROUND128_0_TO_15(c,d,a,b,g,h,e,f);
-        ROUND128_0_TO_15(b,c,d,a,f,g,h,e);
-    }
+#define R128_0                         \
+    ROUND128_0_TO_15(a,b,c,d,e,f,g,h); \
+    ROUND128_0_TO_15(d,a,b,c,h,e,f,g); \
+    ROUND128_0_TO_15(c,d,a,b,g,h,e,f); \
+    ROUND128_0_TO_15(b,c,d,a,f,g,h,e)
+
+    R128_0; R128_0; R128_0; R128_0;
     SWAP(a,e)
 
-    for (; n < 32;) {
-        ROUND128_16_TO_31(a,b,c,d,e,f,g,h);
-        ROUND128_16_TO_31(d,a,b,c,h,e,f,g);
-        ROUND128_16_TO_31(c,d,a,b,g,h,e,f);
-        ROUND128_16_TO_31(b,c,d,a,f,g,h,e);
-    }
+#define R128_16                         \
+    ROUND128_16_TO_31(a,b,c,d,e,f,g,h); \
+    ROUND128_16_TO_31(d,a,b,c,h,e,f,g); \
+    ROUND128_16_TO_31(c,d,a,b,g,h,e,f); \
+    ROUND128_16_TO_31(b,c,d,a,f,g,h,e)
+
+    R128_16; R128_16; R128_16; R128_16;
     SWAP(b,f)
 
-    for (; n < 48;) {
-        ROUND128_32_TO_47(a,b,c,d,e,f,g,h);
-        ROUND128_32_TO_47(d,a,b,c,h,e,f,g);
-        ROUND128_32_TO_47(c,d,a,b,g,h,e,f);
-        ROUND128_32_TO_47(b,c,d,a,f,g,h,e);
-    }
+#define R128_32                         \
+    ROUND128_32_TO_47(a,b,c,d,e,f,g,h); \
+    ROUND128_32_TO_47(d,a,b,c,h,e,f,g); \
+    ROUND128_32_TO_47(c,d,a,b,g,h,e,f); \
+    ROUND128_32_TO_47(b,c,d,a,f,g,h,e)
+
+    R128_32; R128_32; R128_32; R128_32;
     SWAP(c,g)
 
-    for (; n < 64;) {
-        ROUND128_48_TO_63(a,b,c,d,e,f,g,h);
-        ROUND128_48_TO_63(d,a,b,c,h,e,f,g);
-        ROUND128_48_TO_63(c,d,a,b,g,h,e,f);
-        ROUND128_48_TO_63(b,c,d,a,f,g,h,e);
-    }
+#define R128_48                         \
+    ROUND128_48_TO_63(a,b,c,d,e,f,g,h); \
+    ROUND128_48_TO_63(d,a,b,c,h,e,f,g); \
+    ROUND128_48_TO_63(c,d,a,b,g,h,e,f); \
+    ROUND128_48_TO_63(b,c,d,a,f,g,h,e)
+
+    R128_48; R128_48; R128_48; R128_48;
     SWAP(d,h)
 
     if (ext) {
@@ -222,54 +227,60 @@ static void ripemd160_transform(uint32_t *state, const uint8_t buffer[64], int e
 
     for (n = 0; n < 16; n++)
         block[n] = AV_RL32(buffer + 4 * n);
+    n = 0;
 
-    for (n = 0; n < 16 - 1;) {
-        ROUND160_0_TO_15(a,b,c,d,e,f,g,h,i,j);
-        ROUND160_0_TO_15(e,a,b,c,d,j,f,g,h,i);
-        ROUND160_0_TO_15(d,e,a,b,c,i,j,f,g,h);
-        ROUND160_0_TO_15(c,d,e,a,b,h,i,j,f,g);
-        ROUND160_0_TO_15(b,c,d,e,a,g,h,i,j,f);
-    }
+#define R160_0                             \
+    ROUND160_0_TO_15(a,b,c,d,e,f,g,h,i,j); \
+    ROUND160_0_TO_15(e,a,b,c,d,j,f,g,h,i); \
+    ROUND160_0_TO_15(d,e,a,b,c,i,j,f,g,h); \
+    ROUND160_0_TO_15(c,d,e,a,b,h,i,j,f,g); \
+    ROUND160_0_TO_15(b,c,d,e,a,g,h,i,j,f)
+
+    R160_0; R160_0; R160_0;
     ROUND160_0_TO_15(a,b,c,d,e,f,g,h,i,j);
     SWAP(a,f)
 
-    for (; n < 32 - 1;) {
-        ROUND160_16_TO_31(e,a,b,c,d,j,f,g,h,i);
-        ROUND160_16_TO_31(d,e,a,b,c,i,j,f,g,h);
-        ROUND160_16_TO_31(c,d,e,a,b,h,i,j,f,g);
-        ROUND160_16_TO_31(b,c,d,e,a,g,h,i,j,f);
-        ROUND160_16_TO_31(a,b,c,d,e,f,g,h,i,j);
-    }
+#define R160_16                             \
+    ROUND160_16_TO_31(e,a,b,c,d,j,f,g,h,i); \
+    ROUND160_16_TO_31(d,e,a,b,c,i,j,f,g,h); \
+    ROUND160_16_TO_31(c,d,e,a,b,h,i,j,f,g); \
+    ROUND160_16_TO_31(b,c,d,e,a,g,h,i,j,f); \
+    ROUND160_16_TO_31(a,b,c,d,e,f,g,h,i,j)
+
+    R160_16; R160_16; R160_16;
     ROUND160_16_TO_31(e,a,b,c,d,j,f,g,h,i);
     SWAP(b,g)
 
-    for (; n < 48 - 1;) {
-        ROUND160_32_TO_47(d,e,a,b,c,i,j,f,g,h);
-        ROUND160_32_TO_47(c,d,e,a,b,h,i,j,f,g);
-        ROUND160_32_TO_47(b,c,d,e,a,g,h,i,j,f);
-        ROUND160_32_TO_47(a,b,c,d,e,f,g,h,i,j);
-        ROUND160_32_TO_47(e,a,b,c,d,j,f,g,h,i);
-    }
+#define R160_32                             \
+    ROUND160_32_TO_47(d,e,a,b,c,i,j,f,g,h); \
+    ROUND160_32_TO_47(c,d,e,a,b,h,i,j,f,g); \
+    ROUND160_32_TO_47(b,c,d,e,a,g,h,i,j,f); \
+    ROUND160_32_TO_47(a,b,c,d,e,f,g,h,i,j); \
+    ROUND160_32_TO_47(e,a,b,c,d,j,f,g,h,i)
+
+    R160_32; R160_32; R160_32;
     ROUND160_32_TO_47(d,e,a,b,c,i,j,f,g,h);
     SWAP(c,h)
 
-    for (; n < 64 - 1;) {
-        ROUND160_48_TO_63(c,d,e,a,b,h,i,j,f,g);
-        ROUND160_48_TO_63(b,c,d,e,a,g,h,i,j,f);
-        ROUND160_48_TO_63(a,b,c,d,e,f,g,h,i,j);
-        ROUND160_48_TO_63(e,a,b,c,d,j,f,g,h,i);
-        ROUND160_48_TO_63(d,e,a,b,c,i,j,f,g,h);
-    }
+#define R160_48                             \
+    ROUND160_48_TO_63(c,d,e,a,b,h,i,j,f,g); \
+    ROUND160_48_TO_63(b,c,d,e,a,g,h,i,j,f); \
+    ROUND160_48_TO_63(a,b,c,d,e,f,g,h,i,j); \
+    ROUND160_48_TO_63(e,a,b,c,d,j,f,g,h,i); \
+    ROUND160_48_TO_63(d,e,a,b,c,i,j,f,g,h)
+
+    R160_48; R160_48; R160_48;
     ROUND160_48_TO_63(c,d,e,a,b,h,i,j,f,g);
     SWAP(d,i)
 
-    for (; n < 75;) {
-        ROUND160_64_TO_79(b,c,d,e,a,g,h,i,j,f);
-        ROUND160_64_TO_79(a,b,c,d,e,f,g,h,i,j);
-        ROUND160_64_TO_79(e,a,b,c,d,j,f,g,h,i);
-        ROUND160_64_TO_79(d,e,a,b,c,i,j,f,g,h);
-        ROUND160_64_TO_79(c,d,e,a,b,h,i,j,f,g);
-    }
+#define R160_64                             \
+    ROUND160_64_TO_79(b,c,d,e,a,g,h,i,j,f); \
+    ROUND160_64_TO_79(a,b,c,d,e,f,g,h,i,j); \
+    ROUND160_64_TO_79(e,a,b,c,d,j,f,g,h,i); \
+    ROUND160_64_TO_79(d,e,a,b,c,i,j,f,g,h); \
+    ROUND160_64_TO_79(c,d,e,a,b,h,i,j,f,g)
+
+    R160_64; R160_64; R160_64;
     ROUND160_64_TO_79(b,c,d,e,a,g,h,i,j,f);
     SWAP(e,j)