00001
00016 #ifndef SFMT_SSE2_H
00017 #define SFMT_SSE2_H
00018
00019 PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b, __m128i c,
00020 __m128i d, __m128i mask) ALWAYSINLINE;
00021
00031 PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b,
00032 __m128i c, __m128i d, __m128i mask) {
00033 __m128i v, x, y, z;
00034
00035 x = _mm_load_si128(a);
00036 y = _mm_srli_epi32(*b, SR1);
00037 z = _mm_srli_si128(c, SR2);
00038 v = _mm_slli_epi32(d, SL1);
00039 z = _mm_xor_si128(z, x);
00040 z = _mm_xor_si128(z, v);
00041 x = _mm_slli_si128(x, SL2);
00042 y = _mm_and_si128(y, mask);
00043 z = _mm_xor_si128(z, x);
00044 z = _mm_xor_si128(z, y);
00045 return z;
00046 }
00047
00052 inline static void gen_rand_all(void) {
00053 int i;
00054 __m128i r, r1, r2, mask;
00055 mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1);
00056
00057 r1 = _mm_load_si128(&sfmt[N - 2].si);
00058 r2 = _mm_load_si128(&sfmt[N - 1].si);
00059 for (i = 0; i < N - POS1; i++) {
00060 r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1].si, r1, r2, mask);
00061 _mm_store_si128(&sfmt[i].si, r);
00062 r1 = r2;
00063 r2 = r;
00064 }
00065 for (; i < N; i++) {
00066 r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1 - N].si, r1, r2, mask);
00067 _mm_store_si128(&sfmt[i].si, r);
00068 r1 = r2;
00069 r2 = r;
00070 }
00071 }
00072
00080 inline static void gen_rand_array(w128_t *array, int size) {
00081 int i, j;
00082 __m128i r, r1, r2, mask;
00083 mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1);
00084
00085 r1 = _mm_load_si128(&sfmt[N - 2].si);
00086 r2 = _mm_load_si128(&sfmt[N - 1].si);
00087 for (i = 0; i < N - POS1; i++) {
00088 r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1].si, r1, r2, mask);
00089 _mm_store_si128(&array[i].si, r);
00090 r1 = r2;
00091 r2 = r;
00092 }
00093 for (; i < N; i++) {
00094 r = mm_recursion(&sfmt[i].si, &array[i + POS1 - N].si, r1, r2, mask);
00095 _mm_store_si128(&array[i].si, r);
00096 r1 = r2;
00097 r2 = r;
00098 }
00099
00100 for (; i < size - N; i++) {
00101 r = mm_recursion(&array[i - N].si, &array[i + POS1 - N].si, r1, r2,
00102 mask);
00103 _mm_store_si128(&array[i].si, r);
00104 r1 = r2;
00105 r2 = r;
00106 }
00107 for (j = 0; j < 2 * N - size; j++) {
00108 r = _mm_load_si128(&array[j + size - N].si);
00109 _mm_store_si128(&sfmt[j].si, r);
00110 }
00111 for (; i < size; i++) {
00112 r = mm_recursion(&array[i - N].si, &array[i + POS1 - N].si, r1, r2,
00113 mask);
00114 _mm_store_si128(&array[i].si, r);
00115 _mm_store_si128(&sfmt[j++].si, r);
00116 r1 = r2;
00117 r2 = r;
00118 }
00119 }
00120
00121 #endif