35 #include <m4ri/m4ri_config.h>
42 #include <emmintrin.h>
56 #define __M4RI_SSE2_CUTOFF 10
65 #define __M4RI_MAX_MZD_BLOCKSIZE (((size_t)1) << 27)
75 #define __M4RI_MUL_BLOCKSIZE MIN(((int)sqrt((double)(4 * __M4RI_CPU_L3_CACHE))) / 2, 2048)
162 #if 0 // Commented out in order to keep the size of mzd_t 64 bytes (one cache line). This could be added back if rows was ever removed.
204 static uint8_t
const mzd_flag_nonzero_offset = 0x1;
205 static uint8_t
const mzd_flag_nonzero_excess = 0x2;
206 static uint8_t
const mzd_flag_windowed_zerooffset = 0x4;
207 static uint8_t
const mzd_flag_windowed_zeroexcess = 0x8;
208 static uint8_t
const mzd_flag_windowed_ownsblocks = 0x10;
209 static uint8_t
const mzd_flag_multiple_blocks = 0x20;
219 return M->
flags & (mzd_flag_nonzero_offset | mzd_flag_windowed_zerooffset);
243 assert(M->
nrows == 0 || result == M->
rows[0]);
299 return n ? 0 : M->
nrows;
324 return n ? 0 : M->
nrows - r;
338 word* result = M->
blocks[0].begin + big_vector;
341 result = M->
blocks[n].begin + big_vector - n * (M->
blocks[0].size /
sizeof(
word));
343 assert(result == M->
rows[row]);
365 void mzd_free(
mzd_t *A);
399 return mzd_init_window((
mzd_t*)M, lowr, lowc, highr, highc);
408 #define mzd_free_window mzd_free
420 if ((rowa == rowb) || (startblock >= M->
width))
429 word *a = M->
rows[rowa] + startblock;
430 word *b = M->
rows[rowb] + startblock;
435 for(
wi_t i = 0; i < width; ++i) {
441 tmp = (a[width] ^ b[width]) & mask_end;
445 __M4RI_DD_ROW(M, rowa);
446 __M4RI_DD_ROW(M, rowb);
467 word tmp = (a[0] ^ b[0]) & mask_begin;
472 for(
wi_t i = 1; i < width; ++i) {
477 tmp = (a[width] ^ b[width]) & mask_end;
487 __M4RI_DD_ROW(M, rowa);
488 __M4RI_DD_ROW(M, rowb);
539 int max_bit =
MAX(a_bit, b_bit);
540 int count_remaining = stop_row - start_row;
541 int min_bit = a_bit + b_bit - max_bit;
543 int offset = max_bit - min_bit;
551 if (a_word == b_word) {
553 count_remaining -= count;
555 int fast_count = count / 4;
556 int rest_count = count - 4 * fast_count;
559 while (fast_count--) {
561 xor_v[1] = ptr[rowstride];
562 xor_v[2] = ptr[2 * rowstride];
563 xor_v[3] = ptr[3 * rowstride];
564 xor_v[0] ^= xor_v[0] >> offset;
565 xor_v[1] ^= xor_v[1] >> offset;
566 xor_v[2] ^= xor_v[2] >> offset;
567 xor_v[3] ^= xor_v[3] >> offset;
572 xor_v[0] |= xor_v[0] << offset;
573 xor_v[1] |= xor_v[1] << offset;
574 xor_v[2] |= xor_v[2] << offset;
575 xor_v[3] |= xor_v[3] << offset;
577 ptr[rowstride] ^= xor_v[1];
578 ptr[2 * rowstride] ^= xor_v[2];
579 ptr[3 * rowstride] ^= xor_v[3];
580 ptr += 4 * rowstride;
582 while (rest_count--) {
584 xor_v ^= xor_v >> offset;
586 *ptr ^= xor_v | (xor_v << offset);
597 if (min_bit == a_bit) {
598 min_ptr = ptr + a_word;
599 max_offset = b_word - a_word;
601 min_ptr = ptr + b_word;
602 max_offset = a_word - b_word;
605 count_remaining -= count;
608 word xor_v = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask;
610 min_ptr[max_offset] ^= xor_v << offset;
611 min_ptr += rowstride;
617 if (min_bit == a_bit)
618 min_ptr = ptr + a_word;
620 min_ptr = ptr + b_word;
672 M->
rows[x][block] ^= values << spot;
675 M->
rows[x][block + 1] ^= values >> space;
694 M->
rows[x][block] &= values << spot;
697 M->
rows[x][block + 1] &= values >> space;
714 M->
rows[x][block] &= ~(values << spot);
717 M->
rows[x][block + 1] &= ~(values >> space);
733 assert(dstrow < M->nrows && srcrow < M->nrows && coloffset < M->ncols);
737 word *src = M->
rows[srcrow] + startblock;
738 word *dst = M->
rows[dstrow] + startblock;
742 *dst++ ^= *src++ & mask_begin;
747 if (wide > not_aligned + 1)
754 __m128i* __src = (__m128i*)src;
755 __m128i* __dst = (__m128i*)dst;
756 __m128i*
const eof = (__m128i*)((
unsigned long)(src + wide) & ~0xFUL);
759 __m128i xmm1 = _mm_xor_si128(*__dst, *__src);
762 while(++__src < eof);
765 wide = ((
sizeof(
word)*wide)%16)/
sizeof(
word);
778 dst[i - 1] ^= src[i - 1] & ~mask_end;
780 __M4RI_DD_ROW(M, dstrow);
794 void mzd_row_add(
mzd_t *M,
rci_t const sourcerow,
rci_t const destrow);
878 void mzd_randomize(
mzd_t *M);
894 void mzd_set_ui(
mzd_t *M,
unsigned int const value);
911 rci_t mzd_gauss_delayed(
mzd_t *M,
rci_t const startcol,
int const full);
928 rci_t mzd_echelonize_naive(
mzd_t *M,
int const full);
939 int mzd_equal(
mzd_t const *A,
mzd_t const *B);
1072 #define mzd_sub mzd_add
1084 #define _mzd_sub _mzd_add
1101 word temp = (spill <= 0) ? M->
rows[x][block] << -spill : (M->
rows[x][block + 1] << (m4ri_radix - spill)) | (M->
rows[x][block] >> spill);
1102 return temp >> (m4ri_radix - n);
1125 void mzd_combine(
mzd_t *DST,
rci_t const row3,
wi_t const startblock3,
1186 wi_t wide = A->
width - a_startblock - 1;
1188 word *a = A->
rows[a_row] + a_startblock;
1189 word *b = B->
rows[b_row] + b_startblock;
1191 #if __M4RI_HAVE_SSE2
1192 if(wide > __M4RI_SSE2_CUTOFF) {
1200 __m128i *a128 = (__m128i*)a;
1201 __m128i *b128 = (__m128i*)b;
1202 const __m128i *eof = (__m128i*)((
unsigned long)(a + wide) & ~0xFUL);
1205 *a128 = _mm_xor_si128(*a128, *b128);
1208 }
while(a128 < eof);
1212 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
1215 #endif // __M4RI_HAVE_SSE2
1218 wi_t n = (wide + 7) / 8;
1220 case 0:
do { *(a++) ^= *(b++);
1221 case 7: *(a++) ^= *(b++);
1222 case 6: *(a++) ^= *(b++);
1223 case 5: *(a++) ^= *(b++);
1224 case 4: *(a++) ^= *(b++);
1225 case 3: *(a++) ^= *(b++);
1226 case 2: *(a++) ^= *(b++);
1227 case 1: *(a++) ^= *(b++);
1261 wi_t wide = A->
width - a_startblock - 1;
1262 word *a = A->
rows[a_row] + a_startblock;
1263 word *b = B->
rows[b_row] + b_startblock;
1264 word *c = C->
rows[c_row] + c_startblock;
1275 #if __M4RI_HAVE_SSE2
1276 if(wide > __M4RI_SSE2_CUTOFF) {
1284 __m128i *a128 = (__m128i*)a;
1285 __m128i *b128 = (__m128i*)b;
1286 __m128i *c128 = (__m128i*)c;
1287 const __m128i *eof = (__m128i*)((
unsigned long)(a + wide) & ~0xFUL);
1290 *c128 = _mm_xor_si128(*a128, *b128);
1294 }
while(a128 < eof);
1299 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
1302 #endif // __M4RI_HAVE_SSE2
1305 wi_t n = (wide + 7) / 8;
1307 case 0:
do { *(c++) = *(a++) ^ *(b++);
1308 case 7: *(c++) = *(a++) ^ *(b++);
1309 case 6: *(c++) = *(a++) ^ *(b++);
1310 case 5: *(c++) = *(a++) ^ *(b++);
1311 case 4: *(c++) = *(a++) ^ *(b++);
1312 case 3: *(c++) = *(a++) ^ *(b++);
1313 case 2: *(c++) = *(a++) ^ *(b++);
1314 case 1: *(c++) = *(a++) ^ *(b++);
1344 int mzd_is_zero(
mzd_t const *A);
1354 void mzd_row_clear_offset(
mzd_t *M,
rci_t const row,
rci_t const coloffset);
1387 double mzd_density(
mzd_t const *A,
wi_t res);