50 return a >= b ? a : b;
59 return (R)(n) *
fak(n - 1);
74 for (k = 0; k <= m - r; k++)
76 sum +=
binom(m + k, k) * POW((xx + K(1.0)) / K(2.0), (R) k);
78 return sum * POW((xx + K(1.0)), (R) r) * POW(K(1.0) - xx, (R) (m + 1))
79 / (R)(1 << (m + 1)) /
fak(r);
83 C
regkern(kernel k, R xx,
int p,
const R *param, R a, R b)
92 if ((xx >= -K(0.5) + b && xx <= -a) || (xx >= a && xx <= K(0.5) - b))
94 return k(xx, 0, param);
96 else if (xx < -K(0.5) + b)
98 sum = (k(-K(0.5), 0, param) + k(K(0.5), 0, param)) / K(2.0)
99 *
BasisPoly(p - 1, 0, K(2.0) * xx / b + (K(1.0) - b) / b);
100 for (r = 0; r < p; r++)
102 sum += POW(-b / K(2.0), (R) r) * k(-K(0.5) + b, r, param)
103 *
BasisPoly(p - 1, r, -K(2.0) * xx / b + (b - K(1.0)) / b);
107 else if ((xx > -a) && (xx < a))
109 for (r = 0; r < p; r++)
113 * (k(-a, r, param) *
BasisPoly(p - 1, r, xx / a)
114 + k(a, r, param) *
BasisPoly(p - 1, r, -xx / a)
119 else if (xx > K(0.5) - b)
121 sum = (k(-K(0.5), 0, param) + k(K(0.5), 0, param)) / K(2.0)
122 *
BasisPoly(p - 1, 0, -K(2.0) * xx / b + (K(1.0) - b) / b);
123 for (r = 0; r < p; r++)
125 sum += POW(b / K(2.0), (R) r) * k(K(0.5) - b, r, param)
126 *
BasisPoly(p - 1, r, K(2.0) * xx / b - (K(1.0) - b) / b);
130 return k(xx, 0, param);
136 static C
regkern1(kernel k, R xx,
int p,
const R *param, R a, R b)
145 if ((xx >= -K(0.5) + b && xx <= -a) || (xx >= a && xx <= K(0.5) - b))
147 return k(xx, 0, param);
149 else if ((xx > -a) && (xx < a))
151 for (r = 0; r < p; r++)
155 * (k(-a, r, param) *
BasisPoly(p - 1, r, xx / a)
156 + k(a, r, param) *
BasisPoly(p - 1, r, -xx / a)
161 else if (xx < -K(0.5) + b)
163 for (r = 0; r < p; r++)
166 * (k(K(0.5) - b, r, param) *
BasisPoly(p - 1, r, (xx + K(0.5)) / b)
167 + k(-K(0.5) + b, r, param) *
BasisPoly(p - 1, r, -(xx + K(0.5)) / b)
172 else if (xx > K(0.5) - b)
174 for (r = 0; r < p; r++)
177 * (k(K(0.5) - b, r, param) *
BasisPoly(p - 1, r, (xx - K(0.5)) / b)
178 + k(-K(0.5) + b, r, param) *
BasisPoly(p - 1, r, -(xx - K(0.5)) / b)
183 return k(xx, 0, param);
232 static C
regkern3(kernel k, R xx,
int p,
const R *param, R a, R b)
245 if ((a <= xx) && (xx <= K(0.5) - b))
247 return k(xx, 0, param);
251 for (r = 0; r < p; r++)
253 sum += POW(-a, (R) r) * k(a, r, param)
259 else if ((K(0.5) - b < xx) && (xx <= K(0.5)))
261 sum = k(K(0.5), 0, param) *
BasisPoly(p - 1, 0, -K(2.0) * xx / b + (K(1.0) - b) / b);
263 for (r = 0; r < p; r++)
265 sum += POW(b / K(2.0), (R) r) * k(K(0.5) - b, r, param)
266 *
BasisPoly(p - 1, r, K(2.0) * xx / b - (K(1.0) - b) / b);
320 C
kubintkern(
const R x,
const C *Add,
const int Ad,
const R a)
349 return (-f0 * c1 * c3 * c4 / K(6.0) + f1 * c2 * c3 * c4 / K(2.0)
350 - f2 * c2 * c1 * c4 / K(2.0) + f3 * c2 * c1 * c3 / K(6.0));
354 static C
kubintkern1(
const R x,
const C *Add,
const int Ad,
const R a)
360 c = (x + a) * (R)(Ad) / K(2.0) / a;
378 return (-f0 * c1 * c3 * c4 / K(6.0) + f1 * c2 * c3 * c4 / K(2.0)
379 - f2 * c2 * c1 * c4 / K(2.0) + f3 * c2 * c1 * c3 / K(6.0));
383 static void quicksort(
int d,
int t, R *x, C *alpha,
int N)
388 R pivot = x[(N / 2) * d + t];
396 while (x[lpos * d + t] < pivot)
398 while (x[rpos * d + t] > pivot)
402 for (k = 0; k < d; k++)
404 temp1 = x[lpos * d + k];
405 x[lpos * d + k] = x[rpos * d + k];
406 x[rpos * d + k] = temp1;
409 alpha[lpos] = alpha[rpos];
419 quicksort(d, t, x + lpos * d, alpha + lpos, N - lpos);
429 box_index = (
int *) NFFT(malloc)((size_t)(ths->box_count) *
sizeof(int));
430 for (t = 0; t < ths->box_count; t++)
433 for (l = 0; l < ths->
N_total; l++)
436 for (t = 0; t < ths->
d; t++)
438 val[t] = ths->
x[ths->
d * l + t] + K(0.25) - ths->
eps_B / K(2.0);
439 ind *= ths->box_count_per_dim;
440 ind += (int) (val[t] / ths->
eps_I);
445 ths->box_offset[0] = 0;
446 for (t = 1; t <= ths->box_count; t++)
448 ths->box_offset[t] = ths->box_offset[t - 1] + box_index[t - 1];
449 box_index[t - 1] = ths->box_offset[t - 1];
452 for (l = 0; l < ths->
N_total; l++)
455 for (t = 0; t < ths->
d; t++)
457 val[t] = ths->
x[ths->
d * l + t] + K(0.25) - ths->
eps_B / K(2.0);
458 ind *= ths->box_count_per_dim;
459 ind += (int) (val[t] / ths->
eps_I);
462 ths->box_alpha[box_index[ind]] = ths->
alpha[l];
464 for (t = 0; t < ths->
d; t++)
466 ths->box_x[ths->
d * box_index[ind] + t] = ths->
x[ths->
d * l + t];
470 NFFT(free)(box_index);
475 int end_lt,
const C *Add,
const int Ad,
int p, R a,
const kernel k,
476 const R *param,
const unsigned flags)
483 for (m = start; m < end_lt; m++)
492 for (l = 0; l < d; l++)
493 r += (y[l] - x[m * d + l]) * (y[l] - x[m * d + l]);
498 result += alpha[m] * k(r, 0, param);
502 result -= alpha[m] *
regkern1(k, r, p, param, a, K(1.0) / K(16.0));
509 result -= alpha[m] *
regkern(k, r, p, param, a, K(1.0) / K(16.0));
512 result -= alpha[m] *
kubintkern(r, Add, Ad, a);
513 #elif defined(NF_QUADR)
514 result -= alpha[m]*quadrintkern(r,Add,Ad,a);
515 #elif defined(NF_LIN)
516 result -= alpha[m]*linintkern(r,Add,Ad,a);
518 #error define interpolation method
531 int y_multiind[ths->
d];
532 int multiindex[ths->
d];
535 for (t = 0; t < ths->
d; t++)
537 y_multiind[t] = (int)(LRINT((y[t] + K(0.25) - ths->
eps_B / K(2.0)) / ths->
eps_I));
542 for (y_ind =
max_i(0, y_multiind[0] - 1);
543 y_ind < ths->box_count_per_dim && y_ind <= y_multiind[0] + 1; y_ind++)
546 ths->box_offset[y_ind], ths->box_offset[y_ind + 1], ths->
Add, ths->
Ad,
550 else if (ths->
d == 2)
552 for (multiindex[0] =
max_i(0, y_multiind[0] - 1);
553 multiindex[0] < ths->box_count_per_dim
554 && multiindex[0] <= y_multiind[0] + 1; multiindex[0]++)
555 for (multiindex[1] =
max_i(0, y_multiind[1] - 1);
556 multiindex[1] < ths->box_count_per_dim
557 && multiindex[1] <= y_multiind[1] + 1; multiindex[1]++)
559 y_ind = (ths->box_count_per_dim * multiindex[0]) + multiindex[1];
561 ths->box_offset[y_ind], ths->box_offset[y_ind + 1], ths->
Add,
565 else if (ths->
d == 3)
567 for (multiindex[0] =
max_i(0, y_multiind[0] - 1);
568 multiindex[0] < ths->box_count_per_dim
569 && multiindex[0] <= y_multiind[0] + 1; multiindex[0]++)
570 for (multiindex[1] =
max_i(0, y_multiind[1] - 1);
571 multiindex[1] < ths->box_count_per_dim
572 && multiindex[1] <= y_multiind[1] + 1; multiindex[1]++)
573 for (multiindex[2] =
max_i(0, y_multiind[2] - 1);
574 multiindex[2] < ths->box_count_per_dim
575 && multiindex[2] <= y_multiind[2] + 1; multiindex[2]++)
577 y_ind = ((ths->box_count_per_dim * multiindex[0]) + multiindex[1])
578 * ths->box_count_per_dim + multiindex[2];
580 ths->box_offset[y_ind], ths->box_offset[y_ind + 1], ths->
Add,
593 static void BuildTree(
int d,
int t, R *x, C *alpha,
int N)
602 BuildTree(d, (t + 1) % d, x + (m + 1) * d, alpha + (m + 1), N - m - 1);
607 static C
SearchTree(
const int d,
const int t,
const R *x,
const C *alpha,
608 const R *xmin,
const R *xmax,
const int N,
const kernel k,
const R *param,
609 const int Ad,
const C *Add,
const int p,
const unsigned flags)
620 R Median = x[m * d + t];
621 R a = FABS(Max - Min) / 2;
627 return SearchTree(d, (t + 1) % d, x + (m + 1) * d, alpha + (m + 1), xmin,
628 xmax, N - m - 1, k, param, Ad, Add, p, flags);
629 else if (Max < Median)
630 return SearchTree(d, (t + 1) % d, x, alpha, xmin, xmax, m, k, param, Ad,
637 for (l = 0; l < d; l++)
639 if (x[m * d + l] > xmin[l] && x[m * d + l] < xmax[l])
647 r = xmin[0] + a - x[m];
652 for (l = 0; l < d; l++)
653 r += (xmin[l] + a - x[m * d + l]) * (xmin[l] + a - x[m * d + l]);
658 result += alpha[m] * k(r, 0, param);
662 result -= alpha[m] *
regkern1(k, r, p, param, a, K(1.0) / K(16.0));
669 result -= alpha[m] *
regkern(k, r, p, param, a, K(1.0) / K(16.0));
672 result -= alpha[m] *
kubintkern(r, Add, Ad, a);
673 #elif defined(NF_QUADR)
674 result -= alpha[m]*quadrintkern(r,Add,Ad,a);
675 #elif defined(NF_LIN)
676 result -= alpha[m]*linintkern(r,Add,Ad,a);
678 #error define interpolation method
683 result +=
SearchTree(d, (t + 1) % d, x + (m + 1) * d, alpha + (m + 1), xmin,
684 xmax, N - m - 1, k, param, Ad, Add, p, flags)
685 +
SearchTree(d, (t + 1) % d, x, alpha, xmin, xmax, m, k, param, Ad, Add,
694 kernel k, R *param,
unsigned flags,
int nn,
int m,
int p, R eps_I, R eps_B)
699 unsigned sort_flags_trafo = 0U;
700 unsigned sort_flags_adjoint = 0U;
702 int nthreads = NFFT(get_num_threads)();
707 sort_flags_trafo = NFFT_SORT_NODES;
709 sort_flags_adjoint = NFFT_SORT_NODES | NFFT_OMP_BLOCKWISE_ADJOINT;
711 sort_flags_adjoint = NFFT_SORT_NODES;
720 ths->
x = (R *) NFFT(malloc)((size_t)(d * N_total) * (
sizeof(R)));
721 ths->
alpha = (C *) NFFT(malloc)((size_t)(N_total) * (
sizeof(C)));
723 ths->
y = (R *) NFFT(malloc)((size_t)(d * M_total) * (
sizeof(R)));
724 ths->
f = (C *) NFFT(malloc)((size_t)(M_total) * (
sizeof(C)));
740 ths->
Ad = 4 * (ths->
p) * (ths->
p);
741 ths->
Add = (C *) NFFT(malloc)((size_t)(ths->
Ad + 5) * (
sizeof(C)));
745 if (ths->
k == one_over_x)
773 ths->
Ad =
max_i(10, (
int)(LRINT(CEIL(K(1.4) / POW(delta, K(1.0) / K(4.0))))));
774 ths->
Add = (C *) NFFT(malloc)((size_t)(ths->
Ad + 3) * (
sizeof(C)));
775 #elif defined(NF_QUADR)
776 ths->
Ad = (int)(LRINT(CEIL(K(2.2)/POW(delta,K(1.0)/K(3.0)))));
777 ths->
Add = (C *)NFFT(malloc)((size_t)(ths->
Ad+3)*(
sizeof(C)));
778 #elif defined(NF_LIN)
779 ths->
Ad = (int)(LRINT(CEIL(K(1.7)/pow(delta,K(1.0)/K(2.0)))));
780 ths->
Add = (C *)NFFT(malloc)((size_t)(ths->
Ad+3)*(
sizeof(C)));
782 #error define NF_LIN or NF_QUADR or NF_KUB
787 ths->
Ad = 2 * (ths->
p) * (ths->
p);
788 ths->
Add = (C *) NFFT(malloc)((size_t)(ths->
Ad + 3) * (
sizeof(C)));
795 for (t = 0; t < d; t++)
800 NFFT(init_guru)(&(ths->mv1), d, N, N_total, n, m,
802 PRE_PHI_HUT | PRE_PSI | MALLOC_X | MALLOC_F_HAT | MALLOC_F | FFTW_INIT
804 FFTW_MEASURE | FFTW_DESTROY_INPUT);
805 NFFT(init_guru)(&(ths->mv2), d, N, M_total, n, m,
807 PRE_PHI_HUT | PRE_PSI | MALLOC_X | MALLOC_F_HAT | MALLOC_F | FFTW_INIT
809 FFTW_MEASURE | FFTW_DESTROY_INPUT);
813 for (t = 0; t < d; t++)
816 ths->
b = (C*) NFFT(malloc)((size_t)(n_total) *
sizeof(C));
818 #pragma omp critical (nfft_omp_critical_fftw_plan)
820 FFTW(plan_with_nthreads)(nthreads);
823 ths->fft_plan = FFTW(plan_dft)(d, N, ths->
b, ths->
b, FFTW_FORWARD,
830 if (ths->
flags & NEARFIELD_BOXES)
832 ths->box_count_per_dim = (int)(LRINT(FLOOR((K(0.5) - ths->
eps_B) / ths->
eps_I))) + 1;
834 for (t = 0; t < ths->
d; t++)
835 ths->box_count *= ths->box_count_per_dim;
837 ths->box_offset = (
int *) NFFT(malloc)((size_t)(ths->box_count + 1) *
sizeof(int));
839 ths->box_alpha = (C *) NFFT(malloc)((size_t)(ths->
N_total) * (
sizeof(C)));
841 ths->box_x = (R *) NFFT(malloc)((size_t)(ths->
d * ths->
N_total) *
sizeof(R));
849 NFFT(free)(ths->
alpha);
854 NFFT(free)(ths->
Add);
856 NFFT(finalize)(&(ths->mv1));
857 NFFT(finalize)(&(ths->mv2));
860 #pragma omp critical (nfft_omp_critical_fftw_plan)
863 FFTW(destroy_plan)(ths->fft_plan);
870 if (ths->
flags & NEARFIELD_BOXES)
872 NFFT(free)(ths->box_offset);
873 NFFT(free)(ths->box_alpha);
874 NFFT(free)(ths->box_x);
886 #pragma omp parallel for default(shared) private(j,k,t,r)
888 for (j = 0; j < ths->
M_total; j++)
891 for (k = 0; k < ths->
N_total; k++)
894 r = ths->
y[j] - ths->
x[k];
898 for (t = 0; t < ths->
d; t++)
899 r += (ths->
y[j * ths->
d + t] - ths->
x[k * ths->
d + t])
900 * (ths->
y[j * ths->
d + t] - ths->
x[k * ths->
d + t]);
926 if (ths->
flags & NEARFIELD_BOXES)
949 #pragma omp parallel for default(shared) private(k)
951 for (k = -ths->
Ad / 2 - 2; k <= ths->Ad / 2 + 2; k++)
957 #pragma omp parallel for default(shared) private(k)
959 for (k = 0; k <= ths->
Ad + 2; k++)
972 for (k = 0; k < ths->mv1.
M_total; k++)
973 for (t = 0; t < ths->mv1.
d; t++)
974 ths->mv1.
x[ths->mv1.
d * k + t] = -ths->
x[ths->mv1.
d * k + t];
977 if (ths->mv1.
flags & PRE_LIN_PSI)
978 NFFT(precompute_lin_psi)(&(ths->mv1));
980 if (ths->mv1.
flags & PRE_PSI)
981 NFFT(precompute_psi)(&(ths->mv1));
983 if (ths->mv1.
flags & PRE_FULL_PSI)
984 NFFT(precompute_full_psi)(&(ths->mv1));
991 for (k = 0; k < ths->mv1.
M_total; k++)
992 ths->mv1.
f[k] = ths->
alpha[k];
998 for (j = 0; j < ths->mv2.
M_total; j++)
999 for (t = 0; t < ths->mv2.
d; t++)
1000 ths->mv2.
x[ths->mv2.
d * j + t] = -ths->
y[ths->mv2.
d * j + t];
1003 if (ths->mv2.
flags & PRE_LIN_PSI)
1004 NFFT(precompute_lin_psi)(&(ths->mv2));
1006 if (ths->mv2.
flags & PRE_PSI)
1007 NFFT(precompute_psi)(&(ths->mv2));
1009 if (ths->mv2.
flags & PRE_FULL_PSI)
1010 NFFT(precompute_full_psi)(&(ths->mv2));
1021 for (t = 0; t < ths->
d; t++)
1025 #pragma omp parallel
for default(shared)
private(j,k,t)
1027 for (j = 0; j < n_total; j++)
1030 ths->
b[j] =
regkern1(ths->
k, (R) j / (R)(ths->
n) - K(0.5), ths->
p,
1036 for (t = 0; t < ths->
d; t++)
1038 ths->
b[j] += ((R) (k % (ths->
n)) / (R)(ths->
n) - K(0.5))
1039 * ((R) (k % (ths->
n)) / (R)(ths->
n) - K(0.5));
1047 NFFT(fftshift_complex)(ths->
b, (int)(ths->mv1.
d), ths->mv1.N);
1048 FFTW(execute)(ths->fft_plan);
1049 NFFT(fftshift_complex)(ths->
b, (int)(ths->mv1.
d), ths->mv1.N);
1073 NFFT(adjoint)(&(ths->mv1));
1084 #pragma omp parallel for default(shared) private(k)
1086 for (k = 0; k < ths->mv2.
N_total; k++)
1087 ths->mv2.f_hat[k] = ths->
b[k] * ths->mv1.f_hat[k];
1097 NFFT(trafo)(&(ths->mv2));
1108 #pragma omp parallel for default(shared) private(j,k,t)
1110 for (j = 0; j < ths->
M_total; j++)
1112 R ymin[ths->
d], ymax[ths->
d];
1114 if (ths->
flags & NEARFIELD_BOXES)
1116 ths->
f[j] = ths->mv2.
f[j] +
SearchBox(ths->
y + ths->
d * j, ths);
1120 for (t = 0; t < ths->
d; t++)
1122 ymin[t] = ths->
y[ths->
d * j + t] - ths->
eps_I;
1123 ymax[t] = ths->
y[ths->
d * j + t] + ths->
eps_I;
1125 ths->
f[j] = ths->mv2.
f[j]
int M_total
number of target knots
static int max_i(int a, int b)
max
C regkern(kernel k, R xx, int p, const R *param, R a, R b)
regularized kernel with K_I arbitrary and K_B smooth to zero
static R fak(int n)
factorial
#define EXACT_NEARFIELD
Constant symbols.
static R BasisPoly(int m, int r, R xx)
basis polynomial for regularized kernel
R nfft_elapsed_seconds(ticks t1, ticks t0)
Return number of elapsed seconds between two time points.
Header file with predefined kernels for the fast summation algorithm.
static void BuildTree(int d, int t, R *x, C *alpha, int N)
recursive sort of source knots dimension by dimension to get tree structure
R * kernel_param
parameters for kernel function
static C SearchBox(R *y, fastsum_plan *ths)
box-based near field correction
static C calc_SearchBox(int d, R *y, R *x, C *alpha, int start, int end_lt, const C *Add, const int Ad, int p, R a, const kernel k, const R *param, const unsigned flags)
inner computation function for box-based near field correction
plan for fast summation algorithm
C * b
expansion coefficients
static R binom(int n, int m)
binomial coefficient
Header file for the fast NFFT-based summation algorithm.
C * alpha
source coefficients
unsigned flags
flags precomp.
void fastsum_trafo(fastsum_plan *ths)
fast NFFT-based summation
static C regkern1(kernel k, R xx, int p, const R *param, R a, R b)
regularized kernel with K_I arbitrary and K_B periodized (used in 1D)
void fastsum_precompute(fastsum_plan *ths)
precomputation for fastsum
int N_total
number of source knots
static C kubintkern1(const R x, const C *Add, const int Ad, const R a)
cubic spline interpolation in near field with arbitrary kernels
R MEASURE_TIME_t[8]
Measured time for each step if MEASURE_TIME is set.
static C regkern3(kernel k, R xx, int p, const R *param, R a, R b)
regularized kernel for even kernels with K_I even and K_B mirrored
void fastsum_init_guru(fastsum_plan *ths, int d, int N_total, int M_total, kernel k, R *param, unsigned flags, int nn, int m, int p, R eps_I, R eps_B)
initialization of fastsum plan
static void quicksort(int d, int t, R *x, C *alpha, int N)
quicksort algorithm for source knots and associated coefficients
static C SearchTree(const int d, const int t, const R *x, const C *alpha, const R *xmin, const R *xmax, const int N, const kernel k, const R *param, const int Ad, const C *Add, const int p, const unsigned flags)
fast search in tree of source knots for near field computation
R * y
target knots in d-ball with radius 1/4-eps_b/2
C kubintkern(const R x, const C *Add, const int Ad, const R a)
linear spline interpolation in near field with even kernels
int n
FS__ - fast summation.
int p
degree of smoothness of regularization
static void BuildBox(fastsum_plan *ths)
initialize box-based search data structures
void fastsum_finalize(fastsum_plan *ths)
finalization of fastsum plan
void fastsum_exact(fastsum_plan *ths)
direct computation of sums
R * x
source knots in d-ball with radius 1/4-eps_b/2