NFFT  3.3.0
fastsum.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2002, 2015 Jens Keiner, Stefan Kunis, Daniel Potts
3  *
4  * This program is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU General Public License as published by the Free Software
6  * Foundation; either version 2 of the License, or (at your option) any later
7  * version.
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program; if not, write to the Free Software Foundation, Inc., 51
16  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18 
19 /* $Id$ */
20 
27 #include "config.h"
28 
29 #include <stdlib.h>
30 #include <math.h>
31 #ifdef HAVE_COMPLEX_H
32 #include <complex.h>
33 #endif
34 
35 #include "nfft3.h"
36 #include "fastsum.h"
37 #include "infft.h"
38 
39 // Required for test if (ths->k == one_over_x)
40 #include "kernels.h"
41 
48 static int max_i(int a, int b)
49 {
50  return a >= b ? a : b;
51 }
52 
54 static R fak(int n)
55 {
56  if (n <= 1)
57  return K(1.0);
58  else
59  return (R)(n) * fak(n - 1);
60 }
61 
63 static R binom(int n, int m)
64 {
65  return fak(n) / fak(m) / fak(n - m);
66 }
67 
69 static R BasisPoly(int m, int r, R xx)
70 {
71  int k;
72  R sum = K(0.0);
73 
74  for (k = 0; k <= m - r; k++)
75  {
76  sum += binom(m + k, k) * POW((xx + K(1.0)) / K(2.0), (R) k);
77  }
78  return sum * POW((xx + K(1.0)), (R) r) * POW(K(1.0) - xx, (R) (m + 1))
79  / (R)(1 << (m + 1)) / fak(r); /* 1<<(m+1) = 2^(m+1) */
80 }
81 
83 C regkern(kernel k, R xx, int p, const R *param, R a, R b)
84 {
85  int r;
86  C sum = K(0.0);
87 
88  if (xx < -K(0.5))
89  xx = -K(0.5);
90  if (xx > K(0.5))
91  xx = K(0.5);
92  if ((xx >= -K(0.5) + b && xx <= -a) || (xx >= a && xx <= K(0.5) - b))
93  {
94  return k(xx, 0, param);
95  }
96  else if (xx < -K(0.5) + b)
97  {
98  sum = (k(-K(0.5), 0, param) + k(K(0.5), 0, param)) / K(2.0)
99  * BasisPoly(p - 1, 0, K(2.0) * xx / b + (K(1.0) - b) / b);
100  for (r = 0; r < p; r++)
101  {
102  sum += POW(-b / K(2.0), (R) r) * k(-K(0.5) + b, r, param)
103  * BasisPoly(p - 1, r, -K(2.0) * xx / b + (b - K(1.0)) / b);
104  }
105  return sum;
106  }
107  else if ((xx > -a) && (xx < a))
108  {
109  for (r = 0; r < p; r++)
110  {
111  sum +=
112  POW(a, (R) r)
113  * (k(-a, r, param) * BasisPoly(p - 1, r, xx / a)
114  + k(a, r, param) * BasisPoly(p - 1, r, -xx / a)
115  * (r & 1 ? -1 : 1));
116  }
117  return sum;
118  }
119  else if (xx > K(0.5) - b)
120  {
121  sum = (k(-K(0.5), 0, param) + k(K(0.5), 0, param)) / K(2.0)
122  * BasisPoly(p - 1, 0, -K(2.0) * xx / b + (K(1.0) - b) / b);
123  for (r = 0; r < p; r++)
124  {
125  sum += POW(b / K(2.0), (R) r) * k(K(0.5) - b, r, param)
126  * BasisPoly(p - 1, r, K(2.0) * xx / b - (K(1.0) - b) / b);
127  }
128  return sum;
129  }
130  return k(xx, 0, param);
131 }
132 
136 static C regkern1(kernel k, R xx, int p, const R *param, R a, R b)
137 {
138  int r;
139  C sum = K(0.0);
140 
141  if (xx < -K(0.5))
142  xx = -K(0.5);
143  if (xx > K(0.5))
144  xx = K(0.5);
145  if ((xx >= -K(0.5) + b && xx <= -a) || (xx >= a && xx <= K(0.5) - b))
146  {
147  return k(xx, 0, param);
148  }
149  else if ((xx > -a) && (xx < a))
150  {
151  for (r = 0; r < p; r++)
152  {
153  sum +=
154  POW(a, (R) r)
155  * (k(-a, r, param) * BasisPoly(p - 1, r, xx / a)
156  + k(a, r, param) * BasisPoly(p - 1, r, -xx / a)
157  * (r & 1 ? -1 : 1));
158  }
159  return sum;
160  }
161  else if (xx < -K(0.5) + b)
162  {
163  for (r = 0; r < p; r++)
164  {
165  sum += POW(b, (R) r)
166  * (k(K(0.5) - b, r, param) * BasisPoly(p - 1, r, (xx + K(0.5)) / b)
167  + k(-K(0.5) + b, r, param) * BasisPoly(p - 1, r, -(xx + K(0.5)) / b)
168  * (r & 1 ? -1 : 1));
169  }
170  return sum;
171  }
172  else if (xx > K(0.5) - b)
173  {
174  for (r = 0; r < p; r++)
175  {
176  sum += POW(b, (R) r)
177  * (k(K(0.5) - b, r, param) * BasisPoly(p - 1, r, (xx - K(0.5)) / b)
178  + k(-K(0.5) + b, r, param) * BasisPoly(p - 1, r, -(xx - K(0.5)) / b)
179  * (r & 1 ? -1 : 1));
180  }
181  return sum;
182  }
183  return k(xx, 0, param);
184 }
185 
187 //static C regkern2(kernel k, R xx, int p, const R *param, R a, R b)
188 //{
189 // int r;
190 // C sum = K(0.0);
191 //
192 // xx = FABS(xx);
193 //
194 // if (xx > K(0.5))
195 // {
196 // for (r = 0; r < p; r++)
197 // {
198 // sum += POW(b, (R) r) * k(K(0.5) - b, r, param)
199 // * (BasisPoly(p - 1, r, 0) + BasisPoly(p - 1, r, 0));
200 // }
201 // return sum;
202 // }
203 // else if ((a <= xx) && (xx <= K(0.5) - b))
204 // {
205 // return k(xx, 0, param);
206 // }
207 // else if (xx < a)
208 // {
209 // for (r = 0; r < p; r++)
210 // {
211 // sum += POW(-a, (R) r) * k(a, r, param)
212 // * (BasisPoly(p - 1, r, xx / a) + BasisPoly(p - 1, r, -xx / a));
213 // }
214 // return sum;
215 // }
216 // else if ((K(0.5) - b < xx) && (xx <= K(0.5)))
217 // {
218 // for (r = 0; r < p; r++)
219 // {
220 // sum += POW(b, (R) r) * k(K(0.5) - b, r, param)
221 // * (BasisPoly(p - 1, r, (xx - K(0.5)) / b)
222 // + BasisPoly(p - 1, r, -(xx - K(0.5)) / b));
223 // }
224 // return sum;
225 // }
226 // return K(0.0);
227 //}
228 
232 static C regkern3(kernel k, R xx, int p, const R *param, R a, R b)
233 {
234  int r;
235  C sum = K(0.0);
236 
237  xx = FABS(xx);
238 
239  if (xx >= K(0.5))
240  {
241  /*return kern(typ,c,0,K(0.5));*/
242  xx = K(0.5);
243  }
244  /* else */
245  if ((a <= xx) && (xx <= K(0.5) - b))
246  {
247  return k(xx, 0, param);
248  }
249  else if (xx < a)
250  {
251  for (r = 0; r < p; r++)
252  {
253  sum += POW(-a, (R) r) * k(a, r, param)
254  * (BasisPoly(p - 1, r, xx / a) + BasisPoly(p - 1, r, -xx / a));
255  }
256  /*sum=kern(typ,c,0,xx); */
257  return sum;
258  }
259  else if ((K(0.5) - b < xx) && (xx <= K(0.5)))
260  {
261  sum = k(K(0.5), 0, param) * BasisPoly(p - 1, 0, -K(2.0) * xx / b + (K(1.0) - b) / b);
262  /* sum=regkern2(typ,c,p,a,b, K(0.5))*BasisPoly(p-1,0,-K(2.0)*xx/b+(K(1.0)-b)/b); */
263  for (r = 0; r < p; r++)
264  {
265  sum += POW(b / K(2.0), (R) r) * k(K(0.5) - b, r, param)
266  * BasisPoly(p - 1, r, K(2.0) * xx / b - (K(1.0) - b) / b);
267  }
268  return sum;
269  }
270  return K(0.0);
271 }
272 
274 //static C linintkern(const R x, const C *Add, const int Ad, const R a)
275 //{
276 // R c, c1, c3;
277 // int r;
278 // C f1, f2;
279 //
280 // c = x * Ad / a;
281 // r = (int)(LRINT(c));
282 // r = abs(r);
283 // f1 = Add[r];
284 // f2 = Add[r + 1];
285 // c = FABS(c);
286 // c1 = c - r;
287 // c3 = c1 - K(1.0);
288 // return (-f1 * c3 + f2 * c1);
289 //}
290 //
291 //static C quadrintkern(const R x, const C *Add, const int Ad, const R a)
292 //{
293 // R c, c1, c2, c3;
294 // int r;
295 // C f0, f1, f2;
296 //
297 // c = x * Ad / a;
298 // r = (int)(LRINT(c));
299 // r = abs(r);
300 // if (r == 0)
301 // {
302 // f0 = Add[r + 1];
303 // f1 = Add[r];
304 // f2 = Add[r + 1];
305 // }
306 // else
307 // {
308 // f0 = Add[r - 1];
309 // f1 = Add[r];
310 // f2 = Add[r + 1];
311 // }
312 // c = FABS(c);
313 // c1 = c - r;
314 // c2 = c1 + K(1.0);
315 // c3 = c1 - K(1.0);
316 // return (f0 * c1 * c3 / K(2.0) - f1 * c2 * c3 + f2 * c2 * c1 / K(2.0));
317 //}
318 
320 C kubintkern(const R x, const C *Add, const int Ad, const R a)
321 {
322  R c, c1, c2, c3, c4;
323  int r;
324  C f0, f1, f2, f3;
325  c = x * (R)(Ad) / a;
326  r = (int)(LRINT(c));
327  r = abs(r);
328  if (r == 0)
329  {
330  f0 = Add[r + 1];
331  f1 = Add[r];
332  f2 = Add[r + 1];
333  f3 = Add[r + 2];
334  }
335  else
336  {
337  f0 = Add[r - 1];
338  f1 = Add[r];
339  f2 = Add[r + 1];
340  f3 = Add[r + 2];
341  }
342  c = FABS(c);
343  c1 = c - (R)(r);
344  c2 = c1 + K(1.0);
345  c3 = c1 - K(1.0);
346  c4 = c1 - K(2.0);
347  /* return(-f0*(c-r)*(c-r-K(1.0))*(c-r-K(2.0))/K(6.0)+f1*(c-r+K(1.0))*(c-r-K(1.0))*(c-r-K(2.0))/2-
348  f2*(c-r+K(1.0))*(c-r)*(c-r-K(2.0))/2+f3*(c-r+K(1.0))*(c-r)*(c-r-K(1.0))/K(6.0)); */
349  return (-f0 * c1 * c3 * c4 / K(6.0) + f1 * c2 * c3 * c4 / K(2.0)
350  - f2 * c2 * c1 * c4 / K(2.0) + f3 * c2 * c1 * c3 / K(6.0));
351 }
352 
354 static C kubintkern1(const R x, const C *Add, const int Ad, const R a)
355 {
356  R c, c1, c2, c3, c4;
357  int r;
358  C f0, f1, f2, f3;
359  Add += 2;
360  c = (x + a) * (R)(Ad) / K(2.0) / a;
361  r = (int)(LRINT(c));
362  r = abs(r);
363  /*if (r==0) {f0=Add[r];f1=Add[r];f2=Add[r+1];f3=Add[r+2];}
364  else */
365  {
366  f0 = Add[r - 1];
367  f1 = Add[r];
368  f2 = Add[r + 1];
369  f3 = Add[r + 2];
370  }
371  c = FABS(c);
372  c1 = c - (R)(r);
373  c2 = c1 + K(1.0);
374  c3 = c1 - K(1.0);
375  c4 = c1 - K(2.0);
376  /* return(-f0*(c-r)*(c-r-K(1.0))*(c-r-K(2.0))/K(6.0)+f1*(c-r+K(1.0))*(c-r-K(1.0))*(c-r-K(2.0))/2-
377  f2*(c-r+K(1.0))*(c-r)*(c-r-K(2.0))/2+f3*(c-r+K(1.0))*(c-r)*(c-r-K(1.0))/K(6.0)); */
378  return (-f0 * c1 * c3 * c4 / K(6.0) + f1 * c2 * c3 * c4 / K(2.0)
379  - f2 * c2 * c1 * c4 / K(2.0) + f3 * c2 * c1 * c3 / K(6.0));
380 }
381 
383 static void quicksort(int d, int t, R *x, C *alpha, int N)
384 {
385  int lpos = 0;
386  int rpos = N - 1;
387  /*R pivot=x[((N-1)/2)*d+t];*/
388  R pivot = x[(N / 2) * d + t];
389 
390  int k;
391  R temp1;
392  C temp2;
393 
394  while (lpos <= rpos)
395  {
396  while (x[lpos * d + t] < pivot)
397  lpos++;
398  while (x[rpos * d + t] > pivot)
399  rpos--;
400  if (lpos <= rpos)
401  {
402  for (k = 0; k < d; k++)
403  {
404  temp1 = x[lpos * d + k];
405  x[lpos * d + k] = x[rpos * d + k];
406  x[rpos * d + k] = temp1;
407  }
408  temp2 = alpha[lpos];
409  alpha[lpos] = alpha[rpos];
410  alpha[rpos] = temp2;
411 
412  lpos++;
413  rpos--;
414  }
415  }
416  if (0 < rpos)
417  quicksort(d, t, x, alpha, rpos + 1);
418  if (lpos < N - 1)
419  quicksort(d, t, x + lpos * d, alpha + lpos, N - lpos);
420 }
421 
423 static void BuildBox(fastsum_plan *ths)
424 {
425  int t, l;
426  int *box_index;
427  R val[ths->d];
428 
429  box_index = (int *) NFFT(malloc)((size_t)(ths->box_count) * sizeof(int));
430  for (t = 0; t < ths->box_count; t++)
431  box_index[t] = 0;
432 
433  for (l = 0; l < ths->N_total; l++)
434  {
435  int ind = 0;
436  for (t = 0; t < ths->d; t++)
437  {
438  val[t] = ths->x[ths->d * l + t] + K(0.25) - ths->eps_B / K(2.0);
439  ind *= ths->box_count_per_dim;
440  ind += (int) (val[t] / ths->eps_I);
441  }
442  box_index[ind]++;
443  }
444 
445  ths->box_offset[0] = 0;
446  for (t = 1; t <= ths->box_count; t++)
447  {
448  ths->box_offset[t] = ths->box_offset[t - 1] + box_index[t - 1];
449  box_index[t - 1] = ths->box_offset[t - 1];
450  }
451 
452  for (l = 0; l < ths->N_total; l++)
453  {
454  int ind = 0;
455  for (t = 0; t < ths->d; t++)
456  {
457  val[t] = ths->x[ths->d * l + t] + K(0.25) - ths->eps_B / K(2.0);
458  ind *= ths->box_count_per_dim;
459  ind += (int) (val[t] / ths->eps_I);
460  }
461 
462  ths->box_alpha[box_index[ind]] = ths->alpha[l];
463 
464  for (t = 0; t < ths->d; t++)
465  {
466  ths->box_x[ths->d * box_index[ind] + t] = ths->x[ths->d * l + t];
467  }
468  box_index[ind]++;
469  }
470  NFFT(free)(box_index);
471 }
472 
474 static inline C calc_SearchBox(int d, R *y, R *x, C *alpha, int start,
475  int end_lt, const C *Add, const int Ad, int p, R a, const kernel k,
476  const R *param, const unsigned flags)
477 {
478  C result = K(0.0);
479 
480  int m, l;
481  R r;
482 
483  for (m = start; m < end_lt; m++)
484  {
485  if (d == 1)
486  {
487  r = y[0] - x[m];
488  }
489  else
490  {
491  r = K(0.0);
492  for (l = 0; l < d; l++)
493  r += (y[l] - x[m * d + l]) * (y[l] - x[m * d + l]);
494  r = SQRT(r);
495  }
496  if (FABS(r) < a)
497  {
498  result += alpha[m] * k(r, 0, param); /* alpha*(kern-regkern) */
499  if (d == 1)
500  {
501  if (flags & EXACT_NEARFIELD)
502  result -= alpha[m] * regkern1(k, r, p, param, a, K(1.0) / K(16.0)); /* exact value (in 1D) */
503  else
504  result -= alpha[m] * kubintkern1(r, Add, Ad, a); /* spline approximation */
505  }
506  else
507  {
508  if (flags & EXACT_NEARFIELD)
509  result -= alpha[m] * regkern(k, r, p, param, a, K(1.0) / K(16.0)); /* exact value (in dD) */
510  else
511 #if defined(NF_KUB)
512  result -= alpha[m] * kubintkern(r, Add, Ad, a); /* spline approximation */
513 #elif defined(NF_QUADR)
514  result -= alpha[m]*quadrintkern(r,Add,Ad,a); /* spline approximation */
515 #elif defined(NF_LIN)
516  result -= alpha[m]*linintkern(r,Add,Ad,a); /* spline approximation */
517 #else
518 #error define interpolation method
519 #endif
520  }
521  }
522  }
523  return result;
524 }
525 
527 static C SearchBox(R *y, fastsum_plan *ths)
528 {
529  C val = K(0.0);
530  int t;
531  int y_multiind[ths->d];
532  int multiindex[ths->d];
533  int y_ind;
534 
535  for (t = 0; t < ths->d; t++)
536  {
537  y_multiind[t] = (int)(LRINT((y[t] + K(0.25) - ths->eps_B / K(2.0)) / ths->eps_I));
538  }
539 
540  if (ths->d == 1)
541  {
542  for (y_ind = max_i(0, y_multiind[0] - 1);
543  y_ind < ths->box_count_per_dim && y_ind <= y_multiind[0] + 1; y_ind++)
544  {
545  val += calc_SearchBox(ths->d, y, ths->box_x, ths->box_alpha,
546  ths->box_offset[y_ind], ths->box_offset[y_ind + 1], ths->Add, ths->Ad,
547  ths->p, ths->eps_I, ths->k, ths->kernel_param, ths->flags);
548  }
549  }
550  else if (ths->d == 2)
551  {
552  for (multiindex[0] = max_i(0, y_multiind[0] - 1);
553  multiindex[0] < ths->box_count_per_dim
554  && multiindex[0] <= y_multiind[0] + 1; multiindex[0]++)
555  for (multiindex[1] = max_i(0, y_multiind[1] - 1);
556  multiindex[1] < ths->box_count_per_dim
557  && multiindex[1] <= y_multiind[1] + 1; multiindex[1]++)
558  {
559  y_ind = (ths->box_count_per_dim * multiindex[0]) + multiindex[1];
560  val += calc_SearchBox(ths->d, y, ths->box_x, ths->box_alpha,
561  ths->box_offset[y_ind], ths->box_offset[y_ind + 1], ths->Add,
562  ths->Ad, ths->p, ths->eps_I, ths->k, ths->kernel_param, ths->flags);
563  }
564  }
565  else if (ths->d == 3)
566  {
567  for (multiindex[0] = max_i(0, y_multiind[0] - 1);
568  multiindex[0] < ths->box_count_per_dim
569  && multiindex[0] <= y_multiind[0] + 1; multiindex[0]++)
570  for (multiindex[1] = max_i(0, y_multiind[1] - 1);
571  multiindex[1] < ths->box_count_per_dim
572  && multiindex[1] <= y_multiind[1] + 1; multiindex[1]++)
573  for (multiindex[2] = max_i(0, y_multiind[2] - 1);
574  multiindex[2] < ths->box_count_per_dim
575  && multiindex[2] <= y_multiind[2] + 1; multiindex[2]++)
576  {
577  y_ind = ((ths->box_count_per_dim * multiindex[0]) + multiindex[1])
578  * ths->box_count_per_dim + multiindex[2];
579  val += calc_SearchBox(ths->d, y, ths->box_x, ths->box_alpha,
580  ths->box_offset[y_ind], ths->box_offset[y_ind + 1], ths->Add,
581  ths->Ad, ths->p, ths->eps_I, ths->k, ths->kernel_param,
582  ths->flags);
583  }
584  }
585  else
586  {
587  exit(EXIT_FAILURE);
588  }
589  return val;
590 }
591 
593 static void BuildTree(int d, int t, R *x, C *alpha, int N)
594 {
595  if (N > 1)
596  {
597  int m = N / 2;
598 
599  quicksort(d, t, x, alpha, N);
600 
601  BuildTree(d, (t + 1) % d, x, alpha, m);
602  BuildTree(d, (t + 1) % d, x + (m + 1) * d, alpha + (m + 1), N - m - 1);
603  }
604 }
605 
607 static C SearchTree(const int d, const int t, const R *x, const C *alpha,
608  const R *xmin, const R *xmax, const int N, const kernel k, const R *param,
609  const int Ad, const C *Add, const int p, const unsigned flags)
610 {
611  if (N == 0)
612  {
613  return K(0.0);
614  }
615  else
616  {
617  int m = N / 2;
618  R Min = xmin[t];
619  R Max = xmax[t];
620  R Median = x[m * d + t];
621  R a = FABS(Max - Min) / 2;
622  int l;
623  int E = 0;
624  R r;
625 
626  if (Min > Median)
627  return SearchTree(d, (t + 1) % d, x + (m + 1) * d, alpha + (m + 1), xmin,
628  xmax, N - m - 1, k, param, Ad, Add, p, flags);
629  else if (Max < Median)
630  return SearchTree(d, (t + 1) % d, x, alpha, xmin, xmax, m, k, param, Ad,
631  Add, p, flags);
632  else
633  {
634  C result = K(0.0);
635  E = 0;
636 
637  for (l = 0; l < d; l++)
638  {
639  if (x[m * d + l] > xmin[l] && x[m * d + l] < xmax[l])
640  E++;
641  }
642 
643  if (E == d)
644  {
645  if (d == 1)
646  {
647  r = xmin[0] + a - x[m]; /* remember: xmin+a = y */
648  }
649  else
650  {
651  r = K(0.0);
652  for (l = 0; l < d; l++)
653  r += (xmin[l] + a - x[m * d + l]) * (xmin[l] + a - x[m * d + l]); /* remember: xmin+a = y */
654  r = SQRT(r);
655  }
656  if (FABS(r) < a)
657  {
658  result += alpha[m] * k(r, 0, param); /* alpha*(kern-regkern) */
659  if (d == 1)
660  {
661  if (flags & EXACT_NEARFIELD)
662  result -= alpha[m] * regkern1(k, r, p, param, a, K(1.0) / K(16.0)); /* exact value (in 1D) */
663  else
664  result -= alpha[m] * kubintkern1(r, Add, Ad, a); /* spline approximation */
665  }
666  else
667  {
668  if (flags & EXACT_NEARFIELD)
669  result -= alpha[m] * regkern(k, r, p, param, a, K(1.0) / K(16.0)); /* exact value (in dD) */
670  else
671 #if defined(NF_KUB)
672  result -= alpha[m] * kubintkern(r, Add, Ad, a); /* spline approximation */
673 #elif defined(NF_QUADR)
674  result -= alpha[m]*quadrintkern(r,Add,Ad,a); /* spline approximation */
675 #elif defined(NF_LIN)
676  result -= alpha[m]*linintkern(r,Add,Ad,a); /* spline approximation */
677 #else
678 #error define interpolation method
679 #endif
680  }
681  }
682  }
683  result += SearchTree(d, (t + 1) % d, x + (m + 1) * d, alpha + (m + 1), xmin,
684  xmax, N - m - 1, k, param, Ad, Add, p, flags)
685  + SearchTree(d, (t + 1) % d, x, alpha, xmin, xmax, m, k, param, Ad, Add,
686  p, flags);
687  return result;
688  }
689  }
690 }
691 
693 void fastsum_init_guru(fastsum_plan *ths, int d, int N_total, int M_total,
694  kernel k, R *param, unsigned flags, int nn, int m, int p, R eps_I, R eps_B)
695 {
696  int t;
697  int N[d], n[d];
698  int n_total;
699  unsigned sort_flags_trafo = 0U;
700  unsigned sort_flags_adjoint = 0U;
701 #ifdef _OPENMP
702  int nthreads = NFFT(get_num_threads)();
703 #endif
704 
705  if (d > 1)
706  {
707  sort_flags_trafo = NFFT_SORT_NODES;
708 #ifdef _OPENMP
709  sort_flags_adjoint = NFFT_SORT_NODES | NFFT_OMP_BLOCKWISE_ADJOINT;
710 #else
711  sort_flags_adjoint = NFFT_SORT_NODES;
712 #endif
713  }
714 
715  ths->d = d;
716 
717  ths->N_total = N_total;
718  ths->M_total = M_total;
719 
720  ths->x = (R *) NFFT(malloc)((size_t)(d * N_total) * (sizeof(R)));
721  ths->alpha = (C *) NFFT(malloc)((size_t)(N_total) * (sizeof(C)));
722 
723  ths->y = (R *) NFFT(malloc)((size_t)(d * M_total) * (sizeof(R)));
724  ths->f = (C *) NFFT(malloc)((size_t)(M_total) * (sizeof(C)));
725 
726  ths->k = k;
727  ths->kernel_param = param;
728 
729  ths->flags = flags;
730 
731  ths->p = p;
732  ths->eps_I = eps_I; /* =(R)ths->p/(R)nn; */
733  ths->eps_B = eps_B; /* =K(1.0)/K(16.0); */
736  if (!(ths->flags & EXACT_NEARFIELD))
737  {
738  if (ths->d == 1)
739  {
740  ths->Ad = 4 * (ths->p) * (ths->p);
741  ths->Add = (C *) NFFT(malloc)((size_t)(ths->Ad + 5) * (sizeof(C)));
742  }
743  else
744  {
745  if (ths->k == one_over_x)
746  {
747  R delta = K(1e-8);
748  switch (p)
749  {
750  case 2:
751  delta = K(1e-3);
752  break;
753  case 3:
754  delta = K(1e-4);
755  break;
756  case 4:
757  delta = K(1e-5);
758  break;
759  case 5:
760  delta = K(1e-6);
761  break;
762  case 6:
763  delta = K(1e-6);
764  break;
765  case 7:
766  delta = K(1e-7);
767  break;
768  default:
769  delta = K(1e-8);
770  }
771 
772 #if defined(NF_KUB)
773  ths->Ad = max_i(10, (int)(LRINT(CEIL(K(1.4) / POW(delta, K(1.0) / K(4.0))))));
774  ths->Add = (C *) NFFT(malloc)((size_t)(ths->Ad + 3) * (sizeof(C)));
775 #elif defined(NF_QUADR)
776  ths->Ad = (int)(LRINT(CEIL(K(2.2)/POW(delta,K(1.0)/K(3.0)))));
777  ths->Add = (C *)NFFT(malloc)((size_t)(ths->Ad+3)*(sizeof(C)));
778 #elif defined(NF_LIN)
779  ths->Ad = (int)(LRINT(CEIL(K(1.7)/pow(delta,K(1.0)/K(2.0)))));
780  ths->Add = (C *)NFFT(malloc)((size_t)(ths->Ad+3)*(sizeof(C)));
781 #else
782 #error define NF_LIN or NF_QUADR or NF_KUB
783 #endif
784  }
785  else
786  {
787  ths->Ad = 2 * (ths->p) * (ths->p);
788  ths->Add = (C *) NFFT(malloc)((size_t)(ths->Ad + 3) * (sizeof(C)));
789  }
790  }
791  }
792 
794  ths->n = nn;
795  for (t = 0; t < d; t++)
796  {
797  N[t] = nn;
798  n[t] = 2 * nn;
799  }
800  NFFT(init_guru)(&(ths->mv1), d, N, N_total, n, m,
801  sort_flags_adjoint |
802  PRE_PHI_HUT | PRE_PSI | MALLOC_X | MALLOC_F_HAT | MALLOC_F | FFTW_INIT
803  | FFT_OUT_OF_PLACE,
804  FFTW_MEASURE | FFTW_DESTROY_INPUT);
805  NFFT(init_guru)(&(ths->mv2), d, N, M_total, n, m,
806  sort_flags_trafo |
807  PRE_PHI_HUT | PRE_PSI | MALLOC_X | MALLOC_F_HAT | MALLOC_F | FFTW_INIT
808  | FFT_OUT_OF_PLACE,
809  FFTW_MEASURE | FFTW_DESTROY_INPUT);
810 
812  n_total = 1;
813  for (t = 0; t < d; t++)
814  n_total *= nn;
815 
816  ths->b = (C*) NFFT(malloc)((size_t)(n_total) * sizeof(C));
817 #ifdef _OPENMP
818  #pragma omp critical (nfft_omp_critical_fftw_plan)
819  {
820  FFTW(plan_with_nthreads)(nthreads);
821 #endif
822 
823  ths->fft_plan = FFTW(plan_dft)(d, N, ths->b, ths->b, FFTW_FORWARD,
824  FFTW_ESTIMATE);
825 
826 #ifdef _OPENMP
827 }
828 #endif
829 
830  if (ths->flags & NEARFIELD_BOXES)
831  {
832  ths->box_count_per_dim = (int)(LRINT(FLOOR((K(0.5) - ths->eps_B) / ths->eps_I))) + 1;
833  ths->box_count = 1;
834  for (t = 0; t < ths->d; t++)
835  ths->box_count *= ths->box_count_per_dim;
836 
837  ths->box_offset = (int *) NFFT(malloc)((size_t)(ths->box_count + 1) * sizeof(int));
838 
839  ths->box_alpha = (C *) NFFT(malloc)((size_t)(ths->N_total) * (sizeof(C)));
840 
841  ths->box_x = (R *) NFFT(malloc)((size_t)(ths->d * ths->N_total) * sizeof(R));
842  }
843 }
844 
847 {
848  NFFT(free)(ths->x);
849  NFFT(free)(ths->alpha);
850  NFFT(free)(ths->y);
851  NFFT(free)(ths->f);
852 
853  if (!(ths->flags & EXACT_NEARFIELD))
854  NFFT(free)(ths->Add);
855 
856  NFFT(finalize)(&(ths->mv1));
857  NFFT(finalize)(&(ths->mv2));
858 
859 #ifdef _OPENMP
860  #pragma omp critical (nfft_omp_critical_fftw_plan)
861  {
862 #endif
863  FFTW(destroy_plan)(ths->fft_plan);
864 #ifdef _OPENMP
865 }
866 #endif
867 
868  NFFT(free)(ths->b);
869 
870  if (ths->flags & NEARFIELD_BOXES)
871  {
872  NFFT(free)(ths->box_offset);
873  NFFT(free)(ths->box_alpha);
874  NFFT(free)(ths->box_x);
875  }
876 }
877 
880 {
881  int j, k;
882  int t;
883  R r;
884 
885 #ifdef _OPENMP
886  #pragma omp parallel for default(shared) private(j,k,t,r)
887 #endif
888  for (j = 0; j < ths->M_total; j++)
889  {
890  ths->f[j] = K(0.0);
891  for (k = 0; k < ths->N_total; k++)
892  {
893  if (ths->d == 1)
894  r = ths->y[j] - ths->x[k];
895  else
896  {
897  r = K(0.0);
898  for (t = 0; t < ths->d; t++)
899  r += (ths->y[j * ths->d + t] - ths->x[k * ths->d + t])
900  * (ths->y[j * ths->d + t] - ths->x[k * ths->d + t]);
901  r = SQRT(r);
902  }
903  ths->f[j] += ths->alpha[k] * ths->k(r, 0, ths->kernel_param);
904  }
905  }
906 }
907 
910 {
911  int j, k, t;
912  int n_total;
913 #ifdef MEASURE_TIME
914  ticks t0, t1;
915 #endif
916 
917  ths->MEASURE_TIME_t[0] = K(0.0);
918  ths->MEASURE_TIME_t[1] = K(0.0);
919  ths->MEASURE_TIME_t[2] = K(0.0);
920  ths->MEASURE_TIME_t[3] = K(0.0);
921 
922 #ifdef MEASURE_TIME
923  t0 = getticks();
924 #endif
925 
926  if (ths->flags & NEARFIELD_BOXES)
927  {
928  BuildBox(ths);
929  }
930  else
931  {
933  BuildTree(ths->d, 0, ths->x, ths->alpha, ths->N_total);
934  }
935 
936 #ifdef MEASURE_TIME
937  t1 = getticks();
938  ths->MEASURE_TIME_t[3] += nfft_elapsed_seconds(t1,t0);
939 #endif
940 
941 #ifdef MEASURE_TIME
942  t0 = getticks();
943 #endif
944 
945  if (!(ths->flags & EXACT_NEARFIELD))
946  {
947  if (ths->d == 1)
948 #ifdef _OPENMP
949  #pragma omp parallel for default(shared) private(k)
950 #endif
951  for (k = -ths->Ad / 2 - 2; k <= ths->Ad / 2 + 2; k++)
952  ths->Add[k + ths->Ad / 2 + 2] = regkern1(ths->k,
953  ths->eps_I * (R) k / (R)(ths->Ad) * K(2.0), ths->p, ths->kernel_param,
954  ths->eps_I, ths->eps_B);
955  else
956 #ifdef _OPENMP
957  #pragma omp parallel for default(shared) private(k)
958 #endif
959  for (k = 0; k <= ths->Ad + 2; k++)
960  ths->Add[k] = regkern3(ths->k, ths->eps_I * (R) k / (R)(ths->Ad), ths->p,
961  ths->kernel_param, ths->eps_I, ths->eps_B);
962  }
963 #ifdef MEASURE_TIME
964  t1 = getticks();
965  ths->MEASURE_TIME_t[0] += NFFT(elapsed_seconds)(t1,t0);
966 #endif
967 
968 #ifdef MEASURE_TIME
969  t0 = getticks();
970 #endif
971 
972  for (k = 0; k < ths->mv1.M_total; k++)
973  for (t = 0; t < ths->mv1.d; t++)
974  ths->mv1.x[ths->mv1.d * k + t] = -ths->x[ths->mv1.d * k + t]; /* note the factor -1 for transposed transform instead of adjoint*/
975 
977  if (ths->mv1.flags & PRE_LIN_PSI)
978  NFFT(precompute_lin_psi)(&(ths->mv1));
979 
980  if (ths->mv1.flags & PRE_PSI)
981  NFFT(precompute_psi)(&(ths->mv1));
982 
983  if (ths->mv1.flags & PRE_FULL_PSI)
984  NFFT(precompute_full_psi)(&(ths->mv1));
985 #ifdef MEASURE_TIME
986  t1 = getticks();
987  ths->MEASURE_TIME_t[1] += nfft_elapsed_seconds(t1,t0);
988 #endif
989 
991  for (k = 0; k < ths->mv1.M_total; k++)
992  ths->mv1.f[k] = ths->alpha[k];
993 
994 #ifdef MEASURE_TIME
995  t0 = getticks();
996 #endif
997 
998  for (j = 0; j < ths->mv2.M_total; j++)
999  for (t = 0; t < ths->mv2.d; t++)
1000  ths->mv2.x[ths->mv2.d * j + t] = -ths->y[ths->mv2.d * j + t]; /* note the factor -1 for conjugated transform instead of standard*/
1001 
1003  if (ths->mv2.flags & PRE_LIN_PSI)
1004  NFFT(precompute_lin_psi)(&(ths->mv2));
1005 
1006  if (ths->mv2.flags & PRE_PSI)
1007  NFFT(precompute_psi)(&(ths->mv2));
1008 
1009  if (ths->mv2.flags & PRE_FULL_PSI)
1010  NFFT(precompute_full_psi)(&(ths->mv2));
1011 #ifdef MEASURE_TIME
1012  t1 = getticks();
1013  ths->MEASURE_TIME_t[2] += NFFT(elapsed_seconds)(t1,t0);
1014 #endif
1015 
1016 #ifdef MEASURE_TIME
1017  t0 = getticks();
1018 #endif
1019 
1020  n_total = 1;
1021  for (t = 0; t < ths->d; t++)
1022  n_total *= ths->n;
1023 
1024 #ifdef _OPENMP
1025  #pragma omp parallel for default(shared) private(j,k,t)
1026 #endif
1027  for (j = 0; j < n_total; j++)
1028  {
1029  if (ths->d == 1)
1030  ths->b[j] = regkern1(ths->k, (R) j / (R)(ths->n) - K(0.5), ths->p,
1031  ths->kernel_param, ths->eps_I, ths->eps_B) / (R)(n_total);
1032  else
1033  {
1034  k = j;
1035  ths->b[j] = K(0.0);
1036  for (t = 0; t < ths->d; t++)
1037  {
1038  ths->b[j] += ((R) (k % (ths->n)) / (R)(ths->n) - K(0.5))
1039  * ((R) (k % (ths->n)) / (R)(ths->n) - K(0.5));
1040  k = k / (ths->n);
1041  }
1042  ths->b[j] = regkern3(ths->k, SQRT(CREAL(ths->b[j])), ths->p, ths->kernel_param,
1043  ths->eps_I, ths->eps_B) / (R)(n_total);
1044  }
1045  }
1046 
1047  NFFT(fftshift_complex)(ths->b, (int)(ths->mv1.d), ths->mv1.N);
1048  FFTW(execute)(ths->fft_plan);
1049  NFFT(fftshift_complex)(ths->b, (int)(ths->mv1.d), ths->mv1.N);
1050 #ifdef MEASURE_TIME
1051  t1 = getticks();
1052  ths->MEASURE_TIME_t[0] += nfft_elapsed_seconds(t1,t0);
1053 #endif
1054 }
1055 
1058 {
1059  int j, k, t;
1060 #ifdef MEASURE_TIME
1061  ticks t0, t1;
1062 #endif
1063 
1064  ths->MEASURE_TIME_t[4] = K(0.0);
1065  ths->MEASURE_TIME_t[5] = K(0.0);
1066  ths->MEASURE_TIME_t[6] = K(0.0);
1067  ths->MEASURE_TIME_t[7] = K(0.0);
1068 
1069 #ifdef MEASURE_TIME
1070  t0 = getticks();
1071 #endif
1072 
1073  NFFT(adjoint)(&(ths->mv1));
1074 #ifdef MEASURE_TIME
1075  t1 = getticks();
1076  ths->MEASURE_TIME_t[4] += NFFT(elapsed_seconds)(t1,t0);
1077 #endif
1078 
1079 #ifdef MEASURE_TIME
1080  t0 = getticks();
1081 #endif
1082 
1083 #ifdef _OPENMP
1084  #pragma omp parallel for default(shared) private(k)
1085 #endif
1086  for (k = 0; k < ths->mv2.N_total; k++)
1087  ths->mv2.f_hat[k] = ths->b[k] * ths->mv1.f_hat[k];
1088 #ifdef MEASURE_TIME
1089  t1 = getticks();
1090  ths->MEASURE_TIME_t[5] += nfft_elapsed_seconds(t1,t0);
1091 #endif
1092 
1093 #ifdef MEASURE_TIME
1094  t0 = getticks();
1095 #endif
1096 
1097  NFFT(trafo)(&(ths->mv2));
1098 #ifdef MEASURE_TIME
1099  t1 = getticks();
1100  ths->MEASURE_TIME_t[6] += nfft_elapsed_seconds(t1,t0);
1101 #endif
1102 
1103 #ifdef MEASURE_TIME
1104  t0 = getticks();
1105 #endif
1106 
1107 #ifdef _OPENMP
1108  #pragma omp parallel for default(shared) private(j,k,t)
1109 #endif
1110  for (j = 0; j < ths->M_total; j++)
1111  {
1112  R ymin[ths->d], ymax[ths->d];
1114  if (ths->flags & NEARFIELD_BOXES)
1115  {
1116  ths->f[j] = ths->mv2.f[j] + SearchBox(ths->y + ths->d * j, ths);
1117  }
1118  else
1119  {
1120  for (t = 0; t < ths->d; t++)
1121  {
1122  ymin[t] = ths->y[ths->d * j + t] - ths->eps_I;
1123  ymax[t] = ths->y[ths->d * j + t] + ths->eps_I;
1124  }
1125  ths->f[j] = ths->mv2.f[j]
1126  + SearchTree(ths->d, 0, ths->x, ths->alpha, ymin, ymax, ths->N_total,
1127  ths->k, ths->kernel_param, ths->Ad, ths->Add, ths->p, ths->flags);
1128  }
1129  /* ths->f[j] = ths->mv2.f[j]; */
1130  /* ths->f[j] = SearchTree(ths->d,0, ths->x, ths->alpha, ymin, ymax, ths->N_total, ths->k, ths->kernel_param, ths->Ad, ths->Add, ths->p, ths->flags); */
1131  }
1132 
1133 #ifdef MEASURE_TIME
1134  t1 = getticks();
1135  ths->MEASURE_TIME_t[7] += NFFT(elapsed_seconds)(t1,t0);
1136 #endif
1137 }
1138 /* \} */
1139 
1140 /* fastsum.c */
int M_total
number of target knots
Definition: fastsum.h:81
static int max_i(int a, int b)
max
Definition: fastsum.c:48
C regkern(kernel k, R xx, int p, const R *param, R a, R b)
regularized kernel with K_I arbitrary and K_B smooth to zero
Definition: fastsum.c:83
static R fak(int n)
factorial
Definition: fastsum.c:54
R eps_B
outer boundary
Definition: fastsum.h:105
#define EXACT_NEARFIELD
Constant symbols.
Definition: fastsum.h:69
static R BasisPoly(int m, int r, R xx)
basis polynomial for regularized kernel
Definition: fastsum.c:69
R nfft_elapsed_seconds(ticks t1, ticks t0)
Return number of elapsed seconds between two time points.
Header file with predefined kernels for the fast summation algorithm.
static void BuildTree(int d, int t, R *x, C *alpha, int N)
recursive sort of source knots dimension by dimension to get tree structure
Definition: fastsum.c:593
R * kernel_param
parameters for kernel function
Definition: fastsum.h:90
C * Add
spline values
Definition: fastsum.h:112
static C SearchBox(R *y, fastsum_plan *ths)
box-based near field correction
Definition: fastsum.c:527
int Ad
near field
Definition: fastsum.h:111
static C calc_SearchBox(int d, R *y, R *x, C *alpha, int start, int end_lt, const C *Add, const int Ad, int p, R a, const kernel k, const R *param, const unsigned flags)
inner computation function for box-based near field correction
Definition: fastsum.c:474
plan for fast summation algorithm
Definition: fastsum.h:74
C * b
expansion coefficients
Definition: fastsum.h:101
static R binom(int n, int m)
binomial coefficient
Definition: fastsum.c:63
Header file for the fast NFFT-based summation algorithm.
C * alpha
source coefficients
Definition: fastsum.h:83
unsigned flags
flags precomp.
Definition: fastsum.h:92
void fastsum_trafo(fastsum_plan *ths)
fast NFFT-based summation
Definition: fastsum.c:1057
int d
api
Definition: fastsum.h:78
static C regkern1(kernel k, R xx, int p, const R *param, R a, R b)
regularized kernel with K_I arbitrary and K_B periodized (used in 1D)
Definition: fastsum.c:136
void fastsum_precompute(fastsum_plan *ths)
precomputation for fastsum
Definition: fastsum.c:909
int N_total
number of source knots
Definition: fastsum.h:80
static C kubintkern1(const R x, const C *Add, const int Ad, const R a)
cubic spline interpolation in near field with arbitrary kernels
Definition: fastsum.c:354
R MEASURE_TIME_t[8]
Measured time for each step if MEASURE_TIME is set.
Definition: fastsum.h:123
static C regkern3(kernel k, R xx, int p, const R *param, R a, R b)
regularized kernel for even kernels with K_I even and K_B mirrored
Definition: fastsum.c:232
void fastsum_init_guru(fastsum_plan *ths, int d, int N_total, int M_total, kernel k, R *param, unsigned flags, int nn, int m, int p, R eps_I, R eps_B)
initialization of fastsum plan
Definition: fastsum.c:693
C * f
target evaluations
Definition: fastsum.h:84
static void quicksort(int d, int t, R *x, C *alpha, int N)
quicksort algorithm for source knots and associated coefficients
Definition: fastsum.c:383
static C SearchTree(const int d, const int t, const R *x, const C *alpha, const R *xmin, const R *xmax, const int N, const kernel k, const R *param, const int Ad, const C *Add, const int p, const unsigned flags)
fast search in tree of source knots for near field computation
Definition: fastsum.c:607
kernel k
kernel function
Definition: fastsum.h:89
R * y
target knots in d-ball with radius 1/4-eps_b/2
Definition: fastsum.h:87
C kubintkern(const R x, const C *Add, const int Ad, const R a)
linear spline interpolation in near field with even kernels
Definition: fastsum.c:320
int n
FS__ - fast summation.
Definition: fastsum.h:100
int p
degree of smoothness of regularization
Definition: fastsum.h:103
R eps_I
inner boundary
Definition: fastsum.h:104
static void BuildBox(fastsum_plan *ths)
initialize box-based search data structures
Definition: fastsum.c:423
void fastsum_finalize(fastsum_plan *ths)
finalization of fastsum plan
Definition: fastsum.c:846
void fastsum_exact(fastsum_plan *ths)
direct computation of sums
Definition: fastsum.c:879
R * x
source knots in d-ball with radius 1/4-eps_b/2
Definition: fastsum.h:86