NFFT  3.5.0
nfft.c
1 /*
2  * Copyright (c) 2002, 2017 Jens Keiner, Stefan Kunis, Daniel Potts
3  *
4  * This program is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU General Public License as published by the Free Software
6  * Foundation; either version 2 of the License, or (at your option) any later
7  * version.
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program; if not, write to the Free Software Foundation, Inc., 51
16  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18 
19 /* Nonequispaced FFT */
20 
21 /* Authors: D. Potts, S. Kunis 2002-2009, Jens Keiner 2009, Toni Volkmer 2012 */
22 
23 /* configure header */
24 #include "config.h"
25 
26 /* complex datatype (maybe) */
27 #ifdef HAVE_COMPLEX_H
28 #include<complex.h>
29 #endif
30 
31 /* NFFT headers */
32 #include "nfft3.h"
33 #include "infft.h"
34 
35 #ifdef _OPENMP
36 #include <omp.h>
37 #endif
38 
39 #ifdef OMP_ASSERT
40 #include <assert.h>
41 #endif
42 
43 #undef X
44 #define X(name) NFFT(name)
45 
47 static inline INT intprod(const INT *vec, const INT a, const INT d)
48 {
49  INT t, p;
50 
51  p = 1;
52  for (t = 0; t < d; t++)
53  p *= vec[t] - a;
54 
55  return p;
56 }
57 
58 /* handy shortcuts */
59 #define BASE(x) CEXP(x)
60 
75 static inline void sort0(const INT d, const INT *n, const INT m,
76  const INT local_x_num, const R *local_x, INT *ar_x)
77 {
78  INT u_j[d], i, j, help, rhigh;
79  INT *ar_x_temp;
80  INT nprod;
81 
82  for (i = 0; i < local_x_num; i++)
83  {
84  ar_x[2 * i] = 0;
85  ar_x[2 *i + 1] = i;
86  for (j = 0; j < d; j++)
87  {
88  help = (INT) LRINT(FLOOR((R)(n[j]) * local_x[d * i + j] - (R)(m)));
89  u_j[j] = (help % n[j] + n[j]) % n[j];
90 
91  ar_x[2 * i] += u_j[j];
92  if (j + 1 < d)
93  ar_x[2 * i] *= n[j + 1];
94  }
95  }
96 
97  for (j = 0, nprod = 1; j < d; j++)
98  nprod *= n[j];
99 
100  rhigh = (INT) LRINT(CEIL(LOG2((R)nprod))) - 1;
101 
102  ar_x_temp = (INT*) Y(malloc)(2 * (size_t)(local_x_num) * sizeof(INT));
103  Y(sort_node_indices_radix_lsdf)(local_x_num, ar_x, ar_x_temp, rhigh);
104 #ifdef OMP_ASSERT
105  for (i = 1; i < local_x_num; i++)
106  assert(ar_x[2 * (i - 1)] <= ar_x[2 * i]);
107 #endif
108  Y(free)(ar_x_temp);
109 }
110 
119 static inline void sort(const X(plan) *ths)
120 {
121  if (ths->flags & NFFT_SORT_NODES)
122  sort0(ths->d, ths->n, ths->m, ths->M_total, ths->x, ths->index_x);
123 }
124 
145 void X(trafo_direct)(const X(plan) *ths)
146 {
147  C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
148 
149  memset(f, 0, (size_t)(ths->M_total) * sizeof(C));
150 
151  if (ths->d == 1)
152  {
153  /* specialize for univariate case, rationale: faster */
154  INT j;
155 #ifdef _OPENMP
156  #pragma omp parallel for default(shared) private(j)
157 #endif
158  for (j = 0; j < ths->M_total; j++)
159  {
160  INT k_L;
161  for (k_L = 0; k_L < ths->N_total; k_L++)
162  {
163  R omega = K2PI * ((R)(k_L - ths->N_total/2)) * ths->x[j];
164  f[j] += f_hat[k_L] * BASE(-II * omega);
165  }
166  }
167  }
168  else
169  {
170  /* multivariate case */
171  INT j;
172 #ifdef _OPENMP
173  #pragma omp parallel for default(shared) private(j)
174 #endif
175  for (j = 0; j < ths->M_total; j++)
176  {
177  R x[ths->d], omega, Omega[ths->d + 1];
178  INT t, t2, k_L, k[ths->d];
179  Omega[0] = K(0.0);
180  for (t = 0; t < ths->d; t++)
181  {
182  k[t] = -ths->N[t]/2;
183  x[t] = K2PI * ths->x[j * ths->d + t];
184  Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
185  }
186  omega = Omega[ths->d];
187 
188  for (k_L = 0; k_L < ths->N_total; k_L++)
189  {
190  f[j] += f_hat[k_L] * BASE(-II * omega);
191  {
192  for (t = ths->d - 1; (t >= 1) && (k[t] == ths->N[t]/2 - 1); t--)
193  k[t]-= ths->N[t]-1;
194 
195  k[t]++;
196 
197  for (t2 = t; t2 < ths->d; t2++)
198  Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
199 
200  omega = Omega[ths->d];
201  }
202  }
203  }
204  }
205 }
206 
207 void X(adjoint_direct)(const X(plan) *ths)
208 {
209  C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
210 
211  memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
212 
213  if (ths->d == 1)
214  {
215  /* specialize for univariate case, rationale: faster */
216 #ifdef _OPENMP
217  INT k_L;
218  #pragma omp parallel for default(shared) private(k_L)
219  for (k_L = 0; k_L < ths->N_total; k_L++)
220  {
221  INT j;
222  for (j = 0; j < ths->M_total; j++)
223  {
224  R omega = K2PI * ((R)(k_L - (ths->N_total/2))) * ths->x[j];
225  f_hat[k_L] += f[j] * BASE(II * omega);
226  }
227  }
228 #else
229  INT j;
230  for (j = 0; j < ths->M_total; j++)
231  {
232  INT k_L;
233  for (k_L = 0; k_L < ths->N_total; k_L++)
234  {
235  R omega = K2PI * ((R)(k_L - ths->N_total / 2)) * ths->x[j];
236  f_hat[k_L] += f[j] * BASE(II * omega);
237  }
238  }
239 #endif
240  }
241  else
242  {
243  /* multivariate case */
244  INT j, k_L;
245 #ifdef _OPENMP
246  #pragma omp parallel for default(shared) private(j, k_L)
247  for (k_L = 0; k_L < ths->N_total; k_L++)
248  {
249  INT k[ths->d], k_temp, t;
250 
251  k_temp = k_L;
252 
253  for (t = ths->d - 1; t >= 0; t--)
254  {
255  k[t] = k_temp % ths->N[t] - ths->N[t]/2;
256  k_temp /= ths->N[t];
257  }
258 
259  for (j = 0; j < ths->M_total; j++)
260  {
261  R omega = K(0.0);
262  for (t = 0; t < ths->d; t++)
263  omega += k[t] * K2PI * ths->x[j * ths->d + t];
264  f_hat[k_L] += f[j] * BASE(II * omega);
265  }
266  }
267 #else
268  for (j = 0; j < ths->M_total; j++)
269  {
270  R x[ths->d], omega, Omega[ths->d+1];
271  INT t, t2, k[ths->d];
272  Omega[0] = K(0.0);
273  for (t = 0; t < ths->d; t++)
274  {
275  k[t] = -ths->N[t]/2;
276  x[t] = K2PI * ths->x[j * ths->d + t];
277  Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
278  }
279  omega = Omega[ths->d];
280  for (k_L = 0; k_L < ths->N_total; k_L++)
281  {
282  f_hat[k_L] += f[j] * BASE(II * omega);
283 
284  for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
285  k[t]-= ths->N[t]-1;
286 
287  k[t]++;
288 
289  for (t2 = t; t2 < ths->d; t2++)
290  Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
291 
292  omega = Omega[ths->d];
293  }
294  }
295 #endif
296  }
297 }
298 
324 static inline void uo(const X(plan) *ths, const INT j, INT *up, INT *op,
325  const INT act_dim)
326 {
327  const R xj = ths->x[j * ths->d + act_dim];
328  INT c = LRINT(FLOOR(xj * (R)(ths->n[act_dim])));
329 
330  (*up) = c - (ths->m);
331  (*op) = c + 1 + (ths->m);
332 }
333 
334 static inline void uo2(INT *u, INT *o, const R x, const INT n, const INT m)
335 {
336  INT c = LRINT(FLOOR(x * (R)(n)));
337 
338  *u = (c - m + n) % n;
339  *o = (c + 1 + m + n) % n;
340 }
341 
342 #define MACRO_D_compute_A \
343 { \
344  g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d]; \
345 }
346 
347 #define MACRO_D_compute_T \
348 { \
349  f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d]; \
350 }
351 
352 #define MACRO_D_init_result_A memset(g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
353 
354 #define MACRO_D_init_result_T memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
355 
356 #define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]];
357 
358 #define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ths->n[t2],ks[t2]-(ths->N[t2]/2),t2));
359 
360 #define MACRO_init_k_ks \
361 { \
362  for (t = ths->d-1; 0 <= t; t--) \
363  { \
364  kp[t] = k[t] = 0; \
365  ks[t] = ths->N[t]/2; \
366  } \
367  t++; \
368 }
369 
370 #define MACRO_update_c_phi_inv_k(which_one) \
371 { \
372  for (t2 = t; t2 < ths->d; t2++) \
373  { \
374  c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one; \
375  ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2]; \
376  k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2]; \
377  } \
378 }
379 
380 #define MACRO_count_k_ks \
381 { \
382  for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--) \
383  { \
384  kp[t] = k[t] = 0; \
385  ks[t]= ths->N[t]/2; \
386  } \
387 \
388  kp[t]++; k[t]++; ks[t]++; \
389  if(kp[t] == ths->N[t]/2) \
390  { \
391  k[t] = ths->n[t] - ths->N[t]/2; \
392  ks[t] = 0; \
393  } \
394 } \
395 
396 /* sub routines for the fast transforms matrix vector multiplication with D, D^T */
397 #define MACRO_D(which_one) \
398 static inline void D_serial_ ## which_one (X(plan) *ths) \
399 { \
400  C *f_hat, *g_hat; /* local copy */ \
401  R c_phi_inv_k[ths->d+1]; /* postfix product of PHI_HUT */ \
402  INT t, t2; /* index dimensions */ \
403  INT k_L; /* plain index */ \
404  INT kp[ths->d]; /* multi index (simple) */ \
405  INT k[ths->d]; /* multi index in g_hat */ \
406  INT ks[ths->d]; /* multi index in f_hat, c_phi_inv*/ \
407  INT k_plain[ths->d+1]; /* postfix plain index */ \
408  INT ks_plain[ths->d+1]; /* postfix plain index */ \
409  \
410  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; \
411  MACRO_D_init_result_ ## which_one; \
412 \
413  c_phi_inv_k[0] = K(1.0); \
414  k_plain[0] = 0; \
415  ks_plain[0] = 0; \
416 \
417  MACRO_init_k_ks; \
418 \
419  if (ths->flags & PRE_PHI_HUT) \
420  { \
421  for (k_L = 0; k_L < ths->N_total; k_L++) \
422  { \
423  MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT); \
424  MACRO_D_compute_ ## which_one; \
425  MACRO_count_k_ks; \
426  } \
427  } \
428  else \
429  { \
430  for (k_L = 0; k_L < ths->N_total; k_L++) \
431  { \
432  MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT); \
433  MACRO_D_compute_ ## which_one; \
434  MACRO_count_k_ks; \
435  } \
436  } \
437 }
438 
439 #ifdef _OPENMP
440 static inline void D_openmp_A(X(plan) *ths)
441 {
442  C *f_hat, *g_hat;
443  INT k_L;
445  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
446  memset(g_hat, 0, ths->n_total * sizeof(C));
447 
448  if (ths->flags & PRE_PHI_HUT)
449  {
450  #pragma omp parallel for default(shared) private(k_L)
451  for (k_L = 0; k_L < ths->N_total; k_L++)
452  {
453  INT kp[ths->d]; //0..N-1
454  INT k[ths->d];
455  INT ks[ths->d];
456  R c_phi_inv_k_val = K(1.0);
457  INT k_plain_val = 0;
458  INT ks_plain_val = 0;
459  INT t;
460  INT k_temp = k_L;
461 
462  for (t = ths->d-1; t >= 0; t--)
463  {
464  kp[t] = k_temp % ths->N[t];
465  if (kp[t] >= ths->N[t]/2)
466  k[t] = ths->n[t] - ths->N[t] + kp[t];
467  else
468  k[t] = kp[t];
469  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
470  k_temp /= ths->N[t];
471  }
472 
473  for (t = 0; t < ths->d; t++)
474  {
475  c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
476  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
477  k_plain_val = k_plain_val*ths->n[t] + k[t];
478  }
479 
480  g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
481  } /* for(k_L) */
482  } /* if(PRE_PHI_HUT) */
483  else
484  {
485  #pragma omp parallel for default(shared) private(k_L)
486  for (k_L = 0; k_L < ths->N_total; k_L++)
487  {
488  INT kp[ths->d]; //0..N-1
489  INT k[ths->d];
490  INT ks[ths->d];
491  R c_phi_inv_k_val = K(1.0);
492  INT k_plain_val = 0;
493  INT ks_plain_val = 0;
494  INT t;
495  INT k_temp = k_L;
496 
497  for (t = ths->d-1; t >= 0; t--)
498  {
499  kp[t] = k_temp % ths->N[t];
500  if (kp[t] >= ths->N[t]/2)
501  k[t] = ths->n[t] - ths->N[t] + kp[t];
502  else
503  k[t] = kp[t];
504  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
505  k_temp /= ths->N[t];
506  }
507 
508  for (t = 0; t < ths->d; t++)
509  {
510  c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
511  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
512  k_plain_val = k_plain_val*ths->n[t] + k[t];
513  }
514 
515  g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
516  } /* for(k_L) */
517  } /* else(PRE_PHI_HUT) */
518 }
519 #endif
520 
521 #ifndef _OPENMP
522 MACRO_D(A)
523 #endif
524 
525 static inline void D_A(X(plan) *ths)
526 {
527 #ifdef _OPENMP
528  D_openmp_A(ths);
529 #else
530  D_serial_A(ths);
531 #endif
532 }
533 
534 #ifdef _OPENMP
535 static void D_openmp_T(X(plan) *ths)
536 {
537  C *f_hat, *g_hat;
538  INT k_L;
540  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
541  memset(f_hat, 0, ths->N_total * sizeof(C));
542 
543  if (ths->flags & PRE_PHI_HUT)
544  {
545  #pragma omp parallel for default(shared) private(k_L)
546  for (k_L = 0; k_L < ths->N_total; k_L++)
547  {
548  INT kp[ths->d]; //0..N-1
549  INT k[ths->d];
550  INT ks[ths->d];
551  R c_phi_inv_k_val = K(1.0);
552  INT k_plain_val = 0;
553  INT ks_plain_val = 0;
554  INT t;
555  INT k_temp = k_L;
556 
557  for (t = ths->d - 1; t >= 0; t--)
558  {
559  kp[t] = k_temp % ths->N[t];
560  if (kp[t] >= ths->N[t]/2)
561  k[t] = ths->n[t] - ths->N[t] + kp[t];
562  else
563  k[t] = kp[t];
564  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
565  k_temp /= ths->N[t];
566  }
567 
568  for (t = 0; t < ths->d; t++)
569  {
570  c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
571  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
572  k_plain_val = k_plain_val*ths->n[t] + k[t];
573  }
574 
575  f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
576  } /* for(k_L) */
577  } /* if(PRE_PHI_HUT) */
578  else
579  {
580  #pragma omp parallel for default(shared) private(k_L)
581  for (k_L = 0; k_L < ths->N_total; k_L++)
582  {
583  INT kp[ths->d]; //0..N-1
584  INT k[ths->d];
585  INT ks[ths->d];
586  R c_phi_inv_k_val = K(1.0);
587  INT k_plain_val = 0;
588  INT ks_plain_val = 0;
589  INT t;
590  INT k_temp = k_L;
591 
592  for (t = ths->d-1; t >= 0; t--)
593  {
594  kp[t] = k_temp % ths->N[t];
595  if (kp[t] >= ths->N[t]/2)
596  k[t] = ths->n[t] - ths->N[t] + kp[t];
597  else
598  k[t] = kp[t];
599  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
600  k_temp /= ths->N[t];
601  }
602 
603  for (t = 0; t < ths->d; t++)
604  {
605  c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
606  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
607  k_plain_val = k_plain_val*ths->n[t] + k[t];
608  }
609 
610  f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
611  } /* for(k_L) */
612  } /* else(PRE_PHI_HUT) */
613 }
614 #endif
615 
616 #ifndef _OPENMP
617 MACRO_D(T)
618 #endif
619 
620 static void D_T(X(plan) *ths)
621 {
622 #ifdef _OPENMP
623  D_openmp_T(ths);
624 #else
625  D_serial_T(ths);
626 #endif
627 }
628 
629 /* sub routines for the fast transforms matrix vector multiplication with B, B^T */
630 #define MACRO_B_init_result_A memset(ths->f, 0, (size_t)(ths->M_total) * sizeof(C));
631 #define MACRO_B_init_result_T memset(ths->g, 0, (size_t)(ths->n_total) * sizeof(C));
632 
633 #define MACRO_B_PRE_FULL_PSI_compute_A \
634 { \
635  (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \
636 }
637 
638 #define MACRO_B_PRE_FULL_PSI_compute_T \
639 { \
640  g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj); \
641 }
642 
643 #define MACRO_B_compute_A \
644 { \
645  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; \
646 }
647 
648 #define MACRO_B_compute_T \
649 { \
650  ths->g[ll_plain[ths->d]] += phi_prod[ths->d] * ths->f[j]; \
651 }
652 
653 #define MACRO_with_FG_PSI fg_psi[t2][lj[t2]]
654 
655 #define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]]
656 
657 #define MACRO_without_PRE_PSI_improved psij_const[t2 * (2*ths->m+2) + lj[t2]]
658 
659 #define MACRO_without_PRE_PSI PHI(ths->n[t2], ths->x[j*ths->d+t2] \
660  - ((R) (lj[t2]+u[t2]))/((R)ths->n[t2]), t2)
661 
662 #define MACRO_init_uo_l_lj_t \
663 INT l_all[ths->d*(2*ths->m+2)]; \
664 { \
665  for (t = ths->d-1; t >= 0; t--) \
666  { \
667  uo(ths,j,&u[t],&o[t],t); \
668  INT lj_t; \
669  for (lj_t = 0; lj_t < 2*ths->m+2; lj_t++) \
670  l_all[t*(2*ths->m+2) + lj_t] = (u[t] + lj_t + ths->n[t]) % ths->n[t]; \
671  lj[t] = 0; \
672  } \
673  t++; \
674 }
675 
676 #define MACRO_update_phi_prod_ll_plain(which_one) { \
677  for (t2 = t; t2 < ths->d; t2++) \
678  { \
679  phi_prod[t2+1] = phi_prod[t2] * MACRO_ ## which_one; \
680  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
681  } \
682 }
683 
684 #define MACRO_count_uo_l_lj_t \
685 { \
686  for (t = ths->d-1; (t > 0) && (lj[t] == o[t]-u[t]); t--) \
687  { \
688  lj[t] = 0; \
689  } \
690  \
691  lj[t]++; \
692 }
693 
694 #define MACRO_COMPUTE_with_PRE_PSI MACRO_with_PRE_PSI
695 #define MACRO_COMPUTE_with_PRE_FG_PSI MACRO_with_FG_PSI
696 #define MACRO_COMPUTE_with_FG_PSI MACRO_with_FG_PSI
697 #define MACRO_COMPUTE_with_PRE_LIN_PSI MACRO_with_FG_PSI
698 #define MACRO_COMPUTE_without_PRE_PSI MACRO_without_PRE_PSI_improved
699 #define MACRO_COMPUTE_without_PRE_PSI_improved MACRO_without_PRE_PSI_improved
700 
701 #define MACRO_B_COMPUTE_ONE_NODE(whichone_AT,whichone_FLAGS) \
702  if (ths->d == 4) \
703  { \
704  INT l0, l1, l2, l3; \
705  for (l0 = 0; l0 < 2*ths->m+2; l0++) \
706  { \
707  lj[0] = l0; \
708  t2 = 0; \
709  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
710  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
711  for (l1 = 0; l1 < 2*ths->m+2; l1++) \
712  { \
713  lj[1] = l1; \
714  t2 = 1; \
715  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
716  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
717  for (l2 = 0; l2 < 2*ths->m+2; l2++) \
718  { \
719  lj[2] = l2; \
720  t2 = 2; \
721  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
722  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
723  for (l3 = 0; l3 < 2*ths->m+2; l3++) \
724  { \
725  lj[3] = l3; \
726  t2 = 3; \
727  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
728  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
729  \
730  MACRO_B_compute_ ## whichone_AT; \
731  } \
732  } \
733  } \
734  } \
735  } /* if(d==4) */ \
736  else if (ths->d == 5) \
737  { \
738  INT l0, l1, l2, l3, l4; \
739  for (l0 = 0; l0 < 2*ths->m+2; l0++) \
740  { \
741  lj[0] = l0; \
742  t2 = 0; \
743  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
744  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
745  for (l1 = 0; l1 < 2*ths->m+2; l1++) \
746  { \
747  lj[1] = l1; \
748  t2 = 1; \
749  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
750  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
751  for (l2 = 0; l2 < 2*ths->m+2; l2++) \
752  { \
753  lj[2] = l2; \
754  t2 = 2; \
755  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
756  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
757  for (l3 = 0; l3 < 2*ths->m+2; l3++) \
758  { \
759  lj[3] = l3; \
760  t2 = 3; \
761  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
762  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
763  for (l4 = 0; l4 < 2*ths->m+2; l4++) \
764  { \
765  lj[4] = l4; \
766  t2 = 4; \
767  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
768  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
769  \
770  MACRO_B_compute_ ## whichone_AT; \
771  } \
772  } \
773  } \
774  } \
775  } \
776  } /* if(d==5) */ \
777  else { \
778  for (l_L = 0; l_L < lprod; l_L++) \
779  { \
780  MACRO_update_phi_prod_ll_plain(whichone_FLAGS); \
781  \
782  MACRO_B_compute_ ## whichone_AT; \
783  \
784  MACRO_count_uo_l_lj_t; \
785  } /* for(l_L) */ \
786  }
787 
788 #define MACRO_B(which_one) \
789 static inline void B_serial_ ## which_one (X(plan) *ths) \
790 { \
791  INT lprod; /* 'regular bandwidth' of matrix B */ \
792  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \
793  INT t, t2; /* index dimensions */ \
794  INT k; /* index nodes */ \
795  INT l_L, ix; /* index one row of B */ \
796  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ \
797  INT ll_plain[ths->d+1]; /* postfix plain index in g */ \
798  R phi_prod[ths->d+1]; /* postfix product of PHI */ \
799  R y[ths->d]; \
800  R fg_psi[ths->d][2*ths->m+2]; \
801  R fg_exp_l[ths->d][2*ths->m+2]; \
802  INT l_fg,lj_fg; \
803  R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \
804  R ip_w; \
805  INT ip_u; \
806  INT ip_s = ths->K/(ths->m+2); \
807  \
808  MACRO_B_init_result_ ## which_one; \
809  \
810  if (ths->flags & PRE_FULL_PSI) \
811  { \
812  INT j; \
813  C *f, *g; /* local copy */ \
814  C *fj; /* local copy */ \
815  f = (C*)ths->f; g = (C*)ths->g; \
816  \
817  for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \
818  { \
819  for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \
820  { \
821  MACRO_B_PRE_FULL_PSI_compute_ ## which_one; \
822  } \
823  } \
824  return; \
825  } \
826 \
827  phi_prod[0] = K(1.0); \
828  ll_plain[0] = 0; \
829 \
830  for (t = 0, lprod = 1; t < ths->d; t++) \
831  lprod *= (2 * ths->m + 2); \
832 \
833  if (ths->flags & PRE_PSI) \
834  { \
835  sort(ths); \
836  \
837  for (k = 0; k < ths->M_total; k++) \
838  { \
839  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \
840  \
841  MACRO_init_uo_l_lj_t; \
842  \
843  MACRO_B_COMPUTE_ONE_NODE(which_one,with_PRE_PSI); \
844  } /* for(j) */ \
845  return; \
846  } /* if(PRE_PSI) */ \
847  \
848  if (ths->flags & PRE_FG_PSI) \
849  { \
850  sort(ths); \
851  \
852  for(t2 = 0; t2 < ths->d; t2++) \
853  { \
854  tmpEXP2 = EXP(K(-1.0) / ths->b[t2]); \
855  tmpEXP2sq = tmpEXP2*tmpEXP2; \
856  tmp2 = K(1.0); \
857  tmp3 = K(1.0); \
858  fg_exp_l[t2][0] = K(1.0); \
859  for (lj_fg = 1; lj_fg <= (2 * ths->m + 2); lj_fg++) \
860  { \
861  tmp3 = tmp2*tmpEXP2; \
862  tmp2 *= tmpEXP2sq; \
863  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1] * tmp3; \
864  } \
865  } \
866  for (k = 0; k < ths->M_total; k++) \
867  { \
868  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \
869  \
870  MACRO_init_uo_l_lj_t; \
871  \
872  for (t2 = 0; t2 < ths->d; t2++) \
873  { \
874  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
875  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
876  tmp1 = K(1.0); \
877  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
878  { \
879  tmp1 *= tmpEXP1; \
880  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
881  } \
882  } \
883  \
884  MACRO_B_COMPUTE_ONE_NODE(which_one,with_FG_PSI); \
885  } /* for(j) */ \
886  return; \
887  } /* if(PRE_FG_PSI) */ \
888  \
889  if (ths->flags & FG_PSI) \
890  { \
891  sort(ths); \
892  \
893  for (t2 = 0; t2 < ths->d; t2++) \
894  { \
895  tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
896  tmpEXP2sq = tmpEXP2*tmpEXP2; \
897  tmp2 = K(1.0); \
898  tmp3 = K(1.0); \
899  fg_exp_l[t2][0] = K(1.0); \
900  for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
901  { \
902  tmp3 = tmp2*tmpEXP2; \
903  tmp2 *= tmpEXP2sq; \
904  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
905  } \
906  } \
907  for (k = 0; k < ths->M_total; k++) \
908  { \
909  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \
910  \
911  MACRO_init_uo_l_lj_t; \
912  \
913  for (t2 = 0; t2 < ths->d; t2++) \
914  { \
915  fg_psi[t2][0] = (PHI(ths->n[t2], (ths->x[j*ths->d+t2] - ((R)u[t2])/((R)(ths->n[t2]))), t2));\
916  \
917  tmpEXP1 = EXP(K(2.0) * ((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
918  /ths->b[t2]); \
919  tmp1 = K(1.0); \
920  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
921  { \
922  tmp1 *= tmpEXP1; \
923  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
924  } \
925  } \
926  \
927  MACRO_B_COMPUTE_ONE_NODE(which_one,with_FG_PSI); \
928  } /* for(j) */ \
929  return; \
930  } /* if(FG_PSI) */ \
931  \
932  if (ths->flags & PRE_LIN_PSI) \
933  { \
934  sort(ths); \
935  \
936  for (k = 0; k<ths->M_total; k++) \
937  { \
938  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \
939  \
940  MACRO_init_uo_l_lj_t; \
941  \
942  for (t2 = 0; t2 < ths->d; t2++) \
943  { \
944  y[t2] = (((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
945  * ((R)(ths->K))) / (R)(ths->m + 2); \
946  ip_u = LRINT(FLOOR(y[t2])); \
947  ip_w = y[t2]-ip_u; \
948  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
949  { \
950  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
951  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
952  * (ip_w); \
953  } \
954  } \
955  \
956  MACRO_B_COMPUTE_ONE_NODE(which_one,with_FG_PSI); \
957  } /* for(j) */ \
958  return; \
959  } /* if(PRE_LIN_PSI) */ \
960  \
961  sort(ths); \
962  \
963  /* no precomputed psi at all */ \
964  for (k = 0; k < ths->M_total; k++) \
965  { \
966  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \
967  \
968  R psij_const[ths->d * (2*ths->m+2)]; \
969  \
970  MACRO_init_uo_l_lj_t; \
971  \
972  for (t2 = 0; t2 < ths->d; t2++) \
973  { \
974  INT lj_t; \
975  for (lj_t = 0; lj_t < 2*ths->m+2; lj_t++) \
976  psij_const[t2 * (2*ths->m+2) + lj_t] = PHI(ths->n[t2], ths->x[j*ths->d+t2] \
977  - ((R) (lj_t+u[t2]))/((R)ths->n[t2]), t2); \
978  } \
979  \
980  MACRO_B_COMPUTE_ONE_NODE(which_one,without_PRE_PSI_improved); \
981  } /* for(j) */ \
982 } /* nfft_B */ \
983 
984 #ifndef _OPENMP
985 MACRO_B(A)
986 #endif
987 
988 #ifdef _OPENMP
989 #define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_with_PRE_PSI
990 #define MACRO_B_openmp_A_COMPUTE_UPDATE_with_PRE_PSI \
991  MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
992 
993 #define MACRO_B_openmp_A_COMPUTE_INIT_FG_PSI \
994  for (t2 = 0; t2 < ths->d; t2++) \
995  { \
996  INT lj_fg; \
997  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
998  R tmpEXP2sq = tmpEXP2*tmpEXP2; \
999  R tmp2 = K(1.0); \
1000  R tmp3 = K(1.0); \
1001  fg_exp_l[t2][0] = K(1.0); \
1002  for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
1003  { \
1004  tmp3 = tmp2*tmpEXP2; \
1005  tmp2 *= tmpEXP2sq; \
1006  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
1007  } \
1008  }
1009 #define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_with_PRE_FG_PSI \
1010  for (t2 = 0; t2 < ths->d; t2++) \
1011  { \
1012  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
1013  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
1014  tmp1 = K(1.0); \
1015  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
1016  { \
1017  tmp1 *= tmpEXP1; \
1018  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
1019  } \
1020  }
1021 #define MACRO_B_openmp_A_COMPUTE_UPDATE_with_PRE_FG_PSI \
1022  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1023 
1024 #define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_with_FG_PSI \
1025  for (t2 = 0; t2 < ths->d; t2++) \
1026  { \
1027  fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/((R)ths->n[t2])),t2)); \
1028  \
1029  tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2]) \
1030  /ths->b[t2]); \
1031  tmp1 = K(1.0); \
1032  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
1033  { \
1034  tmp1 *= tmpEXP1; \
1035  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
1036  } \
1037  }
1038 #define MACRO_B_openmp_A_COMPUTE_UPDATE_with_FG_PSI \
1039  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1040 
1041 #define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_with_PRE_LIN_PSI \
1042  for (t2 = 0; t2 < ths->d; t2++) \
1043  { \
1044  y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2]) \
1045  * ((R)ths->K))/(ths->m+2); \
1046  ip_u = LRINT(FLOOR(y[t2])); \
1047  ip_w = y[t2]-ip_u; \
1048  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
1049  { \
1050  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
1051  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
1052  * (ip_w); \
1053  } \
1054  }
1055 #define MACRO_B_openmp_A_COMPUTE_UPDATE_with_PRE_LIN_PSI \
1056  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1057 
1058 #define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_without_PRE_PSI \
1059  for (t2 = 0; t2 < ths->d; t2++) \
1060  { \
1061  INT lj_t; \
1062  for (lj_t = 0; lj_t < 2*ths->m+2; lj_t++) \
1063  psij_const[t2 * (2*ths->m+2) + lj_t] = PHI(ths->n[t2], ths->x[j*ths->d+t2] \
1064  - ((R) (lj_t+u[t2]))/((R)ths->n[t2]), t2); \
1065  }
1066 #define MACRO_B_openmp_A_COMPUTE_UPDATE_without_PRE_PSI \
1067  MACRO_update_phi_prod_ll_plain(without_PRE_PSI_improved);
1068 
1069 #define MACRO_B_openmp_A_COMPUTE(whichone) \
1070 { \
1071  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \
1072  INT l_L; /* index one row of B */ \
1073  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ \
1074  INT ll_plain[ths->d+1]; /* postfix plain index in g */ \
1075  R phi_prod[ths->d+1]; /* postfix product of PHI */ \
1076  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \
1077  \
1078  phi_prod[0] = K(1.0); \
1079  ll_plain[0] = 0; \
1080  \
1081  MACRO_init_uo_l_lj_t; \
1082  \
1083  MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_ ##whichone \
1084  \
1085  if (ths->d == 4) \
1086  { \
1087  INT l0, l1, l2, l3; \
1088  for (l0 = 0; l0 < 2*ths->m+2; l0++) \
1089  { \
1090  lj[0] = l0; \
1091  t2 = 0; \
1092  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1093  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1094  for (l1 = 0; l1 < 2*ths->m+2; l1++) \
1095  { \
1096  lj[1] = l1; \
1097  t2 = 1; \
1098  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1099  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1100  for (l2 = 0; l2 < 2*ths->m+2; l2++) \
1101  { \
1102  lj[2] = l2; \
1103  t2 = 2; \
1104  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1105  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1106  for (l3 = 0; l3 < 2*ths->m+2; l3++) \
1107  { \
1108  lj[3] = l3; \
1109  t2 = 3; \
1110  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1111  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1112  \
1113  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; \
1114  } \
1115  } \
1116  } \
1117  } \
1118  } /* if(d==4) */ \
1119  else if (ths->d == 5) \
1120  { \
1121  INT l0, l1, l2, l3, l4; \
1122  for (l0 = 0; l0 < 2*ths->m+2; l0++) \
1123  { \
1124  lj[0] = l0; \
1125  t2 = 0; \
1126  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1127  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1128  for (l1 = 0; l1 < 2*ths->m+2; l1++) \
1129  { \
1130  lj[1] = l1; \
1131  t2 = 1; \
1132  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1133  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1134  for (l2 = 0; l2 < 2*ths->m+2; l2++) \
1135  { \
1136  lj[2] = l2; \
1137  t2 = 2; \
1138  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1139  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1140  for (l3 = 0; l3 < 2*ths->m+2; l3++) \
1141  { \
1142  lj[3] = l3; \
1143  t2 = 3; \
1144  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1145  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1146  for (l4 = 0; l4 < 2*ths->m+2; l4++) \
1147  { \
1148  lj[4] = l4; \
1149  t2 = 4; \
1150  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1151  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1152  \
1153  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; \
1154  } \
1155  } \
1156  } \
1157  } \
1158  } \
1159  } /* if(d==5) */ \
1160  else { \
1161  for (l_L = 0; l_L < lprod; l_L++) \
1162  { \
1163  MACRO_B_openmp_A_COMPUTE_UPDATE_ ##whichone \
1164  \
1165  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; \
1166  \
1167  MACRO_count_uo_l_lj_t; \
1168  } /* for(l_L) */ \
1169  } \
1170 }
1171 
1172 static inline void B_openmp_A (X(plan) *ths)
1173 {
1174  INT lprod; /* 'regular bandwidth' of matrix B */
1175  INT k;
1176 
1177  memset(ths->f, 0, ths->M_total * sizeof(C));
1178 
1179  for (k = 0, lprod = 1; k < ths->d; k++)
1180  lprod *= (2*ths->m+2);
1181 
1182  if (ths->flags & PRE_FULL_PSI)
1183  {
1184  #pragma omp parallel for default(shared) private(k)
1185  for (k = 0; k < ths->M_total; k++)
1186  {
1187  INT l;
1188  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1189  ths->f[j] = K(0.0);
1190  for (l = 0; l < lprod; l++)
1191  ths->f[j] += ths->psi[j*lprod+l] * ths->g[ths->psi_index_g[j*lprod+l]];
1192  }
1193  return;
1194  }
1195 
1196  if (ths->flags & PRE_PSI)
1197  {
1198  #pragma omp parallel for default(shared) private(k)
1199  for (k = 0; k < ths->M_total; k++)
1200  {
1201  INT t, t2; /* index dimensions */
1202  MACRO_B_openmp_A_COMPUTE(with_PRE_PSI);
1203  } /* for(j) */
1204  return;
1205  } /* if(PRE_PSI) */
1206 
1207  if (ths->flags & PRE_FG_PSI)
1208  {
1209  INT t, t2; /* index dimensions */
1210  R fg_exp_l[ths->d][2*ths->m+2];
1211 
1212  MACRO_B_openmp_A_COMPUTE_INIT_FG_PSI
1213 
1214  #pragma omp parallel for default(shared) private(k,t,t2)
1215  for (k = 0; k < ths->M_total; k++)
1216  {
1217  R fg_psi[ths->d][2*ths->m+2];
1218  R tmpEXP1, tmp1;
1219  INT l_fg,lj_fg;
1220 
1221  MACRO_B_openmp_A_COMPUTE(with_PRE_FG_PSI);
1222  } /* for(j) */
1223  return;
1224  } /* if(PRE_FG_PSI) */
1225 
1226  if (ths->flags & FG_PSI)
1227  {
1228  INT t, t2; /* index dimensions */
1229  R fg_exp_l[ths->d][2*ths->m+2];
1230 
1231  sort(ths);
1232 
1233  MACRO_B_openmp_A_COMPUTE_INIT_FG_PSI
1234 
1235  #pragma omp parallel for default(shared) private(k,t,t2)
1236  for (k = 0; k < ths->M_total; k++)
1237  {
1238  R fg_psi[ths->d][2*ths->m+2];
1239  R tmpEXP1, tmp1;
1240  INT l_fg,lj_fg;
1241 
1242  MACRO_B_openmp_A_COMPUTE(with_FG_PSI);
1243  } /* for(j) */
1244  return;
1245  } /* if(FG_PSI) */
1246 
1247  if (ths->flags & PRE_LIN_PSI)
1248  {
1249  sort(ths);
1250 
1251  #pragma omp parallel for default(shared) private(k)
1252  for (k = 0; k<ths->M_total; k++)
1253  {
1254  INT t, t2; /* index dimensions */
1255  R y[ths->d];
1256  R fg_psi[ths->d][2*ths->m+2];
1257  INT l_fg,lj_fg;
1258  R ip_w;
1259  INT ip_u;
1260  INT ip_s = ths->K/(ths->m+2);
1261 
1262  MACRO_B_openmp_A_COMPUTE(with_PRE_LIN_PSI);
1263  } /* for(j) */
1264  return;
1265  } /* if(PRE_LIN_PSI) */
1266 
1267  /* no precomputed psi at all */
1268  sort(ths);
1269 
1270  #pragma omp parallel for default(shared) private(k)
1271  for (k = 0; k < ths->M_total; k++)
1272  {
1273  INT t, t2; /* index dimensions */
1274  R psij_const[ths->d * (2*ths->m+2)];
1275 
1276  MACRO_B_openmp_A_COMPUTE(without_PRE_PSI);
1277  } /* for(j) */
1278 }
1279 #endif
1280 
1281 static void B_A(X(plan) *ths)
1282 {
1283 #ifdef _OPENMP
1284  B_openmp_A(ths);
1285 #else
1286  B_serial_A(ths);
1287 #endif
1288 }
1289 
1290 #ifdef _OPENMP
1291 
1306 static inline INT index_x_binary_search(const INT *ar_x, const INT len, const INT key)
1307 {
1308  INT left = 0, right = len - 1;
1309 
1310  if (len == 1)
1311  return 0;
1312 
1313  while (left < right - 1)
1314  {
1315  INT i = (left + right) / 2;
1316  if (ar_x[2*i] >= key)
1317  right = i;
1318  else if (ar_x[2*i] < key)
1319  left = i;
1320  }
1321 
1322  if (ar_x[2*left] < key && left != len-1)
1323  return left+1;
1324 
1325  return left;
1326 }
1327 #endif
1328 
1329 #ifdef _OPENMP
1330 
1345 static void nfft_adjoint_B_omp_blockwise_init(INT *my_u0, INT *my_o0,
1346  INT *min_u_a, INT *max_u_a, INT *min_u_b, INT *max_u_b, const INT d,
1347  const INT *n, const INT m)
1348 {
1349  const INT n0 = n[0];
1350  INT k;
1351  INT nthreads = omp_get_num_threads();
1352  INT nthreads_used = MIN(nthreads, n0);
1353  INT size_per_thread = n0 / nthreads_used;
1354  INT size_left = n0 - size_per_thread * nthreads_used;
1355  INT size_g[nthreads_used];
1356  INT offset_g[nthreads_used];
1357  INT my_id = omp_get_thread_num();
1358  INT n_prod_rest = 1;
1359 
1360  for (k = 1; k < d; k++)
1361  n_prod_rest *= n[k];
1362 
1363  *min_u_a = -1;
1364  *max_u_a = -1;
1365  *min_u_b = -1;
1366  *max_u_b = -1;
1367  *my_u0 = -1;
1368  *my_o0 = -1;
1369 
1370  if (my_id < nthreads_used)
1371  {
1372  const INT m22 = 2 * m + 2;
1373 
1374  offset_g[0] = 0;
1375  for (k = 0; k < nthreads_used; k++)
1376  {
1377  if (k > 0)
1378  offset_g[k] = offset_g[k-1] + size_g[k-1];
1379  size_g[k] = size_per_thread;
1380  if (size_left > 0)
1381  {
1382  size_g[k]++;
1383  size_left--;
1384  }
1385  }
1386 
1387  *my_u0 = offset_g[my_id];
1388  *my_o0 = offset_g[my_id] + size_g[my_id] - 1;
1389 
1390  if (nthreads_used > 1)
1391  {
1392  *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1;
1393  *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1);
1394  }
1395  else
1396  {
1397  *min_u_a = 0;
1398  *max_u_a = n_prod_rest * n0 - 1;
1399  }
1400 
1401  if (*min_u_a < 0)
1402  {
1403  *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0);
1404  *max_u_b = n_prod_rest * n0 - 1;
1405  *min_u_a = 0;
1406  }
1407 
1408  if (*min_u_b != -1 && *min_u_b <= *max_u_a)
1409  {
1410  *max_u_a = *max_u_b;
1411  *min_u_b = -1;
1412  *max_u_b = -1;
1413  }
1414 #ifdef OMP_ASSERT
1415  assert(*min_u_a <= *max_u_a);
1416  assert(*min_u_b <= *max_u_b);
1417  assert(*min_u_b == -1 || *max_u_a < *min_u_b);
1418 #endif
1419  }
1420 }
1421 #endif
1422 
1431 static void nfft_adjoint_B_compute_full_psi(C *g, const INT *psi_index_g,
1432  const R *psi, const C *f, const INT M, const INT d, const INT *n,
1433  const INT m, const unsigned flags, const INT *index_x)
1434 {
1435  INT k;
1436  INT lprod;
1437 #ifdef _OPENMP
1438  INT lprod_m1;
1439 #endif
1440 #ifndef _OPENMP
1441  UNUSED(n);
1442 #endif
1443  {
1444  INT t;
1445  for(t = 0, lprod = 1; t < d; t++)
1446  lprod *= 2 * m + 2;
1447  }
1448 #ifdef _OPENMP
1449  lprod_m1 = lprod / (2 * m + 2);
1450 #endif
1451 
1452 #ifdef _OPENMP
1453  if (flags & NFFT_OMP_BLOCKWISE_ADJOINT)
1454  {
1455  #pragma omp parallel private(k)
1456  {
1457  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;
1458  const INT *ar_x = index_x;
1459  INT n_prod_rest = 1;
1460 
1461  for (k = 1; k < d; k++)
1462  n_prod_rest *= n[k];
1463 
1464  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m);
1465 
1466  if (min_u_a != -1)
1467  {
1468  k = index_x_binary_search(ar_x, M, min_u_a);
1469 #ifdef OMP_ASSERT
1470  assert(ar_x[2*k] >= min_u_a || k == M-1);
1471  if (k > 0)
1472  assert(ar_x[2*k-2] < min_u_a);
1473 #endif
1474  while (k < M)
1475  {
1476  INT l0, lrest;
1477  INT u_prod = ar_x[2*k];
1478  INT j = ar_x[2*k+1];
1479 
1480  if (u_prod < min_u_a || u_prod > max_u_a)
1481  break;
1482 
1483  for (l0 = 0; l0 < 2 * m + 2; l0++)
1484  {
1485  const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1486 
1487  if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1488  continue;
1489 
1490  for (lrest = 0; lrest < lprod_m1; lrest++)
1491  {
1492  const INT l = l0 * lprod_m1 + lrest;
1493  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1494  }
1495  }
1496 
1497  k++;
1498  }
1499  }
1500 
1501  if (min_u_b != -1)
1502  {
1503  k = index_x_binary_search(ar_x, M, min_u_b);
1504 #ifdef OMP_ASSERT
1505  assert(ar_x[2*k] >= min_u_b || k == M-1);
1506  if (k > 0)
1507  assert(ar_x[2*k-2] < min_u_b);
1508 #endif
1509  while (k < M)
1510  {
1511  INT l0, lrest;
1512  INT u_prod = ar_x[2*k];
1513  INT j = ar_x[2*k+1];
1514 
1515  if (u_prod < min_u_b || u_prod > max_u_b)
1516  break;
1517 
1518  for (l0 = 0; l0 < 2 * m + 2; l0++)
1519  {
1520  const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1521 
1522  if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1523  continue;
1524  for (lrest = 0; lrest < lprod_m1; lrest++)
1525  {
1526  const INT l = l0 * lprod_m1 + lrest;
1527  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1528  }
1529  }
1530 
1531  k++;
1532  }
1533  }
1534  } /* omp parallel */
1535  return;
1536  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */
1537 #endif
1538 
1539 #ifdef _OPENMP
1540  #pragma omp parallel for default(shared) private(k)
1541 #endif
1542  for (k = 0; k < M; k++)
1543  {
1544  INT l;
1545  INT j = (flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k;
1546 
1547  for (l = 0; l < lprod; l++)
1548  {
1549 #ifdef _OPENMP
1550  C val = psi[j * lprod + l] * f[j];
1551  C *gref = g + psi_index_g[j * lprod + l];
1552  R *gref_real = (R*) gref;
1553 
1554  #pragma omp atomic
1555  gref_real[0] += CREAL(val);
1556 
1557  #pragma omp atomic
1558  gref_real[1] += CIMAG(val);
1559 #else
1560  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1561 #endif
1562  }
1563  }
1564 }
1565 
1566 #ifndef _OPENMP
1567 MACRO_B(T)
1568 #endif
1569 
1570 
1571 #ifdef _OPENMP
1572 
1573 #ifdef OMP_ASSERT
1574 #define MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \
1575 { \
1576  assert(ar_x[2*k] >= min_u_a || k == M-1); \
1577  if (k > 0) \
1578  assert(ar_x[2*k-2] < min_u_a); \
1579 }
1580 #else
1581 #define MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A
1582 #endif
1583 
1584 #ifdef OMP_ASSERT
1585 #define MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \
1586 { \
1587  assert(ar_x[2*k] >= min_u_b || k == M-1); \
1588  if (k > 0) \
1589  assert(ar_x[2*k-2] < min_u_b); \
1590 }
1591 #else
1592 #define MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B
1593 #endif
1594 
1595 #define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_with_PRE_PSI
1596 #define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_with_PRE_PSI \
1597  MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
1598 
1599 #define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_with_PRE_FG_PSI \
1600  R fg_psi[ths->d][2*ths->m+2]; \
1601  R tmpEXP1, tmp1; \
1602  INT l_fg,lj_fg; \
1603  for (t2 = 0; t2 < ths->d; t2++) \
1604  { \
1605  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
1606  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
1607  tmp1 = K(1.0); \
1608  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
1609  { \
1610  tmp1 *= tmpEXP1; \
1611  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
1612  } \
1613  }
1614 #define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_with_PRE_FG_PSI \
1615  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1616 
1617 #define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_with_FG_PSI \
1618  R fg_psi[ths->d][2*ths->m+2]; \
1619  R tmpEXP1, tmp1; \
1620  INT l_fg,lj_fg; \
1621  for (t2 = 0; t2 < ths->d; t2++) \
1622  { \
1623  fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/((R)ths->n[t2])),t2)); \
1624  \
1625  tmpEXP1 = EXP(K(2.0)*((R)ths->n[t2]*ths->x[j*ths->d+t2] - (R)u[t2]) \
1626  /ths->b[t2]); \
1627  tmp1 = K(1.0); \
1628  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
1629  { \
1630  tmp1 *= tmpEXP1; \
1631  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
1632  } \
1633  }
1634 #define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_with_FG_PSI \
1635  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1636 
1637 #define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_with_PRE_LIN_PSI \
1638  R y[ths->d]; \
1639  R fg_psi[ths->d][2*ths->m+2]; \
1640  INT l_fg,lj_fg; \
1641  R ip_w; \
1642  INT ip_u; \
1643  INT ip_s = ths->K/(ths->m+2); \
1644  for (t2 = 0; t2 < ths->d; t2++) \
1645  { \
1646  y[t2] = ((((R)ths->n[t2])*ths->x[j*ths->d+t2]-(R)u[t2]) \
1647  * ((R)ths->K))/((R)ths->m+2); \
1648  ip_u = LRINT(FLOOR(y[t2])); \
1649  ip_w = y[t2]-ip_u; \
1650  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
1651  { \
1652  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
1653  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
1654  * (ip_w); \
1655  } \
1656  }
1657 #define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_with_PRE_LIN_PSI \
1658  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1659 
1660 #define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_without_PRE_PSI \
1661  R psij_const[ths->d * (2*ths->m+2)]; \
1662  for (t2 = 0; t2 < ths->d; t2++) \
1663  { \
1664  INT lj_t; \
1665  for (lj_t = 0; lj_t < 2*ths->m+2; lj_t++) \
1666  psij_const[t2 * (2*ths->m+2) + lj_t] = PHI(ths->n[t2], ths->x[j*ths->d+t2] \
1667  - ((R) (lj_t+u[t2]))/((R)ths->n[t2]), t2); \
1668  }
1669 #define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_without_PRE_PSI \
1670  MACRO_update_phi_prod_ll_plain(without_PRE_PSI_improved);
1671 
1672 #define MACRO_adjoint_nd_B_OMP_BLOCKWISE_COMPUTE(whichone) \
1673 { \
1674  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \
1675  INT t, t2; /* index dimensions */ \
1676  INT l_L; /* index one row of B */ \
1677  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ \
1678  INT ll_plain[ths->d+1]; /* postfix plain index in g */ \
1679  R phi_prod[ths->d+1]; /* postfix product of PHI */ \
1680  \
1681  phi_prod[0] = K(1.0); \
1682  ll_plain[0] = 0; \
1683  \
1684  MACRO_init_uo_l_lj_t; \
1685  \
1686  MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_ ##whichone \
1687  \
1688  if (ths->d == 4) \
1689  { \
1690  INT l0, l1, l2, l3; \
1691  for (l0 = 0; l0 < 2*ths->m+2; l0++) \
1692  { \
1693  lj[0] = l0; \
1694  t2 = 0; \
1695  if (l_all[lj[0]] < my_u0 || l_all[lj[0]] > my_o0) \
1696  continue; \
1697  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1698  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1699  for (l1 = 0; l1 < 2*ths->m+2; l1++) \
1700  { \
1701  lj[1] = l1; \
1702  t2 = 1; \
1703  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1704  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1705  for (l2 = 0; l2 < 2*ths->m+2; l2++) \
1706  { \
1707  lj[2] = l2; \
1708  t2 = 2; \
1709  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1710  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1711  for (l3 = 0; l3 < 2*ths->m+2; l3++) \
1712  { \
1713  lj[3] = l3; \
1714  t2 = 3; \
1715  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1716  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1717  \
1718  ths->g[ll_plain[ths->d]] += phi_prod[ths->d] * ths->f[j]; \
1719  } \
1720  } \
1721  } \
1722  } \
1723  } /* if(d==4) */ \
1724  else if (ths->d == 5) \
1725  { \
1726  INT l0, l1, l2, l3, l4; \
1727  for (l0 = 0; l0 < 2*ths->m+2; l0++) \
1728  { \
1729  lj[0] = l0; \
1730  t2 = 0; \
1731  if (l_all[lj[0]] < my_u0 || l_all[lj[0]] > my_o0) \
1732  continue; \
1733  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1734  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1735  for (l1 = 0; l1 < 2*ths->m+2; l1++) \
1736  { \
1737  lj[1] = l1; \
1738  t2 = 1; \
1739  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1740  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1741  for (l2 = 0; l2 < 2*ths->m+2; l2++) \
1742  { \
1743  lj[2] = l2; \
1744  t2 = 2; \
1745  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1746  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1747  for (l3 = 0; l3 < 2*ths->m+2; l3++) \
1748  { \
1749  lj[3] = l3; \
1750  t2 = 3; \
1751  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1752  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1753  for (l4 = 0; l4 < 2*ths->m+2; l4++) \
1754  { \
1755  lj[4] = l4; \
1756  t2 = 4; \
1757  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1758  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1759  \
1760  ths->g[ll_plain[ths->d]] += phi_prod[ths->d] * ths->f[j]; \
1761  } \
1762  } \
1763  } \
1764  } \
1765  } \
1766  } /* if(d==5) */ \
1767  else { \
1768  l_L = 0; \
1769  while (l_L < lprod) \
1770  { \
1771  if (t == 0 && (l_all[lj[0]] < my_u0 || l_all[lj[0]] > my_o0)) \
1772  { \
1773  lj[0]++; \
1774  l_L += lprodrest; \
1775  continue; \
1776  } \
1777  MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_ ##whichone \
1778  ths->g[ll_plain[ths->d]] += phi_prod[ths->d] * ths->f[j]; \
1779  MACRO_count_uo_l_lj_t; \
1780  l_L++; \
1781  } /* for(l_L) */ \
1782  } \
1783 }
1784 
1785 #define MACRO_adjoint_nd_B_OMP_BLOCKWISE(whichone) \
1786 { \
1787  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
1788  { \
1789  INT lprodrest = 1; \
1790  for (k = 1; k < ths->d; k++) \
1791  lprodrest *= (2*ths->m+2); \
1792  _Pragma("omp parallel private(k)") \
1793  { \
1794  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
1795  INT *ar_x = ths->index_x; \
1796  \
1797  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
1798  &min_u_b, &max_u_b, ths->d, ths->n, ths->m); \
1799  \
1800  if (min_u_a != -1) \
1801  { \
1802  k = index_x_binary_search(ar_x, ths->M_total, min_u_a); \
1803  \
1804  MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \
1805  \
1806  while (k < ths->M_total) \
1807  { \
1808  INT u_prod = ar_x[2*k]; \
1809  INT j = ar_x[2*k+1]; \
1810  \
1811  if (u_prod < min_u_a || u_prod > max_u_a) \
1812  break; \
1813  \
1814  MACRO_adjoint_nd_B_OMP_BLOCKWISE_COMPUTE(whichone) \
1815  \
1816  k++; \
1817  } \
1818  } \
1819  \
1820  if (min_u_b != -1) \
1821  { \
1822  INT k = index_x_binary_search(ar_x, ths->M_total, min_u_b); \
1823  \
1824  MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \
1825  \
1826  while (k < ths->M_total) \
1827  { \
1828  INT u_prod = ar_x[2*k]; \
1829  INT j = ar_x[2*k+1]; \
1830  \
1831  if (u_prod < min_u_b || u_prod > max_u_b) \
1832  break; \
1833  \
1834  MACRO_adjoint_nd_B_OMP_BLOCKWISE_COMPUTE(whichone) \
1835  \
1836  k++; \
1837  } \
1838  } \
1839  } /* omp parallel */ \
1840  return; \
1841  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
1842 }
1843 
1844 #define MACRO_adjoint_nd_B_OMP_COMPUTE(whichone) \
1845 { \
1846  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \
1847  INT l_L; /* index one row of B */ \
1848  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ \
1849  INT ll_plain[ths->d+1]; /* postfix plain index in g */ \
1850  R phi_prod[ths->d+1]; /* postfix product of PHI */ \
1851  \
1852  phi_prod[0] = K(1.0); \
1853  ll_plain[0] = 0; \
1854  \
1855  MACRO_init_uo_l_lj_t; \
1856  \
1857  MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_ ## whichone \
1858  \
1859  if (ths->d == 4) \
1860  { \
1861  INT l0, l1, l2, l3; \
1862  for (l0 = 0; l0 < 2*ths->m+2; l0++) \
1863  { \
1864  lj[0] = l0; \
1865  t2 = 0; \
1866  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1867  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1868  for (l1 = 0; l1 < 2*ths->m+2; l1++) \
1869  { \
1870  lj[1] = l1; \
1871  t2 = 1; \
1872  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1873  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1874  for (l2 = 0; l2 < 2*ths->m+2; l2++) \
1875  { \
1876  lj[2] = l2; \
1877  t2 = 2; \
1878  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1879  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1880  for (l3 = 0; l3 < 2*ths->m+2; l3++) \
1881  { \
1882  lj[3] = l3; \
1883  t2 = 3; \
1884  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1885  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1886  \
1887  C *lhs = ths->g + ll_plain[ths->d]; \
1888  R *lhs_real = (R*)lhs; \
1889  C val = phi_prod[ths->d] * ths->f[j]; \
1890  \
1891  _Pragma("omp atomic") \
1892  lhs_real[0] += CREAL(val); \
1893  \
1894  _Pragma("omp atomic") \
1895  lhs_real[1] += CIMAG(val); \
1896  } \
1897  } \
1898  } \
1899  } \
1900  } /* if(d==4) */ \
1901  else if (ths->d == 5) \
1902  { \
1903  INT l0, l1, l2, l3, l4; \
1904  for (l0 = 0; l0 < 2*ths->m+2; l0++) \
1905  { \
1906  lj[0] = l0; \
1907  t2 = 0; \
1908  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1909  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1910  for (l1 = 0; l1 < 2*ths->m+2; l1++) \
1911  { \
1912  lj[1] = l1; \
1913  t2 = 1; \
1914  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1915  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1916  for (l2 = 0; l2 < 2*ths->m+2; l2++) \
1917  { \
1918  lj[2] = l2; \
1919  t2 = 2; \
1920  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1921  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1922  for (l3 = 0; l3 < 2*ths->m+2; l3++) \
1923  { \
1924  lj[3] = l3; \
1925  t2 = 3; \
1926  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1927  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1928  for (l4 = 0; l4 < 2*ths->m+2; l4++) \
1929  { \
1930  lj[4] = l4; \
1931  t2 = 4; \
1932  phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1933  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1934  \
1935  C *lhs = ths->g + ll_plain[ths->d]; \
1936  R *lhs_real = (R*)lhs; \
1937  C val = phi_prod[ths->d] * ths->f[j]; \
1938  \
1939  _Pragma("omp atomic") \
1940  lhs_real[0] += CREAL(val); \
1941  \
1942  _Pragma("omp atomic") \
1943  lhs_real[1] += CIMAG(val); \
1944  } \
1945  } \
1946  } \
1947  } \
1948  } \
1949  } /* if(d==5) */ \
1950  else { \
1951  for (l_L = 0; l_L < lprod; l_L++) \
1952  { \
1953  C *lhs; \
1954  R *lhs_real; \
1955  C val; \
1956  \
1957  MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_ ## whichone \
1958  \
1959  lhs = ths->g + ll_plain[ths->d]; \
1960  lhs_real = (R*)lhs; \
1961  val = phi_prod[ths->d] * ths->f[j]; \
1962  \
1963  _Pragma("omp atomic") \
1964  lhs_real[0] += CREAL(val); \
1965  \
1966  _Pragma("omp atomic") \
1967  lhs_real[1] += CIMAG(val); \
1968  \
1969  MACRO_count_uo_l_lj_t; \
1970  } /* for(l_L) */ \
1971  } \
1972 }
1973 
1974 static inline void B_openmp_T(X(plan) *ths)
1975 {
1976  INT lprod; /* 'regular bandwidth' of matrix B */
1977  INT k;
1978 
1979  memset(ths->g, 0, (size_t)(ths->n_total) * sizeof(C));
1980 
1981  for (k = 0, lprod = 1; k < ths->d; k++)
1982  lprod *= (2*ths->m+2);
1983 
1984  if (ths->flags & PRE_FULL_PSI)
1985  {
1986  nfft_adjoint_B_compute_full_psi(ths->g, ths->psi_index_g, ths->psi, ths->f,
1987  ths->M_total, ths->d, ths->n, ths->m, ths->flags, ths->index_x);
1988  return;
1989  }
1990 
1991  if (ths->flags & PRE_PSI)
1992  {
1993  MACRO_adjoint_nd_B_OMP_BLOCKWISE(with_PRE_PSI);
1994 
1995  #pragma omp parallel for default(shared) private(k)
1996  for (k = 0; k < ths->M_total; k++)
1997  {
1998  INT t, t2; /* index dimensions */ \
1999  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2000  MACRO_adjoint_nd_B_OMP_COMPUTE(with_PRE_PSI);
2001  } /* for(j) */
2002  return;
2003  } /* if(PRE_PSI) */
2004 
2005  if (ths->flags & PRE_FG_PSI)
2006  {
2007  INT t, t2; /* index dimensions */
2008  R fg_exp_l[ths->d][2*ths->m+2];
2009  for(t2 = 0; t2 < ths->d; t2++)
2010  {
2011  INT lj_fg;
2012  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
2013  R tmpEXP2sq = tmpEXP2*tmpEXP2;
2014  R tmp2 = K(1.0);
2015  R tmp3 = K(1.0);
2016  fg_exp_l[t2][0] = K(1.0);
2017  for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
2018  {
2019  tmp3 = tmp2*tmpEXP2;
2020  tmp2 *= tmpEXP2sq;
2021  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
2022  }
2023  }
2024 
2025  MACRO_adjoint_nd_B_OMP_BLOCKWISE(with_PRE_FG_PSI);
2026 
2027  #pragma omp parallel for default(shared) private(k,t,t2)
2028  for (k = 0; k < ths->M_total; k++)
2029  {
2030  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2031  MACRO_adjoint_nd_B_OMP_COMPUTE(with_PRE_FG_PSI);
2032  } /* for(j) */
2033  return;
2034  } /* if(PRE_FG_PSI) */
2035 
2036  if (ths->flags & FG_PSI)
2037  {
2038  INT t, t2; /* index dimensions */
2039  R fg_exp_l[ths->d][2*ths->m+2];
2040 
2041  sort(ths);
2042 
2043  for (t2 = 0; t2 < ths->d; t2++)
2044  {
2045  INT lj_fg;
2046  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
2047  R tmpEXP2sq = tmpEXP2*tmpEXP2;
2048  R tmp2 = K(1.0);
2049  R tmp3 = K(1.0);
2050  fg_exp_l[t2][0] = K(1.0);
2051  for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
2052  {
2053  tmp3 = tmp2*tmpEXP2;
2054  tmp2 *= tmpEXP2sq;
2055  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
2056  }
2057  }
2058 
2059  MACRO_adjoint_nd_B_OMP_BLOCKWISE(with_FG_PSI);
2060 
2061  #pragma omp parallel for default(shared) private(k,t,t2)
2062  for (k = 0; k < ths->M_total; k++)
2063  {
2064  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2065  MACRO_adjoint_nd_B_OMP_COMPUTE(with_FG_PSI);
2066  } /* for(j) */
2067  return;
2068  } /* if(FG_PSI) */
2069 
2070  if (ths->flags & PRE_LIN_PSI)
2071  {
2072  sort(ths);
2073 
2074  MACRO_adjoint_nd_B_OMP_BLOCKWISE(with_PRE_LIN_PSI);
2075 
2076  #pragma omp parallel for default(shared) private(k)
2077  for (k = 0; k<ths->M_total; k++)
2078  {
2079  INT t, t2; /* index dimensions */
2080  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2081  MACRO_adjoint_nd_B_OMP_COMPUTE(with_PRE_LIN_PSI);
2082  } /* for(j) */
2083  return;
2084  } /* if(PRE_LIN_PSI) */
2085 
2086  /* no precomputed psi at all */
2087  sort(ths);
2088 
2089  MACRO_adjoint_nd_B_OMP_BLOCKWISE(without_PRE_PSI);
2090 
2091  #pragma omp parallel for default(shared) private(k)
2092  for (k = 0; k < ths->M_total; k++)
2093  {
2094  INT t, t2; /* index dimensions */
2095  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2096  MACRO_adjoint_nd_B_OMP_COMPUTE(without_PRE_PSI);
2097  } /* for(j) */
2098 }
2099 #endif
2100 
2101 static void B_T(X(plan) *ths)
2102 {
2103 #ifdef _OPENMP
2104  B_openmp_T(ths);
2105 #else
2106  B_serial_T(ths);
2107 #endif
2108 }
2109 
2110 /* ## specialized version for d=1 ########################################### */
2111 
2112 static void nfft_1d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
2113 {
2114  const INT tmp2 = 2*m+2;
2115  INT l;
2116  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
2117 
2118  fg_exp_b0 = EXP(K(-1.0)/b);
2119  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
2120  fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0);
2121 
2122  for (l = 1; l < tmp2; l++)
2123  {
2124  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
2125  fg_exp_b1 *= fg_exp_b0_sq;
2126  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
2127  }
2128 }
2129 
2130 
2131 static void nfft_trafo_1d_compute(C *fj, const C *g,const R *psij_const,
2132  const R *xj, const INT n, const INT m)
2133 {
2134  INT u, o, l;
2135  const C *gj;
2136  const R *psij;
2137  psij = psij_const;
2138 
2139  uo2(&u, &o, *xj, n, m);
2140 
2141  if (u < o)
2142  {
2143  for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++)
2144  (*fj) += (*psij++) * (*gj++);
2145  }
2146  else
2147  {
2148  for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++)
2149  (*fj) += (*psij++) * (*gj++);
2150  for (l = 0, gj = g; l <= o; l++)
2151  (*fj) += (*psij++) * (*gj++);
2152  }
2153 }
2154 
2155 #ifndef _OPENMP
2156 static void nfft_adjoint_1d_compute_serial(const C *fj, C *g,
2157  const R *psij_const, const R *xj, const INT n, const INT m)
2158 {
2159  INT u,o,l;
2160  C *gj;
2161  const R *psij;
2162  psij = psij_const;
2163 
2164  uo2(&u,&o,*xj, n, m);
2165 
2166  if (u < o)
2167  {
2168  for (l = 0, gj = g+u; l <= 2*m+1; l++)
2169  (*gj++) += (*psij++) * (*fj);
2170  }
2171  else
2172  {
2173  for (l = 0, gj = g+u; l < 2*m+1-o; l++)
2174  (*gj++) += (*psij++) * (*fj);
2175  for (l = 0, gj = g; l <= o; l++)
2176  (*gj++) += (*psij++) * (*fj);
2177  }
2178 }
2179 #endif
2180 
2181 #ifdef _OPENMP
2182 /* adjoint NFFT one-dimensional case with OpenMP atomic operations */
2183 static void nfft_adjoint_1d_compute_omp_atomic(const C f, C *g,
2184  const R *psij_const, const R *xj, const INT n, const INT m)
2185 {
2186  INT u,o,l;
2187  C *gj;
2188  INT index_temp[2*m+2];
2189 
2190  uo2(&u,&o,*xj, n, m);
2191 
2192  for (l=0; l<=2*m+1; l++)
2193  index_temp[l] = (l+u)%n;
2194 
2195  for (l = 0, gj = g+u; l <= 2*m+1; l++)
2196  {
2197  INT i = index_temp[l];
2198  C *lhs = g+i;
2199  R *lhs_real = (R*)lhs;
2200  C val = psij_const[l] * f;
2201  #pragma omp atomic
2202  lhs_real[0] += CREAL(val);
2203 
2204  #pragma omp atomic
2205  lhs_real[1] += CIMAG(val);
2206  }
2207 }
2208 #endif
2209 
2210 #ifdef _OPENMP
2211 
2226 static void nfft_adjoint_1d_compute_omp_blockwise(const C f, C *g,
2227  const R *psij_const, const R *xj, const INT n, const INT m,
2228  const INT my_u0, const INT my_o0)
2229 {
2230  INT ar_u,ar_o,l;
2231 
2232  uo2(&ar_u,&ar_o,*xj, n, m);
2233 
2234  if (ar_u < ar_o)
2235  {
2236  INT u = MAX(my_u0,ar_u);
2237  INT o = MIN(my_o0,ar_o);
2238  INT offset_psij = u-ar_u;
2239 #ifdef OMP_ASSERT
2240  assert(offset_psij >= 0);
2241  assert(o-u <= 2*m+1);
2242  assert(offset_psij+o-u <= 2*m+1);
2243 #endif
2244 
2245  for (l = 0; l <= o-u; l++)
2246  g[u+l] += psij_const[offset_psij+l] * f;
2247  }
2248  else
2249  {
2250  INT u = MAX(my_u0,ar_u);
2251  INT o = my_o0;
2252  INT offset_psij = u-ar_u;
2253 #ifdef OMP_ASSERT
2254  assert(offset_psij >= 0);
2255  assert(o-u <= 2*m+1);
2256  assert(offset_psij+o-u <= 2*m+1);
2257 #endif
2258 
2259  for (l = 0; l <= o-u; l++)
2260  g[u+l] += psij_const[offset_psij+l] * f;
2261 
2262  u = my_u0;
2263  o = MIN(my_o0,ar_o);
2264  offset_psij += my_u0-ar_u+n;
2265 
2266 #ifdef OMP_ASSERT
2267  if (u <= o)
2268  {
2269  assert(o-u <= 2*m+1);
2270  if (offset_psij+o-u > 2*m+1)
2271  {
2272  fprintf(stderr, "ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij);
2273  }
2274  assert(offset_psij+o-u <= 2*m+1);
2275  }
2276 #endif
2277  for (l = 0; l <= o-u; l++)
2278  g[u+l] += psij_const[offset_psij+l] * f;
2279  }
2280 }
2281 #endif
2282 
2283 static void nfft_trafo_1d_B(X(plan) *ths)
2284 {
2285  const INT n = ths->n[0], M = ths->M_total, m = ths->m, m2p2 = 2*m+2;
2286  const C *g = (C*)ths->g;
2287 
2288  if (ths->flags & PRE_FULL_PSI)
2289  {
2290  INT k;
2291 #ifdef _OPENMP
2292  #pragma omp parallel for default(shared) private(k)
2293 #endif
2294  for (k = 0; k < M; k++)
2295  {
2296  INT l;
2297  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2298  ths->f[j] = K(0.0);
2299  for (l = 0; l < m2p2; l++)
2300  ths->f[j] += ths->psi[j*m2p2+l] * g[ths->psi_index_g[j*m2p2+l]];
2301  }
2302  return;
2303  } /* if(PRE_FULL_PSI) */
2304 
2305  if (ths->flags & PRE_PSI)
2306  {
2307  INT k;
2308 #ifdef _OPENMP
2309  #pragma omp parallel for default(shared) private(k)
2310 #endif
2311  for (k = 0; k < M; k++)
2312  {
2313  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2314  nfft_trafo_1d_compute(&ths->f[j], g, ths->psi + j * (2 * m + 2),
2315  &ths->x[j], n, m);
2316  }
2317  return;
2318  } /* if(PRE_PSI) */
2319 
2320  if (ths->flags & PRE_FG_PSI)
2321  {
2322  INT k;
2323  R fg_exp_l[m2p2];
2324 
2325  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2326 
2327 #ifdef _OPENMP
2328  #pragma omp parallel for default(shared) private(k)
2329 #endif
2330  for (k = 0; k < M; k++)
2331  {
2332  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2333  const R fg_psij0 = ths->psi[2 * j], fg_psij1 = ths->psi[2 * j + 1];
2334  R fg_psij2 = K(1.0);
2335  R psij_const[m2p2];
2336  INT l;
2337 
2338  psij_const[0] = fg_psij0;
2339 
2340  for (l = 1; l < m2p2; l++)
2341  {
2342  fg_psij2 *= fg_psij1;
2343  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2344  }
2345 
2346  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2347  }
2348 
2349  return;
2350  } /* if(PRE_FG_PSI) */
2351 
2352  if (ths->flags & FG_PSI)
2353  {
2354  INT k;
2355  R fg_exp_l[m2p2];
2356 
2357  sort(ths);
2358 
2359  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2360 
2361 #ifdef _OPENMP
2362  #pragma omp parallel for default(shared) private(k)
2363 #endif
2364  for (k = 0; k < M; k++)
2365  {
2366  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2367  INT u, o, l;
2368  R fg_psij0, fg_psij1, fg_psij2;
2369  R psij_const[m2p2];
2370 
2371  uo(ths, (INT)j, &u, &o, (INT)0);
2372  fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)(u))/(R)(n), 0));
2373  fg_psij1 = EXP(K(2.0) * ((R)(n) * ths->x[j] - (R)(u)) / ths->b[0]);
2374  fg_psij2 = K(1.0);
2375 
2376  psij_const[0] = fg_psij0;
2377 
2378  for (l = 1; l < m2p2; l++)
2379  {
2380  fg_psij2 *= fg_psij1;
2381  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2382  }
2383 
2384  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2385  }
2386  return;
2387  } /* if(FG_PSI) */
2388 
2389  if (ths->flags & PRE_LIN_PSI)
2390  {
2391  const INT K = ths->K, ip_s = K / (m + 2);
2392  INT k;
2393 
2394  sort(ths);
2395 
2396 #ifdef _OPENMP
2397  #pragma omp parallel for default(shared) private(k)
2398 #endif
2399  for (k = 0; k < M; k++)
2400  {
2401  INT u, o, l;
2402  R ip_y, ip_w;
2403  INT ip_u;
2404  R psij_const[m2p2];
2405  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2406 
2407  uo(ths, (INT)j, &u, &o, (INT)0);
2408 
2409  ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2410  ip_u = (INT)(LRINT(FLOOR(ip_y)));
2411  ip_w = ip_y - (R)(ip_u);
2412 
2413  for (l = 0; l < m2p2; l++)
2414  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2415  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2416 
2417  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2418  }
2419  return;
2420  } /* if(PRE_LIN_PSI) */
2421  else
2422  {
2423  /* no precomputed psi at all */
2424  INT k;
2425 
2426  sort(ths);
2427 
2428 #ifdef _OPENMP
2429  #pragma omp parallel for default(shared) private(k)
2430 #endif
2431  for (k = 0; k < M; k++)
2432  {
2433  R psij_const[m2p2];
2434  INT u, o, l;
2435  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2436 
2437  uo(ths, (INT)j, &u, &o, (INT)0);
2438 
2439  for (l = 0; l < m2p2; l++)
2440  psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n), 0));
2441 
2442  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2443  }
2444  }
2445 }
2446 
2447 
2448 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
2449 { \
2450  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, \
2451  ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0); \
2452 }
2453 
2454 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
2455 { \
2456  R psij_const[2 * m + 2]; \
2457  INT l; \
2458  R fg_psij0 = ths->psi[2 * j]; \
2459  R fg_psij1 = ths->psi[2 * j + 1]; \
2460  R fg_psij2 = K(1.0); \
2461  \
2462  psij_const[0] = fg_psij0; \
2463  for (l = 1; l <= 2 * m + 1; l++) \
2464  { \
2465  fg_psij2 *= fg_psij1; \
2466  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2467  } \
2468  \
2469  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2470  ths->x + j, n, m, my_u0, my_o0); \
2471 }
2472 
2473 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
2474 { \
2475  R psij_const[2 * m + 2]; \
2476  R fg_psij0, fg_psij1, fg_psij2; \
2477  INT u, o, l; \
2478  \
2479  uo(ths, j, &u, &o, (INT)0); \
2480  fg_psij0 = (PHI(ths->n[0],ths->x[j]-((R)u)/((R)n),0)); \
2481  fg_psij1 = EXP(K(2.0) * (((R)n) * (ths->x[j]) - (R)u) / ths->b[0]); \
2482  fg_psij2 = K(1.0); \
2483  psij_const[0] = fg_psij0; \
2484  for (l = 1; l <= 2 * m + 1; l++) \
2485  { \
2486  fg_psij2 *= fg_psij1; \
2487  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2488  } \
2489  \
2490  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2491  ths->x + j, n, m, my_u0, my_o0); \
2492 }
2493 
2494 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
2495 { \
2496  R psij_const[2 * m + 2]; \
2497  INT ip_u; \
2498  R ip_y, ip_w; \
2499  INT u, o, l; \
2500  \
2501  uo(ths, j, &u, &o, (INT)0); \
2502  \
2503  ip_y = FABS(((R)n) * ths->x[j] - (R)u) * ((R)ip_s); \
2504  ip_u = LRINT(FLOOR(ip_y)); \
2505  ip_w = ip_y - ip_u; \
2506  for (l = 0; l < 2 * m + 2; l++) \
2507  psij_const[l] \
2508  = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) \
2509  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); \
2510  \
2511  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2512  ths->x + j, n, m, my_u0, my_o0); \
2513 }
2514 
2515 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
2516 { \
2517  R psij_const[2 * m + 2]; \
2518  INT u, o, l; \
2519  \
2520  uo(ths, j, &u, &o, (INT)0); \
2521  \
2522  for (l = 0; l <= 2 * m + 1; l++) \
2523  psij_const[l] = (PHI(ths->n[0],ths->x[j]-((R)((u+l)))/((R)n),0)); \
2524  \
2525  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2526  ths->x + j, n, m, my_u0, my_o0); \
2527 }
2528 
2529 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE(whichone) \
2530 { \
2531  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
2532  { \
2533  _Pragma("omp parallel private(k)") \
2534  { \
2535  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
2536  INT *ar_x = ths->index_x; \
2537  \
2538  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
2539  &min_u_b, &max_u_b, 1, &n, m); \
2540  \
2541  if (min_u_a != -1) \
2542  { \
2543  k = index_x_binary_search(ar_x, M, min_u_a); \
2544  \
2545  MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \
2546  \
2547  while (k < M) \
2548  { \
2549  INT u_prod = ar_x[2*k]; \
2550  INT j = ar_x[2*k+1]; \
2551  \
2552  if (u_prod < min_u_a || u_prod > max_u_a) \
2553  break; \
2554  \
2555  MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2556  \
2557  k++; \
2558  } \
2559  } \
2560  \
2561  if (min_u_b != -1) \
2562  { \
2563  k = index_x_binary_search(ar_x, M, min_u_b); \
2564  \
2565  MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \
2566  \
2567  while (k < M) \
2568  { \
2569  INT u_prod = ar_x[2*k]; \
2570  INT j = ar_x[2*k+1]; \
2571  \
2572  if (u_prod < min_u_b || u_prod > max_u_b) \
2573  break; \
2574  \
2575  MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2576  \
2577  k++; \
2578  } \
2579  } \
2580  } /* omp parallel */ \
2581  return; \
2582  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
2583 }
2584 
2585 static void nfft_adjoint_1d_B(X(plan) *ths)
2586 {
2587  const INT n = ths->n[0], M = ths->M_total, m = ths->m;
2588  INT k;
2589  C *g = (C*)ths->g;
2590 
2591  memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
2592 
2593  if (ths->flags & PRE_FULL_PSI)
2594  {
2595  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
2596  (INT)1, ths->n, m, ths->flags, ths->index_x);
2597  return;
2598  } /* if(PRE_FULL_PSI) */
2599 
2600  if (ths->flags & PRE_PSI)
2601  {
2602 #ifdef _OPENMP
2603  MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_PSI)
2604 #endif
2605 
2606 #ifdef _OPENMP
2607  #pragma omp parallel for default(shared) private(k)
2608 #endif
2609  for (k = 0; k < M; k++)
2610  {
2611  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2612 #ifdef _OPENMP
2613  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2614 #else
2615  nfft_adjoint_1d_compute_serial(ths->f + j, g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2616 #endif
2617  }
2618 
2619  return;
2620  } /* if(PRE_PSI) */
2621 
2622  if (ths->flags & PRE_FG_PSI)
2623  {
2624  R fg_exp_l[2 * m + 2];
2625 
2626  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2627 
2628 #ifdef _OPENMP
2629  MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_FG_PSI)
2630 #endif
2631 
2632 
2633 #ifdef _OPENMP
2634  #pragma omp parallel for default(shared) private(k)
2635 #endif
2636  for (k = 0; k < M; k++)
2637  {
2638  R psij_const[2 * m + 2];
2639  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2640  INT l;
2641  R fg_psij0 = ths->psi[2 * j];
2642  R fg_psij1 = ths->psi[2 * j + 1];
2643  R fg_psij2 = K(1.0);
2644 
2645  psij_const[0] = fg_psij0;
2646  for (l = 1; l <= 2 * m + 1; l++)
2647  {
2648  fg_psij2 *= fg_psij1;
2649  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2650  }
2651 
2652 #ifdef _OPENMP
2653  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2654 #else
2655  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2656 #endif
2657  }
2658 
2659  return;
2660  } /* if(PRE_FG_PSI) */
2661 
2662  if (ths->flags & FG_PSI)
2663  {
2664  R fg_exp_l[2 * m + 2];
2665 
2666  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2667 
2668  sort(ths);
2669 
2670 #ifdef _OPENMP
2671  MACRO_adjoint_1d_B_OMP_BLOCKWISE(FG_PSI)
2672 #endif
2673 
2674 #ifdef _OPENMP
2675  #pragma omp parallel for default(shared) private(k)
2676 #endif
2677  for (k = 0; k < M; k++)
2678  {
2679  INT u,o,l;
2680  R psij_const[2 * m + 2];
2681  R fg_psij0, fg_psij1, fg_psij2;
2682  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2683 
2684  uo(ths, j, &u, &o, (INT)0);
2685  fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)u) / (R)(n),0));
2686  fg_psij1 = EXP(K(2.0) * ((R)(n) * (ths->x[j]) - (R)(u)) / ths->b[0]);
2687  fg_psij2 = K(1.0);
2688  psij_const[0] = fg_psij0;
2689  for (l = 1; l <= 2 * m + 1; l++)
2690  {
2691  fg_psij2 *= fg_psij1;
2692  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2693  }
2694 
2695 #ifdef _OPENMP
2696  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2697 #else
2698  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2699 #endif
2700  }
2701 
2702  return;
2703  } /* if(FG_PSI) */
2704 
2705  if (ths->flags & PRE_LIN_PSI)
2706  {
2707  const INT K = ths->K;
2708  const INT ip_s = K / (m + 2);
2709 
2710  sort(ths);
2711 
2712 #ifdef _OPENMP
2713  MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
2714 #endif
2715 
2716 #ifdef _OPENMP
2717  #pragma omp parallel for default(shared) private(k)
2718 #endif
2719  for (k = 0; k < M; k++)
2720  {
2721  INT u,o,l;
2722  INT ip_u;
2723  R ip_y, ip_w;
2724  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2725  R psij_const[2 * m + 2];
2726 
2727  uo(ths, j, &u, &o, (INT)0);
2728 
2729  ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2730  ip_u = (INT)(LRINT(FLOOR(ip_y)));
2731  ip_w = ip_y - (R)(ip_u);
2732  for (l = 0; l < 2 * m + 2; l++)
2733  psij_const[l]
2734  = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2735  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2736 
2737 #ifdef _OPENMP
2738  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2739 #else
2740  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2741 #endif
2742  }
2743  return;
2744  } /* if(PRE_LIN_PSI) */
2745 
2746  /* no precomputed psi at all */
2747  sort(ths);
2748 
2749 #ifdef _OPENMP
2750  MACRO_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI)
2751 #endif
2752 
2753 #ifdef _OPENMP
2754  #pragma omp parallel for default(shared) private(k)
2755 #endif
2756  for (k = 0; k < M; k++)
2757  {
2758  INT u,o,l;
2759  R psij_const[2 * m + 2];
2760  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2761 
2762  uo(ths, j, &u, &o, (INT)0);
2763 
2764  for (l = 0; l <= 2 * m + 1; l++)
2765  psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n),0));
2766 
2767 #ifdef _OPENMP
2768  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2769 #else
2770  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2771 #endif
2772  }
2773 }
2774 
2775 void X(trafo_1d)(X(plan) *ths)
2776 {
2777  if((ths->N[0] <= ths->m) || (ths->n[0] <= 2*ths->m+2))
2778  {
2779  X(trafo_direct)(ths);
2780  return;
2781  }
2782 
2783  const INT N = ths->N[0], N2 = N/2, n = ths->n[0];
2784  C *f_hat1 = (C*)ths->f_hat, *f_hat2 = (C*)&ths->f_hat[N2];
2785 
2786  ths->g_hat = ths->g1;
2787  ths->g = ths->g2;
2788 
2789  {
2790  C *g_hat1 = (C*)&ths->g_hat[n-N/2], *g_hat2 = (C*)ths->g_hat;
2791  R *c_phi_inv1, *c_phi_inv2;
2792 
2793  TIC(0)
2794 #ifdef _OPENMP
2795  {
2796  INT k;
2797  #pragma omp parallel for default(shared) private(k)
2798  for (k = 0; k < ths->n_total; k++)
2799  ths->g_hat[k] = 0.0;
2800  }
2801 #else
2802  memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
2803 #endif
2804  if(ths->flags & PRE_PHI_HUT)
2805  {
2806  INT k;
2807  c_phi_inv1 = ths->c_phi_inv[0];
2808  c_phi_inv2 = &ths->c_phi_inv[0][N2];
2809 
2810 #ifdef _OPENMP
2811  #pragma omp parallel for default(shared) private(k)
2812 #endif
2813  for (k = 0; k < N2; k++)
2814  {
2815  g_hat1[k] = f_hat1[k] * c_phi_inv1[k];
2816  g_hat2[k] = f_hat2[k] * c_phi_inv2[k];
2817  }
2818  }
2819  else
2820  {
2821  INT k;
2822 #ifdef _OPENMP
2823  #pragma omp parallel for default(shared) private(k)
2824 #endif
2825  for (k = 0; k < N2; k++)
2826  {
2827  g_hat1[k] = f_hat1[k] / (PHI_HUT(ths->n[0],k-N2,0));
2828  g_hat2[k] = f_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2829  }
2830  }
2831  TOC(0)
2832 
2833  TIC_FFTW(1)
2834  FFTW(execute)(ths->my_fftw_plan1);
2835  TOC_FFTW(1);
2836 
2837  TIC(2);
2838  nfft_trafo_1d_B(ths);
2839  TOC(2);
2840  }
2841 }
2842 
2843 void X(adjoint_1d)(X(plan) *ths)
2844 {
2845  if((ths->N[0] <= ths->m) || (ths->n[0] <= 2*ths->m+2))
2846  {
2847  X(adjoint_direct)(ths);
2848  return;
2849  }
2850 
2851  INT n,N;
2852  C *g_hat1,*g_hat2,*f_hat1,*f_hat2;
2853  R *c_phi_inv1, *c_phi_inv2;
2854 
2855  N=ths->N[0];
2856  n=ths->n[0];
2857 
2858  ths->g_hat=ths->g1;
2859  ths->g=ths->g2;
2860 
2861  f_hat1=(C*)ths->f_hat;
2862  f_hat2=(C*)&ths->f_hat[N/2];
2863  g_hat1=(C*)&ths->g_hat[n-N/2];
2864  g_hat2=(C*)ths->g_hat;
2865 
2866  TIC(2)
2867  nfft_adjoint_1d_B(ths);
2868  TOC(2)
2869 
2870  TIC_FFTW(1)
2871  FFTW(execute)(ths->my_fftw_plan2);
2872  TOC_FFTW(1);
2873 
2874  TIC(0)
2875  if(ths->flags & PRE_PHI_HUT)
2876  {
2877  INT k;
2878  c_phi_inv1=ths->c_phi_inv[0];
2879  c_phi_inv2=&ths->c_phi_inv[0][N/2];
2880 
2881 #ifdef _OPENMP
2882  #pragma omp parallel for default(shared) private(k)
2883 #endif
2884  for (k = 0; k < N/2; k++)
2885  {
2886  f_hat1[k] = g_hat1[k] * c_phi_inv1[k];
2887  f_hat2[k] = g_hat2[k] * c_phi_inv2[k];
2888  }
2889  }
2890  else
2891  {
2892  INT k;
2893 
2894 #ifdef _OPENMP
2895  #pragma omp parallel for default(shared) private(k)
2896 #endif
2897  for (k = 0; k < N/2; k++)
2898  {
2899  f_hat1[k] = g_hat1[k] / (PHI_HUT(ths->n[0],k-N/2,0));
2900  f_hat2[k] = g_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2901  }
2902  }
2903  TOC(0)
2904 }
2905 
2906 
2907 /* ################################################ SPECIFIC VERSIONS FOR d=2 */
2908 
2909 static void nfft_2d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
2910 {
2911  INT l;
2912  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
2913 
2914  fg_exp_b0 = EXP(K(-1.0)/b);
2915  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
2916  fg_exp_b1 = K(1.0);
2917  fg_exp_b2 = K(1.0);
2918  fg_exp_l[0] = K(1.0);
2919  for(l=1; l <= 2*m+1; l++)
2920  {
2921  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
2922  fg_exp_b1 *= fg_exp_b0_sq;
2923  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
2924  }
2925 }
2926 
2927 static void nfft_trafo_2d_compute(C *fj, const C *g, const R *psij_const0,
2928  const R *psij_const1, const R *xj0, const R *xj1, const INT n0,
2929  const INT n1, const INT m)
2930 {
2931  INT u0,o0,l0,u1,o1,l1;
2932  const C *gj;
2933  const R *psij0,*psij1;
2934 
2935  psij0=psij_const0;
2936  psij1=psij_const1;
2937 
2938  uo2(&u0,&o0,*xj0, n0, m);
2939  uo2(&u1,&o1,*xj1, n1, m);
2940 
2941  *fj=0;
2942 
2943  if (u0 < o0)
2944  if(u1 < o1)
2945  for(l0=0; l0<=2*m+1; l0++,psij0++)
2946  {
2947  psij1=psij_const1;
2948  gj=g+(u0+l0)*n1+u1;
2949  for(l1=0; l1<=2*m+1; l1++)
2950  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2951  }
2952  else
2953  for(l0=0; l0<=2*m+1; l0++,psij0++)
2954  {
2955  psij1=psij_const1;
2956  gj=g+(u0+l0)*n1+u1;
2957  for(l1=0; l1<2*m+1-o1; l1++)
2958  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2959  gj=g+(u0+l0)*n1;
2960  for(l1=0; l1<=o1; l1++)
2961  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2962  }
2963  else
2964  if(u1<o1)
2965  {
2966  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2967  {
2968  psij1=psij_const1;
2969  gj=g+(u0+l0)*n1+u1;
2970  for(l1=0; l1<=2*m+1; l1++)
2971  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2972  }
2973  for(l0=0; l0<=o0; l0++,psij0++)
2974  {
2975  psij1=psij_const1;
2976  gj=g+l0*n1+u1;
2977  for(l1=0; l1<=2*m+1; l1++)
2978  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2979  }
2980  }
2981  else
2982  {
2983  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2984  {
2985  psij1=psij_const1;
2986  gj=g+(u0+l0)*n1+u1;
2987  for(l1=0; l1<2*m+1-o1; l1++)
2988  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2989  gj=g+(u0+l0)*n1;
2990  for(l1=0; l1<=o1; l1++)
2991  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2992  }
2993  for(l0=0; l0<=o0; l0++,psij0++)
2994  {
2995  psij1=psij_const1;
2996  gj=g+l0*n1+u1;
2997  for(l1=0; l1<2*m+1-o1; l1++)
2998  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2999  gj=g+l0*n1;
3000  for(l1=0; l1<=o1; l1++)
3001  (*fj) += (*psij0) * (*psij1++) * (*gj++);
3002  }
3003  }
3004 }
3005 
3006 #ifdef _OPENMP
3007 /* adjoint NFFT two-dimensional case with OpenMP atomic operations */
3008 static void nfft_adjoint_2d_compute_omp_atomic(const C f, C *g,
3009  const R *psij_const0, const R *psij_const1, const R *xj0,
3010  const R *xj1, const INT n0, const INT n1, const INT m)
3011 {
3012  INT u0,o0,l0,u1,o1,l1;
3013 
3014  INT index_temp0[2*m+2];
3015  INT index_temp1[2*m+2];
3016 
3017  uo2(&u0,&o0,*xj0, n0, m);
3018  uo2(&u1,&o1,*xj1, n1, m);
3019 
3020  for (l0=0; l0<=2*m+1; l0++)
3021  index_temp0[l0] = (u0+l0)%n0;
3022 
3023  for (l1=0; l1<=2*m+1; l1++)
3024  index_temp1[l1] = (u1+l1)%n1;
3025 
3026  for(l0=0; l0<=2*m+1; l0++)
3027  {
3028  for(l1=0; l1<=2*m+1; l1++)
3029  {
3030  INT i = index_temp0[l0] * n1 + index_temp1[l1];
3031  C *lhs = g+i;
3032  R *lhs_real = (R*)lhs;
3033  C val = psij_const0[l0] * psij_const1[l1] * f;
3034 
3035  #pragma omp atomic
3036  lhs_real[0] += CREAL(val);
3037 
3038  #pragma omp atomic
3039  lhs_real[1] += CIMAG(val);
3040  }
3041  }
3042 }
3043 #endif
3044 
3045 #ifdef _OPENMP
3046 
3064 static void nfft_adjoint_2d_compute_omp_blockwise(const C f, C *g,
3065  const R *psij_const0, const R *psij_const1, const R *xj0,
3066  const R *xj1, const INT n0, const INT n1, const INT m,
3067  const INT my_u0, const INT my_o0)
3068 {
3069  INT ar_u0,ar_o0,l0,u1,o1,l1;
3070  INT index_temp1[2*m+2];
3071 
3072  uo2(&ar_u0,&ar_o0,*xj0, n0, m);
3073  uo2(&u1,&o1,*xj1, n1, m);
3074 
3075  for (l1 = 0; l1 <= 2*m+1; l1++)
3076  index_temp1[l1] = (u1+l1)%n1;
3077 
3078  if(ar_u0 < ar_o0)
3079  {
3080  INT u0 = MAX(my_u0,ar_u0);
3081  INT o0 = MIN(my_o0,ar_o0);
3082  INT offset_psij = u0-ar_u0;
3083 #ifdef OMP_ASSERT
3084  assert(offset_psij >= 0);
3085  assert(o0-u0 <= 2*m+1);
3086  assert(offset_psij+o0-u0 <= 2*m+1);
3087 #endif
3088 
3089  for (l0 = 0; l0 <= o0-u0; l0++)
3090  {
3091  INT i0 = (u0+l0) * n1;
3092  const C val0 = psij_const0[offset_psij+l0];
3093 
3094  for(l1=0; l1<=2*m+1; l1++)
3095  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
3096  }
3097  }
3098  else
3099  {
3100  INT u0 = MAX(my_u0,ar_u0);
3101  INT o0 = my_o0;
3102  INT offset_psij = u0-ar_u0;
3103 #ifdef OMP_ASSERT
3104  assert(offset_psij >= 0);
3105  assert(o0-u0 <= 2*m+1);
3106  assert(offset_psij+o0-u0 <= 2*m+1);
3107 #endif
3108 
3109  for (l0 = 0; l0 <= o0-u0; l0++)
3110  {
3111  INT i0 = (u0+l0) * n1;
3112  const C val0 = psij_const0[offset_psij+l0];
3113 
3114  for(l1=0; l1<=2*m+1; l1++)
3115  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
3116  }
3117 
3118  u0 = my_u0;
3119  o0 = MIN(my_o0,ar_o0);
3120  offset_psij += my_u0-ar_u0+n0;
3121 
3122 #ifdef OMP_ASSERT
3123  if (u0<=o0)
3124  {
3125  assert(o0-u0 <= 2*m+1);
3126  assert(offset_psij+o0-u0 <= 2*m+1);
3127  }
3128 #endif
3129 
3130  for (l0 = 0; l0 <= o0-u0; l0++)
3131  {
3132  INT i0 = (u0+l0) * n1;
3133  const C val0 = psij_const0[offset_psij+l0];
3134 
3135  for(l1=0; l1<=2*m+1; l1++)
3136  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
3137  }
3138  }
3139 }
3140 #endif
3141 
3142 #ifndef _OPENMP
3143 static void nfft_adjoint_2d_compute_serial(const C *fj, C *g,
3144  const R *psij_const0, const R *psij_const1, const R *xj0,
3145  const R *xj1, const INT n0, const INT n1, const INT m)
3146 {
3147  INT u0,o0,l0,u1,o1,l1;
3148  C *gj;
3149  const R *psij0,*psij1;
3150 
3151  psij0=psij_const0;
3152  psij1=psij_const1;
3153 
3154  uo2(&u0,&o0,*xj0, n0, m);
3155  uo2(&u1,&o1,*xj1, n1, m);
3156 
3157  if(u0<o0)
3158  if(u1<o1)
3159  for(l0=0; l0<=2*m+1; l0++,psij0++)
3160  {
3161  psij1=psij_const1;
3162  gj=g+(u0+l0)*n1+u1;
3163  for(l1=0; l1<=2*m+1; l1++)
3164  (*gj++) += (*psij0) * (*psij1++) * (*fj);
3165  }
3166  else
3167  for(l0=0; l0<=2*m+1; l0++,psij0++)
3168  {
3169  psij1=psij_const1;
3170  gj=g+(u0+l0)*n1+u1;
3171  for(l1=0; l1<2*m+1-o1; l1++)
3172  (*gj++) += (*psij0) * (*psij1++) * (*fj);
3173  gj=g+(u0+l0)*n1;
3174  for(l1=0; l1<=o1; l1++)
3175  (*gj++) += (*psij0) * (*psij1++) * (*fj);
3176  }
3177  else
3178  if(u1<o1)
3179  {
3180  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
3181  {
3182  psij1=psij_const1;
3183  gj=g+(u0+l0)*n1+u1;
3184  for(l1=0; l1<=2*m+1; l1++)
3185  (*gj++) += (*psij0) * (*psij1++) * (*fj);
3186  }
3187  for(l0=0; l0<=o0; l0++,psij0++)
3188  {
3189  psij1=psij_const1;
3190  gj=g+l0*n1+u1;
3191  for(l1=0; l1<=2*m+1; l1++)
3192  (*gj++) += (*psij0) * (*psij1++) * (*fj);
3193  }
3194  }
3195  else
3196  {
3197  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
3198  {
3199  psij1=psij_const1;
3200  gj=g+(u0+l0)*n1+u1;
3201  for(l1=0; l1<2*m+1-o1; l1++)
3202  (*gj++) += (*psij0) * (*psij1++) * (*fj);
3203  gj=g+(u0+l0)*n1;
3204  for(l1=0; l1<=o1; l1++)
3205  (*gj++) += (*psij0) * (*psij1++) * (*fj);
3206  }
3207  for(l0=0; l0<=o0; l0++,psij0++)
3208  {
3209  psij1=psij_const1;
3210  gj=g+l0*n1+u1;
3211  for(l1=0; l1<2*m+1-o1; l1++)
3212  (*gj++) += (*psij0) * (*psij1++) * (*fj);
3213  gj=g+l0*n1;
3214  for(l1=0; l1<=o1; l1++)
3215  (*gj++) += (*psij0) * (*psij1++) * (*fj);
3216  }
3217  }
3218 }
3219 #endif
3220 
3221 static void nfft_trafo_2d_B(X(plan) *ths)
3222 {
3223  const C *g = (C*)ths->g;
3224  const INT n0 = ths->n[0];
3225  const INT n1 = ths->n[1];
3226  const INT M = ths->M_total;
3227  const INT m = ths->m;
3228 
3229  INT k;
3230 
3231  if(ths->flags & PRE_FULL_PSI)
3232  {
3233  const INT lprod = (2*m+2) * (2*m+2);
3234 #ifdef _OPENMP
3235  #pragma omp parallel for default(shared) private(k)
3236 #endif
3237  for (k = 0; k < M; k++)
3238  {
3239  INT l;
3240  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3241  ths->f[j] = K(0.0);
3242  for (l = 0; l < lprod; l++)
3243  ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
3244  }
3245  return;
3246  } /* if(PRE_FULL_PSI) */
3247 
3248  if(ths->flags & PRE_PSI)
3249  {
3250 #ifdef _OPENMP
3251  #pragma omp parallel for default(shared) private(k)
3252 #endif
3253  for (k = 0; k < M; k++)
3254  {
3255  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3256  nfft_trafo_2d_compute(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3257  }
3258 
3259  return;
3260  } /* if(PRE_PSI) */
3261 
3262  if(ths->flags & PRE_FG_PSI)
3263  {
3264  R fg_exp_l[2*(2*m+2)];
3265 
3266  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3267  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3268 
3269 #ifdef _OPENMP
3270  #pragma omp parallel for default(shared) private(k)
3271 #endif
3272  for (k = 0; k < M; k++)
3273  {
3274  R psij_const[2*(2*m+2)];
3275  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3276  INT l;
3277  R fg_psij0 = ths->psi[2*j*2];
3278  R fg_psij1 = ths->psi[2*j*2+1];
3279  R fg_psij2 = K(1.0);
3280 
3281  psij_const[0] = fg_psij0;
3282  for (l = 1; l <= 2*m+1; l++)
3283  {
3284  fg_psij2 *= fg_psij1;
3285  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3286  }
3287 
3288  fg_psij0 = ths->psi[2*(j*2+1)];
3289  fg_psij1 = ths->psi[2*(j*2+1)+1];
3290  fg_psij2 = K(1.0);
3291  psij_const[2*m+2] = fg_psij0;
3292  for (l = 1; l <= 2*m+1; l++)
3293  {
3294  fg_psij2 *= fg_psij1;
3295  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3296  }
3297 
3298  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3299  }
3300 
3301  return;
3302  } /* if(PRE_FG_PSI) */
3303 
3304  if(ths->flags & FG_PSI)
3305  {
3306  R fg_exp_l[2*(2*m+2)];
3307 
3308  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3309  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3310 
3311  sort(ths);
3312 
3313 #ifdef _OPENMP
3314  #pragma omp parallel for default(shared) private(k)
3315 #endif
3316  for (k = 0; k < M; k++)
3317  {
3318  INT u, o, l;
3319  R fg_psij0, fg_psij1, fg_psij2;
3320  R psij_const[2*(2*m+2)];
3321  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3322 
3323  uo(ths, j, &u, &o, (INT)0);
3324  fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u) / (R)(n0),0));
3325  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3326  fg_psij2 = K(1.0);
3327  psij_const[0] = fg_psij0;
3328  for (l = 1; l <= 2*m+1; l++)
3329  {
3330  fg_psij2 *= fg_psij1;
3331  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3332  }
3333 
3334  uo(ths,j,&u,&o, (INT)1);
3335  fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3336  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3337  fg_psij2 = K(1.0);
3338  psij_const[2*m+2] = fg_psij0;
3339  for(l=1; l<=2*m+1; l++)
3340  {
3341  fg_psij2 *= fg_psij1;
3342  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3343  }
3344 
3345  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3346  }
3347 
3348  return;
3349  } /* if(FG_PSI) */
3350 
3351  if(ths->flags & PRE_LIN_PSI)
3352  {
3353  const INT K = ths->K, ip_s = K / (m + 2);
3354 
3355  sort(ths);
3356 
3357 #ifdef _OPENMP
3358  #pragma omp parallel for default(shared) private(k)
3359 #endif
3360  for (k = 0; k < M; k++)
3361  {
3362  INT u, o, l;
3363  R ip_y, ip_w;
3364  INT ip_u;
3365  R psij_const[2*(2*m+2)];
3366  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3367 
3368  uo(ths,j,&u,&o,(INT)0);
3369  ip_y = FABS((R)(n0) * ths->x[2*j] - (R)(u)) * ((R)ip_s);
3370  ip_u = (INT)LRINT(FLOOR(ip_y));
3371  ip_w = ip_y - (R)(ip_u);
3372  for (l = 0; l < 2*m+2; l++)
3373  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3374 
3375  uo(ths,j,&u,&o,(INT)1);
3376  ip_y = FABS((R)(n1) * ths->x[2*j+1] - (R)(u)) * ((R)ip_s);
3377  ip_u = (INT)(LRINT(FLOOR(ip_y)));
3378  ip_w = ip_y - (R)(ip_u);
3379  for (l = 0; l < 2*m+2; l++)
3380  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3381 
3382  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3383  }
3384  return;
3385  } /* if(PRE_LIN_PSI) */
3386 
3387  /* no precomputed psi at all */
3388 
3389  sort(ths);
3390 
3391 #ifdef _OPENMP
3392  #pragma omp parallel for default(shared) private(k)
3393 #endif
3394  for (k = 0; k < M; k++)
3395  {
3396  R psij_const[2*(2*m+2)];
3397  INT u, o, l;
3398  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3399 
3400  uo(ths,j,&u,&o,(INT)0);
3401  for (l = 0; l <= 2*m+1; l++)
3402  psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3403 
3404  uo(ths,j,&u,&o,(INT)1);
3405  for (l = 0; l <= 2*m+1; l++)
3406  psij_const[2*m+2+l] = (PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l)))/(R)(n1),1));
3407 
3408  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3409  }
3410 }
3411 
3412 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
3413  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3414  ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), \
3415  ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0);
3416 
3417 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
3418 { \
3419  R psij_const[2*(2*m+2)]; \
3420  INT l; \
3421  R fg_psij0 = ths->psi[2*j*2]; \
3422  R fg_psij1 = ths->psi[2*j*2+1]; \
3423  R fg_psij2 = K(1.0); \
3424  \
3425  psij_const[0] = fg_psij0; \
3426  for(l=1; l<=2*m+1; l++) \
3427  { \
3428  fg_psij2 *= fg_psij1; \
3429  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3430  } \
3431  \
3432  fg_psij0 = ths->psi[2*(j*2+1)]; \
3433  fg_psij1 = ths->psi[2*(j*2+1)+1]; \
3434  fg_psij2 = K(1.0); \
3435  psij_const[2*m+2] = fg_psij0; \
3436  for(l=1; l<=2*m+1; l++) \
3437  { \
3438  fg_psij2 *= fg_psij1; \
3439  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3440  } \
3441  \
3442  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3443  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3444  n0, n1, m, my_u0, my_o0); \
3445 }
3446 
3447 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
3448 { \
3449  R psij_const[2*(2*m+2)]; \
3450  R fg_psij0, fg_psij1, fg_psij2; \
3451  INT u, o, l; \
3452  \
3453  uo(ths,j,&u,&o,(INT)0); \
3454  fg_psij0 = (PHI(ths->n[0],ths->x[2*j]-((R)u)/((R)n0),0)); \
3455  fg_psij1 = EXP(K(2.0)*(((R)n0)*(ths->x[2*j]) - (R)u)/ths->b[0]); \
3456  fg_psij2 = K(1.0); \
3457  psij_const[0] = fg_psij0; \
3458  for(l=1; l<=2*m+1; l++) \
3459  { \
3460  fg_psij2 *= fg_psij1; \
3461  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3462  } \
3463  \
3464  uo(ths,j,&u,&o,(INT)1); \
3465  fg_psij0 = (PHI(ths->n[1],ths->x[2*j+1]-((R)u)/((R)n1),1)); \
3466  fg_psij1 = EXP(K(2.0)*(((R)n1)*(ths->x[2*j+1]) - (R)u)/ths->b[1]); \
3467  fg_psij2 = K(1.0); \
3468  psij_const[2*m+2] = fg_psij0; \
3469  for(l=1; l<=2*m+1; l++) \
3470  { \
3471  fg_psij2 *= fg_psij1; \
3472  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3473  } \
3474  \
3475  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3476  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3477  n0, n1, m, my_u0, my_o0); \
3478 }
3479 
3480 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
3481 { \
3482  R psij_const[2*(2*m+2)]; \
3483  INT u, o, l; \
3484  INT ip_u; \
3485  R ip_y, ip_w; \
3486  \
3487  uo(ths,j,&u,&o,(INT)0); \
3488  ip_y = FABS(((R)n0)*(ths->x[2*j]) - (R)u)*((R)ip_s); \
3489  ip_u = LRINT(FLOOR(ip_y)); \
3490  ip_w = ip_y-ip_u; \
3491  for(l=0; l < 2*m+2; l++) \
3492  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3493  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
3494  \
3495  uo(ths,j,&u,&o,(INT)1); \
3496  ip_y = FABS(((R)n1)*(ths->x[2*j+1]) - (R)u)*((R)ip_s); \
3497  ip_u = LRINT(FLOOR(ip_y)); \
3498  ip_w = ip_y-ip_u; \
3499  for(l=0; l < 2*m+2; l++) \
3500  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3501  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
3502  \
3503  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3504  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3505  n0, n1, m, my_u0, my_o0); \
3506 }
3507 
3508 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
3509 { \
3510  R psij_const[2*(2*m+2)]; \
3511  INT u, o, l; \
3512  \
3513  uo(ths,j,&u,&o,(INT)0); \
3514  for(l=0;l<=2*m+1;l++) \
3515  psij_const[l]=(PHI(ths->n[0],ths->x[2*j]-((R)((u+l)))/((R)n0),0)); \
3516  \
3517  uo(ths,j,&u,&o,(INT)1); \
3518  for(l=0;l<=2*m+1;l++) \
3519  psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[2*j+1]-((R)((u+l)))/((R)n1),1)); \
3520  \
3521  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3522  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3523  n0, n1, m, my_u0, my_o0); \
3524 }
3525 
3526 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE(whichone) \
3527 { \
3528  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
3529  { \
3530  _Pragma("omp parallel private(k)") \
3531  { \
3532  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
3533  INT *ar_x = ths->index_x; \
3534  \
3535  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
3536  &min_u_b, &max_u_b, 2, ths->n, m); \
3537  \
3538  if (min_u_a != -1) \
3539  { \
3540  k = index_x_binary_search(ar_x, M, min_u_a); \
3541  \
3542  MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \
3543  \
3544  while (k < M) \
3545  { \
3546  INT u_prod = ar_x[2*k]; \
3547  INT j = ar_x[2*k+1]; \
3548  \
3549  if (u_prod < min_u_a || u_prod > max_u_a) \
3550  break; \
3551  \
3552  MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3553  \
3554  k++; \
3555  } \
3556  } \
3557  \
3558  if (min_u_b != -1) \
3559  { \
3560  INT k = index_x_binary_search(ar_x, M, min_u_b); \
3561  \
3562  MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \
3563  \
3564  while (k < M) \
3565  { \
3566  INT u_prod = ar_x[2*k]; \
3567  INT j = ar_x[2*k+1]; \
3568  \
3569  if (u_prod < min_u_b || u_prod > max_u_b) \
3570  break; \
3571  \
3572  MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3573  \
3574  k++; \
3575  } \
3576  } \
3577  } /* omp parallel */ \
3578  return; \
3579  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
3580 }
3581 
3582 
3583 static void nfft_adjoint_2d_B(X(plan) *ths)
3584 {
3585  const INT n0 = ths->n[0];
3586  const INT n1 = ths->n[1];
3587  const INT M = ths->M_total;
3588  const INT m = ths->m;
3589  C* g = (C*) ths->g;
3590  INT k;
3591 
3592  memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
3593 
3594  if(ths->flags & PRE_FULL_PSI)
3595  {
3596  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
3597  (INT)2, ths->n, m, ths->flags, ths->index_x);
3598  return;
3599  } /* if(PRE_FULL_PSI) */
3600 
3601  if(ths->flags & PRE_PSI)
3602  {
3603 #ifdef _OPENMP
3604  MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_PSI)
3605 #endif
3606 
3607 #ifdef _OPENMP
3608  #pragma omp parallel for default(shared) private(k)
3609 #endif
3610  for (k = 0; k < M; k++)
3611  {
3612  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3613 #ifdef _OPENMP
3614  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3615 #else
3616  nfft_adjoint_2d_compute_serial(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3617 #endif
3618  }
3619  return;
3620  } /* if(PRE_PSI) */
3621 
3622  if(ths->flags & PRE_FG_PSI)
3623  {
3624  R fg_exp_l[2*(2*m+2)];
3625 
3626  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3627  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3628 
3629 #ifdef _OPENMP
3630  MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_FG_PSI)
3631 #endif
3632 
3633 
3634 #ifdef _OPENMP
3635  #pragma omp parallel for default(shared) private(k)
3636 #endif
3637  for (k = 0; k < M; k++)
3638  {
3639  R psij_const[2*(2*m+2)];
3640  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3641  INT l;
3642  R fg_psij0 = ths->psi[2*j*2];
3643  R fg_psij1 = ths->psi[2*j*2+1];
3644  R fg_psij2 = K(1.0);
3645 
3646  psij_const[0] = fg_psij0;
3647  for(l=1; l<=2*m+1; l++)
3648  {
3649  fg_psij2 *= fg_psij1;
3650  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3651  }
3652 
3653  fg_psij0 = ths->psi[2*(j*2+1)];
3654  fg_psij1 = ths->psi[2*(j*2+1)+1];
3655  fg_psij2 = K(1.0);
3656  psij_const[2*m+2] = fg_psij0;
3657  for(l=1; l<=2*m+1; l++)
3658  {
3659  fg_psij2 *= fg_psij1;
3660  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3661  }
3662 
3663 #ifdef _OPENMP
3664  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3665 #else
3666  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3667 #endif
3668  }
3669 
3670  return;
3671  } /* if(PRE_FG_PSI) */
3672 
3673  if(ths->flags & FG_PSI)
3674  {
3675  R fg_exp_l[2*(2*m+2)];
3676 
3677  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3678  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3679 
3680  sort(ths);
3681 
3682 #ifdef _OPENMP
3683  MACRO_adjoint_2d_B_OMP_BLOCKWISE(FG_PSI)
3684 #endif
3685 
3686 #ifdef _OPENMP
3687  #pragma omp parallel for default(shared) private(k)
3688 #endif
3689  for (k = 0; k < M; k++)
3690  {
3691  INT u, o, l;
3692  R fg_psij0, fg_psij1, fg_psij2;
3693  R psij_const[2*(2*m+2)];
3694  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3695 
3696  uo(ths,j,&u,&o,(INT)0);
3697  fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u)/(R)(n0),0));
3698  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3699  fg_psij2 = K(1.0);
3700  psij_const[0] = fg_psij0;
3701  for(l=1; l<=2*m+1; l++)
3702  {
3703  fg_psij2 *= fg_psij1;
3704  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3705  }
3706 
3707  uo(ths,j,&u,&o,(INT)1);
3708  fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3709  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3710  fg_psij2 = K(1.0);
3711  psij_const[2*m+2] = fg_psij0;
3712  for(l=1; l<=2*m+1; l++)
3713  {
3714  fg_psij2 *= fg_psij1;
3715  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3716  }
3717 
3718 #ifdef _OPENMP
3719  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3720 #else
3721  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3722 #endif
3723  }
3724 
3725  return;
3726  } /* if(FG_PSI) */
3727 
3728  if(ths->flags & PRE_LIN_PSI)
3729  {
3730  const INT K = ths->K;
3731  const INT ip_s = K / (m + 2);
3732 
3733  sort(ths);
3734 
3735 #ifdef _OPENMP
3736  MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
3737 #endif
3738 
3739 #ifdef _OPENMP
3740  #pragma omp parallel for default(shared) private(k)
3741 #endif
3742  for (k = 0; k < M; k++)
3743  {
3744  INT u,o,l;
3745  INT ip_u;
3746  R ip_y, ip_w;
3747  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3748  R psij_const[2*(2*m+2)];
3749 
3750  uo(ths,j,&u,&o,(INT)0);
3751  ip_y = FABS((R)(n0) * (ths->x[2*j]) - (R)(u)) * ((R)ip_s);
3752  ip_u = (INT)(LRINT(FLOOR(ip_y)));
3753  ip_w = ip_y - (R)(ip_u);
3754  for(l=0; l < 2*m+2; l++)
3755  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3756  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3757 
3758  uo(ths,j,&u,&o,(INT)1);
3759  ip_y = FABS((R)(n1) * (ths->x[2*j+1]) - (R)(u)) * ((R)ip_s);
3760  ip_u = (INT)(LRINT(FLOOR(ip_y)));
3761  ip_w = ip_y - (R)(ip_u);
3762  for(l=0; l < 2*m+2; l++)
3763  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3764  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3765 
3766 #ifdef _OPENMP
3767  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3768 #else
3769  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3770 #endif
3771  }
3772  return;
3773  } /* if(PRE_LIN_PSI) */
3774 
3775  /* no precomputed psi at all */
3776  sort(ths);
3777 
3778 #ifdef _OPENMP
3779  MACRO_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI)
3780 #endif
3781 
3782 #ifdef _OPENMP
3783  #pragma omp parallel for default(shared) private(k)
3784 #endif
3785  for (k = 0; k < M; k++)
3786  {
3787  INT u,o,l;
3788  R psij_const[2*(2*m+2)];
3789  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3790 
3791  uo(ths,j,&u,&o,(INT)0);
3792  for(l=0;l<=2*m+1;l++)
3793  psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3794 
3795  uo(ths,j,&u,&o,(INT)1);
3796  for(l=0;l<=2*m+1;l++)
3797  psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l))) / (R)(n1),1));
3798 
3799 #ifdef _OPENMP
3800  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3801 #else
3802  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3803 #endif
3804  }
3805 }
3806 
3807 
3808 void X(trafo_2d)(X(plan) *ths)
3809 {
3810  if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2))
3811  {
3812  X(trafo_direct)(ths);
3813  return;
3814  }
3815 
3816  INT k0,k1,n0,n1,N0,N1;
3817  C *g_hat,*f_hat;
3818  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3819  R ck01, ck02, ck11, ck12;
3820  C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3821 
3822  ths->g_hat=ths->g1;
3823  ths->g=ths->g2;
3824 
3825  N0=ths->N[0];
3826  N1=ths->N[1];
3827  n0=ths->n[0];
3828  n1=ths->n[1];
3829 
3830  f_hat=(C*)ths->f_hat;
3831  g_hat=(C*)ths->g_hat;
3832 
3833  TIC(0)
3834 #ifdef _OPENMP
3835  #pragma omp parallel for default(shared) private(k0)
3836  for (k0 = 0; k0 < ths->n_total; k0++)
3837  ths->g_hat[k0] = 0.0;
3838 #else
3839  memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
3840 #endif
3841  if(ths->flags & PRE_PHI_HUT)
3842  {
3843  c_phi_inv01=ths->c_phi_inv[0];
3844  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3845 
3846 #ifdef _OPENMP
3847  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3848 #endif
3849  for(k0=0;k0<N0/2;k0++)
3850  {
3851  ck01=c_phi_inv01[k0];
3852  ck02=c_phi_inv02[k0];
3853 
3854  c_phi_inv11=ths->c_phi_inv[1];
3855  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3856 
3857  g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3858  f_hat11=f_hat + k0*N1;
3859  g_hat21=g_hat + k0*n1+n1-(N1/2);
3860  f_hat21=f_hat + ((N0/2)+k0)*N1;
3861  g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3862  f_hat12=f_hat + k0*N1+(N1/2);
3863  g_hat22=g_hat + k0*n1;
3864  f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3865 
3866  for(k1=0;k1<N1/2;k1++)
3867  {
3868  ck11=c_phi_inv11[k1];
3869  ck12=c_phi_inv12[k1];
3870 
3871  g_hat11[k1] = f_hat11[k1] * ck01 * ck11;
3872  g_hat21[k1] = f_hat21[k1] * ck02 * ck11;
3873  g_hat12[k1] = f_hat12[k1] * ck01 * ck12;
3874  g_hat22[k1] = f_hat22[k1] * ck02 * ck12;
3875  }
3876  }
3877  }
3878  else
3879 #ifdef _OPENMP
3880  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3881 #endif
3882  for(k0=0;k0<N0/2;k0++)
3883  {
3884  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3885  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3886  for(k1=0;k1<N1/2;k1++)
3887  {
3888  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3889  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3890  g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1] * ck01 * ck11;
3891  g_hat[k0*n1+n1-N1/2+k1] = f_hat[(N0/2+k0)*N1+k1] * ck02 * ck11;
3892  g_hat[(n0-N0/2+k0)*n1+k1] = f_hat[k0*N1+N1/2+k1] * ck01 * ck12;
3893  g_hat[k0*n1+k1] = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12;
3894  }
3895  }
3896 
3897  TOC(0)
3898 
3899  TIC_FFTW(1)
3900  FFTW(execute)(ths->my_fftw_plan1);
3901  TOC_FFTW(1);
3902 
3903  TIC(2);
3904  nfft_trafo_2d_B(ths);
3905  TOC(2);
3906 }
3907 
3908 void X(adjoint_2d)(X(plan) *ths)
3909 {
3910  if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2))
3911  {
3912  X(adjoint_direct)(ths);
3913  return;
3914  }
3915 
3916  INT k0,k1,n0,n1,N0,N1;
3917  C *g_hat,*f_hat;
3918  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3919  R ck01, ck02, ck11, ck12;
3920  C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3921 
3922  ths->g_hat=ths->g1;
3923  ths->g=ths->g2;
3924 
3925  N0=ths->N[0];
3926  N1=ths->N[1];
3927  n0=ths->n[0];
3928  n1=ths->n[1];
3929 
3930  f_hat=(C*)ths->f_hat;
3931  g_hat=(C*)ths->g_hat;
3932 
3933  TIC(2);
3934  nfft_adjoint_2d_B(ths);
3935  TOC(2);
3936 
3937  TIC_FFTW(1)
3938  FFTW(execute)(ths->my_fftw_plan2);
3939  TOC_FFTW(1);
3940 
3941  TIC(0)
3942  if(ths->flags & PRE_PHI_HUT)
3943  {
3944  c_phi_inv01=ths->c_phi_inv[0];
3945  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3946 
3947 #ifdef _OPENMP
3948  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3949 #endif
3950  for(k0=0;k0<N0/2;k0++)
3951  {
3952  ck01=c_phi_inv01[k0];
3953  ck02=c_phi_inv02[k0];
3954 
3955  c_phi_inv11=ths->c_phi_inv[1];
3956  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3957 
3958  g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3959  f_hat11=f_hat + k0*N1;
3960  g_hat21=g_hat + k0*n1+n1-(N1/2);
3961  f_hat21=f_hat + ((N0/2)+k0)*N1;
3962  g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3963  f_hat12=f_hat + k0*N1+(N1/2);
3964  g_hat22=g_hat + k0*n1;
3965  f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3966 
3967  for(k1=0;k1<N1/2;k1++)
3968  {
3969  ck11=c_phi_inv11[k1];
3970  ck12=c_phi_inv12[k1];
3971 
3972  f_hat11[k1] = g_hat11[k1] * ck01 * ck11;
3973  f_hat21[k1] = g_hat21[k1] * ck02 * ck11;
3974  f_hat12[k1] = g_hat12[k1] * ck01 * ck12;
3975  f_hat22[k1] = g_hat22[k1] * ck02 * ck12;
3976  }
3977  }
3978  }
3979  else
3980 #ifdef _OPENMP
3981  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3982 #endif
3983  for(k0=0;k0<N0/2;k0++)
3984  {
3985  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3986  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3987  for(k1=0;k1<N1/2;k1++)
3988  {
3989  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3990  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3991  f_hat[k0*N1+k1] = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11;
3992  f_hat[(N0/2+k0)*N1+k1] = g_hat[k0*n1+n1-N1/2+k1] * ck02 * ck11;
3993  f_hat[k0*N1+N1/2+k1] = g_hat[(n0-N0/2+k0)*n1+k1] * ck01 * ck12;
3994  f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1] * ck02 * ck12;
3995  }
3996  }
3997  TOC(0)
3998 }
3999 
4000 /* ################################################ SPECIFIC VERSIONS FOR d=3 */
4001 
4002 static void nfft_3d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
4003 {
4004  INT l;
4005  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
4006 
4007  fg_exp_b0 = EXP(-K(1.0) / b);
4008  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
4009  fg_exp_b1 = K(1.0);
4010  fg_exp_b2 = K(1.0);
4011  fg_exp_l[0] = K(1.0);
4012  for(l=1; l <= 2*m+1; l++)
4013  {
4014  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
4015  fg_exp_b1 *= fg_exp_b0_sq;
4016  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
4017  }
4018 }
4019 
4020 static void nfft_trafo_3d_compute(C *fj, const C *g, const R *psij_const0,
4021  const R *psij_const1, const R *psij_const2, const R *xj0, const R *xj1,
4022  const R *xj2, const INT n0, const INT n1, const INT n2, const INT m)
4023 {
4024  INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
4025  const C *gj;
4026  const R *psij0, *psij1, *psij2;
4027 
4028  psij0 = psij_const0;
4029  psij1 = psij_const1;
4030  psij2 = psij_const2;
4031 
4032  uo2(&u0, &o0, *xj0, n0, m);
4033  uo2(&u1, &o1, *xj1, n1, m);
4034  uo2(&u2, &o2, *xj2, n2, m);
4035 
4036  *fj = 0;
4037 
4038  if (u0 < o0)
4039  if (u1 < o1)
4040  if (u2 < o2)
4041  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4042  {
4043  psij1 = psij_const1;
4044  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4045  {
4046  psij2 = psij_const2;
4047  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4048  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4049  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4050  }
4051  }
4052  else
4053  /* asserts (u2>o2)*/
4054  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4055  {
4056  psij1 = psij_const1;
4057  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4058  {
4059  psij2 = psij_const2;
4060  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4061  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4062  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4063  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4064  for (l2 = 0; l2 <= o2; l2++)
4065  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4066  }
4067  }
4068  else /* asserts (u1>o1)*/
4069  if (u2 < o2)
4070  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4071  {
4072  psij1 = psij_const1;
4073  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4074  {
4075  psij2 = psij_const2;
4076  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4077  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4078  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4079  }
4080  for (l1 = 0; l1 <= o1; l1++, psij1++)
4081  {
4082  psij2 = psij_const2;
4083  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4084  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4085  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4086  }
4087  }
4088  else/* asserts (u2>o2) */
4089  {
4090  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4091  {
4092  psij1 = psij_const1;
4093  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4094  {
4095  psij2 = psij_const2;
4096  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4097  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4098  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4099  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4100  for (l2 = 0; l2 <= o2; l2++)
4101  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4102  }
4103  for (l1 = 0; l1 <= o1; l1++, psij1++)
4104  {
4105  psij2 = psij_const2;
4106  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4107  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4108  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4109  gj = g + ((u0 + l0) * n1 + l1) * n2;
4110  for (l2 = 0; l2 <= o2; l2++)
4111  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4112  }
4113  }
4114  }
4115  else /* asserts (u0>o0) */
4116  if (u1 < o1)
4117  if (u2 < o2)
4118  {
4119  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4120  {
4121  psij1 = psij_const1;
4122  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4123  {
4124  psij2 = psij_const2;
4125  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4126  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4127  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4128  }
4129  }
4130 
4131  for (l0 = 0; l0 <= o0; l0++, psij0++)
4132  {
4133  psij1 = psij_const1;
4134  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4135  {
4136  psij2 = psij_const2;
4137  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4138  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4139  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4140  }
4141  }
4142  } else/* asserts (u2>o2) */
4143  {
4144  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4145  {
4146  psij1 = psij_const1;
4147  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4148  {
4149  psij2 = psij_const2;
4150  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4151  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4152  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4153  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4154  for (l2 = 0; l2 <= o2; l2++)
4155  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4156  }
4157  }
4158 
4159  for (l0 = 0; l0 <= o0; l0++, psij0++)
4160  {
4161  psij1 = psij_const1;
4162  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4163  {
4164  psij2 = psij_const2;
4165  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4166  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4167  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4168  gj = g + (l0 * n1 + (u1 + l1)) * n2;
4169  for (l2 = 0; l2 <= o2; l2++)
4170  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4171  }
4172  }
4173  }
4174  else /* asserts (u1>o1) */
4175  if (u2 < o2)
4176  {
4177  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4178  {
4179  psij1 = psij_const1;
4180  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4181  {
4182  psij2 = psij_const2;
4183  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4184  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4185  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4186  }
4187  for (l1 = 0; l1 <= o1; l1++, psij1++)
4188  {
4189  psij2 = psij_const2;
4190  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4191  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4192  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4193  }
4194  }
4195  for (l0 = 0; l0 <= o0; l0++, psij0++)
4196  {
4197  psij1 = psij_const1;
4198  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4199  {
4200  psij2 = psij_const2;
4201  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4202  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4203  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4204  }
4205  for (l1 = 0; l1 <= o1; l1++, psij1++)
4206  {
4207  psij2 = psij_const2;
4208  gj = g + (l0 * n1 + l1) * n2 + u2;
4209  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4210  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4211  }
4212  }
4213  } else/* asserts (u2>o2) */
4214  {
4215  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4216  {
4217  psij1 = psij_const1;
4218  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4219  {
4220  psij2 = psij_const2;
4221  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4222  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4223  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4224  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4225  for (l2 = 0; l2 <= o2; l2++)
4226  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4227  }
4228  for (l1 = 0; l1 <= o1; l1++, psij1++)
4229  {
4230  psij2 = psij_const2;
4231  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4232  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4233  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4234  gj = g + ((u0 + l0) * n1 + l1) * n2;
4235  for (l2 = 0; l2 <= o2; l2++)
4236  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4237  }
4238  }
4239 
4240  for (l0 = 0; l0 <= o0; l0++, psij0++)
4241  {
4242  psij1 = psij_const1;
4243  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4244  {
4245  psij2 = psij_const2;
4246  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4247  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4248  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4249  gj = g + (l0 * n1 + (u1 + l1)) * n2;
4250  for (l2 = 0; l2 <= o2; l2++)
4251  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4252  }
4253  for (l1 = 0; l1 <= o1; l1++, psij1++)
4254  {
4255  psij2 = psij_const2;
4256  gj = g + (l0 * n1 + l1) * n2 + u2;
4257  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4258  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4259  gj = g + (l0 * n1 + l1) * n2;
4260  for (l2 = 0; l2 <= o2; l2++)
4261  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4262  }
4263  }
4264  }
4265 }
4266 
4267 #ifdef _OPENMP
4268 
4289 static void nfft_adjoint_3d_compute_omp_blockwise(const C f, C *g,
4290  const R *psij_const0, const R *psij_const1, const R *psij_const2,
4291  const R *xj0, const R *xj1, const R *xj2,
4292  const INT n0, const INT n1, const INT n2, const INT m,
4293  const INT my_u0, const INT my_o0)
4294 {
4295  INT ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2;
4296 
4297  INT index_temp1[2*m+2];
4298  INT index_temp2[2*m+2];
4299 
4300  uo2(&ar_u0,&ar_o0,*xj0, n0, m);
4301  uo2(&u1,&o1,*xj1, n1, m);
4302  uo2(&u2,&o2,*xj2, n2, m);
4303 
4304  for (l1=0; l1<=2*m+1; l1++)
4305  index_temp1[l1] = (u1+l1)%n1;
4306 
4307  for (l2=0; l2<=2*m+1; l2++)
4308  index_temp2[l2] = (u2+l2)%n2;
4309 
4310  if(ar_u0<ar_o0)
4311  {
4312  INT u0 = MAX(my_u0,ar_u0);
4313  INT o0 = MIN(my_o0,ar_o0);
4314  INT offset_psij = u0-ar_u0;
4315 #ifdef OMP_ASSERT
4316  assert(offset_psij >= 0);
4317  assert(o0-u0 <= 2*m+1);
4318  assert(offset_psij+o0-u0 <= 2*m+1);
4319 #endif
4320 
4321  for (l0 = 0; l0 <= o0-u0; l0++)
4322  {
4323  const INT i0 = (u0+l0) * n1;
4324  const C val0 = psij_const0[offset_psij+l0];
4325 
4326  for(l1=0; l1<=2*m+1; l1++)
4327  {
4328  const INT i1 = (i0 + index_temp1[l1]) * n2;
4329  const C val1 = psij_const1[l1];
4330 
4331  for(l2=0; l2<=2*m+1; l2++)
4332  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4333  }
4334  }
4335  }
4336  else
4337  {
4338  INT u0 = MAX(my_u0,ar_u0);
4339  INT o0 = my_o0;
4340  INT offset_psij = u0-ar_u0;
4341 #ifdef OMP_ASSERT
4342  assert(offset_psij >= 0);
4343  assert(o0-u0 <= 2*m+1);
4344  assert(offset_psij+o0-u0 <= 2*m+1);
4345 #endif
4346 
4347  for (l0 = 0; l0 <= o0-u0; l0++)
4348  {
4349  INT i0 = (u0+l0) * n1;
4350  const C val0 = psij_const0[offset_psij+l0];
4351 
4352  for(l1=0; l1<=2*m+1; l1++)
4353  {
4354  const INT i1 = (i0 + index_temp1[l1]) * n2;
4355  const C val1 = psij_const1[l1];
4356 
4357  for(l2=0; l2<=2*m+1; l2++)
4358  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4359  }
4360  }
4361 
4362  u0 = my_u0;
4363  o0 = MIN(my_o0,ar_o0);
4364  offset_psij += my_u0-ar_u0+n0;
4365 
4366 #ifdef OMP_ASSERT
4367  if (u0<=o0)
4368  {
4369  assert(o0-u0 <= 2*m+1);
4370  assert(offset_psij+o0-u0 <= 2*m+1);
4371  }
4372 #endif
4373  for (l0 = 0; l0 <= o0-u0; l0++)
4374  {
4375  INT i0 = (u0+l0) * n1;
4376  const C val0 = psij_const0[offset_psij+l0];
4377 
4378  for(l1=0; l1<=2*m+1; l1++)
4379  {
4380  const INT i1 = (i0 + index_temp1[l1]) * n2;
4381  const C val1 = psij_const1[l1];
4382 
4383  for(l2=0; l2<=2*m+1; l2++)
4384  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4385  }
4386  }
4387  }
4388 }
4389 #endif
4390 
4391 #ifdef _OPENMP
4392 /* adjoint NFFT three-dimensional case with OpenMP atomic operations */
4393 static void nfft_adjoint_3d_compute_omp_atomic(const C f, C *g,
4394  const R *psij_const0, const R *psij_const1, const R *psij_const2,
4395  const R *xj0, const R *xj1, const R *xj2,
4396  const INT n0, const INT n1, const INT n2, const INT m)
4397 {
4398  INT u0,o0,l0,u1,o1,l1,u2,o2,l2;
4399 
4400  INT index_temp0[2*m+2];
4401  INT index_temp1[2*m+2];
4402  INT index_temp2[2*m+2];
4403 
4404  uo2(&u0,&o0,*xj0, n0, m);
4405  uo2(&u1,&o1,*xj1, n1, m);
4406  uo2(&u2,&o2,*xj2, n2, m);
4407 
4408  for (l0=0; l0<=2*m+1; l0++)
4409  index_temp0[l0] = (u0+l0)%n0;
4410 
4411  for (l1=0; l1<=2*m+1; l1++)
4412  index_temp1[l1] = (u1+l1)%n1;
4413 
4414  for (l2=0; l2<=2*m+1; l2++)
4415  index_temp2[l2] = (u2+l2)%n2;
4416 
4417  for(l0=0; l0<=2*m+1; l0++)
4418  {
4419  for(l1=0; l1<=2*m+1; l1++)
4420  {
4421  for(l2=0; l2<=2*m+1; l2++)
4422  {
4423  INT i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2];
4424  C *lhs = g+i;
4425  R *lhs_real = (R*)lhs;
4426  C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f;
4427 
4428 #pragma omp atomic
4429  lhs_real[0] += CREAL(val);
4430 
4431 #pragma omp atomic
4432  lhs_real[1] += CIMAG(val);
4433  }
4434  }
4435  }
4436 }
4437 #endif
4438 
4439 #ifndef _OPENMP
4440 static void nfft_adjoint_3d_compute_serial(const C *fj, C *g,
4441  const R *psij_const0, const R *psij_const1, const R *psij_const2, const R *xj0,
4442  const R *xj1, const R *xj2, const INT n0, const INT n1, const INT n2,
4443  const INT m)
4444 {
4445  INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
4446  C *gj;
4447  const R *psij0, *psij1, *psij2;
4448 
4449  psij0 = psij_const0;
4450  psij1 = psij_const1;
4451  psij2 = psij_const2;
4452 
4453  uo2(&u0, &o0, *xj0, n0, m);
4454  uo2(&u1, &o1, *xj1, n1, m);
4455  uo2(&u2, &o2, *xj2, n2, m);
4456 
4457  if (u0 < o0)
4458  if (u1 < o1)
4459  if (u2 < o2)
4460  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4461  {
4462  psij1 = psij_const1;
4463  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4464  {
4465  psij2 = psij_const2;
4466  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4467  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4468  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4469  }
4470  }
4471  else
4472  /* asserts (u2>o2)*/
4473  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4474  {
4475  psij1 = psij_const1;
4476  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4477  {
4478  psij2 = psij_const2;
4479  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4480  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4481  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4482  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4483  for (l2 = 0; l2 <= o2; l2++)
4484  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4485  }
4486  }
4487  else /* asserts (u1>o1)*/
4488  if (u2 < o2)
4489  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4490  {
4491  psij1 = psij_const1;
4492  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4493  {
4494  psij2 = psij_const2;
4495  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4496  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4497  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4498  }
4499  for (l1 = 0; l1 <= o1; l1++, psij1++)
4500  {
4501  psij2 = psij_const2;
4502  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4503  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4504  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4505  }
4506  }
4507  else/* asserts (u2>o2) */
4508  {
4509  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4510  {
4511  psij1 = psij_const1;
4512  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4513  {
4514  psij2 = psij_const2;
4515  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4516  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4517  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4518  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4519  for (l2 = 0; l2 <= o2; l2++)
4520  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4521  }
4522  for (l1 = 0; l1 <= o1; l1++, psij1++)
4523  {
4524  psij2 = psij_const2;
4525  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4526  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4527  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4528  gj = g + ((u0 + l0) * n1 + l1) * n2;
4529  for (l2 = 0; l2 <= o2; l2++)
4530  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4531  }
4532  }
4533  }
4534  else /* asserts (u0>o0) */
4535  if (u1 < o1)
4536  if (u2 < o2)
4537  {
4538  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4539  {
4540  psij1 = psij_const1;
4541  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4542  {
4543  psij2 = psij_const2;
4544  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4545  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4546  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4547  }
4548  }
4549 
4550  for (l0 = 0; l0 <= o0; l0++, psij0++)
4551  {
4552  psij1 = psij_const1;
4553  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4554  {
4555  psij2 = psij_const2;
4556  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4557  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4558  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4559  }
4560  }
4561  } else/* asserts (u2>o2) */
4562  {
4563  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4564  {
4565  psij1 = psij_const1;
4566  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4567  {
4568  psij2 = psij_const2;
4569  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4570  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4571  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4572  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4573  for (l2 = 0; l2 <= o2; l2++)
4574  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4575  }
4576  }
4577 
4578  for (l0 = 0; l0 <= o0; l0++, psij0++)
4579  {
4580  psij1 = psij_const1;
4581  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4582  {
4583  psij2 = psij_const2;
4584  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4585  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4586  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4587  gj = g + (l0 * n1 + (u1 + l1)) * n2;
4588  for (l2 = 0; l2 <= o2; l2++)
4589  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4590  }
4591  }
4592  }
4593  else /* asserts (u1>o1) */
4594  if (u2 < o2)
4595  {
4596  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4597  {
4598  psij1 = psij_const1;
4599  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4600  {
4601  psij2 = psij_const2;
4602  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4603  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4604  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4605  }
4606  for (l1 = 0; l1 <= o1; l1++, psij1++)
4607  {
4608  psij2 = psij_const2;
4609  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4610  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4611  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4612  }
4613  }
4614  for (l0 = 0; l0 <= o0; l0++, psij0++)
4615  {
4616  psij1 = psij_const1;
4617  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4618  {
4619  psij2 = psij_const2;
4620  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4621  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4622  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4623  }
4624  for (l1 = 0; l1 <= o1; l1++, psij1++)
4625  {
4626  psij2 = psij_const2;
4627  gj = g + (l0 * n1 + l1) * n2 + u2;
4628  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4629  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4630  }
4631  }
4632  } else/* asserts (u2>o2) */
4633  {
4634  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4635  {
4636  psij1 = psij_const1;
4637  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4638  {
4639  psij2 = psij_const2;
4640  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4641  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4642  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4643  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4644  for (l2 = 0; l2 <= o2; l2++)
4645  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4646  }
4647  for (l1 = 0; l1 <= o1; l1++, psij1++)
4648  {
4649  psij2 = psij_const2;
4650  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4651  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4652  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4653  gj = g + ((u0 + l0) * n1 + l1) * n2;
4654  for (l2 = 0; l2 <= o2; l2++)
4655  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4656  }
4657  }
4658 
4659  for (l0 = 0; l0 <= o0; l0++, psij0++)
4660  {
4661  psij1 = psij_const1;
4662  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4663  {
4664  psij2 = psij_const2;
4665  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4666  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4667  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4668  gj = g + (l0 * n1 + (u1 + l1)) * n2;
4669  for (l2 = 0; l2 <= o2; l2++)
4670  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4671  }
4672  for (l1 = 0; l1 <= o1; l1++, psij1++)
4673  {
4674  psij2 = psij_const2;
4675  gj = g + (l0 * n1 + l1) * n2 + u2;
4676  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4677  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4678  gj = g + (l0 * n1 + l1) * n2;
4679  for (l2 = 0; l2 <= o2; l2++)
4680  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4681  }
4682  }
4683  }
4684 }
4685 #endif
4686 
4687 static void nfft_trafo_3d_B(X(plan) *ths)
4688 {
4689  const INT n0 = ths->n[0];
4690  const INT n1 = ths->n[1];
4691  const INT n2 = ths->n[2];
4692  const INT M = ths->M_total;
4693  const INT m = ths->m;
4694 
4695  const C* g = (C*) ths->g;
4696 
4697  INT k;
4698 
4699  if(ths->flags & PRE_FULL_PSI)
4700  {
4701  const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4702 #ifdef _OPENMP
4703  #pragma omp parallel for default(shared) private(k)
4704 #endif
4705  for (k = 0; k < M; k++)
4706  {
4707  INT l;
4708  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4709  ths->f[j] = K(0.0);
4710  for (l = 0; l < lprod; l++)
4711  ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
4712  }
4713  return;
4714  } /* if(PRE_FULL_PSI) */
4715 
4716  if(ths->flags & PRE_PSI)
4717  {
4718 #ifdef _OPENMP
4719  #pragma omp parallel for default(shared) private(k)
4720 #endif
4721  for (k = 0; k < M; k++)
4722  {
4723  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4724  nfft_trafo_3d_compute(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4725  }
4726  return;
4727  } /* if(PRE_PSI) */
4728 
4729  if(ths->flags & PRE_FG_PSI)
4730  {
4731  R fg_exp_l[3*(2*m+2)];
4732 
4733  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4734  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4735  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4736 
4737 #ifdef _OPENMP
4738  #pragma omp parallel for default(shared) private(k)
4739 #endif
4740  for (k = 0; k < M; k++)
4741  {
4742  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4743  INT l;
4744  R psij_const[3*(2*m+2)];
4745  R fg_psij0 = ths->psi[2*j*3];
4746  R fg_psij1 = ths->psi[2*j*3+1];
4747  R fg_psij2 = K(1.0);
4748 
4749  psij_const[0] = fg_psij0;
4750  for(l=1; l<=2*m+1; l++)
4751  {
4752  fg_psij2 *= fg_psij1;
4753  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4754  }
4755 
4756  fg_psij0 = ths->psi[2*(j*3+1)];
4757  fg_psij1 = ths->psi[2*(j*3+1)+1];
4758  fg_psij2 = K(1.0);
4759  psij_const[2*m+2] = fg_psij0;
4760  for(l=1; l<=2*m+1; l++)
4761  {
4762  fg_psij2 *= fg_psij1;
4763  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4764  }
4765 
4766  fg_psij0 = ths->psi[2*(j*3+2)];
4767  fg_psij1 = ths->psi[2*(j*3+2)+1];
4768  fg_psij2 = K(1.0);
4769  psij_const[2*(2*m+2)] = fg_psij0;
4770  for(l=1; l<=2*m+1; l++)
4771  {
4772  fg_psij2 *= fg_psij1;
4773  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4774  }
4775 
4776  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4777  }
4778 
4779  return;
4780  } /* if(PRE_FG_PSI) */
4781 
4782  if(ths->flags & FG_PSI)
4783  {
4784  R fg_exp_l[3*(2*m+2)];
4785 
4786  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4787  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4788  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4789 
4790  sort(ths);
4791 
4792 #ifdef _OPENMP
4793  #pragma omp parallel for default(shared) private(k)
4794 #endif
4795  for (k = 0; k < M; k++)
4796  {
4797  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4798  INT u, o, l;
4799  R psij_const[3*(2*m+2)];
4800  R fg_psij0, fg_psij1, fg_psij2;
4801 
4802  uo(ths,j,&u,&o,(INT)0);
4803  fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
4804  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u)) / ths->b[0]);
4805  fg_psij2 = K(1.0);
4806  psij_const[0] = fg_psij0;
4807  for(l=1; l<=2*m+1; l++)
4808  {
4809  fg_psij2 *= fg_psij1;
4810  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4811  }
4812 
4813  uo(ths,j,&u,&o,(INT)1);
4814  fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
4815  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u)) / ths->b[1]);
4816  fg_psij2 = K(1.0);
4817  psij_const[2*m+2] = fg_psij0;
4818  for(l=1; l<=2*m+1; l++)
4819  {
4820  fg_psij2 *= fg_psij1;
4821  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4822  }
4823 
4824  uo(ths,j,&u,&o,(INT)2);
4825  fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
4826  fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u)) / ths->b[2]);
4827  fg_psij2 = K(1.0);
4828  psij_const[2*(2*m+2)] = fg_psij0;
4829  for(l=1; l<=2*m+1; l++)
4830  {
4831  fg_psij2 *= fg_psij1;
4832  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4833  }
4834 
4835  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4836  }
4837 
4838  return;
4839  } /* if(FG_PSI) */
4840 
4841  if(ths->flags & PRE_LIN_PSI)
4842  {
4843  const INT K = ths->K, ip_s = K / (m + 2);
4844 
4845  sort(ths);
4846 
4847 #ifdef _OPENMP
4848  #pragma omp parallel for default(shared) private(k)
4849 #endif
4850  for (k = 0; k < M; k++)
4851  {
4852  INT u, o, l;
4853  R ip_y, ip_w;
4854  INT ip_u;
4855  R psij_const[3*(2*m+2)];
4856  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4857 
4858  uo(ths,j,&u,&o,(INT)0);
4859  ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
4860  ip_u = (INT)(LRINT(FLOOR(ip_y)));
4861  ip_w = ip_y - (R)(ip_u);
4862  for(l=0; l < 2*m+2; l++)
4863  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4864  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
4865 
4866  uo(ths,j,&u,&o,(INT)1);
4867  ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
4868  ip_u = (INT)(LRINT(FLOOR(ip_y)));
4869  ip_w = ip_y - (R)(ip_u);
4870  for(l=0; l < 2*m+2; l++)
4871  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4872  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4873 
4874  uo(ths,j,&u,&o,(INT)2);
4875  ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u)) * ((R)ip_s);
4876  ip_u = (INT)(LRINT(FLOOR(ip_y)));
4877  ip_w = ip_y - (R)(ip_u);
4878  for(l=0; l < 2*m+2; l++)
4879  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4880  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4881 
4882  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4883  }
4884  return;
4885  } /* if(PRE_LIN_PSI) */
4886 
4887  /* no precomputed psi at all */
4888 
4889  sort(ths);
4890 
4891 #ifdef _OPENMP
4892  #pragma omp parallel for default(shared) private(k)
4893 #endif
4894  for (k = 0; k < M; k++)
4895  {
4896  R psij_const[3*(2*m+2)];
4897  INT u, o, l;
4898  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4899 
4900  uo(ths,j,&u,&o,(INT)0);
4901  for(l=0;l<=2*m+1;l++)
4902  psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
4903 
4904  uo(ths,j,&u,&o,(INT)1);
4905  for(l=0;l<=2*m+1;l++)
4906  psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
4907 
4908  uo(ths,j,&u,&o,(INT)2);
4909  for(l=0;l<=2*m+1;l++)
4910  psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
4911 
4912  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4913  }
4914 }
4915 
4916 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
4917  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4918  ths->psi+j*3*(2*m+2), \
4919  ths->psi+(j*3+1)*(2*m+2), \
4920  ths->psi+(j*3+2)*(2*m+2), \
4921  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4922  n0, n1, n2, m, my_u0, my_o0);
4923 
4924 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
4925 { \
4926  INT l; \
4927  R psij_const[3*(2*m+2)]; \
4928  R fg_psij0 = ths->psi[2*j*3]; \
4929  R fg_psij1 = ths->psi[2*j*3+1]; \
4930  R fg_psij2 = K(1.0); \
4931  \
4932  psij_const[0] = fg_psij0; \
4933  for(l=1; l<=2*m+1; l++) \
4934  { \
4935  fg_psij2 *= fg_psij1; \
4936  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4937  } \
4938  \
4939  fg_psij0 = ths->psi[2*(j*3+1)]; \
4940  fg_psij1 = ths->psi[2*(j*3+1)+1]; \
4941  fg_psij2 = K(1.0); \
4942  psij_const[2*m+2] = fg_psij0; \
4943  for(l=1; l<=2*m+1; l++) \
4944  { \
4945  fg_psij2 *= fg_psij1; \
4946  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4947  } \
4948  \
4949  fg_psij0 = ths->psi[2*(j*3+2)]; \
4950  fg_psij1 = ths->psi[2*(j*3+2)+1]; \
4951  fg_psij2 = K(1.0); \
4952  psij_const[2*(2*m+2)] = fg_psij0; \
4953  for(l=1; l<=2*m+1; l++) \
4954  { \
4955  fg_psij2 *= fg_psij1; \
4956  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4957  } \
4958  \
4959  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4960  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4961  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4962  n0, n1, n2, m, my_u0, my_o0); \
4963 }
4964 
4965 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
4966 { \
4967  INT u, o, l; \
4968  R psij_const[3*(2*m+2)]; \
4969  R fg_psij0, fg_psij1, fg_psij2; \
4970  \
4971  uo(ths,j,&u,&o,(INT)0); \
4972  fg_psij0 = (PHI(ths->n[0],ths->x[3*j]-((R)u)/((R)n0),0)); \
4973  fg_psij1 = EXP(K(2.0)*(((R)n0)*(ths->x[3*j]) - (R)u)/ths->b[0]); \
4974  fg_psij2 = K(1.0); \
4975  psij_const[0] = fg_psij0; \
4976  for(l=1; l<=2*m+1; l++) \
4977  { \
4978  fg_psij2 *= fg_psij1; \
4979  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4980  } \
4981  \
4982  uo(ths,j,&u,&o,(INT)1); \
4983  fg_psij0 = (PHI(ths->n[1],ths->x[3*j+1]-((R)u)/((R)n1),1)); \
4984  fg_psij1 = EXP(K(2.0)*(((R)n1)*(ths->x[3*j+1]) - (R)u)/ths->b[1]); \
4985  fg_psij2 = K(1.0); \
4986  psij_const[2*m+2] = fg_psij0; \
4987  for(l=1; l<=2*m+1; l++) \
4988  { \
4989  fg_psij2 *= fg_psij1; \
4990  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4991  } \
4992  \
4993  uo(ths,j,&u,&o,(INT)2); \
4994  fg_psij0 = (PHI(ths->n[2],ths->x[3*j+2]-((R)u)/((R)n2),2)); \
4995  fg_psij1 = EXP(K(2.0)*(((R)n2)*(ths->x[3*j+2]) - (R)u)/ths->b[2]); \
4996  fg_psij2 = K(1.0); \
4997  psij_const[2*(2*m+2)] = fg_psij0; \
4998  for(l=1; l<=2*m+1; l++) \
4999  { \
5000  fg_psij2 *= fg_psij1; \
5001  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
5002  } \
5003  \
5004  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
5005  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
5006  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
5007  n0, n1, n2, m, my_u0, my_o0); \
5008 }
5009 
5010 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
5011 { \
5012  INT u, o, l; \
5013  R psij_const[3*(2*m+2)]; \
5014  INT ip_u; \
5015  R ip_y, ip_w; \
5016  \
5017  uo(ths,j,&u,&o,(INT)0); \
5018  ip_y = FABS(((R)n0)*ths->x[3*j+0] - (R)u)*((R)ip_s); \
5019  ip_u = LRINT(FLOOR(ip_y)); \
5020  ip_w = ip_y-ip_u; \
5021  for(l=0; l < 2*m+2; l++) \
5022  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
5023  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
5024  \
5025  uo(ths,j,&u,&o,(INT)1); \
5026  ip_y = FABS(((R)n1)*ths->x[3*j+1] - (R)u)*((R)ip_s); \
5027  ip_u = LRINT(FLOOR(ip_y)); \
5028  ip_w = ip_y-ip_u; \
5029  for(l=0; l < 2*m+2; l++) \
5030  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
5031  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
5032  \
5033  uo(ths,j,&u,&o,(INT)2); \
5034  ip_y = FABS(((R)n2)*ths->x[3*j+2] - (R)u)*((R)ip_s); \
5035  ip_u = LRINT(FLOOR(ip_y)); \
5036  ip_w = ip_y-ip_u; \
5037  for(l=0; l < 2*m+2; l++) \
5038  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
5039  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
5040  \
5041  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
5042  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
5043  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
5044  n0, n1, n2, m, my_u0, my_o0); \
5045 }
5046 
5047 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
5048 { \
5049  INT u, o, l; \
5050  R psij_const[3*(2*m+2)]; \
5051  \
5052  uo(ths,j,&u,&o,(INT)0); \
5053  for(l=0;l<=2*m+1;l++) \
5054  psij_const[l]=(PHI(ths->n[0],ths->x[3*j]-((R)((u+l)))/((R) n0),0)); \
5055  \
5056  uo(ths,j,&u,&o,(INT)1); \
5057  for(l=0;l<=2*m+1;l++) \
5058  psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[3*j+1]-((R)((u+l)))/((R) n1),1)); \
5059  \
5060  uo(ths,j,&u,&o,(INT)2); \
5061  for(l=0;l<=2*m+1;l++) \
5062  psij_const[2*(2*m+2)+l]=(PHI(ths->n[2],ths->x[3*j+2]-((R)((u+l)))/((R) n2),2)); \
5063  \
5064  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
5065  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
5066  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
5067  n0, n1, n2, m, my_u0, my_o0); \
5068 }
5069 
5070 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE(whichone) \
5071 { \
5072  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
5073  { \
5074  _Pragma("omp parallel private(k)") \
5075  { \
5076  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
5077  INT *ar_x = ths->index_x; \
5078  \
5079  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
5080  &min_u_b, &max_u_b, 3, ths->n, m); \
5081  \
5082  if (min_u_a != -1) \
5083  { \
5084  k = index_x_binary_search(ar_x, M, min_u_a); \
5085  \
5086  MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \
5087  \
5088  while (k < M) \
5089  { \
5090  INT u_prod = ar_x[2*k]; \
5091  INT j = ar_x[2*k+1]; \
5092  \
5093  if (u_prod < min_u_a || u_prod > max_u_a) \
5094  break; \
5095  \
5096  MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
5097  \
5098  k++; \
5099  } \
5100  } \
5101  \
5102  if (min_u_b != -1) \
5103  { \
5104  INT k = index_x_binary_search(ar_x, M, min_u_b); \
5105  \
5106  MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \
5107  \
5108  while (k < M) \
5109  { \
5110  INT u_prod = ar_x[2*k]; \
5111  INT j = ar_x[2*k+1]; \
5112  \
5113  if (u_prod < min_u_b || u_prod > max_u_b) \
5114  break; \
5115  \
5116  MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
5117  \
5118  k++; \
5119  } \
5120  } \
5121  } /* omp parallel */ \
5122  return; \
5123  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
5124 }
5125 
5126 static void nfft_adjoint_3d_B(X(plan) *ths)
5127 {
5128  INT k;
5129  const INT n0 = ths->n[0];
5130  const INT n1 = ths->n[1];
5131  const INT n2 = ths->n[2];
5132  const INT M = ths->M_total;
5133  const INT m = ths->m;
5134 
5135  C* g = (C*) ths->g;
5136 
5137  memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
5138 
5139  if(ths->flags & PRE_FULL_PSI)
5140  {
5141  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
5142  (INT)3, ths->n, m, ths->flags, ths->index_x);
5143  return;
5144  } /* if(PRE_FULL_PSI) */
5145 
5146  if(ths->flags & PRE_PSI)
5147  {
5148 #ifdef _OPENMP
5149  MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_PSI)
5150 #endif
5151 
5152 #ifdef _OPENMP
5153  #pragma omp parallel for default(shared) private(k)
5154 #endif
5155  for (k = 0; k < M; k++)
5156  {
5157  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5158 #ifdef _OPENMP
5159  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5160 #else
5161  nfft_adjoint_3d_compute_serial(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5162 #endif
5163  }
5164  return;
5165  } /* if(PRE_PSI) */
5166 
5167  if(ths->flags & PRE_FG_PSI)
5168  {
5169  R fg_exp_l[3*(2*m+2)];
5170 
5171  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
5172  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
5173  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
5174 
5175 #ifdef _OPENMP
5176  MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_FG_PSI)
5177 #endif
5178 
5179 #ifdef _OPENMP
5180  #pragma omp parallel for default(shared) private(k)
5181 #endif
5182  for (k = 0; k < M; k++)
5183  {
5184  R psij_const[3*(2*m+2)];
5185  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5186  INT l;
5187  R fg_psij0 = ths->psi[2*j*3];
5188  R fg_psij1 = ths->psi[2*j*3+1];
5189  R fg_psij2 = K(1.0);
5190 
5191  psij_const[0] = fg_psij0;
5192  for(l=1; l<=2*m+1; l++)
5193  {
5194  fg_psij2 *= fg_psij1;
5195  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
5196  }
5197 
5198  fg_psij0 = ths->psi[2*(j*3+1)];
5199  fg_psij1 = ths->psi[2*(j*3+1)+1];
5200  fg_psij2 = K(1.0);
5201  psij_const[2*m+2] = fg_psij0;
5202  for(l=1; l<=2*m+1; l++)
5203  {
5204  fg_psij2 *= fg_psij1;
5205  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
5206  }
5207 
5208  fg_psij0 = ths->psi[2*(j*3+2)];
5209  fg_psij1 = ths->psi[2*(j*3+2)+1];
5210  fg_psij2 = K(1.0);
5211  psij_const[2*(2*m+2)] = fg_psij0;
5212  for(l=1; l<=2*m+1; l++)
5213  {
5214  fg_psij2 *= fg_psij1;
5215  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
5216  }
5217 
5218 #ifdef _OPENMP
5219  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5220 #else
5221  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5222 #endif
5223  }
5224 
5225  return;
5226  } /* if(PRE_FG_PSI) */
5227 
5228  if(ths->flags & FG_PSI)
5229  {
5230  R fg_exp_l[3*(2*m+2)];
5231 
5232  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
5233  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
5234  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
5235 
5236  sort(ths);
5237 
5238 #ifdef _OPENMP
5239  MACRO_adjoint_3d_B_OMP_BLOCKWISE(FG_PSI)
5240 #endif
5241 
5242 #ifdef _OPENMP
5243  #pragma omp parallel for default(shared) private(k)
5244 #endif
5245  for (k = 0; k < M; k++)
5246  {
5247  INT u,o,l;
5248  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5249  R psij_const[3*(2*m+2)];
5250  R fg_psij0, fg_psij1, fg_psij2;
5251 
5252  uo(ths,j,&u,&o,(INT)0);
5253  fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
5254  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u))/ths->b[0]);
5255  fg_psij2 = K(1.0);
5256  psij_const[0] = fg_psij0;
5257  for(l=1; l<=2*m+1; l++)
5258  {
5259  fg_psij2 *= fg_psij1;
5260  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
5261  }
5262 
5263  uo(ths,j,&u,&o,(INT)1);
5264  fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
5265  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u))/ths->b[1]);
5266  fg_psij2 = K(1.0);
5267  psij_const[2*m+2] = fg_psij0;
5268  for(l=1; l<=2*m+1; l++)
5269  {
5270  fg_psij2 *= fg_psij1;
5271  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
5272  }
5273 
5274  uo(ths,j,&u,&o,(INT)2);
5275  fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
5276  fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u))/ths->b[2]);
5277  fg_psij2 = K(1.0);
5278  psij_const[2*(2*m+2)] = fg_psij0;
5279  for(l=1; l<=2*m+1; l++)
5280  {
5281  fg_psij2 *= fg_psij1;
5282  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
5283  }
5284 
5285 #ifdef _OPENMP
5286  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5287 #else
5288  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5289 #endif
5290  }
5291 
5292  return;
5293  } /* if(FG_PSI) */
5294 
5295  if(ths->flags & PRE_LIN_PSI)
5296  {
5297  const INT K = ths->K;
5298  const INT ip_s = K / (m + 2);
5299 
5300  sort(ths);
5301 
5302 #ifdef _OPENMP
5303  MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
5304 #endif
5305 
5306 #ifdef _OPENMP
5307  #pragma omp parallel for default(shared) private(k)
5308 #endif
5309  for (k = 0; k < M; k++)
5310  {
5311  INT u,o,l;
5312  INT ip_u;
5313  R ip_y, ip_w;
5314  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5315  R psij_const[3*(2*m+2)];
5316 
5317  uo(ths,j,&u,&o,(INT)0);
5318  ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
5319  ip_u = (INT)(LRINT(FLOOR(ip_y)));
5320  ip_w = ip_y - (R)(ip_u);
5321  for(l=0; l < 2*m+2; l++)
5322  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5323  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
5324 
5325  uo(ths,j,&u,&o,(INT)1);
5326  ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
5327  ip_u = (INT)(LRINT(FLOOR(ip_y)));
5328  ip_w = ip_y - (R)(ip_u);
5329  for(l=0; l < 2*m+2; l++)
5330  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5331  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5332 
5333  uo(ths,j,&u,&o,(INT)2);
5334  ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u))*((R)ip_s);
5335  ip_u = (INT)(LRINT(FLOOR(ip_y)));
5336  ip_w = ip_y - (R)(ip_u);
5337  for(l=0; l < 2*m+2; l++)
5338  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5339  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5340 
5341 #ifdef _OPENMP
5342  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5343 #else
5344  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5345 #endif
5346  }
5347  return;
5348  } /* if(PRE_LIN_PSI) */
5349 
5350  /* no precomputed psi at all */
5351  sort(ths);
5352 
5353 #ifdef _OPENMP
5354  MACRO_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI)
5355 #endif
5356 
5357 #ifdef _OPENMP
5358  #pragma omp parallel for default(shared) private(k)
5359 #endif
5360  for (k = 0; k < M; k++)
5361  {
5362  INT u,o,l;
5363  R psij_const[3*(2*m+2)];
5364  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5365 
5366  uo(ths,j,&u,&o,(INT)0);
5367  for(l=0;l<=2*m+1;l++)
5368  psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
5369 
5370  uo(ths,j,&u,&o,(INT)1);
5371  for(l=0;l<=2*m+1;l++)
5372  psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
5373 
5374  uo(ths,j,&u,&o,(INT)2);
5375  for(l=0;l<=2*m+1;l++)
5376  psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
5377 
5378 #ifdef _OPENMP
5379  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5380 #else
5381  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5382 #endif
5383  }
5384 }
5385 
5386 
5387 void X(trafo_3d)(X(plan) *ths)
5388 {
5389  if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->N[2] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2) || (ths->n[2] <= 2*ths->m+2))
5390  {
5391  X(trafo_direct)(ths);
5392  return;
5393  }
5394 
5395  INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5396  C *g_hat,*f_hat;
5397  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5398  R ck01, ck02, ck11, ck12, ck21, ck22;
5399  C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5400  C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5401 
5402  ths->g_hat=ths->g1;
5403  ths->g=ths->g2;
5404 
5405  N0=ths->N[0];
5406  N1=ths->N[1];
5407  N2=ths->N[2];
5408  n0=ths->n[0];
5409  n1=ths->n[1];
5410  n2=ths->n[2];
5411 
5412  f_hat=(C*)ths->f_hat;
5413  g_hat=(C*)ths->g_hat;
5414 
5415  TIC(0)
5416 #ifdef _OPENMP
5417  #pragma omp parallel for default(shared) private(k0)
5418  for (k0 = 0; k0 < ths->n_total; k0++)
5419  ths->g_hat[k0] = 0.0;
5420 #else
5421  memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
5422 #endif
5423 
5424  if(ths->flags & PRE_PHI_HUT)
5425  {
5426  c_phi_inv01=ths->c_phi_inv[0];
5427  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5428 
5429 #ifdef _OPENMP
5430  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5431 #endif
5432  for(k0=0;k0<N0/2;k0++)
5433  {
5434  ck01=c_phi_inv01[k0];
5435  ck02=c_phi_inv02[k0];
5436  c_phi_inv11=ths->c_phi_inv[1];
5437  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5438 
5439  for(k1=0;k1<N1/2;k1++)
5440  {
5441  ck11=c_phi_inv11[k1];
5442  ck12=c_phi_inv12[k1];
5443  c_phi_inv21=ths->c_phi_inv[2];
5444  c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5445 
5446  g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5447  f_hat111=f_hat + (k0*N1+k1)*N2;
5448  g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5449  f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5450  g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5451  f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5452  g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5453  f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5454 
5455  g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5456  f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5457  g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5458  f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5459  g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5460  f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2);
5461  g_hat222=g_hat + (k0*n1+k1)*n2;
5462  f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5463 
5464  for(k2=0;k2<N2/2;k2++)
5465  {
5466  ck21=c_phi_inv21[k2];
5467  ck22=c_phi_inv22[k2];
5468 
5469  g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21;
5470  g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21;
5471  g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21;
5472  g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21;
5473 
5474  g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22;
5475  g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22;
5476  g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22;
5477  g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22;
5478  }
5479  }
5480  }
5481  }
5482  else
5483 #ifdef _OPENMP
5484  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5485 #endif
5486  for(k0=0;k0<N0/2;k0++)
5487  {
5488  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5489  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5490  for(k1=0;k1<N1/2;k1++)
5491  {
5492  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5493  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5494 
5495  for(k2=0;k2<N2/2;k2++)
5496  {
5497  ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5498  ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5499 
5500  g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2] * ck01 * ck11 * ck21;
5501  g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+k2] * ck02 * ck11 * ck21;
5502  g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+k2] * ck01 * ck12 * ck21;
5503  g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] * ck02 * ck12 * ck21;
5504 
5505  g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] = f_hat[(k0*N1+k1)*N2+N2/2+k2] * ck01 * ck11 * ck22;
5506  g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] * ck02 * ck11 * ck22;
5507  g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] * ck01 * ck12 * ck22;
5508  g_hat[(k0*n1+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22;
5509  }
5510  }
5511  }
5512 
5513  TOC(0)
5514 
5515  TIC_FFTW(1)
5516  FFTW(execute)(ths->my_fftw_plan1);
5517  TOC_FFTW(1);
5518 
5519  TIC(2);
5520  nfft_trafo_3d_B(ths);
5521  TOC(2);
5522 }
5523 
5524 void X(adjoint_3d)(X(plan) *ths)
5525 {
5526  if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->N[2] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2) || (ths->n[2] <= 2*ths->m+2))
5527  {
5528  X(adjoint_direct)(ths);
5529  return;
5530  }
5531 
5532  INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5533  C *g_hat,*f_hat;
5534  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5535  R ck01, ck02, ck11, ck12, ck21, ck22;
5536  C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5537  C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5538 
5539  ths->g_hat=ths->g1;
5540  ths->g=ths->g2;
5541 
5542  N0=ths->N[0];
5543  N1=ths->N[1];
5544  N2=ths->N[2];
5545  n0=ths->n[0];
5546  n1=ths->n[1];
5547  n2=ths->n[2];
5548 
5549  f_hat=(C*)ths->f_hat;
5550  g_hat=(C*)ths->g_hat;
5551 
5552  TIC(2);
5553  nfft_adjoint_3d_B(ths);
5554  TOC(2);
5555 
5556  TIC_FFTW(1)
5557  FFTW(execute)(ths->my_fftw_plan2);
5558  TOC_FFTW(1);
5559 
5560  TIC(0)
5561  if(ths->flags & PRE_PHI_HUT)
5562  {
5563  c_phi_inv01=ths->c_phi_inv[0];
5564  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5565 
5566 #ifdef _OPENMP
5567  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5568 #endif
5569  for(k0=0;k0<N0/2;k0++)
5570  {
5571  ck01=c_phi_inv01[k0];
5572  ck02=c_phi_inv02[k0];
5573  c_phi_inv11=ths->c_phi_inv[1];
5574  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5575 
5576  for(k1=0;k1<N1/2;k1++)
5577  {
5578  ck11=c_phi_inv11[k1];
5579  ck12=c_phi_inv12[k1];
5580  c_phi_inv21=ths->c_phi_inv[2];
5581  c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5582 
5583  g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5584  f_hat111=f_hat + (k0*N1+k1)*N2;
5585  g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5586  f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5587  g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5588  f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5589  g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5590  f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5591 
5592  g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5593  f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5594  g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5595  f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5596  g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5597  f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2);
5598  g_hat222=g_hat + (k0*n1+k1)*n2;
5599  f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5600 
5601  for(k2=0;k2<N2/2;k2++)
5602  {
5603  ck21=c_phi_inv21[k2];
5604  ck22=c_phi_inv22[k2];
5605 
5606  f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21;
5607  f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21;
5608  f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21;
5609  f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21;
5610 
5611  f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22;
5612  f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22;
5613  f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22;
5614  f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22;
5615  }
5616  }
5617  }
5618  }
5619  else
5620 #ifdef _OPENMP
5621  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5622 #endif
5623  for(k0=0;k0<N0/2;k0++)
5624  {
5625  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5626  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5627  for(k1=0;k1<N1/2;k1++)
5628  {
5629  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5630  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5631 
5632  for(k2=0;k2<N2/2;k2++)
5633  {
5634  ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5635  ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5636 
5637  f_hat[(k0*N1+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21;
5638  f_hat[((N0/2+k0)*N1+k1)*N2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck02 * ck11 * ck21;
5639  f_hat[(k0*N1+N1/2+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] * ck01 * ck12 * ck21;
5640  f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] * ck02 * ck12 * ck21;
5641 
5642  f_hat[(k0*N1+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] * ck01 * ck11 * ck22;
5643  f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] * ck02 * ck11 * ck22;
5644  f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] * ck01 * ck12 * ck22;
5645  f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2] * ck02 * ck12 * ck22;
5646  }
5647  }
5648  }
5649 
5650  TOC(0)
5651 }
5652 
5655 void X(trafo)(X(plan) *ths)
5656 {
5657  /* use direct transform if degree N is too low */
5658  for (int j = 0; j < ths->d; j++)
5659  {
5660  if((ths->N[j] <= ths->m) || (ths->n[j] <= 2*ths->m+2))
5661  {
5662  X(trafo_direct)(ths);
5663  return;
5664  }
5665  }
5666 
5667  switch(ths->d)
5668  {
5669  case 1: X(trafo_1d)(ths); break;
5670  case 2: X(trafo_2d)(ths); break;
5671  case 3: X(trafo_3d)(ths); break;
5672  default:
5673  {
5674  /* use ths->my_fftw_plan1 */
5675  ths->g_hat = ths->g1;
5676  ths->g = ths->g2;
5677 
5681  TIC(0)
5682  D_A(ths);
5683  TOC(0)
5684 
5689  TIC_FFTW(1)
5690  FFTW(execute)(ths->my_fftw_plan1);
5691  TOC_FFTW(1)
5692 
5696  TIC(2)
5697  B_A(ths);
5698  TOC(2)
5699  }
5700  }
5701 } /* nfft_trafo */
5702 
5703 void X(adjoint)(X(plan) *ths)
5704 {
5705  /* use direct transform if degree N is too low */
5706  for (int j = 0; j < ths->d; j++)
5707  {
5708  if((ths->N[j] <= ths->m) || (ths->n[j] <= 2*ths->m+2))
5709  {
5710  X(adjoint_direct)(ths);
5711  return;
5712  }
5713  }
5714 
5715  switch(ths->d)
5716  {
5717  case 1: X(adjoint_1d)(ths); break;
5718  case 2: X(adjoint_2d)(ths); break;
5719  case 3: X(adjoint_3d)(ths); break;
5720  default:
5721  {
5722  /* use ths->my_fftw_plan2 */
5723  ths->g_hat=ths->g1;
5724  ths->g=ths->g2;
5725 
5729  TIC(2)
5730  B_T(ths);
5731  TOC(2)
5732 
5737  TIC_FFTW(1)
5738  FFTW(execute)(ths->my_fftw_plan2);
5739  TOC_FFTW(1)
5740 
5744  TIC(0)
5745  D_T(ths);
5746  TOC(0)
5747  }
5748  }
5749 } /* nfft_adjoint */
5750 
5751 
5754 static void precompute_phi_hut(X(plan) *ths)
5755 {
5756  INT ks[ths->d]; /* index over all frequencies */
5757  INT t; /* index over all dimensions */
5758 
5759  ths->c_phi_inv = (R**) Y(malloc)((size_t)(ths->d) * sizeof(R*));
5760 
5761  for (t = 0; t < ths->d; t++)
5762  {
5763  ths->c_phi_inv[t] = (R*)Y(malloc)((size_t)(ths->N[t]) * sizeof(R));
5764 
5765  for (ks[t] = 0; ks[t] < ths->N[t]; ks[t]++)
5766  {
5767  ths->c_phi_inv[t][ks[t]]= K(1.0) / (PHI_HUT(ths->n[t], ks[t] - ths->N[t] / 2,t));
5768  }
5769  }
5770 } /* nfft_phi_hut */
5771 
5776 void X(precompute_lin_psi)(X(plan) *ths)
5777 {
5778  INT t;
5779  INT j;
5780  R step;
5782  for (t=0; t<ths->d; t++)
5783  {
5784  step = ((R)(ths->m+2)) / ((R)(ths->K * ths->n[t]));
5785  for(j = 0;j <= ths->K; j++)
5786  {
5787  ths->psi[(ths->K+1)*t + j] = PHI(ths->n[t], (R)(j) * step,t);
5788  } /* for(j) */
5789  } /* for(t) */
5790 }
5791 
5792 void X(precompute_fg_psi)(X(plan) *ths)
5793 {
5794  INT t;
5795  INT u, o;
5797  sort(ths);
5798 
5799  for (t=0; t<ths->d; t++)
5800  {
5801  INT j;
5802 #ifdef _OPENMP
5803  #pragma omp parallel for default(shared) private(j,u,o)
5804 #endif
5805  for (j = 0; j < ths->M_total; j++)
5806  {
5807  uo(ths,j,&u,&o,t);
5808 
5809  ths->psi[2*(j*ths->d+t)]=
5810  (PHI(ths->n[t] ,(ths->x[j*ths->d+t] - ((R)u) / (R)(ths->n[t])),t));
5811 
5812  ths->psi[2*(j*ths->d+t)+1]=
5813  EXP(K(2.0) * ((R)(ths->n[t]) * ths->x[j*ths->d+t] - (R)(u)) / ths->b[t]);
5814  } /* for(j) */
5815  }
5816  /* for(t) */
5817 } /* nfft_precompute_fg_psi */
5818 
5819 void X(precompute_psi)(X(plan) *ths)
5820 {
5821  INT t; /* index over all dimensions */
5822  INT l; /* index u<=l<=o */
5823  INT lj; /* index 0<=lj<u+o+1 */
5824  INT u, o; /* depends on x_j */
5825 
5826  sort(ths);
5827 
5828  for (t=0; t<ths->d; t++)
5829  {
5830  INT j;
5831 #ifdef _OPENMP
5832  #pragma omp parallel for default(shared) private(j,l,lj,u,o)
5833 #endif
5834  for (j = 0; j < ths->M_total; j++)
5835  {
5836  uo(ths,j,&u,&o,t);
5837 
5838  for(l = u, lj = 0; l <= o; l++, lj++)
5839  ths->psi[(j * ths->d + t) * (2 * ths->m + 2) + lj] =
5840  (PHI(ths->n[t], (ths->x[j*ths->d+t] - ((R)l) / (R)(ths->n[t])), t));
5841  } /* for(j) */
5842  }
5843  /* for(t) */
5844 } /* nfft_precompute_psi */
5845 
5846 #ifdef _OPENMP
5847 static void nfft_precompute_full_psi_omp(X(plan) *ths)
5848 {
5849  INT j;
5850  INT lprod;
5852  {
5853  INT t;
5854  for(t=0,lprod = 1; t<ths->d; t++)
5855  lprod *= 2*ths->m+2;
5856  }
5857 
5858  #pragma omp parallel for default(shared) private(j)
5859  for(j=0; j<ths->M_total; j++)
5860  {
5861  INT t,t2;
5862  INT l_L;
5863  INT lj[ths->d];
5864  INT ll_plain[ths->d+1];
5866  INT u[ths->d], o[ths->d];
5868  R phi_prod[ths->d+1];
5869  INT ix = j*lprod;
5870 
5871  phi_prod[0]=1;
5872  ll_plain[0]=0;
5873 
5874  MACRO_init_uo_l_lj_t;
5875 
5876  for(l_L=0; l_L<lprod; l_L++, ix++)
5877  {
5878  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5879 
5880  ths->psi_index_g[ix]=ll_plain[ths->d];
5881  ths->psi[ix]=phi_prod[ths->d];
5882 
5883  MACRO_count_uo_l_lj_t;
5884  } /* for(l_L) */
5885 
5886  ths->psi_index_f[j]=lprod;
5887  } /* for(j) */
5888 }
5889 #endif
5890 
5891 void X(precompute_full_psi)(X(plan) *ths)
5892 {
5893 #ifdef _OPENMP
5894  sort(ths);
5895 
5896  nfft_precompute_full_psi_omp(ths);
5897 #else
5898  INT t, t2; /* index over all dimensions */
5899  INT j; /* index over all nodes */
5900  INT l_L; /* plain index 0 <= l_L < lprod */
5901  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
5902  INT ll_plain[ths->d+1]; /* postfix plain index */
5903  INT lprod; /* 'bandwidth' of matrix B */
5904  INT u[ths->d], o[ths->d]; /* depends on x_j */
5905 
5906  R phi_prod[ths->d+1];
5907 
5908  INT ix, ix_old;
5909 
5910  sort(ths);
5911 
5912  phi_prod[0] = K(1.0);
5913  ll_plain[0] = 0;
5914 
5915  for (t = 0, lprod = 1; t < ths->d; t++)
5916  lprod *= 2 * ths->m + 2;
5917 
5918  for (j = 0, ix = 0, ix_old = 0; j < ths->M_total; j++)
5919  {
5920  MACRO_init_uo_l_lj_t;
5921 
5922  for (l_L = 0; l_L < lprod; l_L++, ix++)
5923  {
5924  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5925 
5926  ths->psi_index_g[ix] = ll_plain[ths->d];
5927  ths->psi[ix] = phi_prod[ths->d];
5928 
5929  MACRO_count_uo_l_lj_t;
5930  } /* for(l_L) */
5931 
5932  ths->psi_index_f[j] = ix - ix_old;
5933  ix_old = ix;
5934  } /* for(j) */
5935 #endif
5936 }
5937 
5938 void X(precompute_one_psi)(X(plan) *ths)
5939 {
5940  if(ths->flags & PRE_LIN_PSI)
5941  X(precompute_lin_psi)(ths);
5942  if(ths->flags & PRE_FG_PSI)
5943  X(precompute_fg_psi)(ths);
5944  if(ths->flags & PRE_PSI)
5945  X(precompute_psi)(ths);
5946  if(ths->flags & PRE_FULL_PSI)
5947  X(precompute_full_psi)(ths);
5948 }
5949 
5950 static void init_help(X(plan) *ths)
5951 {
5952  INT t; /* index over all dimensions */
5953  INT lprod; /* 'bandwidth' of matrix B */
5954 
5955  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT)
5956  ths->flags |= NFFT_SORT_NODES;
5957 
5958  ths->N_total = intprod(ths->N, 0, ths->d);
5959  ths->n_total = intprod(ths->n, 0, ths->d);
5960 
5961  ths->sigma = (R*) Y(malloc)((size_t)(ths->d) * sizeof(R));
5962 
5963  for(t = 0;t < ths->d; t++)
5964  ths->sigma[t] = ((R)ths->n[t]) / (R)(ths->N[t]);
5965 
5966  WINDOW_HELP_INIT;
5967 
5968  if(ths->flags & MALLOC_X)
5969  ths->x = (R*)Y(malloc)((size_t)(ths->d * ths->M_total) * sizeof(R));
5970 
5971  if(ths->flags & MALLOC_F_HAT)
5972  ths->f_hat = (C*)Y(malloc)((size_t)(ths->N_total) * sizeof(C));
5973 
5974  if(ths->flags & MALLOC_F)
5975  ths->f = (C*)Y(malloc)((size_t)(ths->M_total) * sizeof(C));
5976 
5977  if(ths->flags & PRE_PHI_HUT)
5978  precompute_phi_hut(ths);
5979 
5980  if (ths->flags & PRE_LIN_PSI)
5981  {
5982  if (ths->K == 0)
5983  {
5984  ths->K = Y(m2K)(ths->m);
5985  }
5986  ths->psi = (R*) Y(malloc)((size_t)((ths->K+1) * ths->d) * sizeof(R));
5987  }
5988 
5989  if(ths->flags & PRE_FG_PSI)
5990  ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * 2) * sizeof(R));
5991 
5992  if(ths->flags & PRE_PSI)
5993  ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * (2 * ths->m + 2)) * sizeof(R));
5994 
5995  if(ths->flags & PRE_FULL_PSI)
5996  {
5997  for (t = 0, lprod = 1; t < ths->d; t++)
5998  lprod *= 2 * ths->m + 2;
5999 
6000  ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(R));
6001 
6002  ths->psi_index_f = (INT*) Y(malloc)((size_t)(ths->M_total) * sizeof(INT));
6003  ths->psi_index_g = (INT*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(INT));
6004  }
6005 
6006  if(ths->flags & FFTW_INIT)
6007  {
6008 #ifdef _OPENMP
6009  INT nthreads = Y(get_num_threads)();
6010 #endif
6011 
6012  ths->g1 = (C*)Y(malloc)((size_t)(ths->n_total) * sizeof(C));
6013 
6014  if(ths->flags & FFT_OUT_OF_PLACE)
6015  ths->g2 = (C*) Y(malloc)((size_t)(ths->n_total) * sizeof(C));
6016  else
6017  ths->g2 = ths->g1;
6018 
6019 #ifdef _OPENMP
6020 #pragma omp critical (nfft_omp_critical_fftw_plan)
6021 {
6022  FFTW(plan_with_nthreads)(nthreads);
6023 #endif
6024  {
6025  int *_n = Y(malloc)((size_t)(ths->d) * sizeof(int));
6026 
6027  for (t = 0; t < ths->d; t++)
6028  _n[t] = (int)(ths->n[t]);
6029 
6030  ths->my_fftw_plan1 = FFTW(plan_dft)((int)ths->d, _n, ths->g1, ths->g2, FFTW_FORWARD, ths->fftw_flags);
6031  ths->my_fftw_plan2 = FFTW(plan_dft)((int)ths->d, _n, ths->g2, ths->g1, FFTW_BACKWARD, ths->fftw_flags);
6032  Y(free)(_n);
6033  }
6034 #ifdef _OPENMP
6035 }
6036 #endif
6037  }
6038 
6039  if(ths->flags & NFFT_SORT_NODES)
6040  ths->index_x = (INT*) Y(malloc)(sizeof(INT) * 2U * (size_t)(ths->M_total));
6041  else
6042  ths->index_x = NULL;
6043 
6044  ths->mv_trafo = (void (*) (void* ))X(trafo);
6045  ths->mv_adjoint = (void (*) (void* ))X(adjoint);
6046 }
6047 
6048 void X(init)(X(plan) *ths, int d, int *N, int M_total)
6049 {
6050  INT t; /* index over all dimensions */
6051 
6052  ths->d = (INT)d;
6053 
6054  ths->N = (INT*) Y(malloc)((size_t)(d) * sizeof(INT));
6055 
6056  for (t = 0; t < d; t++)
6057  ths->N[t] = (INT)N[t];
6058 
6059  ths->M_total = (INT)M_total;
6060 
6061  ths->n = (INT*) Y(malloc)((size_t)(d) * sizeof(INT));
6062 
6063  for (t = 0; t < d; t++)
6064  ths->n[t] = 2 * (Y(next_power_of_2)(ths->N[t]));
6065 
6066  ths->m = WINDOW_HELP_ESTIMATE_m;
6067 
6068  if (d > 1)
6069  {
6070 #ifdef _OPENMP
6071  ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
6072  FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES |
6073  NFFT_OMP_BLOCKWISE_ADJOINT;
6074 #else
6075  ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
6076  FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES;
6077 #endif
6078  }
6079  else
6080  ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
6082 
6083  ths->fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT;
6084 
6085  ths->K = 0;
6086  init_help(ths);
6087 }
6088 
6089 void X(init_guru)(X(plan) *ths, int d, int *N, int M_total, int *n, int m,
6090  unsigned flags, unsigned fftw_flags)
6091 {
6092  INT t; /* index over all dimensions */
6093 
6094  ths->d = (INT)d;
6095  ths->M_total = (INT)M_total;
6096  ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
6097 
6098  for (t = 0; t < d; t++)
6099  ths->N[t] = (INT)N[t];
6100 
6101  ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
6102 
6103  for (t = 0; t < d; t++)
6104  ths->n[t] = (INT)n[t];
6105 
6106  ths->m = (INT)m;
6107 
6108  ths->flags = flags;
6109  ths->fftw_flags = fftw_flags;
6110 
6111  ths->K = 0;
6112  init_help(ths);
6113 }
6114 
6115 void X(init_lin)(X(plan) *ths, int d, int *N, int M_total, int *n, int m, int K,
6116  unsigned flags, unsigned fftw_flags)
6117 {
6118  INT t; /* index over all dimensions */
6119 
6120  ths->d = (INT)d;
6121  ths->M_total = (INT)M_total;
6122  ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
6123 
6124  for (t = 0; t < d; t++)
6125  ths->N[t] = (INT)N[t];
6126 
6127  ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
6128 
6129  for (t = 0; t < d; t++)
6130  ths->n[t] = (INT)n[t];
6131 
6132  ths->m = (INT)m;
6133 
6134  ths->flags = flags;
6135  ths->fftw_flags = fftw_flags;
6136 
6137  ths->K = K;
6138  init_help(ths);
6139 }
6140 
6141 void X(init_1d)(X(plan) *ths, int N1, int M_total)
6142 {
6143  int N[1];
6144 
6145  N[0] = N1;
6146 
6147  X(init)(ths, 1, N, M_total);
6148 }
6149 
6150 void X(init_2d)(X(plan) *ths, int N1, int N2, int M_total)
6151 {
6152  int N[2];
6153 
6154  N[0] = N1;
6155  N[1] = N2;
6156  X(init)(ths, 2, N, M_total);
6157 }
6158 
6159 void X(init_3d)(X(plan) *ths, int N1, int N2, int N3, int M_total)
6160 {
6161  int N[3];
6162 
6163  N[0] = N1;
6164  N[1] = N2;
6165  N[2] = N3;
6166  X(init)(ths, 3, N, M_total);
6167 }
6168 
6169 const char* X(check)(X(plan) *ths)
6170 {
6171  INT j;
6172 
6173  if (!ths->f)
6174  return "Member f not initialized.";
6175 
6176  if (!ths->x)
6177  return "Member x not initialized.";
6178 
6179  if (!ths->f_hat)
6180  return "Member f_hat not initialized.";
6181 
6182  if ((ths->flags & PRE_LIN_PSI) && ths->K < ths->M_total)
6183  return "Number of nodes too small to use PRE_LIN_PSI.";
6184 
6185  for (j = 0; j < ths->M_total * ths->d; j++)
6186  {
6187  if ((ths->x[j]<-K(0.5)) || (ths->x[j]>= K(0.5)))
6188  {
6189  return "ths->x out of range [-0.5,0.5)";
6190  }
6191  }
6192 
6193  for (j = 0; j < ths->d; j++)
6194  {
6195  if (ths->sigma[j] <= 1)
6196  return "Oversampling factor too small";
6197 
6198  /* Automatically calls trafo_direct if
6199  if(ths->N[j] <= ths->m)
6200  return "Polynomial degree N is <= cut-off m";
6201  */
6202 
6203  if(ths->N[j]%2 == 1)
6204  return "polynomial degree N has to be even";
6205  }
6206  return 0;
6207 }
6208 
6209 void X(finalize)(X(plan) *ths)
6210 {
6211  INT t; /* index over dimensions */
6212 
6213  if(ths->flags & NFFT_SORT_NODES)
6214  Y(free)(ths->index_x);
6215 
6216  if(ths->flags & FFTW_INIT)
6217  {
6218 #ifdef _OPENMP
6219  #pragma omp critical (nfft_omp_critical_fftw_plan)
6220 #endif
6221  FFTW(destroy_plan)(ths->my_fftw_plan2);
6222 #ifdef _OPENMP
6223  #pragma omp critical (nfft_omp_critical_fftw_plan)
6224 #endif
6225  FFTW(destroy_plan)(ths->my_fftw_plan1);
6226 
6227  if(ths->flags & FFT_OUT_OF_PLACE)
6228  Y(free)(ths->g2);
6229 
6230  Y(free)(ths->g1);
6231  }
6232 
6233  if(ths->flags & PRE_FULL_PSI)
6234  {
6235  Y(free)(ths->psi_index_g);
6236  Y(free)(ths->psi_index_f);
6237  Y(free)(ths->psi);
6238  }
6239 
6240  if(ths->flags & PRE_PSI)
6241  Y(free)(ths->psi);
6242 
6243  if(ths->flags & PRE_FG_PSI)
6244  Y(free)(ths->psi);
6245 
6246  if(ths->flags & PRE_LIN_PSI)
6247  Y(free)(ths->psi);
6248 
6249  if(ths->flags & PRE_PHI_HUT)
6250  {
6251  for (t = 0; t < ths->d; t++)
6252  Y(free)(ths->c_phi_inv[t]);
6253  Y(free)(ths->c_phi_inv);
6254  }
6255 
6256  if(ths->flags & MALLOC_F)
6257  Y(free)(ths->f);
6258 
6259  if(ths->flags & MALLOC_F_HAT)
6260  Y(free)(ths->f_hat);
6261 
6262  if(ths->flags & MALLOC_X)
6263  Y(free)(ths->x);
6264 
6265  WINDOW_HELP_FINALIZE;
6266 
6267  Y(free)(ths->sigma);
6268  Y(free)(ths->n);
6269  Y(free)(ths->N);
6270 }
#define TIC(a)
Timing, method works since the inaccurate timer is updated mostly in the measured function...
Definition: infft.h:1397
#define PRE_FG_PSI
Definition: nfft3.h:196
#define MALLOC_X
Definition: nfft3.h:199
#define MALLOC_F_HAT
Definition: nfft3.h:200
#define FG_PSI
Definition: nfft3.h:194
#define FFTW_INIT
Definition: nfft3.h:203
#define MALLOC_F
Definition: nfft3.h:201
#define X(name)
Include header for C99 complex datatype.
Definition: fastsum.h:57
#define FFT_OUT_OF_PLACE
Definition: nfft3.h:202
#define PRE_LIN_PSI
Definition: nfft3.h:195
#define PRE_PSI
Definition: nfft3.h:197
#define UNUSED(x)
Dummy use of unused parameters to silence compiler warnings.
Definition: infft.h:1365
#define PRE_FULL_PSI
Definition: nfft3.h:198
Header file for the nfft3 library.
#define PRE_PHI_HUT
Definition: nfft3.h:193