44 #define X(name) NFFT(name) 47 static inline INT intprod(
const INT *vec,
const INT a,
const INT d)
52 for (t = 0; t < d; t++)
59 #define BASE(x) CEXP(x) 75 static inline void sort0(
const INT d,
const INT *n,
const INT m,
76 const INT local_x_num,
const R *local_x, INT *ar_x)
78 INT u_j[d], i, j, help, rhigh;
82 for (i = 0; i < local_x_num; i++)
86 for (j = 0; j < d; j++)
88 help = (INT) LRINT(FLOOR((R)(n[j]) * local_x[d * i + j] - (R)(m)));
89 u_j[j] = (help % n[j] + n[j]) % n[j];
91 ar_x[2 * i] += u_j[j];
93 ar_x[2 * i] *= n[j + 1];
97 for (j = 0, nprod = 1; j < d; j++)
100 rhigh = (INT) LRINT(CEIL(LOG2((R)nprod))) - 1;
102 ar_x_temp = (INT*) Y(malloc)(2 * (size_t)(local_x_num) *
sizeof(INT));
103 Y(sort_node_indices_radix_lsdf)(local_x_num, ar_x, ar_x_temp, rhigh);
105 for (i = 1; i < local_x_num; i++)
106 assert(ar_x[2 * (i - 1)] <= ar_x[2 * i]);
119 static inline void sort(
const X(plan) *ths)
121 if (ths->flags & NFFT_SORT_NODES)
122 sort0(ths->d, ths->n, ths->m, ths->M_total, ths->x, ths->index_x);
145 void X(trafo_direct)(
const X(plan) *ths)
147 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
149 memset(f, 0, (
size_t)(ths->M_total) *
sizeof(C));
156 #pragma omp parallel for default(shared) private(j) 158 for (j = 0; j < ths->M_total; j++)
161 for (k_L = 0; k_L < ths->N_total; k_L++)
163 R omega = K2PI * ((R)(k_L - ths->N_total/2)) * ths->x[j];
164 f[j] += f_hat[k_L] * BASE(-II * omega);
173 #pragma omp parallel for default(shared) private(j) 175 for (j = 0; j < ths->M_total; j++)
177 R x[ths->d], omega, Omega[ths->d + 1];
178 INT t, t2, k_L, k[ths->d];
180 for (t = 0; t < ths->d; t++)
183 x[t] = K2PI * ths->x[j * ths->d + t];
184 Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
186 omega = Omega[ths->d];
188 for (k_L = 0; k_L < ths->N_total; k_L++)
190 f[j] += f_hat[k_L] * BASE(-II * omega);
192 for (t = ths->d - 1; (t >= 1) && (k[t] == ths->N[t]/2 - 1); t--)
197 for (t2 = t; t2 < ths->d; t2++)
198 Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
200 omega = Omega[ths->d];
207 void X(adjoint_direct)(
const X(plan) *ths)
209 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
211 memset(f_hat, 0, (
size_t)(ths->N_total) *
sizeof(C));
218 #pragma omp parallel for default(shared) private(k_L) 219 for (k_L = 0; k_L < ths->N_total; k_L++)
222 for (j = 0; j < ths->M_total; j++)
224 R omega = K2PI * ((R)(k_L - (ths->N_total/2))) * ths->x[j];
225 f_hat[k_L] += f[j] * BASE(II * omega);
230 for (j = 0; j < ths->M_total; j++)
233 for (k_L = 0; k_L < ths->N_total; k_L++)
235 R omega = K2PI * ((R)(k_L - ths->N_total / 2)) * ths->x[j];
236 f_hat[k_L] += f[j] * BASE(II * omega);
246 #pragma omp parallel for default(shared) private(j, k_L) 247 for (k_L = 0; k_L < ths->N_total; k_L++)
249 INT k[ths->d], k_temp, t;
253 for (t = ths->d - 1; t >= 0; t--)
255 k[t] = k_temp % ths->N[t] - ths->N[t]/2;
259 for (j = 0; j < ths->M_total; j++)
262 for (t = 0; t < ths->d; t++)
263 omega += k[t] * K2PI * ths->x[j * ths->d + t];
264 f_hat[k_L] += f[j] * BASE(II * omega);
268 for (j = 0; j < ths->M_total; j++)
270 R x[ths->d], omega, Omega[ths->d+1];
271 INT t, t2, k[ths->d];
273 for (t = 0; t < ths->d; t++)
276 x[t] = K2PI * ths->x[j * ths->d + t];
277 Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
279 omega = Omega[ths->d];
280 for (k_L = 0; k_L < ths->N_total; k_L++)
282 f_hat[k_L] += f[j] * BASE(II * omega);
284 for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
289 for (t2 = t; t2 < ths->d; t2++)
290 Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
292 omega = Omega[ths->d];
324 static inline void uo(
const X(plan) *ths,
const INT j, INT *up, INT *op,
327 const R xj = ths->x[j * ths->d + act_dim];
328 INT c = LRINT(FLOOR(xj * (R)(ths->n[act_dim])));
330 (*up) = c - (ths->m);
331 (*op) = c + 1 + (ths->m);
334 static inline void uo2(INT *u, INT *o,
const R x,
const INT n,
const INT m)
336 INT c = LRINT(FLOOR(x * (R)(n)));
338 *u = (c - m + n) % n;
339 *o = (c + 1 + m + n) % n;
342 #define MACRO_D_compute_A \ 344 g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d]; \ 347 #define MACRO_D_compute_T \ 349 f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d]; \ 352 #define MACRO_D_init_result_A memset(g_hat, 0, (size_t)(ths->n_total) * sizeof(C)); 354 #define MACRO_D_init_result_T memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C)); 356 #define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]]; 358 #define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ths->n[t2],ks[t2]-(ths->N[t2]/2),t2)); 360 #define MACRO_init_k_ks \ 362 for (t = ths->d-1; 0 <= t; t--) \ 365 ks[t] = ths->N[t]/2; \ 370 #define MACRO_update_c_phi_inv_k(which_one) \ 372 for (t2 = t; t2 < ths->d; t2++) \ 374 c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one; \ 375 ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2]; \ 376 k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2]; \ 380 #define MACRO_count_k_ks \ 382 for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--) \ 385 ks[t]= ths->N[t]/2; \ 388 kp[t]++; k[t]++; ks[t]++; \ 389 if(kp[t] == ths->N[t]/2) \ 391 k[t] = ths->n[t] - ths->N[t]/2; \ 397 #define MACRO_D(which_one) \ 398 static inline void D_serial_ ## which_one (X(plan) *ths) \ 401 R c_phi_inv_k[ths->d+1]; \ 407 INT k_plain[ths->d+1]; \ 408 INT ks_plain[ths->d+1]; \ 410 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; \ 411 MACRO_D_init_result_ ## which_one; \ 413 c_phi_inv_k[0] = K(1.0); \ 419 if (ths->flags & PRE_PHI_HUT) \ 421 for (k_L = 0; k_L < ths->N_total; k_L++) \ 423 MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT); \ 424 MACRO_D_compute_ ## which_one; \ 430 for (k_L = 0; k_L < ths->N_total; k_L++) \ 432 MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT); \ 433 MACRO_D_compute_ ## which_one; \ 440 static inline void D_openmp_A(
X(plan) *ths)
445 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
446 memset(g_hat, 0, ths->n_total *
sizeof(C));
450 #pragma omp parallel for default(shared) private(k_L) 451 for (k_L = 0; k_L < ths->N_total; k_L++)
456 R c_phi_inv_k_val = K(1.0);
458 INT ks_plain_val = 0;
462 for (t = ths->d-1; t >= 0; t--)
464 kp[t] = k_temp % ths->N[t];
465 if (kp[t] >= ths->N[t]/2)
466 k[t] = ths->n[t] - ths->N[t] + kp[t];
469 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
473 for (t = 0; t < ths->d; t++)
475 c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
476 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
477 k_plain_val = k_plain_val*ths->n[t] + k[t];
480 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
485 #pragma omp parallel for default(shared) private(k_L) 486 for (k_L = 0; k_L < ths->N_total; k_L++)
491 R c_phi_inv_k_val = K(1.0);
493 INT ks_plain_val = 0;
497 for (t = ths->d-1; t >= 0; t--)
499 kp[t] = k_temp % ths->N[t];
500 if (kp[t] >= ths->N[t]/2)
501 k[t] = ths->n[t] - ths->N[t] + kp[t];
504 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
508 for (t = 0; t < ths->d; t++)
510 c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
511 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
512 k_plain_val = k_plain_val*ths->n[t] + k[t];
515 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
525 static inline void D_A(
X(plan) *ths)
535 static void D_openmp_T(
X(plan) *ths)
540 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
541 memset(f_hat, 0, ths->N_total *
sizeof(C));
545 #pragma omp parallel for default(shared) private(k_L) 546 for (k_L = 0; k_L < ths->N_total; k_L++)
551 R c_phi_inv_k_val = K(1.0);
553 INT ks_plain_val = 0;
557 for (t = ths->d - 1; t >= 0; t--)
559 kp[t] = k_temp % ths->N[t];
560 if (kp[t] >= ths->N[t]/2)
561 k[t] = ths->n[t] - ths->N[t] + kp[t];
564 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
568 for (t = 0; t < ths->d; t++)
570 c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
571 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
572 k_plain_val = k_plain_val*ths->n[t] + k[t];
575 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
580 #pragma omp parallel for default(shared) private(k_L) 581 for (k_L = 0; k_L < ths->N_total; k_L++)
586 R c_phi_inv_k_val = K(1.0);
588 INT ks_plain_val = 0;
592 for (t = ths->d-1; t >= 0; t--)
594 kp[t] = k_temp % ths->N[t];
595 if (kp[t] >= ths->N[t]/2)
596 k[t] = ths->n[t] - ths->N[t] + kp[t];
599 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
603 for (t = 0; t < ths->d; t++)
605 c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
606 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
607 k_plain_val = k_plain_val*ths->n[t] + k[t];
610 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
620 static void D_T(
X(plan) *ths)
630 #define MACRO_B_init_result_A memset(ths->f, 0, (size_t)(ths->M_total) * sizeof(C)); 631 #define MACRO_B_init_result_T memset(ths->g, 0, (size_t)(ths->n_total) * sizeof(C)); 633 #define MACRO_B_PRE_FULL_PSI_compute_A \ 635 (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \ 638 #define MACRO_B_PRE_FULL_PSI_compute_T \ 640 g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj); \ 643 #define MACRO_B_compute_A \ 645 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; \ 648 #define MACRO_B_compute_T \ 650 ths->g[ll_plain[ths->d]] += phi_prod[ths->d] * ths->f[j]; \ 653 #define MACRO_with_FG_PSI fg_psi[t2][lj[t2]] 655 #define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]] 657 #define MACRO_without_PRE_PSI_improved psij_const[t2 * (2*ths->m+2) + lj[t2]] 659 #define MACRO_without_PRE_PSI PHI(ths->n[t2], ths->x[j*ths->d+t2] \ 660 - ((R) (lj[t2]+u[t2]))/((R)ths->n[t2]), t2) 662 #define MACRO_init_uo_l_lj_t \ 663 INT l_all[ths->d*(2*ths->m+2)]; \ 665 for (t = ths->d-1; t >= 0; t--) \ 667 uo(ths,j,&u[t],&o[t],t); \ 669 for (lj_t = 0; lj_t < 2*ths->m+2; lj_t++) \ 670 l_all[t*(2*ths->m+2) + lj_t] = (u[t] + lj_t + ths->n[t]) % ths->n[t]; \ 676 #define MACRO_update_phi_prod_ll_plain(which_one) { \ 677 for (t2 = t; t2 < ths->d; t2++) \ 679 phi_prod[t2+1] = phi_prod[t2] * MACRO_ ## which_one; \ 680 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 684 #define MACRO_count_uo_l_lj_t \ 686 for (t = ths->d-1; (t > 0) && (lj[t] == o[t]-u[t]); t--) \ 694 #define MACRO_COMPUTE_with_PRE_PSI MACRO_with_PRE_PSI 695 #define MACRO_COMPUTE_with_PRE_FG_PSI MACRO_with_FG_PSI 696 #define MACRO_COMPUTE_with_FG_PSI MACRO_with_FG_PSI 697 #define MACRO_COMPUTE_with_PRE_LIN_PSI MACRO_with_FG_PSI 698 #define MACRO_COMPUTE_without_PRE_PSI MACRO_without_PRE_PSI_improved 699 #define MACRO_COMPUTE_without_PRE_PSI_improved MACRO_without_PRE_PSI_improved 701 #define MACRO_B_COMPUTE_ONE_NODE(whichone_AT,whichone_FLAGS) \ 704 INT l0, l1, l2, l3; \ 705 for (l0 = 0; l0 < 2*ths->m+2; l0++) \ 709 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \ 710 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 711 for (l1 = 0; l1 < 2*ths->m+2; l1++) \ 715 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \ 716 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 717 for (l2 = 0; l2 < 2*ths->m+2; l2++) \ 721 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \ 722 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 723 for (l3 = 0; l3 < 2*ths->m+2; l3++) \ 727 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \ 728 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 730 MACRO_B_compute_ ## whichone_AT; \ 736 else if (ths->d == 5) \ 738 INT l0, l1, l2, l3, l4; \ 739 for (l0 = 0; l0 < 2*ths->m+2; l0++) \ 743 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \ 744 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 745 for (l1 = 0; l1 < 2*ths->m+2; l1++) \ 749 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \ 750 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 751 for (l2 = 0; l2 < 2*ths->m+2; l2++) \ 755 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \ 756 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 757 for (l3 = 0; l3 < 2*ths->m+2; l3++) \ 761 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \ 762 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 763 for (l4 = 0; l4 < 2*ths->m+2; l4++) \ 767 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \ 768 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 770 MACRO_B_compute_ ## whichone_AT; \ 778 for (l_L = 0; l_L < lprod; l_L++) \ 780 MACRO_update_phi_prod_ll_plain(whichone_FLAGS); \ 782 MACRO_B_compute_ ## whichone_AT; \ 784 MACRO_count_uo_l_lj_t; \ 788 #define MACRO_B(which_one) \ 789 static inline void B_serial_ ## which_one (X(plan) *ths) \ 792 INT u[ths->d], o[ths->d]; \ 797 INT ll_plain[ths->d+1]; \ 798 R phi_prod[ths->d+1]; \ 800 R fg_psi[ths->d][2*ths->m+2]; \ 801 R fg_exp_l[ths->d][2*ths->m+2]; \ 803 R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \ 806 INT ip_s = ths->K/(ths->m+2); \ 808 MACRO_B_init_result_ ## which_one; \ 810 if (ths->flags & PRE_FULL_PSI) \ 815 f = (C*)ths->f; g = (C*)ths->g; \ 817 for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \ 819 for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \ 821 MACRO_B_PRE_FULL_PSI_compute_ ## which_one; \ 827 phi_prod[0] = K(1.0); \ 830 for (t = 0, lprod = 1; t < ths->d; t++) \ 831 lprod *= (2 * ths->m + 2); \ 833 if (ths->flags & PRE_PSI) \ 837 for (k = 0; k < ths->M_total; k++) \ 839 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \ 841 MACRO_init_uo_l_lj_t; \ 843 MACRO_B_COMPUTE_ONE_NODE(which_one,with_PRE_PSI); \ 848 if (ths->flags & PRE_FG_PSI) \ 852 for(t2 = 0; t2 < ths->d; t2++) \ 854 tmpEXP2 = EXP(K(-1.0) / ths->b[t2]); \ 855 tmpEXP2sq = tmpEXP2*tmpEXP2; \ 858 fg_exp_l[t2][0] = K(1.0); \ 859 for (lj_fg = 1; lj_fg <= (2 * ths->m + 2); lj_fg++) \ 861 tmp3 = tmp2*tmpEXP2; \ 863 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1] * tmp3; \ 866 for (k = 0; k < ths->M_total; k++) \ 868 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \ 870 MACRO_init_uo_l_lj_t; \ 872 for (t2 = 0; t2 < ths->d; t2++) \ 874 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \ 875 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \ 877 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \ 880 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \ 884 MACRO_B_COMPUTE_ONE_NODE(which_one,with_FG_PSI); \ 889 if (ths->flags & FG_PSI) \ 893 for (t2 = 0; t2 < ths->d; t2++) \ 895 tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \ 896 tmpEXP2sq = tmpEXP2*tmpEXP2; \ 899 fg_exp_l[t2][0] = K(1.0); \ 900 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \ 902 tmp3 = tmp2*tmpEXP2; \ 904 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \ 907 for (k = 0; k < ths->M_total; k++) \ 909 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \ 911 MACRO_init_uo_l_lj_t; \ 913 for (t2 = 0; t2 < ths->d; t2++) \ 915 fg_psi[t2][0] = (PHI(ths->n[t2], (ths->x[j*ths->d+t2] - ((R)u[t2])/((R)(ths->n[t2]))), t2));\ 917 tmpEXP1 = EXP(K(2.0) * ((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \ 920 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \ 923 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \ 927 MACRO_B_COMPUTE_ONE_NODE(which_one,with_FG_PSI); \ 932 if (ths->flags & PRE_LIN_PSI) \ 936 for (k = 0; k<ths->M_total; k++) \ 938 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \ 940 MACRO_init_uo_l_lj_t; \ 942 for (t2 = 0; t2 < ths->d; t2++) \ 944 y[t2] = (((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \ 945 * ((R)(ths->K))) / (R)(ths->m + 2); \ 946 ip_u = LRINT(FLOOR(y[t2])); \ 948 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \ 950 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \ 951 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \ 956 MACRO_B_COMPUTE_ONE_NODE(which_one,with_FG_PSI); \ 964 for (k = 0; k < ths->M_total; k++) \ 966 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \ 968 R psij_const[ths->d * (2*ths->m+2)]; \ 970 MACRO_init_uo_l_lj_t; \ 972 for (t2 = 0; t2 < ths->d; t2++) \ 975 for (lj_t = 0; lj_t < 2*ths->m+2; lj_t++) \ 976 psij_const[t2 * (2*ths->m+2) + lj_t] = PHI(ths->n[t2], ths->x[j*ths->d+t2] \ 977 - ((R) (lj_t+u[t2]))/((R)ths->n[t2]), t2); \ 980 MACRO_B_COMPUTE_ONE_NODE(which_one,without_PRE_PSI_improved); \ 989 #define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_with_PRE_PSI 990 #define MACRO_B_openmp_A_COMPUTE_UPDATE_with_PRE_PSI \ 991 MACRO_update_phi_prod_ll_plain(with_PRE_PSI); 993 #define MACRO_B_openmp_A_COMPUTE_INIT_FG_PSI \ 994 for (t2 = 0; t2 < ths->d; t2++) \ 997 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \ 998 R tmpEXP2sq = tmpEXP2*tmpEXP2; \ 1001 fg_exp_l[t2][0] = K(1.0); \ 1002 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \ 1004 tmp3 = tmp2*tmpEXP2; \ 1005 tmp2 *= tmpEXP2sq; \ 1006 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \ 1009 #define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_with_PRE_FG_PSI \ 1010 for (t2 = 0; t2 < ths->d; t2++) \ 1012 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \ 1013 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \ 1015 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \ 1018 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \ 1021 #define MACRO_B_openmp_A_COMPUTE_UPDATE_with_PRE_FG_PSI \ 1022 MACRO_update_phi_prod_ll_plain(with_FG_PSI); 1024 #define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_with_FG_PSI \ 1025 for (t2 = 0; t2 < ths->d; t2++) \ 1027 fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/((R)ths->n[t2])),t2)); \ 1029 tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2]) \ 1032 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \ 1035 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \ 1038 #define MACRO_B_openmp_A_COMPUTE_UPDATE_with_FG_PSI \ 1039 MACRO_update_phi_prod_ll_plain(with_FG_PSI); 1041 #define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_with_PRE_LIN_PSI \ 1042 for (t2 = 0; t2 < ths->d; t2++) \ 1044 y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2]) \ 1045 * ((R)ths->K))/(ths->m+2); \ 1046 ip_u = LRINT(FLOOR(y[t2])); \ 1047 ip_w = y[t2]-ip_u; \ 1048 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \ 1050 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \ 1051 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \ 1055 #define MACRO_B_openmp_A_COMPUTE_UPDATE_with_PRE_LIN_PSI \ 1056 MACRO_update_phi_prod_ll_plain(with_FG_PSI); 1058 #define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_without_PRE_PSI \ 1059 for (t2 = 0; t2 < ths->d; t2++) \ 1062 for (lj_t = 0; lj_t < 2*ths->m+2; lj_t++) \ 1063 psij_const[t2 * (2*ths->m+2) + lj_t] = PHI(ths->n[t2], ths->x[j*ths->d+t2] \ 1064 - ((R) (lj_t+u[t2]))/((R)ths->n[t2]), t2); \ 1066 #define MACRO_B_openmp_A_COMPUTE_UPDATE_without_PRE_PSI \ 1067 MACRO_update_phi_prod_ll_plain(without_PRE_PSI_improved); 1069 #define MACRO_B_openmp_A_COMPUTE(whichone) \ 1071 INT u[ths->d], o[ths->d]; \ 1074 INT ll_plain[ths->d+1]; \ 1075 R phi_prod[ths->d+1]; \ 1076 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \ 1078 phi_prod[0] = K(1.0); \ 1081 MACRO_init_uo_l_lj_t; \ 1083 MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_ ##whichone \ 1087 INT l0, l1, l2, l3; \ 1088 for (l0 = 0; l0 < 2*ths->m+2; l0++) \ 1092 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1093 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1094 for (l1 = 0; l1 < 2*ths->m+2; l1++) \ 1098 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1099 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1100 for (l2 = 0; l2 < 2*ths->m+2; l2++) \ 1104 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1105 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1106 for (l3 = 0; l3 < 2*ths->m+2; l3++) \ 1110 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1111 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1113 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; \ 1119 else if (ths->d == 5) \ 1121 INT l0, l1, l2, l3, l4; \ 1122 for (l0 = 0; l0 < 2*ths->m+2; l0++) \ 1126 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1127 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1128 for (l1 = 0; l1 < 2*ths->m+2; l1++) \ 1132 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1133 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1134 for (l2 = 0; l2 < 2*ths->m+2; l2++) \ 1138 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1139 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1140 for (l3 = 0; l3 < 2*ths->m+2; l3++) \ 1144 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1145 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1146 for (l4 = 0; l4 < 2*ths->m+2; l4++) \ 1150 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1151 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1153 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; \ 1161 for (l_L = 0; l_L < lprod; l_L++) \ 1163 MACRO_B_openmp_A_COMPUTE_UPDATE_ ##whichone \ 1165 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; \ 1167 MACRO_count_uo_l_lj_t; \ 1172 static inline void B_openmp_A (
X(plan) *ths)
1177 memset(ths->f, 0, ths->M_total *
sizeof(C));
1179 for (k = 0, lprod = 1; k < ths->d; k++)
1180 lprod *= (2*ths->m+2);
1184 #pragma omp parallel for default(shared) private(k) 1185 for (k = 0; k < ths->M_total; k++)
1188 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1190 for (l = 0; l < lprod; l++)
1191 ths->f[j] += ths->psi[j*lprod+l] * ths->g[ths->psi_index_g[j*lprod+l]];
1198 #pragma omp parallel for default(shared) private(k) 1199 for (k = 0; k < ths->M_total; k++)
1202 MACRO_B_openmp_A_COMPUTE(with_PRE_PSI);
1210 R fg_exp_l[ths->d][2*ths->m+2];
1212 MACRO_B_openmp_A_COMPUTE_INIT_FG_PSI
1214 #pragma omp parallel for default(shared) private(k,t,t2) 1215 for (k = 0; k < ths->M_total; k++)
1217 R fg_psi[ths->d][2*ths->m+2];
1221 MACRO_B_openmp_A_COMPUTE(with_PRE_FG_PSI);
1229 R fg_exp_l[ths->d][2*ths->m+2];
1233 MACRO_B_openmp_A_COMPUTE_INIT_FG_PSI
1235 #pragma omp parallel for default(shared) private(k,t,t2) 1236 for (k = 0; k < ths->M_total; k++)
1238 R fg_psi[ths->d][2*ths->m+2];
1242 MACRO_B_openmp_A_COMPUTE(with_FG_PSI);
1251 #pragma omp parallel for default(shared) private(k) 1252 for (k = 0; k<ths->M_total; k++)
1256 R fg_psi[ths->d][2*ths->m+2];
1260 INT ip_s = ths->K/(ths->m+2);
1262 MACRO_B_openmp_A_COMPUTE(with_PRE_LIN_PSI);
1270 #pragma omp parallel for default(shared) private(k) 1271 for (k = 0; k < ths->M_total; k++)
1274 R psij_const[ths->d * (2*ths->m+2)];
1276 MACRO_B_openmp_A_COMPUTE(without_PRE_PSI);
1281 static void B_A(
X(plan) *ths)
1306 static inline INT index_x_binary_search(
const INT *ar_x,
const INT len,
const INT key)
1308 INT left = 0, right = len - 1;
1313 while (left < right - 1)
1315 INT i = (left + right) / 2;
1316 if (ar_x[2*i] >= key)
1318 else if (ar_x[2*i] < key)
1322 if (ar_x[2*left] < key && left != len-1)
1345 static void nfft_adjoint_B_omp_blockwise_init(INT *my_u0, INT *my_o0,
1346 INT *min_u_a, INT *max_u_a, INT *min_u_b, INT *max_u_b,
const INT d,
1347 const INT *n,
const INT m)
1349 const INT n0 = n[0];
1351 INT nthreads = omp_get_num_threads();
1352 INT nthreads_used = MIN(nthreads, n0);
1353 INT size_per_thread = n0 / nthreads_used;
1354 INT size_left = n0 - size_per_thread * nthreads_used;
1355 INT size_g[nthreads_used];
1356 INT offset_g[nthreads_used];
1357 INT my_id = omp_get_thread_num();
1358 INT n_prod_rest = 1;
1360 for (k = 1; k < d; k++)
1361 n_prod_rest *= n[k];
1370 if (my_id < nthreads_used)
1372 const INT m22 = 2 * m + 2;
1375 for (k = 0; k < nthreads_used; k++)
1378 offset_g[k] = offset_g[k-1] + size_g[k-1];
1379 size_g[k] = size_per_thread;
1387 *my_u0 = offset_g[my_id];
1388 *my_o0 = offset_g[my_id] + size_g[my_id] - 1;
1390 if (nthreads_used > 1)
1392 *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1;
1393 *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1);
1398 *max_u_a = n_prod_rest * n0 - 1;
1403 *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0);
1404 *max_u_b = n_prod_rest * n0 - 1;
1408 if (*min_u_b != -1 && *min_u_b <= *max_u_a)
1410 *max_u_a = *max_u_b;
1415 assert(*min_u_a <= *max_u_a);
1416 assert(*min_u_b <= *max_u_b);
1417 assert(*min_u_b == -1 || *max_u_a < *min_u_b);
1431 static void nfft_adjoint_B_compute_full_psi(C *g,
const INT *psi_index_g,
1432 const R *psi,
const C *f,
const INT M,
const INT d,
const INT *n,
1433 const INT m,
const unsigned flags,
const INT *index_x)
1445 for(t = 0, lprod = 1; t < d; t++)
1449 lprod_m1 = lprod / (2 * m + 2);
1453 if (flags & NFFT_OMP_BLOCKWISE_ADJOINT)
1455 #pragma omp parallel private(k) 1457 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;
1458 const INT *ar_x = index_x;
1459 INT n_prod_rest = 1;
1461 for (k = 1; k < d; k++)
1462 n_prod_rest *= n[k];
1464 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m);
1468 k = index_x_binary_search(ar_x, M, min_u_a);
1470 assert(ar_x[2*k] >= min_u_a || k == M-1);
1472 assert(ar_x[2*k-2] < min_u_a);
1477 INT u_prod = ar_x[2*k];
1478 INT j = ar_x[2*k+1];
1480 if (u_prod < min_u_a || u_prod > max_u_a)
1483 for (l0 = 0; l0 < 2 * m + 2; l0++)
1485 const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1487 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1490 for (lrest = 0; lrest < lprod_m1; lrest++)
1492 const INT l = l0 * lprod_m1 + lrest;
1493 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1503 k = index_x_binary_search(ar_x, M, min_u_b);
1505 assert(ar_x[2*k] >= min_u_b || k == M-1);
1507 assert(ar_x[2*k-2] < min_u_b);
1512 INT u_prod = ar_x[2*k];
1513 INT j = ar_x[2*k+1];
1515 if (u_prod < min_u_b || u_prod > max_u_b)
1518 for (l0 = 0; l0 < 2 * m + 2; l0++)
1520 const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1522 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1524 for (lrest = 0; lrest < lprod_m1; lrest++)
1526 const INT l = l0 * lprod_m1 + lrest;
1527 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1540 #pragma omp parallel for default(shared) private(k) 1542 for (k = 0; k < M; k++)
1545 INT j = (flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k;
1547 for (l = 0; l < lprod; l++)
1550 C val = psi[j * lprod + l] * f[j];
1551 C *gref = g + psi_index_g[j * lprod + l];
1552 R *gref_real = (R*) gref;
1555 gref_real[0] += CREAL(val);
1558 gref_real[1] += CIMAG(val);
1560 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1574 #define MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \ 1576 assert(ar_x[2*k] >= min_u_a || k == M-1); \ 1578 assert(ar_x[2*k-2] < min_u_a); \ 1581 #define MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A 1585 #define MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \ 1587 assert(ar_x[2*k] >= min_u_b || k == M-1); \ 1589 assert(ar_x[2*k-2] < min_u_b); \ 1592 #define MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B 1595 #define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_with_PRE_PSI 1596 #define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_with_PRE_PSI \ 1597 MACRO_update_phi_prod_ll_plain(with_PRE_PSI); 1599 #define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_with_PRE_FG_PSI \ 1600 R fg_psi[ths->d][2*ths->m+2]; \ 1603 for (t2 = 0; t2 < ths->d; t2++) \ 1605 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \ 1606 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \ 1608 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \ 1611 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \ 1614 #define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_with_PRE_FG_PSI \ 1615 MACRO_update_phi_prod_ll_plain(with_FG_PSI); 1617 #define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_with_FG_PSI \ 1618 R fg_psi[ths->d][2*ths->m+2]; \ 1621 for (t2 = 0; t2 < ths->d; t2++) \ 1623 fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/((R)ths->n[t2])),t2)); \ 1625 tmpEXP1 = EXP(K(2.0)*((R)ths->n[t2]*ths->x[j*ths->d+t2] - (R)u[t2]) \ 1628 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \ 1631 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \ 1634 #define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_with_FG_PSI \ 1635 MACRO_update_phi_prod_ll_plain(with_FG_PSI); 1637 #define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_with_PRE_LIN_PSI \ 1639 R fg_psi[ths->d][2*ths->m+2]; \ 1643 INT ip_s = ths->K/(ths->m+2); \ 1644 for (t2 = 0; t2 < ths->d; t2++) \ 1646 y[t2] = ((((R)ths->n[t2])*ths->x[j*ths->d+t2]-(R)u[t2]) \ 1647 * ((R)ths->K))/((R)ths->m+2); \ 1648 ip_u = LRINT(FLOOR(y[t2])); \ 1649 ip_w = y[t2]-ip_u; \ 1650 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \ 1652 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \ 1653 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \ 1657 #define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_with_PRE_LIN_PSI \ 1658 MACRO_update_phi_prod_ll_plain(with_FG_PSI); 1660 #define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_without_PRE_PSI \ 1661 R psij_const[ths->d * (2*ths->m+2)]; \ 1662 for (t2 = 0; t2 < ths->d; t2++) \ 1665 for (lj_t = 0; lj_t < 2*ths->m+2; lj_t++) \ 1666 psij_const[t2 * (2*ths->m+2) + lj_t] = PHI(ths->n[t2], ths->x[j*ths->d+t2] \ 1667 - ((R) (lj_t+u[t2]))/((R)ths->n[t2]), t2); \ 1669 #define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_without_PRE_PSI \ 1670 MACRO_update_phi_prod_ll_plain(without_PRE_PSI_improved); 1672 #define MACRO_adjoint_nd_B_OMP_BLOCKWISE_COMPUTE(whichone) \ 1674 INT u[ths->d], o[ths->d]; \ 1678 INT ll_plain[ths->d+1]; \ 1679 R phi_prod[ths->d+1]; \ 1681 phi_prod[0] = K(1.0); \ 1684 MACRO_init_uo_l_lj_t; \ 1686 MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_ ##whichone \ 1690 INT l0, l1, l2, l3; \ 1691 for (l0 = 0; l0 < 2*ths->m+2; l0++) \ 1695 if (l_all[lj[0]] < my_u0 || l_all[lj[0]] > my_o0) \ 1697 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1698 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1699 for (l1 = 0; l1 < 2*ths->m+2; l1++) \ 1703 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1704 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1705 for (l2 = 0; l2 < 2*ths->m+2; l2++) \ 1709 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1710 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1711 for (l3 = 0; l3 < 2*ths->m+2; l3++) \ 1715 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1716 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1718 ths->g[ll_plain[ths->d]] += phi_prod[ths->d] * ths->f[j]; \ 1724 else if (ths->d == 5) \ 1726 INT l0, l1, l2, l3, l4; \ 1727 for (l0 = 0; l0 < 2*ths->m+2; l0++) \ 1731 if (l_all[lj[0]] < my_u0 || l_all[lj[0]] > my_o0) \ 1733 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1734 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1735 for (l1 = 0; l1 < 2*ths->m+2; l1++) \ 1739 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1740 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1741 for (l2 = 0; l2 < 2*ths->m+2; l2++) \ 1745 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1746 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1747 for (l3 = 0; l3 < 2*ths->m+2; l3++) \ 1751 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1752 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1753 for (l4 = 0; l4 < 2*ths->m+2; l4++) \ 1757 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1758 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1760 ths->g[ll_plain[ths->d]] += phi_prod[ths->d] * ths->f[j]; \ 1769 while (l_L < lprod) \ 1771 if (t == 0 && (l_all[lj[0]] < my_u0 || l_all[lj[0]] > my_o0)) \ 1777 MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_ ##whichone \ 1778 ths->g[ll_plain[ths->d]] += phi_prod[ths->d] * ths->f[j]; \ 1779 MACRO_count_uo_l_lj_t; \ 1785 #define MACRO_adjoint_nd_B_OMP_BLOCKWISE(whichone) \ 1787 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \ 1789 INT lprodrest = 1; \ 1790 for (k = 1; k < ths->d; k++) \ 1791 lprodrest *= (2*ths->m+2); \ 1792 _Pragma("omp parallel private(k)") \ 1794 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \ 1795 INT *ar_x = ths->index_x; \ 1797 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \ 1798 &min_u_b, &max_u_b, ths->d, ths->n, ths->m); \ 1800 if (min_u_a != -1) \ 1802 k = index_x_binary_search(ar_x, ths->M_total, min_u_a); \ 1804 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \ 1806 while (k < ths->M_total) \ 1808 INT u_prod = ar_x[2*k]; \ 1809 INT j = ar_x[2*k+1]; \ 1811 if (u_prod < min_u_a || u_prod > max_u_a) \ 1814 MACRO_adjoint_nd_B_OMP_BLOCKWISE_COMPUTE(whichone) \ 1820 if (min_u_b != -1) \ 1822 INT k = index_x_binary_search(ar_x, ths->M_total, min_u_b); \ 1824 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \ 1826 while (k < ths->M_total) \ 1828 INT u_prod = ar_x[2*k]; \ 1829 INT j = ar_x[2*k+1]; \ 1831 if (u_prod < min_u_b || u_prod > max_u_b) \ 1834 MACRO_adjoint_nd_B_OMP_BLOCKWISE_COMPUTE(whichone) \ 1844 #define MACRO_adjoint_nd_B_OMP_COMPUTE(whichone) \ 1846 INT u[ths->d], o[ths->d]; \ 1849 INT ll_plain[ths->d+1]; \ 1850 R phi_prod[ths->d+1]; \ 1852 phi_prod[0] = K(1.0); \ 1855 MACRO_init_uo_l_lj_t; \ 1857 MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_ ## whichone \ 1861 INT l0, l1, l2, l3; \ 1862 for (l0 = 0; l0 < 2*ths->m+2; l0++) \ 1866 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1867 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1868 for (l1 = 0; l1 < 2*ths->m+2; l1++) \ 1872 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1873 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1874 for (l2 = 0; l2 < 2*ths->m+2; l2++) \ 1878 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1879 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1880 for (l3 = 0; l3 < 2*ths->m+2; l3++) \ 1884 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1885 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1887 C *lhs = ths->g + ll_plain[ths->d]; \ 1888 R *lhs_real = (R*)lhs; \ 1889 C val = phi_prod[ths->d] * ths->f[j]; \ 1891 _Pragma("omp atomic") \ 1892 lhs_real[0] += CREAL(val); \ 1894 _Pragma("omp atomic") \ 1895 lhs_real[1] += CIMAG(val); \ 1901 else if (ths->d == 5) \ 1903 INT l0, l1, l2, l3, l4; \ 1904 for (l0 = 0; l0 < 2*ths->m+2; l0++) \ 1908 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1909 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1910 for (l1 = 0; l1 < 2*ths->m+2; l1++) \ 1914 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1915 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1916 for (l2 = 0; l2 < 2*ths->m+2; l2++) \ 1920 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1921 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1922 for (l3 = 0; l3 < 2*ths->m+2; l3++) \ 1926 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1927 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1928 for (l4 = 0; l4 < 2*ths->m+2; l4++) \ 1932 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \ 1933 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \ 1935 C *lhs = ths->g + ll_plain[ths->d]; \ 1936 R *lhs_real = (R*)lhs; \ 1937 C val = phi_prod[ths->d] * ths->f[j]; \ 1939 _Pragma("omp atomic") \ 1940 lhs_real[0] += CREAL(val); \ 1942 _Pragma("omp atomic") \ 1943 lhs_real[1] += CIMAG(val); \ 1951 for (l_L = 0; l_L < lprod; l_L++) \ 1957 MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_ ## whichone \ 1959 lhs = ths->g + ll_plain[ths->d]; \ 1960 lhs_real = (R*)lhs; \ 1961 val = phi_prod[ths->d] * ths->f[j]; \ 1963 _Pragma("omp atomic") \ 1964 lhs_real[0] += CREAL(val); \ 1966 _Pragma("omp atomic") \ 1967 lhs_real[1] += CIMAG(val); \ 1969 MACRO_count_uo_l_lj_t; \ 1974 static inline void B_openmp_T(
X(plan) *ths)
1979 memset(ths->g, 0, (
size_t)(ths->n_total) *
sizeof(C));
1981 for (k = 0, lprod = 1; k < ths->d; k++)
1982 lprod *= (2*ths->m+2);
1986 nfft_adjoint_B_compute_full_psi(ths->g, ths->psi_index_g, ths->psi, ths->f,
1987 ths->M_total, ths->d, ths->n, ths->m, ths->flags, ths->index_x);
1993 MACRO_adjoint_nd_B_OMP_BLOCKWISE(with_PRE_PSI);
1995 #pragma omp parallel for default(shared) private(k) 1996 for (k = 0; k < ths->M_total; k++)
1999 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2000 MACRO_adjoint_nd_B_OMP_COMPUTE(with_PRE_PSI);
2008 R fg_exp_l[ths->d][2*ths->m+2];
2009 for(t2 = 0; t2 < ths->d; t2++)
2012 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
2013 R tmpEXP2sq = tmpEXP2*tmpEXP2;
2016 fg_exp_l[t2][0] = K(1.0);
2017 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
2019 tmp3 = tmp2*tmpEXP2;
2021 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
2025 MACRO_adjoint_nd_B_OMP_BLOCKWISE(with_PRE_FG_PSI);
2027 #pragma omp parallel for default(shared) private(k,t,t2) 2028 for (k = 0; k < ths->M_total; k++)
2030 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2031 MACRO_adjoint_nd_B_OMP_COMPUTE(with_PRE_FG_PSI);
2039 R fg_exp_l[ths->d][2*ths->m+2];
2043 for (t2 = 0; t2 < ths->d; t2++)
2046 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
2047 R tmpEXP2sq = tmpEXP2*tmpEXP2;
2050 fg_exp_l[t2][0] = K(1.0);
2051 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
2053 tmp3 = tmp2*tmpEXP2;
2055 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
2059 MACRO_adjoint_nd_B_OMP_BLOCKWISE(with_FG_PSI);
2061 #pragma omp parallel for default(shared) private(k,t,t2) 2062 for (k = 0; k < ths->M_total; k++)
2064 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2065 MACRO_adjoint_nd_B_OMP_COMPUTE(with_FG_PSI);
2074 MACRO_adjoint_nd_B_OMP_BLOCKWISE(with_PRE_LIN_PSI);
2076 #pragma omp parallel for default(shared) private(k) 2077 for (k = 0; k<ths->M_total; k++)
2080 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2081 MACRO_adjoint_nd_B_OMP_COMPUTE(with_PRE_LIN_PSI);
2089 MACRO_adjoint_nd_B_OMP_BLOCKWISE(without_PRE_PSI);
2091 #pragma omp parallel for default(shared) private(k) 2092 for (k = 0; k < ths->M_total; k++)
2095 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2096 MACRO_adjoint_nd_B_OMP_COMPUTE(without_PRE_PSI);
2101 static void B_T(
X(plan) *ths)
2112 static void nfft_1d_init_fg_exp_l(R *fg_exp_l,
const INT m,
const R b)
2114 const INT tmp2 = 2*m+2;
2116 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
2118 fg_exp_b0 = EXP(K(-1.0)/b);
2119 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
2120 fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0);
2122 for (l = 1; l < tmp2; l++)
2124 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
2125 fg_exp_b1 *= fg_exp_b0_sq;
2126 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
2131 static void nfft_trafo_1d_compute(C *fj,
const C *g,
const R *psij_const,
2132 const R *xj,
const INT n,
const INT m)
2139 uo2(&u, &o, *xj, n, m);
2143 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++)
2144 (*fj) += (*psij++) * (*gj++);
2148 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++)
2149 (*fj) += (*psij++) * (*gj++);
2150 for (l = 0, gj = g; l <= o; l++)
2151 (*fj) += (*psij++) * (*gj++);
2156 static void nfft_adjoint_1d_compute_serial(
const C *fj, C *g,
2157 const R *psij_const,
const R *xj,
const INT n,
const INT m)
2164 uo2(&u,&o,*xj, n, m);
2168 for (l = 0, gj = g+u; l <= 2*m+1; l++)
2169 (*gj++) += (*psij++) * (*fj);
2173 for (l = 0, gj = g+u; l < 2*m+1-o; l++)
2174 (*gj++) += (*psij++) * (*fj);
2175 for (l = 0, gj = g; l <= o; l++)
2176 (*gj++) += (*psij++) * (*fj);
2183 static void nfft_adjoint_1d_compute_omp_atomic(
const C f, C *g,
2184 const R *psij_const,
const R *xj,
const INT n,
const INT m)
2188 INT index_temp[2*m+2];
2190 uo2(&u,&o,*xj, n, m);
2192 for (l=0; l<=2*m+1; l++)
2193 index_temp[l] = (l+u)%n;
2195 for (l = 0, gj = g+u; l <= 2*m+1; l++)
2197 INT i = index_temp[l];
2199 R *lhs_real = (R*)lhs;
2200 C val = psij_const[l] * f;
2202 lhs_real[0] += CREAL(val);
2205 lhs_real[1] += CIMAG(val);
2226 static void nfft_adjoint_1d_compute_omp_blockwise(
const C f, C *g,
2227 const R *psij_const,
const R *xj,
const INT n,
const INT m,
2228 const INT my_u0,
const INT my_o0)
2232 uo2(&ar_u,&ar_o,*xj, n, m);
2236 INT u = MAX(my_u0,ar_u);
2237 INT o = MIN(my_o0,ar_o);
2238 INT offset_psij = u-ar_u;
2240 assert(offset_psij >= 0);
2241 assert(o-u <= 2*m+1);
2242 assert(offset_psij+o-u <= 2*m+1);
2245 for (l = 0; l <= o-u; l++)
2246 g[u+l] += psij_const[offset_psij+l] * f;
2250 INT u = MAX(my_u0,ar_u);
2252 INT offset_psij = u-ar_u;
2254 assert(offset_psij >= 0);
2255 assert(o-u <= 2*m+1);
2256 assert(offset_psij+o-u <= 2*m+1);
2259 for (l = 0; l <= o-u; l++)
2260 g[u+l] += psij_const[offset_psij+l] * f;
2263 o = MIN(my_o0,ar_o);
2264 offset_psij += my_u0-ar_u+n;
2269 assert(o-u <= 2*m+1);
2270 if (offset_psij+o-u > 2*m+1)
2272 fprintf(stderr,
"ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij);
2274 assert(offset_psij+o-u <= 2*m+1);
2277 for (l = 0; l <= o-u; l++)
2278 g[u+l] += psij_const[offset_psij+l] * f;
2283 static void nfft_trafo_1d_B(
X(plan) *ths)
2285 const INT n = ths->n[0], M = ths->M_total, m = ths->m, m2p2 = 2*m+2;
2286 const C *g = (C*)ths->g;
2292 #pragma omp parallel for default(shared) private(k) 2294 for (k = 0; k < M; k++)
2297 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2299 for (l = 0; l < m2p2; l++)
2300 ths->f[j] += ths->psi[j*m2p2+l] * g[ths->psi_index_g[j*m2p2+l]];
2309 #pragma omp parallel for default(shared) private(k) 2311 for (k = 0; k < M; k++)
2313 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2314 nfft_trafo_1d_compute(&ths->f[j], g, ths->psi + j * (2 * m + 2),
2325 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2328 #pragma omp parallel for default(shared) private(k) 2330 for (k = 0; k < M; k++)
2332 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2333 const R fg_psij0 = ths->psi[2 * j], fg_psij1 = ths->psi[2 * j + 1];
2334 R fg_psij2 = K(1.0);
2338 psij_const[0] = fg_psij0;
2340 for (l = 1; l < m2p2; l++)
2342 fg_psij2 *= fg_psij1;
2343 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2346 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2359 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2362 #pragma omp parallel for default(shared) private(k) 2364 for (k = 0; k < M; k++)
2366 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2368 R fg_psij0, fg_psij1, fg_psij2;
2371 uo(ths, (INT)j, &u, &o, (INT)0);
2372 fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)(u))/(R)(n), 0));
2373 fg_psij1 = EXP(K(2.0) * ((R)(n) * ths->x[j] - (R)(u)) / ths->b[0]);
2376 psij_const[0] = fg_psij0;
2378 for (l = 1; l < m2p2; l++)
2380 fg_psij2 *= fg_psij1;
2381 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2384 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2391 const INT K = ths->K, ip_s = K / (m + 2);
2397 #pragma omp parallel for default(shared) private(k) 2399 for (k = 0; k < M; k++)
2405 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2407 uo(ths, (INT)j, &u, &o, (INT)0);
2409 ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2410 ip_u = (INT)(LRINT(FLOOR(ip_y)));
2411 ip_w = ip_y - (R)(ip_u);
2413 for (l = 0; l < m2p2; l++)
2414 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2415 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2417 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2429 #pragma omp parallel for default(shared) private(k) 2431 for (k = 0; k < M; k++)
2435 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2437 uo(ths, (INT)j, &u, &o, (INT)0);
2439 for (l = 0; l < m2p2; l++)
2440 psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n), 0));
2442 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2448 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \ 2450 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, \ 2451 ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0); \ 2454 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \ 2456 R psij_const[2 * m + 2]; \ 2458 R fg_psij0 = ths->psi[2 * j]; \ 2459 R fg_psij1 = ths->psi[2 * j + 1]; \ 2460 R fg_psij2 = K(1.0); \ 2462 psij_const[0] = fg_psij0; \ 2463 for (l = 1; l <= 2 * m + 1; l++) \ 2465 fg_psij2 *= fg_psij1; \ 2466 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \ 2469 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \ 2470 ths->x + j, n, m, my_u0, my_o0); \ 2473 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \ 2475 R psij_const[2 * m + 2]; \ 2476 R fg_psij0, fg_psij1, fg_psij2; \ 2479 uo(ths, j, &u, &o, (INT)0); \ 2480 fg_psij0 = (PHI(ths->n[0],ths->x[j]-((R)u)/((R)n),0)); \ 2481 fg_psij1 = EXP(K(2.0) * (((R)n) * (ths->x[j]) - (R)u) / ths->b[0]); \ 2482 fg_psij2 = K(1.0); \ 2483 psij_const[0] = fg_psij0; \ 2484 for (l = 1; l <= 2 * m + 1; l++) \ 2486 fg_psij2 *= fg_psij1; \ 2487 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \ 2490 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \ 2491 ths->x + j, n, m, my_u0, my_o0); \ 2494 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \ 2496 R psij_const[2 * m + 2]; \ 2501 uo(ths, j, &u, &o, (INT)0); \ 2503 ip_y = FABS(((R)n) * ths->x[j] - (R)u) * ((R)ip_s); \ 2504 ip_u = LRINT(FLOOR(ip_y)); \ 2505 ip_w = ip_y - ip_u; \ 2506 for (l = 0; l < 2 * m + 2; l++) \ 2508 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) \ 2509 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); \ 2511 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \ 2512 ths->x + j, n, m, my_u0, my_o0); \ 2515 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \ 2517 R psij_const[2 * m + 2]; \ 2520 uo(ths, j, &u, &o, (INT)0); \ 2522 for (l = 0; l <= 2 * m + 1; l++) \ 2523 psij_const[l] = (PHI(ths->n[0],ths->x[j]-((R)((u+l)))/((R)n),0)); \ 2525 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \ 2526 ths->x + j, n, m, my_u0, my_o0); \ 2529 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE(whichone) \ 2531 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \ 2533 _Pragma("omp parallel private(k)") \ 2535 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \ 2536 INT *ar_x = ths->index_x; \ 2538 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \ 2539 &min_u_b, &max_u_b, 1, &n, m); \ 2541 if (min_u_a != -1) \ 2543 k = index_x_binary_search(ar_x, M, min_u_a); \ 2545 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \ 2549 INT u_prod = ar_x[2*k]; \ 2550 INT j = ar_x[2*k+1]; \ 2552 if (u_prod < min_u_a || u_prod > max_u_a) \ 2555 MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \ 2561 if (min_u_b != -1) \ 2563 k = index_x_binary_search(ar_x, M, min_u_b); \ 2565 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \ 2569 INT u_prod = ar_x[2*k]; \ 2570 INT j = ar_x[2*k+1]; \ 2572 if (u_prod < min_u_b || u_prod > max_u_b) \ 2575 MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \ 2585 static void nfft_adjoint_1d_B(
X(plan) *ths)
2587 const INT n = ths->n[0], M = ths->M_total, m = ths->m;
2591 memset(g, 0, (
size_t)(ths->n_total) *
sizeof(C));
2595 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
2596 (INT)1, ths->n, m, ths->flags, ths->index_x);
2603 MACRO_adjoint_1d_B_OMP_BLOCKWISE(
PRE_PSI)
2607 #pragma omp parallel for default(shared) private(k) 2609 for (k = 0; k < M; k++)
2611 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2613 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2615 nfft_adjoint_1d_compute_serial(ths->f + j, g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2624 R fg_exp_l[2 * m + 2];
2626 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2634 #pragma omp parallel for default(shared) private(k) 2636 for (k = 0; k < M; k++)
2638 R psij_const[2 * m + 2];
2639 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2641 R fg_psij0 = ths->psi[2 * j];
2642 R fg_psij1 = ths->psi[2 * j + 1];
2643 R fg_psij2 = K(1.0);
2645 psij_const[0] = fg_psij0;
2646 for (l = 1; l <= 2 * m + 1; l++)
2648 fg_psij2 *= fg_psij1;
2649 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2653 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2655 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2664 R fg_exp_l[2 * m + 2];
2666 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2671 MACRO_adjoint_1d_B_OMP_BLOCKWISE(
FG_PSI)
2675 #pragma omp parallel for default(shared) private(k) 2677 for (k = 0; k < M; k++)
2680 R psij_const[2 * m + 2];
2681 R fg_psij0, fg_psij1, fg_psij2;
2682 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2684 uo(ths, j, &u, &o, (INT)0);
2685 fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)u) / (R)(n),0));
2686 fg_psij1 = EXP(K(2.0) * ((R)(n) * (ths->x[j]) - (R)(u)) / ths->b[0]);
2688 psij_const[0] = fg_psij0;
2689 for (l = 1; l <= 2 * m + 1; l++)
2691 fg_psij2 *= fg_psij1;
2692 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2696 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2698 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2707 const INT K = ths->K;
2708 const INT ip_s = K / (m + 2);
2717 #pragma omp parallel for default(shared) private(k) 2719 for (k = 0; k < M; k++)
2724 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2725 R psij_const[2 * m + 2];
2727 uo(ths, j, &u, &o, (INT)0);
2729 ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2730 ip_u = (INT)(LRINT(FLOOR(ip_y)));
2731 ip_w = ip_y - (R)(ip_u);
2732 for (l = 0; l < 2 * m + 2; l++)
2734 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2735 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2738 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2740 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2750 MACRO_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI)
2754 #pragma omp parallel for default(shared) private(k) 2756 for (k = 0; k < M; k++)
2759 R psij_const[2 * m + 2];
2760 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2762 uo(ths, j, &u, &o, (INT)0);
2764 for (l = 0; l <= 2 * m + 1; l++)
2765 psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n),0));
2768 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2770 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2775 void X(trafo_1d)(
X(plan) *ths)
2777 if((ths->N[0] <= ths->m) || (ths->n[0] <= 2*ths->m+2))
2779 X(trafo_direct)(ths);
2783 const INT N = ths->N[0], N2 = N/2, n = ths->n[0];
2784 C *f_hat1 = (C*)ths->f_hat, *f_hat2 = (C*)&ths->f_hat[N2];
2786 ths->g_hat = ths->g1;
2790 C *g_hat1 = (C*)&ths->g_hat[n-N/2], *g_hat2 = (C*)ths->g_hat;
2791 R *c_phi_inv1, *c_phi_inv2;
2797 #pragma omp parallel for default(shared) private(k) 2798 for (k = 0; k < ths->n_total; k++)
2799 ths->g_hat[k] = 0.0;
2802 memset(ths->g_hat, 0, (
size_t)(ths->n_total) *
sizeof(C));
2807 c_phi_inv1 = ths->c_phi_inv[0];
2808 c_phi_inv2 = &ths->c_phi_inv[0][N2];
2811 #pragma omp parallel for default(shared) private(k) 2813 for (k = 0; k < N2; k++)
2815 g_hat1[k] = f_hat1[k] * c_phi_inv1[k];
2816 g_hat2[k] = f_hat2[k] * c_phi_inv2[k];
2823 #pragma omp parallel for default(shared) private(k) 2825 for (k = 0; k < N2; k++)
2827 g_hat1[k] = f_hat1[k] / (PHI_HUT(ths->n[0],k-N2,0));
2828 g_hat2[k] = f_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2834 FFTW(execute)(ths->my_fftw_plan1);
2838 nfft_trafo_1d_B(ths);
2843 void X(adjoint_1d)(
X(plan) *ths)
2845 if((ths->N[0] <= ths->m) || (ths->n[0] <= 2*ths->m+2))
2847 X(adjoint_direct)(ths);
2852 C *g_hat1,*g_hat2,*f_hat1,*f_hat2;
2853 R *c_phi_inv1, *c_phi_inv2;
2861 f_hat1=(C*)ths->f_hat;
2862 f_hat2=(C*)&ths->f_hat[N/2];
2863 g_hat1=(C*)&ths->g_hat[n-N/2];
2864 g_hat2=(C*)ths->g_hat;
2867 nfft_adjoint_1d_B(ths);
2871 FFTW(execute)(ths->my_fftw_plan2);
2878 c_phi_inv1=ths->c_phi_inv[0];
2879 c_phi_inv2=&ths->c_phi_inv[0][N/2];
2882 #pragma omp parallel for default(shared) private(k) 2884 for (k = 0; k < N/2; k++)
2886 f_hat1[k] = g_hat1[k] * c_phi_inv1[k];
2887 f_hat2[k] = g_hat2[k] * c_phi_inv2[k];
2895 #pragma omp parallel for default(shared) private(k) 2897 for (k = 0; k < N/2; k++)
2899 f_hat1[k] = g_hat1[k] / (PHI_HUT(ths->n[0],k-N/2,0));
2900 f_hat2[k] = g_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2909 static void nfft_2d_init_fg_exp_l(R *fg_exp_l,
const INT m,
const R b)
2912 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
2914 fg_exp_b0 = EXP(K(-1.0)/b);
2915 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
2918 fg_exp_l[0] = K(1.0);
2919 for(l=1; l <= 2*m+1; l++)
2921 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
2922 fg_exp_b1 *= fg_exp_b0_sq;
2923 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
2927 static void nfft_trafo_2d_compute(C *fj,
const C *g,
const R *psij_const0,
2928 const R *psij_const1,
const R *xj0,
const R *xj1,
const INT n0,
2929 const INT n1,
const INT m)
2931 INT u0,o0,l0,u1,o1,l1;
2933 const R *psij0,*psij1;
2938 uo2(&u0,&o0,*xj0, n0, m);
2939 uo2(&u1,&o1,*xj1, n1, m);
2945 for(l0=0; l0<=2*m+1; l0++,psij0++)
2949 for(l1=0; l1<=2*m+1; l1++)
2950 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2953 for(l0=0; l0<=2*m+1; l0++,psij0++)
2957 for(l1=0; l1<2*m+1-o1; l1++)
2958 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2960 for(l1=0; l1<=o1; l1++)
2961 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2966 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2970 for(l1=0; l1<=2*m+1; l1++)
2971 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2973 for(l0=0; l0<=o0; l0++,psij0++)
2977 for(l1=0; l1<=2*m+1; l1++)
2978 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2983 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2987 for(l1=0; l1<2*m+1-o1; l1++)
2988 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2990 for(l1=0; l1<=o1; l1++)
2991 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2993 for(l0=0; l0<=o0; l0++,psij0++)
2997 for(l1=0; l1<2*m+1-o1; l1++)
2998 (*fj) += (*psij0) * (*psij1++) * (*gj++);
3000 for(l1=0; l1<=o1; l1++)
3001 (*fj) += (*psij0) * (*psij1++) * (*gj++);
3008 static void nfft_adjoint_2d_compute_omp_atomic(
const C f, C *g,
3009 const R *psij_const0,
const R *psij_const1,
const R *xj0,
3010 const R *xj1,
const INT n0,
const INT n1,
const INT m)
3012 INT u0,o0,l0,u1,o1,l1;
3014 INT index_temp0[2*m+2];
3015 INT index_temp1[2*m+2];
3017 uo2(&u0,&o0,*xj0, n0, m);
3018 uo2(&u1,&o1,*xj1, n1, m);
3020 for (l0=0; l0<=2*m+1; l0++)
3021 index_temp0[l0] = (u0+l0)%n0;
3023 for (l1=0; l1<=2*m+1; l1++)
3024 index_temp1[l1] = (u1+l1)%n1;
3026 for(l0=0; l0<=2*m+1; l0++)
3028 for(l1=0; l1<=2*m+1; l1++)
3030 INT i = index_temp0[l0] * n1 + index_temp1[l1];
3032 R *lhs_real = (R*)lhs;
3033 C val = psij_const0[l0] * psij_const1[l1] * f;
3036 lhs_real[0] += CREAL(val);
3039 lhs_real[1] += CIMAG(val);
3064 static void nfft_adjoint_2d_compute_omp_blockwise(
const C f, C *g,
3065 const R *psij_const0,
const R *psij_const1,
const R *xj0,
3066 const R *xj1,
const INT n0,
const INT n1,
const INT m,
3067 const INT my_u0,
const INT my_o0)
3069 INT ar_u0,ar_o0,l0,u1,o1,l1;
3070 INT index_temp1[2*m+2];
3072 uo2(&ar_u0,&ar_o0,*xj0, n0, m);
3073 uo2(&u1,&o1,*xj1, n1, m);
3075 for (l1 = 0; l1 <= 2*m+1; l1++)
3076 index_temp1[l1] = (u1+l1)%n1;
3080 INT u0 = MAX(my_u0,ar_u0);
3081 INT o0 = MIN(my_o0,ar_o0);
3082 INT offset_psij = u0-ar_u0;
3084 assert(offset_psij >= 0);
3085 assert(o0-u0 <= 2*m+1);
3086 assert(offset_psij+o0-u0 <= 2*m+1);
3089 for (l0 = 0; l0 <= o0-u0; l0++)
3091 INT i0 = (u0+l0) * n1;
3092 const C val0 = psij_const0[offset_psij+l0];
3094 for(l1=0; l1<=2*m+1; l1++)
3095 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
3100 INT u0 = MAX(my_u0,ar_u0);
3102 INT offset_psij = u0-ar_u0;
3104 assert(offset_psij >= 0);
3105 assert(o0-u0 <= 2*m+1);
3106 assert(offset_psij+o0-u0 <= 2*m+1);
3109 for (l0 = 0; l0 <= o0-u0; l0++)
3111 INT i0 = (u0+l0) * n1;
3112 const C val0 = psij_const0[offset_psij+l0];
3114 for(l1=0; l1<=2*m+1; l1++)
3115 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
3119 o0 = MIN(my_o0,ar_o0);
3120 offset_psij += my_u0-ar_u0+n0;
3125 assert(o0-u0 <= 2*m+1);
3126 assert(offset_psij+o0-u0 <= 2*m+1);
3130 for (l0 = 0; l0 <= o0-u0; l0++)
3132 INT i0 = (u0+l0) * n1;
3133 const C val0 = psij_const0[offset_psij+l0];
3135 for(l1=0; l1<=2*m+1; l1++)
3136 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
3143 static void nfft_adjoint_2d_compute_serial(
const C *fj, C *g,
3144 const R *psij_const0,
const R *psij_const1,
const R *xj0,
3145 const R *xj1,
const INT n0,
const INT n1,
const INT m)
3147 INT u0,o0,l0,u1,o1,l1;
3149 const R *psij0,*psij1;
3154 uo2(&u0,&o0,*xj0, n0, m);
3155 uo2(&u1,&o1,*xj1, n1, m);
3159 for(l0=0; l0<=2*m+1; l0++,psij0++)
3163 for(l1=0; l1<=2*m+1; l1++)
3164 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3167 for(l0=0; l0<=2*m+1; l0++,psij0++)
3171 for(l1=0; l1<2*m+1-o1; l1++)
3172 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3174 for(l1=0; l1<=o1; l1++)
3175 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3180 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
3184 for(l1=0; l1<=2*m+1; l1++)
3185 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3187 for(l0=0; l0<=o0; l0++,psij0++)
3191 for(l1=0; l1<=2*m+1; l1++)
3192 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3197 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
3201 for(l1=0; l1<2*m+1-o1; l1++)
3202 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3204 for(l1=0; l1<=o1; l1++)
3205 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3207 for(l0=0; l0<=o0; l0++,psij0++)
3211 for(l1=0; l1<2*m+1-o1; l1++)
3212 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3214 for(l1=0; l1<=o1; l1++)
3215 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3221 static void nfft_trafo_2d_B(
X(plan) *ths)
3223 const C *g = (C*)ths->g;
3224 const INT n0 = ths->n[0];
3225 const INT n1 = ths->n[1];
3226 const INT M = ths->M_total;
3227 const INT m = ths->m;
3233 const INT lprod = (2*m+2) * (2*m+2);
3235 #pragma omp parallel for default(shared) private(k) 3237 for (k = 0; k < M; k++)
3240 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3242 for (l = 0; l < lprod; l++)
3243 ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
3251 #pragma omp parallel for default(shared) private(k) 3253 for (k = 0; k < M; k++)
3255 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3256 nfft_trafo_2d_compute(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3264 R fg_exp_l[2*(2*m+2)];
3266 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3267 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3270 #pragma omp parallel for default(shared) private(k) 3272 for (k = 0; k < M; k++)
3274 R psij_const[2*(2*m+2)];
3275 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3277 R fg_psij0 = ths->psi[2*j*2];
3278 R fg_psij1 = ths->psi[2*j*2+1];
3279 R fg_psij2 = K(1.0);
3281 psij_const[0] = fg_psij0;
3282 for (l = 1; l <= 2*m+1; l++)
3284 fg_psij2 *= fg_psij1;
3285 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3288 fg_psij0 = ths->psi[2*(j*2+1)];
3289 fg_psij1 = ths->psi[2*(j*2+1)+1];
3291 psij_const[2*m+2] = fg_psij0;
3292 for (l = 1; l <= 2*m+1; l++)
3294 fg_psij2 *= fg_psij1;
3295 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3298 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3306 R fg_exp_l[2*(2*m+2)];
3308 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3309 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3314 #pragma omp parallel for default(shared) private(k) 3316 for (k = 0; k < M; k++)
3319 R fg_psij0, fg_psij1, fg_psij2;
3320 R psij_const[2*(2*m+2)];
3321 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3323 uo(ths, j, &u, &o, (INT)0);
3324 fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u) / (R)(n0),0));
3325 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3327 psij_const[0] = fg_psij0;
3328 for (l = 1; l <= 2*m+1; l++)
3330 fg_psij2 *= fg_psij1;
3331 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3334 uo(ths,j,&u,&o, (INT)1);
3335 fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3336 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3338 psij_const[2*m+2] = fg_psij0;
3339 for(l=1; l<=2*m+1; l++)
3341 fg_psij2 *= fg_psij1;
3342 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3345 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3353 const INT K = ths->K, ip_s = K / (m + 2);
3358 #pragma omp parallel for default(shared) private(k) 3360 for (k = 0; k < M; k++)
3365 R psij_const[2*(2*m+2)];
3366 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3368 uo(ths,j,&u,&o,(INT)0);
3369 ip_y = FABS((R)(n0) * ths->x[2*j] - (R)(u)) * ((R)ip_s);
3370 ip_u = (INT)LRINT(FLOOR(ip_y));
3371 ip_w = ip_y - (R)(ip_u);
3372 for (l = 0; l < 2*m+2; l++)
3373 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3375 uo(ths,j,&u,&o,(INT)1);
3376 ip_y = FABS((R)(n1) * ths->x[2*j+1] - (R)(u)) * ((R)ip_s);
3377 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3378 ip_w = ip_y - (R)(ip_u);
3379 for (l = 0; l < 2*m+2; l++)
3380 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3382 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3392 #pragma omp parallel for default(shared) private(k) 3394 for (k = 0; k < M; k++)
3396 R psij_const[2*(2*m+2)];
3398 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3400 uo(ths,j,&u,&o,(INT)0);
3401 for (l = 0; l <= 2*m+1; l++)
3402 psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3404 uo(ths,j,&u,&o,(INT)1);
3405 for (l = 0; l <= 2*m+1; l++)
3406 psij_const[2*m+2+l] = (PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l)))/(R)(n1),1));
3408 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3412 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \ 3413 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \ 3414 ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), \ 3415 ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0); 3417 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \ 3419 R psij_const[2*(2*m+2)]; \ 3421 R fg_psij0 = ths->psi[2*j*2]; \ 3422 R fg_psij1 = ths->psi[2*j*2+1]; \ 3423 R fg_psij2 = K(1.0); \ 3425 psij_const[0] = fg_psij0; \ 3426 for(l=1; l<=2*m+1; l++) \ 3428 fg_psij2 *= fg_psij1; \ 3429 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \ 3432 fg_psij0 = ths->psi[2*(j*2+1)]; \ 3433 fg_psij1 = ths->psi[2*(j*2+1)+1]; \ 3434 fg_psij2 = K(1.0); \ 3435 psij_const[2*m+2] = fg_psij0; \ 3436 for(l=1; l<=2*m+1; l++) \ 3438 fg_psij2 *= fg_psij1; \ 3439 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \ 3442 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \ 3443 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \ 3444 n0, n1, m, my_u0, my_o0); \ 3447 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \ 3449 R psij_const[2*(2*m+2)]; \ 3450 R fg_psij0, fg_psij1, fg_psij2; \ 3453 uo(ths,j,&u,&o,(INT)0); \ 3454 fg_psij0 = (PHI(ths->n[0],ths->x[2*j]-((R)u)/((R)n0),0)); \ 3455 fg_psij1 = EXP(K(2.0)*(((R)n0)*(ths->x[2*j]) - (R)u)/ths->b[0]); \ 3456 fg_psij2 = K(1.0); \ 3457 psij_const[0] = fg_psij0; \ 3458 for(l=1; l<=2*m+1; l++) \ 3460 fg_psij2 *= fg_psij1; \ 3461 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \ 3464 uo(ths,j,&u,&o,(INT)1); \ 3465 fg_psij0 = (PHI(ths->n[1],ths->x[2*j+1]-((R)u)/((R)n1),1)); \ 3466 fg_psij1 = EXP(K(2.0)*(((R)n1)*(ths->x[2*j+1]) - (R)u)/ths->b[1]); \ 3467 fg_psij2 = K(1.0); \ 3468 psij_const[2*m+2] = fg_psij0; \ 3469 for(l=1; l<=2*m+1; l++) \ 3471 fg_psij2 *= fg_psij1; \ 3472 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \ 3475 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \ 3476 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \ 3477 n0, n1, m, my_u0, my_o0); \ 3480 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \ 3482 R psij_const[2*(2*m+2)]; \ 3487 uo(ths,j,&u,&o,(INT)0); \ 3488 ip_y = FABS(((R)n0)*(ths->x[2*j]) - (R)u)*((R)ip_s); \ 3489 ip_u = LRINT(FLOOR(ip_y)); \ 3491 for(l=0; l < 2*m+2; l++) \ 3492 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \ 3493 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \ 3495 uo(ths,j,&u,&o,(INT)1); \ 3496 ip_y = FABS(((R)n1)*(ths->x[2*j+1]) - (R)u)*((R)ip_s); \ 3497 ip_u = LRINT(FLOOR(ip_y)); \ 3499 for(l=0; l < 2*m+2; l++) \ 3500 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \ 3501 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \ 3503 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \ 3504 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \ 3505 n0, n1, m, my_u0, my_o0); \ 3508 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \ 3510 R psij_const[2*(2*m+2)]; \ 3513 uo(ths,j,&u,&o,(INT)0); \ 3514 for(l=0;l<=2*m+1;l++) \ 3515 psij_const[l]=(PHI(ths->n[0],ths->x[2*j]-((R)((u+l)))/((R)n0),0)); \ 3517 uo(ths,j,&u,&o,(INT)1); \ 3518 for(l=0;l<=2*m+1;l++) \ 3519 psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[2*j+1]-((R)((u+l)))/((R)n1),1)); \ 3521 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \ 3522 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \ 3523 n0, n1, m, my_u0, my_o0); \ 3526 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE(whichone) \ 3528 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \ 3530 _Pragma("omp parallel private(k)") \ 3532 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \ 3533 INT *ar_x = ths->index_x; \ 3535 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \ 3536 &min_u_b, &max_u_b, 2, ths->n, m); \ 3538 if (min_u_a != -1) \ 3540 k = index_x_binary_search(ar_x, M, min_u_a); \ 3542 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \ 3546 INT u_prod = ar_x[2*k]; \ 3547 INT j = ar_x[2*k+1]; \ 3549 if (u_prod < min_u_a || u_prod > max_u_a) \ 3552 MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \ 3558 if (min_u_b != -1) \ 3560 INT k = index_x_binary_search(ar_x, M, min_u_b); \ 3562 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \ 3566 INT u_prod = ar_x[2*k]; \ 3567 INT j = ar_x[2*k+1]; \ 3569 if (u_prod < min_u_b || u_prod > max_u_b) \ 3572 MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \ 3583 static void nfft_adjoint_2d_B(
X(plan) *ths)
3585 const INT n0 = ths->n[0];
3586 const INT n1 = ths->n[1];
3587 const INT M = ths->M_total;
3588 const INT m = ths->m;
3592 memset(g, 0, (
size_t)(ths->n_total) *
sizeof(C));
3596 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
3597 (INT)2, ths->n, m, ths->flags, ths->index_x);
3604 MACRO_adjoint_2d_B_OMP_BLOCKWISE(
PRE_PSI)
3608 #pragma omp parallel for default(shared) private(k) 3610 for (k = 0; k < M; k++)
3612 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3614 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3616 nfft_adjoint_2d_compute_serial(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3624 R fg_exp_l[2*(2*m+2)];
3626 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3627 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3635 #pragma omp parallel for default(shared) private(k) 3637 for (k = 0; k < M; k++)
3639 R psij_const[2*(2*m+2)];
3640 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3642 R fg_psij0 = ths->psi[2*j*2];
3643 R fg_psij1 = ths->psi[2*j*2+1];
3644 R fg_psij2 = K(1.0);
3646 psij_const[0] = fg_psij0;
3647 for(l=1; l<=2*m+1; l++)
3649 fg_psij2 *= fg_psij1;
3650 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3653 fg_psij0 = ths->psi[2*(j*2+1)];
3654 fg_psij1 = ths->psi[2*(j*2+1)+1];
3656 psij_const[2*m+2] = fg_psij0;
3657 for(l=1; l<=2*m+1; l++)
3659 fg_psij2 *= fg_psij1;
3660 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3664 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3666 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3675 R fg_exp_l[2*(2*m+2)];
3677 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3678 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3683 MACRO_adjoint_2d_B_OMP_BLOCKWISE(
FG_PSI)
3687 #pragma omp parallel for default(shared) private(k) 3689 for (k = 0; k < M; k++)
3692 R fg_psij0, fg_psij1, fg_psij2;
3693 R psij_const[2*(2*m+2)];
3694 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3696 uo(ths,j,&u,&o,(INT)0);
3697 fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u)/(R)(n0),0));
3698 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3700 psij_const[0] = fg_psij0;
3701 for(l=1; l<=2*m+1; l++)
3703 fg_psij2 *= fg_psij1;
3704 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3707 uo(ths,j,&u,&o,(INT)1);
3708 fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3709 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3711 psij_const[2*m+2] = fg_psij0;
3712 for(l=1; l<=2*m+1; l++)
3714 fg_psij2 *= fg_psij1;
3715 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3719 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3721 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3730 const INT K = ths->K;
3731 const INT ip_s = K / (m + 2);
3740 #pragma omp parallel for default(shared) private(k) 3742 for (k = 0; k < M; k++)
3747 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3748 R psij_const[2*(2*m+2)];
3750 uo(ths,j,&u,&o,(INT)0);
3751 ip_y = FABS((R)(n0) * (ths->x[2*j]) - (R)(u)) * ((R)ip_s);
3752 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3753 ip_w = ip_y - (R)(ip_u);
3754 for(l=0; l < 2*m+2; l++)
3755 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3756 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3758 uo(ths,j,&u,&o,(INT)1);
3759 ip_y = FABS((R)(n1) * (ths->x[2*j+1]) - (R)(u)) * ((R)ip_s);
3760 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3761 ip_w = ip_y - (R)(ip_u);
3762 for(l=0; l < 2*m+2; l++)
3763 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3764 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3767 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3769 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3779 MACRO_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI)
3783 #pragma omp parallel for default(shared) private(k) 3785 for (k = 0; k < M; k++)
3788 R psij_const[2*(2*m+2)];
3789 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3791 uo(ths,j,&u,&o,(INT)0);
3792 for(l=0;l<=2*m+1;l++)
3793 psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3795 uo(ths,j,&u,&o,(INT)1);
3796 for(l=0;l<=2*m+1;l++)
3797 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l))) / (R)(n1),1));
3800 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3802 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3808 void X(trafo_2d)(
X(plan) *ths)
3810 if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2))
3812 X(trafo_direct)(ths);
3816 INT k0,k1,n0,n1,N0,N1;
3818 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3819 R ck01, ck02, ck11, ck12;
3820 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3830 f_hat=(C*)ths->f_hat;
3831 g_hat=(C*)ths->g_hat;
3835 #pragma omp parallel for default(shared) private(k0) 3836 for (k0 = 0; k0 < ths->n_total; k0++)
3837 ths->g_hat[k0] = 0.0;
3839 memset(ths->g_hat, 0, (
size_t)(ths->n_total) *
sizeof(C));
3841 if(ths->flags & PRE_PHI_HUT)
3843 c_phi_inv01=ths->c_phi_inv[0];
3844 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3847 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12) 3849 for(k0=0;k0<N0/2;k0++)
3851 ck01=c_phi_inv01[k0];
3852 ck02=c_phi_inv02[k0];
3854 c_phi_inv11=ths->c_phi_inv[1];
3855 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3857 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3858 f_hat11=f_hat + k0*N1;
3859 g_hat21=g_hat + k0*n1+n1-(N1/2);
3860 f_hat21=f_hat + ((N0/2)+k0)*N1;
3861 g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3862 f_hat12=f_hat + k0*N1+(N1/2);
3863 g_hat22=g_hat + k0*n1;
3864 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3866 for(k1=0;k1<N1/2;k1++)
3868 ck11=c_phi_inv11[k1];
3869 ck12=c_phi_inv12[k1];
3871 g_hat11[k1] = f_hat11[k1] * ck01 * ck11;
3872 g_hat21[k1] = f_hat21[k1] * ck02 * ck11;
3873 g_hat12[k1] = f_hat12[k1] * ck01 * ck12;
3874 g_hat22[k1] = f_hat22[k1] * ck02 * ck12;
3880 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12) 3882 for(k0=0;k0<N0/2;k0++)
3884 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3885 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3886 for(k1=0;k1<N1/2;k1++)
3888 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3889 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3890 g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1] * ck01 * ck11;
3891 g_hat[k0*n1+n1-N1/2+k1] = f_hat[(N0/2+k0)*N1+k1] * ck02 * ck11;
3892 g_hat[(n0-N0/2+k0)*n1+k1] = f_hat[k0*N1+N1/2+k1] * ck01 * ck12;
3893 g_hat[k0*n1+k1] = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12;
3900 FFTW(execute)(ths->my_fftw_plan1);
3904 nfft_trafo_2d_B(ths);
3908 void X(adjoint_2d)(
X(plan) *ths)
3910 if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2))
3912 X(adjoint_direct)(ths);
3916 INT k0,k1,n0,n1,N0,N1;
3918 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3919 R ck01, ck02, ck11, ck12;
3920 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3930 f_hat=(C*)ths->f_hat;
3931 g_hat=(C*)ths->g_hat;
3934 nfft_adjoint_2d_B(ths);
3938 FFTW(execute)(ths->my_fftw_plan2);
3942 if(ths->flags & PRE_PHI_HUT)
3944 c_phi_inv01=ths->c_phi_inv[0];
3945 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3948 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12) 3950 for(k0=0;k0<N0/2;k0++)
3952 ck01=c_phi_inv01[k0];
3953 ck02=c_phi_inv02[k0];
3955 c_phi_inv11=ths->c_phi_inv[1];
3956 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3958 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3959 f_hat11=f_hat + k0*N1;
3960 g_hat21=g_hat + k0*n1+n1-(N1/2);
3961 f_hat21=f_hat + ((N0/2)+k0)*N1;
3962 g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3963 f_hat12=f_hat + k0*N1+(N1/2);
3964 g_hat22=g_hat + k0*n1;
3965 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3967 for(k1=0;k1<N1/2;k1++)
3969 ck11=c_phi_inv11[k1];
3970 ck12=c_phi_inv12[k1];
3972 f_hat11[k1] = g_hat11[k1] * ck01 * ck11;
3973 f_hat21[k1] = g_hat21[k1] * ck02 * ck11;
3974 f_hat12[k1] = g_hat12[k1] * ck01 * ck12;
3975 f_hat22[k1] = g_hat22[k1] * ck02 * ck12;
3981 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12) 3983 for(k0=0;k0<N0/2;k0++)
3985 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3986 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3987 for(k1=0;k1<N1/2;k1++)
3989 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3990 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3991 f_hat[k0*N1+k1] = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11;
3992 f_hat[(N0/2+k0)*N1+k1] = g_hat[k0*n1+n1-N1/2+k1] * ck02 * ck11;
3993 f_hat[k0*N1+N1/2+k1] = g_hat[(n0-N0/2+k0)*n1+k1] * ck01 * ck12;
3994 f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1] * ck02 * ck12;
4002 static void nfft_3d_init_fg_exp_l(R *fg_exp_l,
const INT m,
const R b)
4005 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
4007 fg_exp_b0 = EXP(-K(1.0) / b);
4008 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
4011 fg_exp_l[0] = K(1.0);
4012 for(l=1; l <= 2*m+1; l++)
4014 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
4015 fg_exp_b1 *= fg_exp_b0_sq;
4016 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
4020 static void nfft_trafo_3d_compute(C *fj,
const C *g,
const R *psij_const0,
4021 const R *psij_const1,
const R *psij_const2,
const R *xj0,
const R *xj1,
4022 const R *xj2,
const INT n0,
const INT n1,
const INT n2,
const INT m)
4024 INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
4026 const R *psij0, *psij1, *psij2;
4028 psij0 = psij_const0;
4029 psij1 = psij_const1;
4030 psij2 = psij_const2;
4032 uo2(&u0, &o0, *xj0, n0, m);
4033 uo2(&u1, &o1, *xj1, n1, m);
4034 uo2(&u2, &o2, *xj2, n2, m);
4041 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4043 psij1 = psij_const1;
4044 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4046 psij2 = psij_const2;
4047 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4048 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4049 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4054 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4056 psij1 = psij_const1;
4057 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4059 psij2 = psij_const2;
4060 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4061 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4062 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4063 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4064 for (l2 = 0; l2 <= o2; l2++)
4065 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4070 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4072 psij1 = psij_const1;
4073 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4075 psij2 = psij_const2;
4076 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4077 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4078 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4080 for (l1 = 0; l1 <= o1; l1++, psij1++)
4082 psij2 = psij_const2;
4083 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4084 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4085 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4090 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4092 psij1 = psij_const1;
4093 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4095 psij2 = psij_const2;
4096 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4097 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4098 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4099 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4100 for (l2 = 0; l2 <= o2; l2++)
4101 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4103 for (l1 = 0; l1 <= o1; l1++, psij1++)
4105 psij2 = psij_const2;
4106 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4107 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4108 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4109 gj = g + ((u0 + l0) * n1 + l1) * n2;
4110 for (l2 = 0; l2 <= o2; l2++)
4111 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4119 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4121 psij1 = psij_const1;
4122 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4124 psij2 = psij_const2;
4125 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4126 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4127 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4131 for (l0 = 0; l0 <= o0; l0++, psij0++)
4133 psij1 = psij_const1;
4134 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4136 psij2 = psij_const2;
4137 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4138 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4139 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4144 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4146 psij1 = psij_const1;
4147 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4149 psij2 = psij_const2;
4150 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4151 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4152 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4153 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4154 for (l2 = 0; l2 <= o2; l2++)
4155 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4159 for (l0 = 0; l0 <= o0; l0++, psij0++)
4161 psij1 = psij_const1;
4162 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4164 psij2 = psij_const2;
4165 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4166 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4167 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4168 gj = g + (l0 * n1 + (u1 + l1)) * n2;
4169 for (l2 = 0; l2 <= o2; l2++)
4170 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4177 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4179 psij1 = psij_const1;
4180 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4182 psij2 = psij_const2;
4183 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4184 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4185 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4187 for (l1 = 0; l1 <= o1; l1++, psij1++)
4189 psij2 = psij_const2;
4190 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4191 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4192 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4195 for (l0 = 0; l0 <= o0; l0++, psij0++)
4197 psij1 = psij_const1;
4198 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4200 psij2 = psij_const2;
4201 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4202 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4203 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4205 for (l1 = 0; l1 <= o1; l1++, psij1++)
4207 psij2 = psij_const2;
4208 gj = g + (l0 * n1 + l1) * n2 + u2;
4209 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4210 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4215 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4217 psij1 = psij_const1;
4218 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4220 psij2 = psij_const2;
4221 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4222 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4223 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4224 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4225 for (l2 = 0; l2 <= o2; l2++)
4226 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4228 for (l1 = 0; l1 <= o1; l1++, psij1++)
4230 psij2 = psij_const2;
4231 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4232 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4233 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4234 gj = g + ((u0 + l0) * n1 + l1) * n2;
4235 for (l2 = 0; l2 <= o2; l2++)
4236 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4240 for (l0 = 0; l0 <= o0; l0++, psij0++)
4242 psij1 = psij_const1;
4243 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4245 psij2 = psij_const2;
4246 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4247 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4248 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4249 gj = g + (l0 * n1 + (u1 + l1)) * n2;
4250 for (l2 = 0; l2 <= o2; l2++)
4251 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4253 for (l1 = 0; l1 <= o1; l1++, psij1++)
4255 psij2 = psij_const2;
4256 gj = g + (l0 * n1 + l1) * n2 + u2;
4257 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4258 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4259 gj = g + (l0 * n1 + l1) * n2;
4260 for (l2 = 0; l2 <= o2; l2++)
4261 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4289 static void nfft_adjoint_3d_compute_omp_blockwise(
const C f, C *g,
4290 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
4291 const R *xj0,
const R *xj1,
const R *xj2,
4292 const INT n0,
const INT n1,
const INT n2,
const INT m,
4293 const INT my_u0,
const INT my_o0)
4295 INT ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2;
4297 INT index_temp1[2*m+2];
4298 INT index_temp2[2*m+2];
4300 uo2(&ar_u0,&ar_o0,*xj0, n0, m);
4301 uo2(&u1,&o1,*xj1, n1, m);
4302 uo2(&u2,&o2,*xj2, n2, m);
4304 for (l1=0; l1<=2*m+1; l1++)
4305 index_temp1[l1] = (u1+l1)%n1;
4307 for (l2=0; l2<=2*m+1; l2++)
4308 index_temp2[l2] = (u2+l2)%n2;
4312 INT u0 = MAX(my_u0,ar_u0);
4313 INT o0 = MIN(my_o0,ar_o0);
4314 INT offset_psij = u0-ar_u0;
4316 assert(offset_psij >= 0);
4317 assert(o0-u0 <= 2*m+1);
4318 assert(offset_psij+o0-u0 <= 2*m+1);
4321 for (l0 = 0; l0 <= o0-u0; l0++)
4323 const INT i0 = (u0+l0) * n1;
4324 const C val0 = psij_const0[offset_psij+l0];
4326 for(l1=0; l1<=2*m+1; l1++)
4328 const INT i1 = (i0 + index_temp1[l1]) * n2;
4329 const C val1 = psij_const1[l1];
4331 for(l2=0; l2<=2*m+1; l2++)
4332 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4338 INT u0 = MAX(my_u0,ar_u0);
4340 INT offset_psij = u0-ar_u0;
4342 assert(offset_psij >= 0);
4343 assert(o0-u0 <= 2*m+1);
4344 assert(offset_psij+o0-u0 <= 2*m+1);
4347 for (l0 = 0; l0 <= o0-u0; l0++)
4349 INT i0 = (u0+l0) * n1;
4350 const C val0 = psij_const0[offset_psij+l0];
4352 for(l1=0; l1<=2*m+1; l1++)
4354 const INT i1 = (i0 + index_temp1[l1]) * n2;
4355 const C val1 = psij_const1[l1];
4357 for(l2=0; l2<=2*m+1; l2++)
4358 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4363 o0 = MIN(my_o0,ar_o0);
4364 offset_psij += my_u0-ar_u0+n0;
4369 assert(o0-u0 <= 2*m+1);
4370 assert(offset_psij+o0-u0 <= 2*m+1);
4373 for (l0 = 0; l0 <= o0-u0; l0++)
4375 INT i0 = (u0+l0) * n1;
4376 const C val0 = psij_const0[offset_psij+l0];
4378 for(l1=0; l1<=2*m+1; l1++)
4380 const INT i1 = (i0 + index_temp1[l1]) * n2;
4381 const C val1 = psij_const1[l1];
4383 for(l2=0; l2<=2*m+1; l2++)
4384 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4393 static void nfft_adjoint_3d_compute_omp_atomic(
const C f, C *g,
4394 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
4395 const R *xj0,
const R *xj1,
const R *xj2,
4396 const INT n0,
const INT n1,
const INT n2,
const INT m)
4398 INT u0,o0,l0,u1,o1,l1,u2,o2,l2;
4400 INT index_temp0[2*m+2];
4401 INT index_temp1[2*m+2];
4402 INT index_temp2[2*m+2];
4404 uo2(&u0,&o0,*xj0, n0, m);
4405 uo2(&u1,&o1,*xj1, n1, m);
4406 uo2(&u2,&o2,*xj2, n2, m);
4408 for (l0=0; l0<=2*m+1; l0++)
4409 index_temp0[l0] = (u0+l0)%n0;
4411 for (l1=0; l1<=2*m+1; l1++)
4412 index_temp1[l1] = (u1+l1)%n1;
4414 for (l2=0; l2<=2*m+1; l2++)
4415 index_temp2[l2] = (u2+l2)%n2;
4417 for(l0=0; l0<=2*m+1; l0++)
4419 for(l1=0; l1<=2*m+1; l1++)
4421 for(l2=0; l2<=2*m+1; l2++)
4423 INT i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2];
4425 R *lhs_real = (R*)lhs;
4426 C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f;
4429 lhs_real[0] += CREAL(val);
4432 lhs_real[1] += CIMAG(val);
4440 static void nfft_adjoint_3d_compute_serial(
const C *fj, C *g,
4441 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
const R *xj0,
4442 const R *xj1,
const R *xj2,
const INT n0,
const INT n1,
const INT n2,
4445 INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
4447 const R *psij0, *psij1, *psij2;
4449 psij0 = psij_const0;
4450 psij1 = psij_const1;
4451 psij2 = psij_const2;
4453 uo2(&u0, &o0, *xj0, n0, m);
4454 uo2(&u1, &o1, *xj1, n1, m);
4455 uo2(&u2, &o2, *xj2, n2, m);
4460 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4462 psij1 = psij_const1;
4463 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4465 psij2 = psij_const2;
4466 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4467 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4468 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4473 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4475 psij1 = psij_const1;
4476 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4478 psij2 = psij_const2;
4479 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4480 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4481 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4482 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4483 for (l2 = 0; l2 <= o2; l2++)
4484 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4489 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4491 psij1 = psij_const1;
4492 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4494 psij2 = psij_const2;
4495 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4496 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4497 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4499 for (l1 = 0; l1 <= o1; l1++, psij1++)
4501 psij2 = psij_const2;
4502 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4503 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4504 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4509 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4511 psij1 = psij_const1;
4512 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4514 psij2 = psij_const2;
4515 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4516 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4517 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4518 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4519 for (l2 = 0; l2 <= o2; l2++)
4520 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4522 for (l1 = 0; l1 <= o1; l1++, psij1++)
4524 psij2 = psij_const2;
4525 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4526 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4527 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4528 gj = g + ((u0 + l0) * n1 + l1) * n2;
4529 for (l2 = 0; l2 <= o2; l2++)
4530 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4538 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4540 psij1 = psij_const1;
4541 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4543 psij2 = psij_const2;
4544 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4545 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4546 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4550 for (l0 = 0; l0 <= o0; l0++, psij0++)
4552 psij1 = psij_const1;
4553 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4555 psij2 = psij_const2;
4556 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4557 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4558 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4563 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4565 psij1 = psij_const1;
4566 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4568 psij2 = psij_const2;
4569 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4570 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4571 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4572 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4573 for (l2 = 0; l2 <= o2; l2++)
4574 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4578 for (l0 = 0; l0 <= o0; l0++, psij0++)
4580 psij1 = psij_const1;
4581 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4583 psij2 = psij_const2;
4584 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4585 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4586 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4587 gj = g + (l0 * n1 + (u1 + l1)) * n2;
4588 for (l2 = 0; l2 <= o2; l2++)
4589 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4596 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4598 psij1 = psij_const1;
4599 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4601 psij2 = psij_const2;
4602 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4603 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4604 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4606 for (l1 = 0; l1 <= o1; l1++, psij1++)
4608 psij2 = psij_const2;
4609 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4610 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4611 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4614 for (l0 = 0; l0 <= o0; l0++, psij0++)
4616 psij1 = psij_const1;
4617 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4619 psij2 = psij_const2;
4620 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4621 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4622 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4624 for (l1 = 0; l1 <= o1; l1++, psij1++)
4626 psij2 = psij_const2;
4627 gj = g + (l0 * n1 + l1) * n2 + u2;
4628 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4629 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4634 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4636 psij1 = psij_const1;
4637 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4639 psij2 = psij_const2;
4640 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4641 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4642 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4643 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4644 for (l2 = 0; l2 <= o2; l2++)
4645 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4647 for (l1 = 0; l1 <= o1; l1++, psij1++)
4649 psij2 = psij_const2;
4650 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4651 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4652 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4653 gj = g + ((u0 + l0) * n1 + l1) * n2;
4654 for (l2 = 0; l2 <= o2; l2++)
4655 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4659 for (l0 = 0; l0 <= o0; l0++, psij0++)
4661 psij1 = psij_const1;
4662 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4664 psij2 = psij_const2;
4665 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4666 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4667 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4668 gj = g + (l0 * n1 + (u1 + l1)) * n2;
4669 for (l2 = 0; l2 <= o2; l2++)
4670 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4672 for (l1 = 0; l1 <= o1; l1++, psij1++)
4674 psij2 = psij_const2;
4675 gj = g + (l0 * n1 + l1) * n2 + u2;
4676 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4677 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4678 gj = g + (l0 * n1 + l1) * n2;
4679 for (l2 = 0; l2 <= o2; l2++)
4680 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4687 static void nfft_trafo_3d_B(
X(plan) *ths)
4689 const INT n0 = ths->n[0];
4690 const INT n1 = ths->n[1];
4691 const INT n2 = ths->n[2];
4692 const INT M = ths->M_total;
4693 const INT m = ths->m;
4695 const C* g = (C*) ths->g;
4701 const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4703 #pragma omp parallel for default(shared) private(k) 4705 for (k = 0; k < M; k++)
4708 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4710 for (l = 0; l < lprod; l++)
4711 ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
4719 #pragma omp parallel for default(shared) private(k) 4721 for (k = 0; k < M; k++)
4723 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4724 nfft_trafo_3d_compute(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4731 R fg_exp_l[3*(2*m+2)];
4733 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4734 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4735 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4738 #pragma omp parallel for default(shared) private(k) 4740 for (k = 0; k < M; k++)
4742 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4744 R psij_const[3*(2*m+2)];
4745 R fg_psij0 = ths->psi[2*j*3];
4746 R fg_psij1 = ths->psi[2*j*3+1];
4747 R fg_psij2 = K(1.0);
4749 psij_const[0] = fg_psij0;
4750 for(l=1; l<=2*m+1; l++)
4752 fg_psij2 *= fg_psij1;
4753 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4756 fg_psij0 = ths->psi[2*(j*3+1)];
4757 fg_psij1 = ths->psi[2*(j*3+1)+1];
4759 psij_const[2*m+2] = fg_psij0;
4760 for(l=1; l<=2*m+1; l++)
4762 fg_psij2 *= fg_psij1;
4763 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4766 fg_psij0 = ths->psi[2*(j*3+2)];
4767 fg_psij1 = ths->psi[2*(j*3+2)+1];
4769 psij_const[2*(2*m+2)] = fg_psij0;
4770 for(l=1; l<=2*m+1; l++)
4772 fg_psij2 *= fg_psij1;
4773 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4776 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4784 R fg_exp_l[3*(2*m+2)];
4786 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4787 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4788 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4793 #pragma omp parallel for default(shared) private(k) 4795 for (k = 0; k < M; k++)
4797 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4799 R psij_const[3*(2*m+2)];
4800 R fg_psij0, fg_psij1, fg_psij2;
4802 uo(ths,j,&u,&o,(INT)0);
4803 fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
4804 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u)) / ths->b[0]);
4806 psij_const[0] = fg_psij0;
4807 for(l=1; l<=2*m+1; l++)
4809 fg_psij2 *= fg_psij1;
4810 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4813 uo(ths,j,&u,&o,(INT)1);
4814 fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
4815 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u)) / ths->b[1]);
4817 psij_const[2*m+2] = fg_psij0;
4818 for(l=1; l<=2*m+1; l++)
4820 fg_psij2 *= fg_psij1;
4821 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4824 uo(ths,j,&u,&o,(INT)2);
4825 fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
4826 fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u)) / ths->b[2]);
4828 psij_const[2*(2*m+2)] = fg_psij0;
4829 for(l=1; l<=2*m+1; l++)
4831 fg_psij2 *= fg_psij1;
4832 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4835 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4843 const INT K = ths->K, ip_s = K / (m + 2);
4848 #pragma omp parallel for default(shared) private(k) 4850 for (k = 0; k < M; k++)
4855 R psij_const[3*(2*m+2)];
4856 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4858 uo(ths,j,&u,&o,(INT)0);
4859 ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
4860 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4861 ip_w = ip_y - (R)(ip_u);
4862 for(l=0; l < 2*m+2; l++)
4863 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4864 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
4866 uo(ths,j,&u,&o,(INT)1);
4867 ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
4868 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4869 ip_w = ip_y - (R)(ip_u);
4870 for(l=0; l < 2*m+2; l++)
4871 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4872 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4874 uo(ths,j,&u,&o,(INT)2);
4875 ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u)) * ((R)ip_s);
4876 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4877 ip_w = ip_y - (R)(ip_u);
4878 for(l=0; l < 2*m+2; l++)
4879 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4880 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4882 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4892 #pragma omp parallel for default(shared) private(k) 4894 for (k = 0; k < M; k++)
4896 R psij_const[3*(2*m+2)];
4898 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4900 uo(ths,j,&u,&o,(INT)0);
4901 for(l=0;l<=2*m+1;l++)
4902 psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
4904 uo(ths,j,&u,&o,(INT)1);
4905 for(l=0;l<=2*m+1;l++)
4906 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
4908 uo(ths,j,&u,&o,(INT)2);
4909 for(l=0;l<=2*m+1;l++)
4910 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
4912 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4916 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \ 4917 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \ 4918 ths->psi+j*3*(2*m+2), \ 4919 ths->psi+(j*3+1)*(2*m+2), \ 4920 ths->psi+(j*3+2)*(2*m+2), \ 4921 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \ 4922 n0, n1, n2, m, my_u0, my_o0); 4924 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \ 4927 R psij_const[3*(2*m+2)]; \ 4928 R fg_psij0 = ths->psi[2*j*3]; \ 4929 R fg_psij1 = ths->psi[2*j*3+1]; \ 4930 R fg_psij2 = K(1.0); \ 4932 psij_const[0] = fg_psij0; \ 4933 for(l=1; l<=2*m+1; l++) \ 4935 fg_psij2 *= fg_psij1; \ 4936 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \ 4939 fg_psij0 = ths->psi[2*(j*3+1)]; \ 4940 fg_psij1 = ths->psi[2*(j*3+1)+1]; \ 4941 fg_psij2 = K(1.0); \ 4942 psij_const[2*m+2] = fg_psij0; \ 4943 for(l=1; l<=2*m+1; l++) \ 4945 fg_psij2 *= fg_psij1; \ 4946 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \ 4949 fg_psij0 = ths->psi[2*(j*3+2)]; \ 4950 fg_psij1 = ths->psi[2*(j*3+2)+1]; \ 4951 fg_psij2 = K(1.0); \ 4952 psij_const[2*(2*m+2)] = fg_psij0; \ 4953 for(l=1; l<=2*m+1; l++) \ 4955 fg_psij2 *= fg_psij1; \ 4956 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \ 4959 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \ 4960 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \ 4961 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \ 4962 n0, n1, n2, m, my_u0, my_o0); \ 4965 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \ 4968 R psij_const[3*(2*m+2)]; \ 4969 R fg_psij0, fg_psij1, fg_psij2; \ 4971 uo(ths,j,&u,&o,(INT)0); \ 4972 fg_psij0 = (PHI(ths->n[0],ths->x[3*j]-((R)u)/((R)n0),0)); \ 4973 fg_psij1 = EXP(K(2.0)*(((R)n0)*(ths->x[3*j]) - (R)u)/ths->b[0]); \ 4974 fg_psij2 = K(1.0); \ 4975 psij_const[0] = fg_psij0; \ 4976 for(l=1; l<=2*m+1; l++) \ 4978 fg_psij2 *= fg_psij1; \ 4979 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \ 4982 uo(ths,j,&u,&o,(INT)1); \ 4983 fg_psij0 = (PHI(ths->n[1],ths->x[3*j+1]-((R)u)/((R)n1),1)); \ 4984 fg_psij1 = EXP(K(2.0)*(((R)n1)*(ths->x[3*j+1]) - (R)u)/ths->b[1]); \ 4985 fg_psij2 = K(1.0); \ 4986 psij_const[2*m+2] = fg_psij0; \ 4987 for(l=1; l<=2*m+1; l++) \ 4989 fg_psij2 *= fg_psij1; \ 4990 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \ 4993 uo(ths,j,&u,&o,(INT)2); \ 4994 fg_psij0 = (PHI(ths->n[2],ths->x[3*j+2]-((R)u)/((R)n2),2)); \ 4995 fg_psij1 = EXP(K(2.0)*(((R)n2)*(ths->x[3*j+2]) - (R)u)/ths->b[2]); \ 4996 fg_psij2 = K(1.0); \ 4997 psij_const[2*(2*m+2)] = fg_psij0; \ 4998 for(l=1; l<=2*m+1; l++) \ 5000 fg_psij2 *= fg_psij1; \ 5001 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \ 5004 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \ 5005 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \ 5006 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \ 5007 n0, n1, n2, m, my_u0, my_o0); \ 5010 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \ 5013 R psij_const[3*(2*m+2)]; \ 5017 uo(ths,j,&u,&o,(INT)0); \ 5018 ip_y = FABS(((R)n0)*ths->x[3*j+0] - (R)u)*((R)ip_s); \ 5019 ip_u = LRINT(FLOOR(ip_y)); \ 5021 for(l=0; l < 2*m+2; l++) \ 5022 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \ 5023 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \ 5025 uo(ths,j,&u,&o,(INT)1); \ 5026 ip_y = FABS(((R)n1)*ths->x[3*j+1] - (R)u)*((R)ip_s); \ 5027 ip_u = LRINT(FLOOR(ip_y)); \ 5029 for(l=0; l < 2*m+2; l++) \ 5030 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \ 5031 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \ 5033 uo(ths,j,&u,&o,(INT)2); \ 5034 ip_y = FABS(((R)n2)*ths->x[3*j+2] - (R)u)*((R)ip_s); \ 5035 ip_u = LRINT(FLOOR(ip_y)); \ 5037 for(l=0; l < 2*m+2; l++) \ 5038 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \ 5039 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \ 5041 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \ 5042 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \ 5043 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \ 5044 n0, n1, n2, m, my_u0, my_o0); \ 5047 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \ 5050 R psij_const[3*(2*m+2)]; \ 5052 uo(ths,j,&u,&o,(INT)0); \ 5053 for(l=0;l<=2*m+1;l++) \ 5054 psij_const[l]=(PHI(ths->n[0],ths->x[3*j]-((R)((u+l)))/((R) n0),0)); \ 5056 uo(ths,j,&u,&o,(INT)1); \ 5057 for(l=0;l<=2*m+1;l++) \ 5058 psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[3*j+1]-((R)((u+l)))/((R) n1),1)); \ 5060 uo(ths,j,&u,&o,(INT)2); \ 5061 for(l=0;l<=2*m+1;l++) \ 5062 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2],ths->x[3*j+2]-((R)((u+l)))/((R) n2),2)); \ 5064 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \ 5065 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \ 5066 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \ 5067 n0, n1, n2, m, my_u0, my_o0); \ 5070 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE(whichone) \ 5072 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \ 5074 _Pragma("omp parallel private(k)") \ 5076 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \ 5077 INT *ar_x = ths->index_x; \ 5079 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \ 5080 &min_u_b, &max_u_b, 3, ths->n, m); \ 5082 if (min_u_a != -1) \ 5084 k = index_x_binary_search(ar_x, M, min_u_a); \ 5086 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \ 5090 INT u_prod = ar_x[2*k]; \ 5091 INT j = ar_x[2*k+1]; \ 5093 if (u_prod < min_u_a || u_prod > max_u_a) \ 5096 MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \ 5102 if (min_u_b != -1) \ 5104 INT k = index_x_binary_search(ar_x, M, min_u_b); \ 5106 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \ 5110 INT u_prod = ar_x[2*k]; \ 5111 INT j = ar_x[2*k+1]; \ 5113 if (u_prod < min_u_b || u_prod > max_u_b) \ 5116 MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \ 5126 static void nfft_adjoint_3d_B(
X(plan) *ths)
5129 const INT n0 = ths->n[0];
5130 const INT n1 = ths->n[1];
5131 const INT n2 = ths->n[2];
5132 const INT M = ths->M_total;
5133 const INT m = ths->m;
5137 memset(g, 0, (
size_t)(ths->n_total) *
sizeof(C));
5141 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
5142 (INT)3, ths->n, m, ths->flags, ths->index_x);
5149 MACRO_adjoint_3d_B_OMP_BLOCKWISE(
PRE_PSI)
5153 #pragma omp parallel for default(shared) private(k) 5155 for (k = 0; k < M; k++)
5157 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5159 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5161 nfft_adjoint_3d_compute_serial(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5169 R fg_exp_l[3*(2*m+2)];
5171 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
5172 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
5173 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
5180 #pragma omp parallel for default(shared) private(k) 5182 for (k = 0; k < M; k++)
5184 R psij_const[3*(2*m+2)];
5185 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5187 R fg_psij0 = ths->psi[2*j*3];
5188 R fg_psij1 = ths->psi[2*j*3+1];
5189 R fg_psij2 = K(1.0);
5191 psij_const[0] = fg_psij0;
5192 for(l=1; l<=2*m+1; l++)
5194 fg_psij2 *= fg_psij1;
5195 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
5198 fg_psij0 = ths->psi[2*(j*3+1)];
5199 fg_psij1 = ths->psi[2*(j*3+1)+1];
5201 psij_const[2*m+2] = fg_psij0;
5202 for(l=1; l<=2*m+1; l++)
5204 fg_psij2 *= fg_psij1;
5205 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
5208 fg_psij0 = ths->psi[2*(j*3+2)];
5209 fg_psij1 = ths->psi[2*(j*3+2)+1];
5211 psij_const[2*(2*m+2)] = fg_psij0;
5212 for(l=1; l<=2*m+1; l++)
5214 fg_psij2 *= fg_psij1;
5215 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
5219 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5221 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5230 R fg_exp_l[3*(2*m+2)];
5232 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
5233 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
5234 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
5239 MACRO_adjoint_3d_B_OMP_BLOCKWISE(
FG_PSI)
5243 #pragma omp parallel for default(shared) private(k) 5245 for (k = 0; k < M; k++)
5248 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5249 R psij_const[3*(2*m+2)];
5250 R fg_psij0, fg_psij1, fg_psij2;
5252 uo(ths,j,&u,&o,(INT)0);
5253 fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
5254 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u))/ths->b[0]);
5256 psij_const[0] = fg_psij0;
5257 for(l=1; l<=2*m+1; l++)
5259 fg_psij2 *= fg_psij1;
5260 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
5263 uo(ths,j,&u,&o,(INT)1);
5264 fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
5265 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u))/ths->b[1]);
5267 psij_const[2*m+2] = fg_psij0;
5268 for(l=1; l<=2*m+1; l++)
5270 fg_psij2 *= fg_psij1;
5271 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
5274 uo(ths,j,&u,&o,(INT)2);
5275 fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
5276 fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u))/ths->b[2]);
5278 psij_const[2*(2*m+2)] = fg_psij0;
5279 for(l=1; l<=2*m+1; l++)
5281 fg_psij2 *= fg_psij1;
5282 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
5286 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5288 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5297 const INT K = ths->K;
5298 const INT ip_s = K / (m + 2);
5307 #pragma omp parallel for default(shared) private(k) 5309 for (k = 0; k < M; k++)
5314 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5315 R psij_const[3*(2*m+2)];
5317 uo(ths,j,&u,&o,(INT)0);
5318 ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
5319 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5320 ip_w = ip_y - (R)(ip_u);
5321 for(l=0; l < 2*m+2; l++)
5322 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5323 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
5325 uo(ths,j,&u,&o,(INT)1);
5326 ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
5327 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5328 ip_w = ip_y - (R)(ip_u);
5329 for(l=0; l < 2*m+2; l++)
5330 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5331 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5333 uo(ths,j,&u,&o,(INT)2);
5334 ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u))*((R)ip_s);
5335 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5336 ip_w = ip_y - (R)(ip_u);
5337 for(l=0; l < 2*m+2; l++)
5338 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5339 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5342 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5344 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5354 MACRO_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI)
5358 #pragma omp parallel for default(shared) private(k) 5360 for (k = 0; k < M; k++)
5363 R psij_const[3*(2*m+2)];
5364 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5366 uo(ths,j,&u,&o,(INT)0);
5367 for(l=0;l<=2*m+1;l++)
5368 psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
5370 uo(ths,j,&u,&o,(INT)1);
5371 for(l=0;l<=2*m+1;l++)
5372 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
5374 uo(ths,j,&u,&o,(INT)2);
5375 for(l=0;l<=2*m+1;l++)
5376 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
5379 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5381 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5387 void X(trafo_3d)(
X(plan) *ths)
5389 if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->N[2] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2) || (ths->n[2] <= 2*ths->m+2))
5391 X(trafo_direct)(ths);
5395 INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5397 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5398 R ck01, ck02, ck11, ck12, ck21, ck22;
5399 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5400 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5412 f_hat=(C*)ths->f_hat;
5413 g_hat=(C*)ths->g_hat;
5417 #pragma omp parallel for default(shared) private(k0) 5418 for (k0 = 0; k0 < ths->n_total; k0++)
5419 ths->g_hat[k0] = 0.0;
5421 memset(ths->g_hat, 0, (
size_t)(ths->n_total) *
sizeof(C));
5424 if(ths->flags & PRE_PHI_HUT)
5426 c_phi_inv01=ths->c_phi_inv[0];
5427 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5430 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22) 5432 for(k0=0;k0<N0/2;k0++)
5434 ck01=c_phi_inv01[k0];
5435 ck02=c_phi_inv02[k0];
5436 c_phi_inv11=ths->c_phi_inv[1];
5437 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5439 for(k1=0;k1<N1/2;k1++)
5441 ck11=c_phi_inv11[k1];
5442 ck12=c_phi_inv12[k1];
5443 c_phi_inv21=ths->c_phi_inv[2];
5444 c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5446 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5447 f_hat111=f_hat + (k0*N1+k1)*N2;
5448 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5449 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5450 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5451 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5452 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5453 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5455 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5456 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5457 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5458 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5459 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5460 f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2);
5461 g_hat222=g_hat + (k0*n1+k1)*n2;
5462 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5464 for(k2=0;k2<N2/2;k2++)
5466 ck21=c_phi_inv21[k2];
5467 ck22=c_phi_inv22[k2];
5469 g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21;
5470 g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21;
5471 g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21;
5472 g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21;
5474 g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22;
5475 g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22;
5476 g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22;
5477 g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22;
5484 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22) 5486 for(k0=0;k0<N0/2;k0++)
5488 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5489 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5490 for(k1=0;k1<N1/2;k1++)
5492 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5493 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5495 for(k2=0;k2<N2/2;k2++)
5497 ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5498 ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5500 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2] * ck01 * ck11 * ck21;
5501 g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+k2] * ck02 * ck11 * ck21;
5502 g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+k2] * ck01 * ck12 * ck21;
5503 g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] * ck02 * ck12 * ck21;
5505 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] = f_hat[(k0*N1+k1)*N2+N2/2+k2] * ck01 * ck11 * ck22;
5506 g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] * ck02 * ck11 * ck22;
5507 g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] * ck01 * ck12 * ck22;
5508 g_hat[(k0*n1+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22;
5516 FFTW(execute)(ths->my_fftw_plan1);
5520 nfft_trafo_3d_B(ths);
5524 void X(adjoint_3d)(
X(plan) *ths)
5526 if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->N[2] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2) || (ths->n[2] <= 2*ths->m+2))
5528 X(adjoint_direct)(ths);
5532 INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5534 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5535 R ck01, ck02, ck11, ck12, ck21, ck22;
5536 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5537 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5549 f_hat=(C*)ths->f_hat;
5550 g_hat=(C*)ths->g_hat;
5553 nfft_adjoint_3d_B(ths);
5557 FFTW(execute)(ths->my_fftw_plan2);
5561 if(ths->flags & PRE_PHI_HUT)
5563 c_phi_inv01=ths->c_phi_inv[0];
5564 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5567 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22) 5569 for(k0=0;k0<N0/2;k0++)
5571 ck01=c_phi_inv01[k0];
5572 ck02=c_phi_inv02[k0];
5573 c_phi_inv11=ths->c_phi_inv[1];
5574 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5576 for(k1=0;k1<N1/2;k1++)
5578 ck11=c_phi_inv11[k1];
5579 ck12=c_phi_inv12[k1];
5580 c_phi_inv21=ths->c_phi_inv[2];
5581 c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5583 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5584 f_hat111=f_hat + (k0*N1+k1)*N2;
5585 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5586 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5587 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5588 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5589 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5590 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5592 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5593 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5594 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5595 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5596 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5597 f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2);
5598 g_hat222=g_hat + (k0*n1+k1)*n2;
5599 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5601 for(k2=0;k2<N2/2;k2++)
5603 ck21=c_phi_inv21[k2];
5604 ck22=c_phi_inv22[k2];
5606 f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21;
5607 f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21;
5608 f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21;
5609 f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21;
5611 f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22;
5612 f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22;
5613 f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22;
5614 f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22;
5621 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22) 5623 for(k0=0;k0<N0/2;k0++)
5625 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5626 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5627 for(k1=0;k1<N1/2;k1++)
5629 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5630 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5632 for(k2=0;k2<N2/2;k2++)
5634 ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5635 ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5637 f_hat[(k0*N1+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21;
5638 f_hat[((N0/2+k0)*N1+k1)*N2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck02 * ck11 * ck21;
5639 f_hat[(k0*N1+N1/2+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] * ck01 * ck12 * ck21;
5640 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] * ck02 * ck12 * ck21;
5642 f_hat[(k0*N1+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] * ck01 * ck11 * ck22;
5643 f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] * ck02 * ck11 * ck22;
5644 f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] * ck01 * ck12 * ck22;
5645 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2] * ck02 * ck12 * ck22;
5655 void X(trafo)(
X(plan) *ths)
5658 for (
int j = 0; j < ths->d; j++)
5660 if((ths->N[j] <= ths->m) || (ths->n[j] <= 2*ths->m+2))
5662 X(trafo_direct)(ths);
5669 case 1:
X(trafo_1d)(ths);
break;
5670 case 2:
X(trafo_2d)(ths);
break;
5671 case 3:
X(trafo_3d)(ths);
break;
5675 ths->g_hat = ths->g1;
5690 FFTW(execute)(ths->my_fftw_plan1);
5703 void X(adjoint)(
X(plan) *ths)
5706 for (
int j = 0; j < ths->d; j++)
5708 if((ths->N[j] <= ths->m) || (ths->n[j] <= 2*ths->m+2))
5710 X(adjoint_direct)(ths);
5717 case 1:
X(adjoint_1d)(ths);
break;
5718 case 2:
X(adjoint_2d)(ths);
break;
5719 case 3:
X(adjoint_3d)(ths);
break;
5738 FFTW(execute)(ths->my_fftw_plan2);
5754 static
void precompute_phi_hut(
X(plan) *ths)
5759 ths->c_phi_inv = (R**) Y(malloc)((size_t)(ths->d) *
sizeof(R*));
5761 for (t = 0; t < ths->d; t++)
5763 ths->c_phi_inv[t] = (R*)Y(malloc)((size_t)(ths->N[t]) *
sizeof(R));
5765 for (ks[t] = 0; ks[t] < ths->N[t]; ks[t]++)
5767 ths->c_phi_inv[t][ks[t]]= K(1.0) / (PHI_HUT(ths->n[t], ks[t] - ths->N[t] / 2,t));
5776 void X(precompute_lin_psi)(
X(plan) *ths)
5782 for (t=0; t<ths->d; t++)
5784 step = ((R)(ths->m+2)) / ((R)(ths->K * ths->n[t]));
5785 for(j = 0;j <= ths->K; j++)
5787 ths->psi[(ths->K+1)*t + j] = PHI(ths->n[t], (R)(j) * step,t);
5792 void X(precompute_fg_psi)(
X(plan) *ths)
5799 for (t=0; t<ths->d; t++)
5803 #pragma omp parallel for default(shared) private(j,u,o) 5805 for (j = 0; j < ths->M_total; j++)
5809 ths->psi[2*(j*ths->d+t)]=
5810 (PHI(ths->n[t] ,(ths->x[j*ths->d+t] - ((R)u) / (R)(ths->n[t])),t));
5812 ths->psi[2*(j*ths->d+t)+1]=
5813 EXP(K(2.0) * ((R)(ths->n[t]) * ths->x[j*ths->d+t] - (R)(u)) / ths->b[t]);
5819 void X(precompute_psi)(
X(plan) *ths)
5828 for (t=0; t<ths->d; t++)
5832 #pragma omp parallel for default(shared) private(j,l,lj,u,o) 5834 for (j = 0; j < ths->M_total; j++)
5838 for(l = u, lj = 0; l <= o; l++, lj++)
5839 ths->psi[(j * ths->d + t) * (2 * ths->m + 2) + lj] =
5840 (PHI(ths->n[t], (ths->x[j*ths->d+t] - ((R)l) / (R)(ths->n[t])), t));
5847 static void nfft_precompute_full_psi_omp(
X(plan) *ths)
5854 for(t=0,lprod = 1; t<ths->d; t++)
5855 lprod *= 2*ths->m+2;
5858 #pragma omp parallel for default(shared) private(j) 5859 for(j=0; j<ths->M_total; j++)
5864 INT ll_plain[ths->d+1];
5866 INT u[ths->d], o[ths->d];
5868 R phi_prod[ths->d+1];
5874 MACRO_init_uo_l_lj_t;
5876 for(l_L=0; l_L<lprod; l_L++, ix++)
5878 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5880 ths->psi_index_g[ix]=ll_plain[ths->d];
5881 ths->psi[ix]=phi_prod[ths->d];
5883 MACRO_count_uo_l_lj_t;
5886 ths->psi_index_f[j]=lprod;
5891 void X(precompute_full_psi)(
X(plan) *ths)
5896 nfft_precompute_full_psi_omp(ths);
5902 INT ll_plain[ths->d+1];
5904 INT u[ths->d], o[ths->d];
5906 R phi_prod[ths->d+1];
5912 phi_prod[0] = K(1.0);
5915 for (t = 0, lprod = 1; t < ths->d; t++)
5916 lprod *= 2 * ths->m + 2;
5918 for (j = 0, ix = 0, ix_old = 0; j < ths->M_total; j++)
5920 MACRO_init_uo_l_lj_t;
5922 for (l_L = 0; l_L < lprod; l_L++, ix++)
5924 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5926 ths->psi_index_g[ix] = ll_plain[ths->d];
5927 ths->psi[ix] = phi_prod[ths->d];
5929 MACRO_count_uo_l_lj_t;
5932 ths->psi_index_f[j] = ix - ix_old;
5938 void X(precompute_one_psi)(
X(plan) *ths)
5941 X(precompute_lin_psi)(ths);
5943 X(precompute_fg_psi)(ths);
5945 X(precompute_psi)(ths);
5947 X(precompute_full_psi)(ths);
5950 static void init_help(
X(plan) *ths)
5955 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT)
5956 ths->flags |= NFFT_SORT_NODES;
5958 ths->N_total = intprod(ths->N, 0, ths->d);
5959 ths->n_total = intprod(ths->n, 0, ths->d);
5961 ths->sigma = (R*) Y(malloc)((size_t)(ths->d) *
sizeof(R));
5963 for(t = 0;t < ths->d; t++)
5964 ths->sigma[t] = ((R)ths->n[t]) / (R)(ths->N[t]);
5969 ths->x = (R*)Y(malloc)((size_t)(ths->d * ths->M_total) *
sizeof(R));
5972 ths->f_hat = (C*)Y(malloc)((size_t)(ths->N_total) *
sizeof(C));
5975 ths->f = (C*)Y(malloc)((size_t)(ths->M_total) *
sizeof(C));
5977 if(ths->flags & PRE_PHI_HUT)
5978 precompute_phi_hut(ths);
5984 ths->K = Y(m2K)(ths->m);
5986 ths->psi = (R*) Y(malloc)((size_t)((ths->K+1) * ths->d) *
sizeof(R));
5990 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * 2) *
sizeof(R));
5993 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * (2 * ths->m + 2)) *
sizeof(R));
5997 for (t = 0, lprod = 1; t < ths->d; t++)
5998 lprod *= 2 * ths->m + 2;
6000 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * lprod) *
sizeof(R));
6002 ths->psi_index_f = (INT*) Y(malloc)((size_t)(ths->M_total) *
sizeof(INT));
6003 ths->psi_index_g = (INT*) Y(malloc)((size_t)(ths->M_total * lprod) *
sizeof(INT));
6009 INT nthreads = Y(get_num_threads)();
6012 ths->g1 = (C*)Y(malloc)((size_t)(ths->n_total) *
sizeof(C));
6015 ths->g2 = (C*) Y(malloc)((size_t)(ths->n_total) *
sizeof(C));
6020 #pragma omp critical (nfft_omp_critical_fftw_plan) 6022 FFTW(plan_with_nthreads)(nthreads);
6025 int *_n = Y(malloc)((size_t)(ths->d) *
sizeof(int));
6027 for (t = 0; t < ths->d; t++)
6028 _n[t] = (
int)(ths->n[t]);
6030 ths->my_fftw_plan1 = FFTW(plan_dft)((int)ths->d, _n, ths->g1, ths->g2, FFTW_FORWARD, ths->fftw_flags);
6031 ths->my_fftw_plan2 = FFTW(plan_dft)((int)ths->d, _n, ths->g2, ths->g1, FFTW_BACKWARD, ths->fftw_flags);
6039 if(ths->flags & NFFT_SORT_NODES)
6040 ths->index_x = (INT*) Y(malloc)(
sizeof(INT) * 2U * (
size_t)(ths->M_total));
6042 ths->index_x = NULL;
6044 ths->mv_trafo = (void (*) (
void* ))
X(trafo);
6045 ths->mv_adjoint = (void (*) (
void* ))
X(adjoint);
6048 void X(init)(
X(plan) *ths,
int d,
int *N,
int M_total)
6054 ths->N = (INT*) Y(malloc)((size_t)(d) *
sizeof(INT));
6056 for (t = 0; t < d; t++)
6057 ths->N[t] = (INT)N[t];
6059 ths->M_total = (INT)M_total;
6061 ths->n = (INT*) Y(malloc)((size_t)(d) *
sizeof(INT));
6063 for (t = 0; t < d; t++)
6064 ths->n[t] = 2 * (Y(next_power_of_2)(ths->N[t]));
6066 ths->m = WINDOW_HELP_ESTIMATE_m;
6073 NFFT_OMP_BLOCKWISE_ADJOINT;
6083 ths->fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT;
6089 void X(init_guru)(
X(plan) *ths,
int d,
int *N,
int M_total,
int *n,
int m,
6090 unsigned flags,
unsigned fftw_flags)
6095 ths->M_total = (INT)M_total;
6096 ths->N = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
6098 for (t = 0; t < d; t++)
6099 ths->N[t] = (INT)N[t];
6101 ths->n = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
6103 for (t = 0; t < d; t++)
6104 ths->n[t] = (INT)n[t];
6109 ths->fftw_flags = fftw_flags;
6115 void X(init_lin)(
X(plan) *ths,
int d,
int *N,
int M_total,
int *n,
int m,
int K,
6116 unsigned flags,
unsigned fftw_flags)
6121 ths->M_total = (INT)M_total;
6122 ths->N = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
6124 for (t = 0; t < d; t++)
6125 ths->N[t] = (INT)N[t];
6127 ths->n = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
6129 for (t = 0; t < d; t++)
6130 ths->n[t] = (INT)n[t];
6135 ths->fftw_flags = fftw_flags;
6141 void X(init_1d)(
X(plan) *ths,
int N1,
int M_total)
6147 X(init)(ths, 1, N, M_total);
6150 void X(init_2d)(
X(plan) *ths,
int N1,
int N2,
int M_total)
6156 X(init)(ths, 2, N, M_total);
6159 void X(init_3d)(
X(plan) *ths,
int N1,
int N2,
int N3,
int M_total)
6166 X(init)(ths, 3, N, M_total);
6169 const char*
X(check)(
X(plan) *ths)
6174 return "Member f not initialized.";
6177 return "Member x not initialized.";
6180 return "Member f_hat not initialized.";
6182 if ((ths->flags &
PRE_LIN_PSI) && ths->K < ths->M_total)
6183 return "Number of nodes too small to use PRE_LIN_PSI.";
6185 for (j = 0; j < ths->M_total * ths->d; j++)
6187 if ((ths->x[j]<-K(0.5)) || (ths->x[j]>= K(0.5)))
6189 return "ths->x out of range [-0.5,0.5)";
6193 for (j = 0; j < ths->d; j++)
6195 if (ths->sigma[j] <= 1)
6196 return "Oversampling factor too small";
6203 if(ths->N[j]%2 == 1)
6204 return "polynomial degree N has to be even";
6209 void X(finalize)(
X(plan) *ths)
6213 if(ths->flags & NFFT_SORT_NODES)
6214 Y(free)(ths->index_x);
6219 #pragma omp critical (nfft_omp_critical_fftw_plan) 6221 FFTW(destroy_plan)(ths->my_fftw_plan2);
6223 #pragma omp critical (nfft_omp_critical_fftw_plan) 6225 FFTW(destroy_plan)(ths->my_fftw_plan1);
6235 Y(free)(ths->psi_index_g);
6236 Y(free)(ths->psi_index_f);
6249 if(ths->flags & PRE_PHI_HUT)
6251 for (t = 0; t < ths->d; t++)
6252 Y(free)(ths->c_phi_inv[t]);
6253 Y(free)(ths->c_phi_inv);
6260 Y(free)(ths->f_hat);
6265 WINDOW_HELP_FINALIZE;
6267 Y(free)(ths->sigma);
#define TIC(a)
Timing, method works since the inaccurate timer is updated mostly in the measured function...
#define X(name)
Include header for C99 complex datatype.
#define UNUSED(x)
Dummy use of unused parameters to silence compiler warnings.
Header file for the nfft3 library.