12 double xibuf [
NIMAX/2] [4][2];
13 double accpbuf[
NIMAX/2] [5][2];
14 double epjbuf [
NJMAX] [4];
15 double spjbuf [
NJMAX] [10];
16 double rsearchj[
NJMAX];
39 static double get_a_NaN(){
40 union{
long l;
double d; } m;
48 v_zero = _fjsp_set_v2r8(0.0, 0.0);
49 v_one = _fjsp_set_v2r8(1.0, 1.0);
50 v_n20 = _fjsp_set_v2r8(-20.0, -20.0);
51 v_p70 = _fjsp_set_v2r8(70.0, 70.0);
52 v_n84 = _fjsp_set_v2r8(-84.0, -84.0);
53 v_p35 = _fjsp_set_v2r8(35.0, 35.0);
61 this->r_crit2 = _r_crit2;
62 v_r_crit2 = _fjsp_set_v2r8(r_crit2, r_crit2);
65 void set_cutoff(
const double _r_out,
const double _r_in){
68 this->denominator = 1.0 / (r_out - r_in);
69 v_r_out = _fjsp_set_v2r8(r_out, r_out);
70 v_r_out_sq = _fjsp_set_v2r8(r_out*r_out, r_out*r_out);
71 v_r_in = _fjsp_set_v2r8(r_in, r_in);
72 v_r_in_sq = _fjsp_set_v2r8(r_in*r_in, r_in*r_in);
73 v_denominator = _fjsp_set_v2r8(denominator, denominator);
76 void set_epj_one(
const int addr,
const double x,
const double y,
const double z,
const double m,
const double r_search) {
81 rsearchj[addr] = r_search;
84 template <
typename EPJ_t>
85 void set_epj(
const int nj,
const EPJ_t epj[]);
89 const double x,
const double y,
const double z,
const double m,
90 const double qxx,
const double qyy,
const double qzz,
91 const double qxy,
const double qyz,
const double qzx)
93 const double tr = qxx + qyy + qzz;
98 spjbuf[addr][4] = 3.0 * qxx - tr;
99 spjbuf[addr][5] = 3.0 * qyy - tr;
100 spjbuf[addr][6] = 3.0 * qxy;
101 spjbuf[addr][7] = 3.0 * qyz;
102 spjbuf[addr][8] = 3.0 * qzx;
103 spjbuf[addr][9] = -(eps2 * tr);
106 template <
typename SPJ_t>
107 void set_spj(
const int nj,
const SPJ_t spj[]);
109 void set_xi_one(
const int addr,
const double x,
const double y,
const double z,
const double r_search){
110 const int ah = addr / 2;
111 const int al = addr % 2;
112 xibuf[ah][0][al] = x;
113 xibuf[ah][1][al] = y;
114 xibuf[ah][2][al] = z;
115 xibuf[ah][3][al] = r_search;
118 template <
typename EPI_t>
119 void set_xi(
const int ni,
const EPI_t spj[]);
121 template <
typename real_t>
122 void get_accp_one(
const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot){
123 const int ah = addr / 2;
124 const int al = addr % 2;
125 ax = accpbuf[ah][0][al];
126 ay = accpbuf[ah][1][al];
127 az = accpbuf[ah][2][al];
128 pot = accpbuf[ah][3][al];
130 template <
typename real_t>
131 void get_accp_one(
const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot, real_t &nngb){
132 const int ah = addr / 2;
133 const int al = addr % 2;
134 ax = accpbuf[ah][0][al];
135 ay = accpbuf[ah][1][al];
136 az = accpbuf[ah][2][al];
137 pot = accpbuf[ah][3][al];
138 nngb = accpbuf[ah][4][al];
142 template <
typename real_t>
143 void accum_accp_one(
const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot){
144 const int ah = addr / 2;
145 const int al = addr % 2;
146 ax += accpbuf[ah][0][al];
147 ay += accpbuf[ah][1][al];
148 az += accpbuf[ah][2][al];
149 pot += accpbuf[ah][3][al];
152 template <
typename real_t>
153 void accum_accp_one(
const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot, real_t &nngb){
154 const int ah = addr / 2;
155 const int al = addr % 2;
156 ax += accpbuf[ah][0][al];
157 ay += accpbuf[ah][1][al];
158 az += accpbuf[ah][2][al];
159 pot += accpbuf[ah][3][al];
160 nngb += accpbuf[ah][4][al];
165 std::cout<<
"ni= "<<ni<<
" NIMAX= "<<
NIMAX<<
" nj= "<<nj<<
" NJMAX= "<<
NJMAX<<std::endl;
172 kernel_epj_unroll4(ni, nj);
189 std::cout<<
"ni= "<<ni<<
" NIMAX= "<<
NIMAX<<
" nj= "<<nj<<
" NJMAX= "<<
NJMAX<<std::endl;
193 kernel_epj_unroll4_for_p3t_with_linear_cutoff(ni, nj);
198 std::cout<<
"ni= "<<ni<<
" NIMAX= "<<
NIMAX<<
" nj= "<<nj<<
" NJMAX= "<<
NJMAX<<std::endl;
203 kernel_spj_unroll4(ni, nj);
207 void inline_kernel_epj(
219 const v2r8_bcl xj(jp_xy);
220 const v2r8_bch yj(jp_xy);
221 const v2r8_bcl zj(jp_zm);
222 const v2r8_bch mj(jp_zm);
224 const v2r8 dx = xj - xi;
225 const v2r8 dy = yj - yi;
226 const v2r8 dz = zj - zi;
228 const v2r8 r2 = ((eps2 + dx*dx) + dy*dy) + dz*dz;
229 const v2r8 ri1 = r2.rsqrta_x3();
231 const v2r8 ri2 = ri1 * ri1;
232 const v2r8 mri1 = mj * ri1;
233 const v2r8 mri3 = mri1 * ri2;
235 pot = mj.nmsub(ri1, pot);
242 void inline_kernel_epj_for_p3t(
const v2r8 eps2,
254 const v2r8_bcl xj(jp_xy);
255 const v2r8_bch yj(jp_xy);
256 const v2r8_bcl zj(jp_zm);
257 const v2r8_bch mj(jp_zm);
259 const v2r8 dx = xj - xi;
260 const v2r8 dy = yj - yi;
261 const v2r8 dz = zj - zi;
263 const v2r8 r2 = ((eps2 + dx*dx) + dy*dy) + dz*dz;
264 const v2r8_mask mask = r_crit2 < r2;
265 const v2r8 ri1 = r2.rsqrta_x3() & mask;
267 const v2r8 ri2 = ri1 * ri1;
268 const v2r8 mri1 = mj * ri1;
269 const v2r8 mri3 = mri1 * ri2;
271 pot = mj.nmsub(ri1, pot);
275 nngb += _fjsp_andnot1_v2r8( mask.val, v2r8(1.0) );
280 void inline_kernel_epj_for_p3t_with_linear_cutoff(
const v2r8 eps2,
293 const v2r8_bcl xj(jp_xy);
294 const v2r8_bch yj(jp_xy);
295 const v2r8_bcl zj(jp_zm);
296 const v2r8_bch mj(jp_zm);
298 const v2r8 dx = xj - xi;
299 const v2r8 dy = yj - yi;
300 const v2r8 dz = zj - zi;
302 v2r8 r2_real = ((eps2 + dx*dx) + dy*dy) + dz*dz;
303 const v2r8 r2 = _fjsp_max_v2r8(r2_real, v_r_crit2);
304 const v2r8 ri1 = r2.rsqrta_x3();
306 const v2r8 ri2 = ri1 * ri1;
307 const v2r8 mri1 = mj * ri1;
308 const v2r8 mri3 = mri1 * ri2;
310 pot = mj.nmsub(ri1, pot);
315 v2r8 r_crit = _fjsp_max_v2r8(rsi, rsj);
316 const v2r8 r_crit2= r_crit*r_crit;
317 const v2r8_mask mask = r_crit2 < r2;
318 nngb += _fjsp_andnot1_v2r8( mask.val, v2r8(1.0) );
324 void inline_kernel_epj_with_cutoff(
const v2r8 eps2,
336 const v2r8_bcl xj(jp_xy);
337 const v2r8_bch yj(jp_xy);
338 const v2r8_bcl zj(jp_zm);
339 const v2r8_bch mj(jp_zm);
341 const v2r8 dx = xj - xi;
342 const v2r8 dy = yj - yi;
343 const v2r8 dz = zj - zi;
346 v2r8 r2 = ((eps2 + dx*dx) + dy*dy) + dz*dz;
349 r2 = _fjsp_max_v2r8(r2, v_r_in_sq);
350 #ifdef RSQRT_NR_EPJ_X7
351 const v2r8 ri1 = r2.rsqrta_x7();
354 const v2r8 ri1 = r2.rsqrta_x3();
360 const v2r8 r1 = r2 * ri1;
361 const v2r8 x = _fjsp_min_v2r8( _fjsp_max_v2r8( ((r1-v_r_in)*v_denominator), v_zero), v_one);
362 const v2r8 x2 = x * x;
363 const v2r8 x4 = x2 * x2;
364 const v2r8 k = (((v_n20*x + v_p70)*x + v_n84)*x + v_p35)*x4 ;
366 const v2r8 mkri1 = mj * k * ri1;
367 const v2r8 ri2 = ri1 * ri1;
368 const v2r8 mkri3 = mkri1 * ri2;
379 v2r8_mask mask = r2 < r_crit2;
380 nngb += _fjsp_and_v2r8( mask.val, v_one );
454 void kernel_epj_unroll4(
const int ni,
const int nj){
455 for(
int i=0; i<ni; i+=8){
456 const v2r8 veps2 = v2r8(eps2);
458 const v2r8 xi0 = v2r8::load(xibuf[0 + i/2][0]);
459 const v2r8 yi0 = v2r8::load(xibuf[0 + i/2][1]);
460 const v2r8 zi0 = v2r8::load(xibuf[0 + i/2][2]);
461 const v2r8 xi1 = v2r8::load(xibuf[1 + i/2][0]);
462 const v2r8 yi1 = v2r8::load(xibuf[1 + i/2][1]);
463 const v2r8 zi1 = v2r8::load(xibuf[1 + i/2][2]);
464 const v2r8 xi2 = v2r8::load(xibuf[2 + i/2][0]);
465 const v2r8 yi2 = v2r8::load(xibuf[2 + i/2][1]);
466 const v2r8 zi2 = v2r8::load(xibuf[2 + i/2][2]);
467 const v2r8 xi3 = v2r8::load(xibuf[3 + i/2][0]);
468 const v2r8 yi3 = v2r8::load(xibuf[3 + i/2][1]);
469 const v2r8 zi3 = v2r8::load(xibuf[3 + i/2][2]);
488 for(
int j=0; j<nj; j++){
489 const v2r8 jp_xy = v2r8::load(epjbuf[j] + 0);
490 const v2r8 jp_zm = v2r8::load(epjbuf[j] + 2);
491 inline_kernel_epj(veps2, jp_xy, jp_zm, xi0, yi0, zi0, ax0, ay0, az0, po0);
492 inline_kernel_epj(veps2, jp_xy, jp_zm, xi1, yi1, zi1, ax1, ay1, az1, po1);
493 inline_kernel_epj(veps2, jp_xy, jp_zm, xi2, yi2, zi2, ax2, ay2, az2, po2);
494 inline_kernel_epj(veps2, jp_xy, jp_zm, xi3, yi3, zi3, ax3, ay3, az3, po3);
497 ax0.store(accpbuf[0 + i/2][0]);
498 ay0.store(accpbuf[0 + i/2][1]);
499 az0.store(accpbuf[0 + i/2][2]);
500 po0.store(accpbuf[0 + i/2][3]);
501 ax1.store(accpbuf[1 + i/2][0]);
502 ay1.store(accpbuf[1 + i/2][1]);
503 az1.store(accpbuf[1 + i/2][2]);
504 po1.store(accpbuf[1 + i/2][3]);
505 ax2.store(accpbuf[2 + i/2][0]);
506 ay2.store(accpbuf[2 + i/2][1]);
507 az2.store(accpbuf[2 + i/2][2]);
508 po2.store(accpbuf[2 + i/2][3]);
509 ax3.store(accpbuf[3 + i/2][0]);
510 ay3.store(accpbuf[3 + i/2][1]);
511 az3.store(accpbuf[3 + i/2][2]);
512 po3.store(accpbuf[3 + i/2][3]);
589 void kernel_epj_unroll4_for_p3t_with_linear_cutoff(
const int ni,
const int nj){
590 for(
int i=0; i<ni; i+=8){
591 const v2r8 veps2 = v2r8(eps2);
593 const v2r8 xi0 = v2r8::load(xibuf[0 + i/2][0]);
594 const v2r8 yi0 = v2r8::load(xibuf[0 + i/2][1]);
595 const v2r8 zi0 = v2r8::load(xibuf[0 + i/2][2]);
596 v2r8 rs0 = v2r8::load(xibuf[0 + i/2][3]);
597 const v2r8 xi1 = v2r8::load(xibuf[1 + i/2][0]);
598 const v2r8 yi1 = v2r8::load(xibuf[1 + i/2][1]);
599 const v2r8 zi1 = v2r8::load(xibuf[1 + i/2][2]);
600 v2r8 rs1 = v2r8::load(xibuf[1 + i/2][3]);
601 const v2r8 xi2 = v2r8::load(xibuf[2 + i/2][0]);
602 const v2r8 yi2 = v2r8::load(xibuf[2 + i/2][1]);
603 const v2r8 zi2 = v2r8::load(xibuf[2 + i/2][2]);
604 v2r8 rs2 = v2r8::load(xibuf[2 + i/2][3]);
605 const v2r8 xi3 = v2r8::load(xibuf[3 + i/2][0]);
606 const v2r8 yi3 = v2r8::load(xibuf[3 + i/2][1]);
607 const v2r8 zi3 = v2r8::load(xibuf[3 + i/2][2]);
608 v2r8 rs3 = v2r8::load(xibuf[3 + i/2][3]);
631 for(
int j=0; j<nj; j++){
632 const v2r8 jp_xy = v2r8::load(epjbuf[j] + 0);
633 const v2r8 jp_zm = v2r8::load(epjbuf[j] + 2);
634 v2r8 rsjp = __builtin_fj_set_v2r8(rsearchj[j], rsearchj[j]);
635 inline_kernel_epj_for_p3t_with_linear_cutoff(veps2, rsjp, jp_xy, jp_zm, xi0, yi0, zi0, rs0, ax0, ay0, az0, po0, nngb0);
636 inline_kernel_epj_for_p3t_with_linear_cutoff(veps2, rsjp, jp_xy, jp_zm, xi1, yi1, zi1, rs1, ax1, ay1, az1, po1, nngb1);
637 inline_kernel_epj_for_p3t_with_linear_cutoff(veps2, rsjp, jp_xy, jp_zm, xi2, yi2, zi2, rs2, ax2, ay2, az2, po2, nngb2);
638 inline_kernel_epj_for_p3t_with_linear_cutoff(veps2, rsjp, jp_xy, jp_zm, xi3, yi3, zi3, rs3, ax3, ay3, az3, po3, nngb3);
641 ax0.store(accpbuf[0 + i/2][0]);
642 ay0.store(accpbuf[0 + i/2][1]);
643 az0.store(accpbuf[0 + i/2][2]);
644 po0.store(accpbuf[0 + i/2][3]);
645 nngb0.store(accpbuf[0 + i/2][4]);
646 ax1.store(accpbuf[1 + i/2][0]);
647 ay1.store(accpbuf[1 + i/2][1]);
648 az1.store(accpbuf[1 + i/2][2]);
649 po1.store(accpbuf[1 + i/2][3]);
650 nngb1.store(accpbuf[1 + i/2][4]);
651 ax2.store(accpbuf[2 + i/2][0]);
652 ay2.store(accpbuf[2 + i/2][1]);
653 az2.store(accpbuf[2 + i/2][2]);
654 po2.store(accpbuf[2 + i/2][3]);
655 nngb2.store(accpbuf[2 + i/2][4]);
656 ax3.store(accpbuf[3 + i/2][0]);
657 ay3.store(accpbuf[3 + i/2][1]);
658 az3.store(accpbuf[3 + i/2][2]);
659 po3.store(accpbuf[3 + i/2][3]);
660 nngb3.store(accpbuf[3 + i/2][4]);
739 void inline_kernel_spj(
754 const v2r8_bcl xj(jp_xy);
755 const v2r8_bch yj(jp_xy);
756 const v2r8_bcl zj(jp_zm);
758 const v2r8 dx = xj - xi;
759 const v2r8 dy = yj - yi;
760 const v2r8 dz = zj - zi;
762 const v2r8 r2 = ((eps2 + dx*dx) + dy*dy) + dz*dz;
763 const v2r8 ri1 = r2.rsqrta_x3();
764 const v2r8 ri2 = ri1 * ri1;
765 const v2r8 ri3 = ri1 * ri2;
766 const v2r8 ri4 = ri2 * ri2;
767 const v2r8 ri5 = ri2 * ri3;
769 const v2r8_bcl q_xx(jp_xx_yy);
770 const v2r8_bch q_yy(jp_xx_yy);
771 const v2r8 q_zz = jp_xx_yy.hadd();
772 const v2r8_bcl q_xy(jp_xy_yz);
773 const v2r8_bch q_yz(jp_xy_yz);
774 const v2r8_bcl q_zx(jp_zx_tr);
775 const v2r8 q_tr = v2r8::unpckh(jp_zx_tr, jp_zx_tr);
776 const v2r8 mj = v2r8::unpckh(jp_zm, jp_zm);
778 v2r8 qr_x = q_xx * dx;
779 qr_x = q_xy.madd(dy, qr_x);
780 qr_x = q_zx.madd(dz, qr_x);
782 v2r8 qr_y = q_xy * dx;
783 qr_y = q_yy.madd(dy, qr_y);
784 qr_y = q_yz.madd(dz, qr_y);
786 v2r8 qr_z = q_zx * dx;
787 qr_z = q_yz.madd(dy, qr_z);
788 qr_z = qr_z - q_zz * dz;
790 ax = v2r8::nmsub(ri5, qr_x, ax);
791 ay = v2r8::nmsub(ri5, qr_y, ay);
792 az = v2r8::nmsub(ri5, qr_z, az);
794 const v2r8 rqr = ((q_tr + qr_x*dx) + qr_y*dy) + qr_z*dz;
795 const v2r8 rqr_ri4 = rqr * ri4;
797 const v2r8 cc(0.5, 2.5);
798 const v2r8 madd05 = v2r8_bcl(cc).madd(rqr_ri4, mj);
799 const v2r8 madd25 = v2r8_bch(cc).madd(rqr_ri4, mj) * ri3;
803 ax = v2r8::madd(madd25, dx, ax);
804 ay = v2r8::madd(madd25, dy, ay);
805 az = v2r8::madd(madd25, dz, az);
881 void kernel_spj_unroll4(
const int ni,
const int nj){
882 for(
int i=0; i<ni; i+=8){
883 const v2r8 veps2 = v2r8(eps2);
885 const v2r8 xi0 = v2r8::load(xibuf[0 + i/2][0]);
886 const v2r8 yi0 = v2r8::load(xibuf[0 + i/2][1]);
887 const v2r8 zi0 = v2r8::load(xibuf[0 + i/2][2]);
888 const v2r8 xi1 = v2r8::load(xibuf[1 + i/2][0]);
889 const v2r8 yi1 = v2r8::load(xibuf[1 + i/2][1]);
890 const v2r8 zi1 = v2r8::load(xibuf[1 + i/2][2]);
891 const v2r8 xi2 = v2r8::load(xibuf[2 + i/2][0]);
892 const v2r8 yi2 = v2r8::load(xibuf[2 + i/2][1]);
893 const v2r8 zi2 = v2r8::load(xibuf[2 + i/2][2]);
894 const v2r8 xi3 = v2r8::load(xibuf[3 + i/2][0]);
895 const v2r8 yi3 = v2r8::load(xibuf[3 + i/2][1]);
896 const v2r8 zi3 = v2r8::load(xibuf[3 + i/2][2]);
915 for(
int j=0; j<nj; j++){
916 const v2r8 jp_xy = v2r8::load(spjbuf[j] + 0);
917 const v2r8 jp_zm = v2r8::load(spjbuf[j] + 2);
918 const v2r8 jp_xx_yy = v2r8::load(spjbuf[j] + 4);
919 const v2r8 jp_xy_yz = v2r8::load(spjbuf[j] + 6);
920 const v2r8 jp_zx_tr = v2r8::load(spjbuf[j] + 8);
921 inline_kernel_spj(veps2, jp_xy, jp_zm, jp_xx_yy, jp_xy_yz, jp_zx_tr, xi0, yi0, zi0, ax0, ay0, az0, po0);
922 inline_kernel_spj(veps2, jp_xy, jp_zm, jp_xx_yy, jp_xy_yz, jp_zx_tr, xi1, yi1, zi1, ax1, ay1, az1, po1);
923 inline_kernel_spj(veps2, jp_xy, jp_zm, jp_xx_yy, jp_xy_yz, jp_zx_tr, xi2, yi2, zi2, ax2, ay2, az2, po2);
924 inline_kernel_spj(veps2, jp_xy, jp_zm, jp_xx_yy, jp_xy_yz, jp_zx_tr, xi3, yi3, zi3, ax3, ay3, az3, po3);
927 ax0.store(accpbuf[0 + i/2][0]);
928 ay0.store(accpbuf[0 + i/2][1]);
929 az0.store(accpbuf[0 + i/2][2]);
930 po0.store(accpbuf[0 + i/2][3]);
931 ax1.store(accpbuf[1 + i/2][0]);
932 ay1.store(accpbuf[1 + i/2][1]);
933 az1.store(accpbuf[1 + i/2][2]);
934 po1.store(accpbuf[1 + i/2][3]);
935 ax2.store(accpbuf[2 + i/2][0]);
936 ay2.store(accpbuf[2 + i/2][1]);
937 az2.store(accpbuf[2 + i/2][2]);
938 po2.store(accpbuf[2 + i/2][3]);
939 ax3.store(accpbuf[3 + i/2][0]);
940 ay3.store(accpbuf[3 + i/2][1]);
941 az3.store(accpbuf[3 + i/2][2]);
942 po3.store(accpbuf[3 + i/2][3]);