PeTar
N-body code for collisional gravitational systems
phantomquad_for_p3t_x86.hpp
Go to the documentation of this file.
1 // gcc -O2 -march=core-avx2
2 #include <cassert>
3 #include<immintrin.h>
4 
5 #if defined (__AVX512F__) && defined (__AVX512DQ__)
6 #define USE__AVX512
7 #endif
8 
9 #ifndef __FMA__
10 #define _mm256_fmadd_ps(x,y,z) _mm256_add_ps(z, _mm256_mul_ps(x,y))
11 #define _mm256_fnmadd_ps(x,y,z) _mm256_sub_ps(z, _mm256_mul_ps(x,y))
12 #define _mm512_fmadd_ps(x,y,z) _mm512_add_ps(z, _mm512_mul_ps(x,y))
13 #define _mm512_fnmadd_ps(x,y,z) _mm512_sub_ps(z, _mm512_mul_ps(x,y))
14 #endif
15 
16 class PhantomGrapeQuad{
17 public:
18  enum{
19  //NIMAX = 32768,
20  NIMAX = 16384,
21  NJMAX = 131072,
22  //NIMAX = 1024,
23  //NJMAX = 8192,
24  };
25 private:
26 #ifdef USE__AVX512
27  float xibuf [NIMAX/16] [4][16]; // x, y, z, r_search
28  float accpbuf[NIMAX/16] [5][16]; // ax, ay, az, pot, nngb
29 #else
30  float xibuf [NIMAX/8] [4][8]; // x, y, z, r_search
31  float accpbuf[NIMAX/8] [5][8]; // ax, ay, az, pot, nngb
32 #endif
33  float epjbuf [NJMAX] [4]; // x, y, z, m
34  float rsearchj[NJMAX]; // r_search_j
35  float spjbuf [NJMAX] [3][4]; // x, y, z, m, | xx, yy, zz, pad, | xy, yz, zx, tr
36 
37  double eps2;
38  static double get_a_NaN(){
39  union{ long l; double d; } m;
40  m.l = -1;
41  return m.d;
42  }
43  double r_crit2;
44 // double r_out;
45 // double r_in;
46 public:
47  PhantomGrapeQuad() : eps2(get_a_NaN()) {
48  /*
49  xibuf = new float**[NIMAX/8];
50  accpbuf = new float**[NIMAX/8];
51  for(int i=0; i<NIMAX/8; i++){
52  xibuf[i] = new float*[3];
53  for(int j=0; j<3; j++){
54  xibuf[i][j] = new float[8];
55  }
56  accpbuf[i] = new float*[5];
57  for(int j=0; j<5; j++){
58  accpbuf[i][j] = new float[8];
59  }
60  }
61  epjbuf = new float*[NJMAX];
62  spjbuf = new float**[NJMAX];
63  for(int i=0; i<NJMAX; i++){
64  epjbuf[i] = new float[4];
65  spjbuf[i] = new float*[3];
66  for(int j=0; j<3; j++){
67  spjbuf[i][j] = new float[4];
68  }
69  }
70  for(int i=0; i<NIMAX/8; i++){
71  for(int j=0; j<3; j++){
72  for(int k=0; k<8; k++){
73  xibuf[i][j][k] = 0.0;
74  }
75  }
76  for(int j=0; j<5; j++){
77  for(int k=0; k<8; k++){
78  accpbuf[i][j][k] = 0.0;
79  }
80  }
81  }
82  for(int i=0; i<NJMAX; i++){
83  for(int j=0; j<4; j++){
84  epjbuf[i][j] = 0.0;
85  }
86  for(int j=0; j<3; j++){
87  for(int k=0; k<4; k++){
88  spjbuf[i][j][k] = 0.0;
89  }
90  }
91  }
92  */
93  } // default NaN
94 
95  //~PhantomGrapeQuad(){
96  /*
97  std::cerr<<"destructor"<<std::endl;
98  for(int i=0; i<NIMAX/8; i++){
99  for(int j=0; j<3; j++){
100  delete [] xibuf[i][j];
101  }
102  for(int j=0; j<5; j++){
103  delete [] accpbuf[i][j];
104  }
105  delete [] xibuf[i];
106  delete [] accpbuf[i];
107  }
108  delete [] xibuf;
109  delete [] accpbuf;
110  for(int i=0; i<NJMAX; i++){
111  delete [] epjbuf[i];
112  for(int j=0; j<3; j++){
113  delete [] spjbuf[i][j];
114  }
115  delete [] spjbuf[i];
116  }
117  delete [] epjbuf;
118  delete [] spjbuf;
119  */
120  //}
121 
122 // void set_cutoff(const double _r_out, const double _r_in){
123 // r_out = _r_out;
124 // r_in = _r_in;
125 // //denominator = 1.0 / (r_out - r_in);
126 // }
127 
128  void set_eps2(const double _eps2){
129  this->eps2 = _eps2;
130  }
131  void set_r_crit2(const double _r_crit2){
132  this->r_crit2 = _r_crit2;
133  //this->r_crit2 = _r_crit2 * 1.01;
134  }
135  void set_epj_one(const int addr, const double x, const double y, const double z,
136  const double m, const double r_search) {
137  epjbuf[addr][0] = x;
138  epjbuf[addr][1] = y;
139  epjbuf[addr][2] = z;
140  epjbuf[addr][3] = m;
141  rsearchj[addr] = r_search;
142  }
143 
144  void set_spj_one(const int addr,
145  const double x, const double y, const double z, const double m,
146  const double qxx, const double qyy, const double qzz,
147  const double qxy, const double qyz, const double qzx){
148  const double tr = qxx + qyy + qzz;
149  spjbuf[addr][0][0] = x;
150  spjbuf[addr][0][1] = y;
151  spjbuf[addr][0][2] = z;
152  spjbuf[addr][0][3] = m;
153 
154  spjbuf[addr][1][0] = 3.0 * qxx - tr;
155  spjbuf[addr][1][1] = 3.0 * qyy - tr;
156  spjbuf[addr][1][2] = 3.0 * qzz - tr;
157  spjbuf[addr][1][3] = m;
158 
159  spjbuf[addr][2][0] = 3.0 * qxy;
160  spjbuf[addr][2][1] = 3.0 * qyz;
161  spjbuf[addr][2][2] = 3.0 * qzx;
162  spjbuf[addr][2][3] = -(eps2 * tr);
163  }
164 
165  void set_xi_one(const int addr, const double x, const double y, const double z, const double r_search){
166 #ifdef USE__AVX512
167  const int ah = addr / 16;
168  const int al = addr % 16;
169 #else
170  const int ah = addr / 8;
171  const int al = addr % 8;
172 #endif
173  xibuf[ah][0][al] = x;
174  xibuf[ah][1][al] = y;
175  xibuf[ah][2][al] = z;
176  xibuf[ah][3][al] = r_search;
177  }
178  template <typename real_t>
179  void accum_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az,
180  real_t &pot){
181 #ifdef USE__AVX512
182  const int ah = addr / 16;
183  const int al = addr % 16;
184 #else
185  const int ah = addr / 8;
186  const int al = addr % 8;
187 #endif
188  ax += accpbuf[ah][0][al];
189  ay += accpbuf[ah][1][al];
190  az += accpbuf[ah][2][al];
191  pot += accpbuf[ah][3][al];
192  }
193  template <typename real_t>
194  void accum_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az,
195  real_t &pot, real_t &nngb){
196 #ifdef USE__AVX512
197  const int ah = addr / 16;
198  const int al = addr % 16;
199 #else
200  const int ah = addr / 8;
201  const int al = addr % 8;
202 #endif
203  ax += accpbuf[ah][0][al];
204  ay += accpbuf[ah][1][al];
205  az += accpbuf[ah][2][al];
206  pot += accpbuf[ah][3][al];
207  nngb += accpbuf[ah][4][al];
208  }
209  template <typename real_t>
210  void accum_accp_one(const int addr, real_t &nngb){
211 #ifdef USE__AVX512
212  const int ah = addr / 16;
213  const int al = addr % 16;
214 #else
215  const int ah = addr / 8;
216  const int al = addr % 8;
217 #endif
218  nngb += accpbuf[ah][4][al];
219  }
220  template <typename real_t>
221  void get_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot,
222  real_t &nngb){
223 #ifdef USE__AVX512
224  const int ah = addr / 16;
225  const int al = addr % 16;
226 #else
227  const int ah = addr / 8;
228  const int al = addr % 8;
229 #endif
230  ax = accpbuf[ah][0][al];
231  ay = accpbuf[ah][1][al];
232  az = accpbuf[ah][2][al];
233  pot = accpbuf[ah][3][al];
234  nngb = accpbuf[ah][4][al];
235  }
236 
237  void run_epj_for_p3t_with_linear_cutoff(const int ni, const int nj){
238  if(ni > NIMAX || nj > NJMAX){
239  std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
240 #ifdef USE__AVX512
241  for(PS::S32 i=0; i<(ni-1)/16+1; i++){
242  for(PS::S32 j=0; j<16; j++){
243  for(PS::S32 k=0; k<3; k++){
244  std::cout<<"xibuf[i][k][j]="<<xibuf[i][k][j]<<std::endl;
245  }
246  std::cout<<std::endl;
247  }
248  }
249 #else
250  for(PS::S32 i=0; i<(ni-1)/8+1; i++){
251  for(PS::S32 j=0; j<8; j++){
252  for(PS::S32 k=0; k<3; k++){
253  std::cout<<"xibuf[i][k][j]="<<xibuf[i][k][j]<<std::endl;
254  }
255  std::cout<<std::endl;
256  }
257  }
258 #endif
259  }
260  assert(ni <= NIMAX);
261  assert(nj <= NJMAX);
262  kernel_epj_nounroll_for_p3t_with_linear_cutoff(ni, nj);
263  }
264 
265  void run_epj_for_neighbor_count(const int ni, const int nj){
266  if(ni > NIMAX || nj > NJMAX){
267  std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
268 #ifdef USE__AVX512
269  for(PS::S32 i=0; i<(ni-1)/16+1; i++){
270  for(PS::S32 j=0; j<16; j++){
271  for(PS::S32 k=0; k<3; k++){
272  std::cout<<"xibuf[i][k][j]="<<xibuf[i][k][j]<<std::endl;
273  }
274  std::cout<<std::endl;
275  }
276  }
277 #else
278  for(PS::S32 i=0; i<(ni-1)/8+1; i++){
279  for(PS::S32 j=0; j<8; j++){
280  for(PS::S32 k=0; k<3; k++){
281  std::cout<<"xibuf[i][k][j]="<<xibuf[i][k][j]<<std::endl;
282  }
283  std::cout<<std::endl;
284  }
285  }
286 #endif
287  }
288  assert(ni <= NIMAX);
289  assert(nj <= NJMAX);
290  kernel_epj_nounroll_for_neighbor_count(ni, nj);
291  }
292 
293  void run_epj(const int ni, const int nj){
294  if(ni > NIMAX || nj > NJMAX){
295  std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
296 #ifdef USE__AVX512
297  for(PS::S32 i=0; i<(ni-1)/16+1; i++){
298  for(PS::S32 j=0; j<16; j++){
299  for(PS::S32 k=0; k<3; k++){
300  std::cout<<"i,j,k="<<i<<" "<<j<<" "<<k<<std::endl;
301  std::cout<<"xibuf[i][k][j]="<<xibuf[i][k][j]<<std::endl;
302  }
303  std::cout<<std::endl;
304  }
305  }
306 #else
307  for(PS::S32 i=0; i<(ni-1)/8+1; i++){
308  for(PS::S32 j=0; j<8; j++){
309  for(PS::S32 k=0; k<3; k++){
310  std::cout<<"i,j,k="<<i<<" "<<j<<" "<<k<<std::endl;
311  std::cout<<"xibuf[i][k][j]="<<xibuf[i][k][j]<<std::endl;
312  }
313  std::cout<<std::endl;
314  }
315  }
316 #endif
317  }
318  assert(ni <= NIMAX);
319  assert(nj <= NJMAX);
320 
321  kernel_epj_nounroll(ni, nj);
322  }
323 
324  void run_spj(const int ni, const int nj){
325  if(ni > NIMAX || nj > NJMAX){
326  std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
327  }
328  assert(ni <= NIMAX);
329  assert(nj <= NJMAX);
330  kernel_spj_nounroll(ni, nj);
331  // kernel_spj_unroll2(ni, nj);
332  }
333  /*
334  void run_spj_d(const int ni, const int nj){
335  if(ni > NIMAX || nj > NJMAX){
336  std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
337  }
338  assert(ni <= NIMAX);
339  assert(nj <= NJMAX);
340  kernel_spj_64bit_nounroll(ni, nj);
341  // kernel_spj_unroll2(ni, nj);
342  }
343  */
344 
345 
346 
347 private:
348 #ifdef USE__AVX512
349 // typedef float v4sf __attribute__((vector_size(16)));
350 // typedef float v8sf __attribute__((vector_size(32)));
351 // typedef float v16sf __attribute__((vector_size(64)));
352 // typedef double v8df __attribute__((vector_size(64)));
353  typedef __m128 v4sf;
354  typedef __m256 v8sf;
355  typedef __m512 v16sf;
356  typedef __m512d v8df;
357 
358 
359  __attribute__ ((noinline))
360  void kernel_epj_nounroll_for_neighbor_count(const int ni, const int nj){
361  for(int i=0; i<ni; i+=16){
362  const v16sf xi = *(v16sf *)(xibuf[i/16][0]);
363  const v16sf yi = *(v16sf *)(xibuf[i/16][1]);
364  const v16sf zi = *(v16sf *)(xibuf[i/16][2]);
365  const v16sf rsi= *(v16sf *)(xibuf[i/16][3]);
366 
367  v16sf nngb = _mm512_set1_ps(0.0f);
368 
369 #ifdef AVX_PRELOAD
370  v16sf rsjbuf= _mm512_set1_ps(*rsearchj);
371  v16sf jbuf = _mm512_broadcast_f32x4(*(v4sf *)epjbuf);
372 
373  v16sf xj = _mm512_shuffle_ps(jbuf, jbuf, 0x00);
374  v16sf yj = _mm512_shuffle_ps(jbuf, jbuf, 0x55);
375  v16sf zj = _mm512_shuffle_ps(jbuf, jbuf, 0xaa);
376 
377 
378  v16sf rsj = rsjbuf;
379 #endif
380  for(int j=0; j<nj; j++) {
381 #ifdef AVX_PRELOAD
382  rsjbuf = _mm512_set1_ps(rsearchj[j+1]);
383  jbuf = _mm512_broadcast_f32x4(*(v4sf *)(epjbuf + j+1));
384  //jbuf = _mm512_broadcast_f32x4(*(__m128 *)(&epjbuf[j+1][0]));
385  v16sf dx = _mm512_sub_ps(xi, xj);
386  v16sf dy = _mm512_sub_ps(yi, yj);
387  v16sf dz = _mm512_sub_ps(zi, zj);
388 #else
389  v16sf dx = _mm512_sub_ps(xi, _mm512_set1_ps(epjbuf[j][0]));
390  v16sf dy = _mm512_sub_ps(yi, _mm512_set1_ps(epjbuf[j][1]));
391  v16sf dz = _mm512_sub_ps(zi, _mm512_set1_ps(epjbuf[j][2]));
392 #endif
393  v16sf r2_real = _mm512_mul_ps(dx, dx);
394  r2_real = _mm512_fmadd_ps(dy, dy, r2_real);
395  r2_real = _mm512_fmadd_ps(dz, dz, r2_real);
396 
397 #ifdef AVX_PRELOAD
398  xj = _mm512_shuffle_ps(jbuf, jbuf, 0x00);
399  yj = _mm512_shuffle_ps(jbuf, jbuf, 0x55);
400  zj = _mm512_shuffle_ps(jbuf, jbuf, 0xaa);
401 #endif
402 
403 #ifdef AVX_PRELOAD
404  v16sf vrcrit = _mm512_max_ps(rsi, rsj);
405 #else
406  v16sf vrcrit = _mm512_max_ps(rsi, _mm512_set1_ps(rsearchj[j]));
407 #endif
408  v16sf vrcrit2 = _mm512_mul_ps(vrcrit, vrcrit);
409  nngb = _mm512_add_ps(_mm512_mask_blend_ps(
410  _mm512_cmp_ps_mask(vrcrit2, r2_real, 0x01),
411  _mm512_set1_ps(1.0f),
412  _mm512_set1_ps(0.0f)),
413  nngb);
414 #ifdef AVX_PRELOAD
415  rsj = rsjbuf;
416 #endif
417  }
418  *(v16sf *)(accpbuf[i/16][4]) = nngb;
419  }
420  }
421 
422  __attribute__ ((noinline))
423  void kernel_epj_nounroll_for_p3t_with_linear_cutoff(const int ni, const int nj){
424  const v16sf veps2 = _mm512_set1_ps((float)eps2);
425 // const v16sf vr_out = _mm512_sub_ps((float)r_out);
426 // const v16sf vr_out2 = vr_out * vr_out;
427  const v16sf vr_out2 = _mm512_set1_ps((float)r_crit2);
428 
429  for(int i=0; i<ni; i+=16){
430  const v16sf xi = *(v16sf *)(xibuf[i/16][0]);
431  const v16sf yi = *(v16sf *)(xibuf[i/16][1]);
432  const v16sf zi = *(v16sf *)(xibuf[i/16][2]);
433  const v16sf rsi= *(v16sf *)(xibuf[i/16][3]);
434 
435  v16sf ax, ay, az, pot, nngb;
436  ax = ay = az = pot = nngb = _mm512_set1_ps(0.0f);
437 #ifdef AVX_PRELOAD
438  v16sf rsjbuf= _mm512_set1_ps(*rsearchj);
439  v16sf jbuf = _mm512_broadcast_f32x4(*(v4sf *)epjbuf);
440  //v16sf jbuf = _mm512_broadcast_f32x4((__m128 *)&epjbuf[0][0]);
441  v16sf xj = _mm512_shuffle_ps(jbuf, jbuf, 0x00);
442  v16sf yj = _mm512_shuffle_ps(jbuf, jbuf, 0x55);
443  v16sf zj = _mm512_shuffle_ps(jbuf, jbuf, 0xaa);
444  v16sf mj = _mm512_shuffle_ps(jbuf, jbuf, 0xff);
445 
446  v16sf rsj = rsjbuf;
447 #endif
448  for(int j=0; j<nj; j++) {
449 #ifdef AVX_PRELOAD
450  rsjbuf = _mm512_set1_ps(rsearchj[j+1]);
451  jbuf = _mm512_broadcast_f32x4(*(v4sf *)(epjbuf + j+1));
452  //jbuf = _mm512_broadcast_f32x4(*(__m128 *)(&epjbuf[j+1][0]));
453  v16sf dx = _mm512_sub_ps(xi, xj);
454  v16sf dy = _mm512_sub_ps(yi, yj);
455  v16sf dz = _mm512_sub_ps(zi, zj);
456 #else
457  v16sf dx = _mm512_sub_ps(xi, _mm512_set1_ps(epjbuf[j][0]));
458  v16sf dy = _mm512_sub_ps(yi, _mm512_set1_ps(epjbuf[j][1]));
459  v16sf dz = _mm512_sub_ps(zi, _mm512_set1_ps(epjbuf[j][2]));
460 #endif
461  v16sf r2_real = _mm512_fmadd_ps(dx, dx, veps2);
462  r2_real = _mm512_fmadd_ps(dy, dy, r2_real);
463  r2_real = _mm512_fmadd_ps(dz, dz, r2_real);
464  v16sf r2 = _mm512_max_ps(r2_real, vr_out2);
465  v16sf ri1 = _mm512_rsqrt14_ps(r2);
466 
467  v16sf ri2 = _mm512_mul_ps(ri1, ri1);
468 #ifdef RSQRT_NR_EPJ_X4
469  // x4
470  v16sf v1 = _mm512_set1_ps(1.0f);
471  v16sf h = _mm512_fnmadd_ps(r2, ri2, v1);
472 
473  v16sf v6p0 = _mm512_set1_ps(6.0f);
474  v16sf v5p0 = _mm512_set1_ps(5.0f);
475  v16sf v8p0 = _mm512_set1_ps(8.0f);
476  v16sf v0p0625 = _mm512_set1_ps((float)1.0/16.0);
477 
478  ri2 = _mm512_fmadd_ps(h, v5p0, v6p0);
479  ri2 = _mm512_fmadd_ps(h, ri2, v8p0);
480  ri2 = _mm512_mul_ps(h, ri2);
481  ri2 = _mm512_fmadd_ps(ri2, v0p0625, v1);
482  ri1 = _mm512_mul_ps(ri2, ri1);
483 
484  //v16sf h = vone - r2*(ri1*ri1);
485  //ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
486  ri2 = _mm512_mul_ps(ri1, ri1);
487 
488 #elif defined(RSQRT_NR_EPJ_X2)
489  ri2 = _mm512_fnmadd_ps(r2, ri2, _mm512_set1_ps(3.0f));
490  ri2 = _mm512_mul_ps(ri2, _mm512_set1_ps(0.5f));
491  ri1 = _mm512_mul_ps(ri2, ri1);
492  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
493 
494  ri2 = _mm512_mul_ps(ri1, ri1);
495 #endif
496 
497 #ifdef AVX_PRELOAD
498  v16sf mri1 = _mm512_mul_ps(ri1, mj);
499 #else
500  v16sf mri1 = _mm512_mul_ps(ri1, _mm512_set1_ps(epjbuf[j][3]));
501 #endif
502  v16sf mri3 = _mm512_mul_ps(mri1, ri2);
503 #ifdef AVX_PRELOAD
504  xj = _mm512_shuffle_ps(jbuf, jbuf, 0x00);
505  yj = _mm512_shuffle_ps(jbuf, jbuf, 0x55);
506  zj = _mm512_shuffle_ps(jbuf, jbuf, 0xaa);
507  mj = _mm512_shuffle_ps(jbuf, jbuf, 0xff);
508 #endif
509  //pot -= _mm512_and_ps(mri1, _mm512_cmp_ps(r2_real, veps2, 0x04));
510  pot = _mm512_sub_ps(pot, mri1);
511  ax = _mm512_fnmadd_ps(mri3, dx, ax);
512  ay = _mm512_fnmadd_ps(mri3, dy, ay);
513  az = _mm512_fnmadd_ps(mri3, dz, az);
514 
515 #ifdef AVX_PRELOAD
516  v16sf vrcrit = _mm512_max_ps(rsi, rsj);
517 #else
518  v16sf vrcrit = _mm512_max_ps(rsi, _mm512_set1_ps(rsearchj[j]));
519 #endif
520  v16sf vrcrit2 = _mm512_mul_ps(vrcrit, vrcrit);
521  nngb = _mm512_add_ps(_mm512_mask_blend_ps(
522  _mm512_cmp_ps_mask(vrcrit2, r2_real, 0x01),
523  _mm512_set1_ps(1.0f),
524  _mm512_set1_ps(0.0f)),
525  nngb);
526 #ifdef AVX_PRELOAD
527  rsj = rsjbuf;
528 #endif
529  }
530  *(v16sf *)(accpbuf[i/16][0]) = ax;
531  *(v16sf *)(accpbuf[i/16][1]) = ay;
532  *(v16sf *)(accpbuf[i/16][2]) = az;
533  *(v16sf *)(accpbuf[i/16][3]) = pot;
534  *(v16sf *)(accpbuf[i/16][4]) = nngb;
535  /*
536  *(v16sf *)(&accpbuf[i/16][0][0]) = ax;
537  *(v16sf *)(&accpbuf[i/16][1][0]) = ay;
538  *(v16sf *)(&accpbuf[i/16][2][0]) = az;
539  *(v16sf *)(&accpbuf[i/16][3][0]) = pot;
540  *(v16sf *)(&accpbuf[i/16][4][0]) = nngb;
541  */
542  }
543  }
544 #else
545  typedef __m128 v4sf;
546  typedef __m256 v8sf;
547  typedef __m256d v4df;
548 
549  __attribute__ ((noinline))
550  void kernel_epj_nounroll_for_neighbor_count(const int ni, const int nj){
551  const v8sf vone = _mm256_set1_ps(1.0f);
552  const v8sf allbits = _mm256_cmp_ps(vone, vone, 0x00);
553 
554  for(int i=0; i<ni; i+=8){
555  const v8sf xi = *(v8sf *)(xibuf[i/8][0]);
556  const v8sf yi = *(v8sf *)(xibuf[i/8][1]);
557  const v8sf zi = *(v8sf *)(xibuf[i/8][2]);
558  const v8sf rsi = *(v8sf *)(xibuf[i/8][3]);
559 
560  v8sf nngb = _mm256_set1_ps(0.0f);
561 
562  v8sf jbuf = _mm256_broadcast_ps((v4sf *)epjbuf);
563  v8sf rsjbuf= _mm256_broadcast_ss(rsearchj);
564 
565  v8sf xj = _mm256_shuffle_ps(jbuf, jbuf, 0x00);
566  v8sf yj = _mm256_shuffle_ps(jbuf, jbuf, 0x55);
567  v8sf zj = _mm256_shuffle_ps(jbuf, jbuf, 0xaa);
568 
569  v8sf rsj= rsjbuf;
570  for(int j=0; j<nj; j++){
571  jbuf = _mm256_broadcast_ps((v4sf *)(epjbuf + j+1));
572  rsjbuf = _mm256_broadcast_ss(rsearchj+j+1);
573 
574  v8sf dx = _mm256_sub_ps(xi, xj);
575  v8sf dy = _mm256_sub_ps(yi, yj);
576  v8sf dz = _mm256_sub_ps(zi, zj);
577 
578  v8sf r2_real = _mm256_mul_ps(dx, dx);
579  r2_real = _mm256_fmadd_ps(dy, dy, r2_real);
580  r2_real = _mm256_fmadd_ps(dz, dz, r2_real);
581 
582  xj = _mm256_shuffle_ps(jbuf, jbuf, 0x00);
583  yj = _mm256_shuffle_ps(jbuf, jbuf, 0x55);
584  zj = _mm256_shuffle_ps(jbuf, jbuf, 0xaa);
585 
586  v8sf vrcrit = _mm256_max_ps(rsi, rsj);
587  v8sf vrcrit2 = _mm256_mul_ps(vrcrit, vrcrit);
588  v8sf mask = _mm256_cmp_ps(vrcrit2, r2_real, 0x01); // for neighbour search
589  rsj= rsjbuf;
590  nngb = _mm256_add_ps(_mm256_and_ps( vone, _mm256_xor_ps(mask, allbits) ), nngb); // can remove
591  }
592  *(v8sf *)(accpbuf[i/8][4]) = nngb;
593  }
594  }
595 
596  __attribute__ ((noinline))
597  void kernel_epj_nounroll_for_p3t_with_linear_cutoff(const int ni, const int nj){
598  const v8sf vone = _mm256_set1_ps(1.0f);
599  const v8sf veps2 = _mm256_set1_ps((float)eps2);
600  //const v8sf vr_out = _mm256_set1_ps((float)r_out);
601  //const v8sf vr_out2 = _mm256_mul_ps(vr_out, vr_out);
602  const v8sf vr_out2 = _mm256_set1_ps((float)r_crit2);
603  //std::cerr<<"r_crit2 "<<*(float*)&vr_out2<<std::endl;
604  //std::cerr<<"veps2 "<<*(float*)&veps2<<std::endl;
605  const v8sf allbits = _mm256_cmp_ps(vone, vone, 0x00);
606  for(int i=0; i<ni; i+=8){
607  //const v8sf xi = *(v8sf *)(&xibuf[i/8][0][0]);
608  const v8sf xi = *(v8sf *)(xibuf[i/8][0]);
609  /*
610  const v8sf xi = {xibuf[i/8][0][0], xibuf[i/8][0][1], xibuf[i/8][0][2], xibuf[i/8][0][3],
611  xibuf[i/8][0][4], xibuf[i/8][0][5], xibuf[i/8][0][6], xibuf[i/8][0][7]};
612  */
613  //const v8sf yi = *(v8sf *)(&xibuf[i/8][1][0]);
614  const v8sf yi = *(v8sf *)(xibuf[i/8][1]);
615  /*
616  const v8sf yi = {xibuf[i/8][1][0], xibuf[i/8][1][1], xibuf[i/8][1][2], xibuf[i/8][1][3],
617  xibuf[i/8][1][4], xibuf[i/8][1][5], xibuf[i/8][1][6], xibuf[i/8][1][7]};
618  */
619  //const v8sf zi = *(v8sf *)(&xibuf[i/8][2][0]);
620  const v8sf zi = *(v8sf *)(xibuf[i/8][2]);
621  /*
622  const v8sf zi = {xibuf[i/8][2][0], xibuf[i/8][2][1], xibuf[i/8][2][2], xibuf[i/8][2][3],
623  xibuf[i/8][2][4], xibuf[i/8][2][5], xibuf[i/8][2][6], xibuf[i/8][2][7]};
624  */
625  const v8sf rsi = *(v8sf *)(xibuf[i/8][3]);
626 
627  v8sf ax, ay, az, pot, nngb;
628  ax = ay = az = pot = nngb = _mm256_set1_ps(0.0f);
629  v8sf jbuf = _mm256_broadcast_ps((v4sf *)epjbuf);
630  //v8sf jbuf = __builtin_ia32_vbroadcastf128_ps256((v4sf *)epjbuf);
631  v8sf rsjbuf= _mm256_broadcast_ss(rsearchj);
632  //v8sf jbuf = __builtin_ia32_vbroadcastf128_ps256((v4sf *)&epjbuf[0][0]);
633  v8sf xj = _mm256_shuffle_ps(jbuf, jbuf, 0x00);
634  v8sf yj = _mm256_shuffle_ps(jbuf, jbuf, 0x55);
635  v8sf zj = _mm256_shuffle_ps(jbuf, jbuf, 0xaa);
636  v8sf mj = _mm256_shuffle_ps(jbuf, jbuf, 0xff);
637  v8sf rsj= rsjbuf;
638  for(int j=0; j<nj; j++){
639  jbuf = _mm256_broadcast_ps((v4sf *)(epjbuf + j+1));
640  rsjbuf = _mm256_broadcast_ss(rsearchj+j+1);
641  //jbuf = __builtin_ia32_vbroadcastf128_ps256((v4sf *)(&epjbuf[j+1][0]));
642  v8sf dx = _mm256_sub_ps(xi, xj);
643  v8sf dy = _mm256_sub_ps(yi, yj);
644  v8sf dz = _mm256_sub_ps(zi, zj);
645 
646  v8sf r2_real = _mm256_fmadd_ps(dx, dx, veps2);
647  r2_real = _mm256_fmadd_ps(dy, dy, r2_real);
648  r2_real = _mm256_fmadd_ps(dz, dz, r2_real);
649  v8sf r2 = _mm256_max_ps(r2_real, vr_out2);
650  v8sf ri1 = _mm256_rsqrt_ps(r2);
651  //std::cerr<<"r2 "<<*(float*)&r2<<std::endl;
652  //std::cerr<<"ri1 "<<*(float*)&ri1<<std::endl;
653 
654  v8sf ri2 = _mm256_mul_ps(ri1, ri1);
655 #ifdef RSQRT_NR_EPJ_X4
656  // x4
657  v8sf v1 = _mm256_set1_ps(1.0f);
658  v8sf h = _mm256_fnmadd_ps(r2, ri2, v1);
659 
660  v8sf v6p0 = _mm256_set1_ps(6.0f);
661  v8sf v5p0 = _mm256_set1_ps(5.0f);
662  v8sf v8p0 = _mm256_set1_ps(8.0f);
663  v8sf v0p0625 = _mm256_set1_ps((float)1.0/16.0);
664 
665  ri2 = _mm256_fmadd_ps(h, v5p0, v6p0);
666  ri2 = _mm256_fmadd_ps(h, ri2, v8p0);
667  ri2 = _mm256_mul_ps(h, ri2);
668  ri2 = _mm256_fmadd_ps(ri2, v0p0625, v1);
669  ri1 = _mm256_mul_ps(ri2, ri1);
670 
671  ri2 = _mm256_mul_ps(ri1, ri1);
672 #elif defined(RSQRT_NR_EPJ_X2)
673  ri2 = _mm256_fnmadd_ps(r2, ri2, _mm256_set1_ps(3.0f));
674  ri2 = _mm256_mul_ps(ri2, _mm256_set1_ps(0.5f));
675  ri1 = _mm256_mul_ps(ri2, ri1);
676  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
677  ri2 = _mm256_mul_ps(ri1, ri1);
678  //v8sf v3p0 = {3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f};
679  //v8sf v0p5 = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
680  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
681 #endif
682  //std::cerr<<"ri1 "<<*(float*)&ri1<<std::endl;
683  //std::cerr<<"ri2 "<<*(float*)&ri2<<std::endl;
684 
685  v8sf mri1 = _mm256_mul_ps(mj, ri1);
686  v8sf mri3 = _mm256_mul_ps(mri1, ri2);
687 
688  //std::cerr<<"mj "<<*(float*)&mj<<std::endl;
689  //std::cerr<<"mri1 "<<*(float*)&mri1<<std::endl;
690  //std::cerr<<"mri3 "<<*(float*)&mri3<<std::endl;
691 
692  xj = _mm256_shuffle_ps(jbuf, jbuf, 0x00);
693  yj = _mm256_shuffle_ps(jbuf, jbuf, 0x55);
694  zj = _mm256_shuffle_ps(jbuf, jbuf, 0xaa);
695  mj = _mm256_shuffle_ps(jbuf, jbuf, 0xff);
696  //pot -= _mm256_and_ps(mri1, _mm256_cmp_ps(r2_real, veps2, 0x04));
697 
698  pot = _mm256_sub_ps(pot, mri1);
699  ax = _mm256_fnmadd_ps(mri3, dx, ax);
700  ay = _mm256_fnmadd_ps(mri3, dy, ay);
701  az = _mm256_fnmadd_ps(mri3, dz, az);
702 
703  //std::cerr<<"pot "<<*(float*)&pot<<std::endl;
704  //std::cerr<<"dx "<<*(float*)&dx<<std::endl;
705  //std::cerr<<"ax "<<*(float*)&ax<<std::endl;
706 
707  v8sf vrcrit = _mm256_max_ps(rsi, rsj);
708  v8sf vrcrit2 = _mm256_mul_ps(vrcrit, vrcrit);
709  v8sf mask = _mm256_cmp_ps(vrcrit2, r2_real, 0x01); // for neighbour search
710  rsj= rsjbuf;
711  nngb = _mm256_add_ps(_mm256_and_ps( vone, _mm256_xor_ps(mask, allbits) ), nngb); // can remove
712  }
713  *(v8sf *)(accpbuf[i/8][0]) = ax;
714  *(v8sf *)(accpbuf[i/8][1]) = ay;
715  *(v8sf *)(accpbuf[i/8][2]) = az;
716  *(v8sf *)(accpbuf[i/8][3]) = pot;
717  *(v8sf *)(accpbuf[i/8][4]) = nngb;
718  /*
719  *(v8sf *)(&accpbuf[i/8][0][0]) = ax;
720  *(v8sf *)(&accpbuf[i/8][1][0]) = ay;
721  *(v8sf *)(&accpbuf[i/8][2][0]) = az;
722  *(v8sf *)(&accpbuf[i/8][3][0]) = pot;
723  *(v8sf *)(&accpbuf[i/8][4][0]) = nngb;
724  */
725  }
726  }
727 
728 #endif
729 
730 #ifdef USE__AVX512
731  __attribute__ ((noinline))
732  void kernel_epj_nounroll(const int ni, const int nj){
733  const v16sf veps2 = _mm512_set1_ps((float)eps2);
734  for(int i=0; i<ni; i+=16){
735  const v16sf xi = *(v16sf *)(xibuf[i/16][0]);
736  const v16sf yi = *(v16sf *)(xibuf[i/16][1]);
737  const v16sf zi = *(v16sf *)(xibuf[i/16][2]);
738 
739  v16sf ax, ay, az, pot;
740  ax = ay = az = pot = _mm512_set1_ps(0.0f);
741 
742 #ifdef AVX_PRELOAD
743  v16sf jbuf = _mm512_broadcast_f32x4(*(v4sf *)epjbuf);
744  v16sf xj = _mm512_shuffle_ps(jbuf, jbuf, 0x00);
745  v16sf yj = _mm512_shuffle_ps(jbuf, jbuf, 0x55);
746  v16sf zj = _mm512_shuffle_ps(jbuf, jbuf, 0xaa);
747  v16sf mj = _mm512_shuffle_ps(jbuf, jbuf, 0xff);
748 #endif
749  for(int j=0; j<nj; j++){
750 #ifdef AVX_PRELOAD
751  jbuf = _mm512_broadcast_f32x4(*(v4sf *)(epjbuf + j+1));
752 
753  v16sf dx = _mm512_sub_ps(xi, xj);
754  v16sf dy = _mm512_sub_ps(yi, yj);
755  v16sf dz = _mm512_sub_ps(zi, zj);
756 #else
757  v16sf dx = _mm512_sub_ps(xi, _mm512_set1_ps(epjbuf[j][0]));
758  v16sf dy = _mm512_sub_ps(yi, _mm512_set1_ps(epjbuf[j][1]));
759  v16sf dz = _mm512_sub_ps(zi, _mm512_set1_ps(epjbuf[j][2]));
760 #endif
761  v16sf r2 = _mm512_fmadd_ps(dx, dx, veps2);
762  r2 = _mm512_fmadd_ps(dy, dy, r2);
763  r2 = _mm512_fmadd_ps(dz, dz, r2);
764  v16sf ri1 = _mm512_rsqrt14_ps(r2);
765 
766  v16sf ri2 = _mm512_mul_ps(ri1, ri1);
767 #ifdef RSQRT_NR_EPJ_X2
768  ri2 = _mm512_fnmadd_ps(r2, ri2, _mm512_set1_ps(3.0f));
769  ri2 = _mm512_mul_ps(ri2, _mm512_set1_ps(0.5f));
770  ri1 = _mm512_mul_ps(ri2, ri1);
771  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
772  ri2 = _mm512_mul_ps(ri1, ri1);
773 #endif
774 
775 #ifdef AVX_PRELOAD
776  v16sf mri1 = _mm512_mul_ps(ri1, mj);
777 #else
778  v16sf mri1 = _mm512_mul_ps(ri1, _mm512_set1_ps(epjbuf[j][3]));
779 #endif
780  v16sf mri3 = _mm512_mul_ps(mri1, ri2);
781 #ifdef AVX_PRELOAD
782  xj = _mm512_shuffle_ps(jbuf, jbuf, 0x00);
783  yj = _mm512_shuffle_ps(jbuf, jbuf, 0x55);
784  zj = _mm512_shuffle_ps(jbuf, jbuf, 0xaa);
785  mj = _mm512_shuffle_ps(jbuf, jbuf, 0xff);
786 
787 #endif
788  pot = _mm512_sub_ps(pot, mri1);
789  ax = _mm512_fnmadd_ps(mri3, dx, ax);
790  ay = _mm512_fnmadd_ps(mri3, dy, ay);
791  az = _mm512_fnmadd_ps(mri3, dz, az);
792 
793  }
794  *(v16sf *)(accpbuf[i/16][0]) = ax;
795  *(v16sf *)(accpbuf[i/16][1]) = ay;
796  *(v16sf *)(accpbuf[i/16][2]) = az;
797  *(v16sf *)(accpbuf[i/16][3]) = pot;
798  }
799  }
800 
801 #else
802  __attribute__ ((noinline))
803  void kernel_epj_nounroll(const int ni, const int nj){
804  const v8sf veps2 = _mm256_set1_ps((float)eps2);
805  for(int i=0; i<ni; i+=8){
806  const v8sf xi = *(v8sf *)(xibuf[i/8][0]);
807  const v8sf yi = *(v8sf *)(xibuf[i/8][1]);
808  const v8sf zi = *(v8sf *)(xibuf[i/8][2]);
809 
810  v8sf ax, ay, az, pot;
811  ax = ay = az = pot = _mm256_set1_ps(0.0f);
812 
813  v8sf jbuf = _mm256_broadcast_ps((v4sf *)epjbuf);
814  v8sf xj = _mm256_shuffle_ps(jbuf, jbuf, 0x00);
815  v8sf yj = _mm256_shuffle_ps(jbuf, jbuf, 0x55);
816  v8sf zj = _mm256_shuffle_ps(jbuf, jbuf, 0xaa);
817  v8sf mj = _mm256_shuffle_ps(jbuf, jbuf, 0xff);
818 
819  for(int j=0; j<nj; j++){
820  jbuf = _mm256_broadcast_ps((v4sf *)(epjbuf + j+1));
821 
822  v8sf dx = _mm256_sub_ps(xi, xj);
823  v8sf dy = _mm256_sub_ps(yi, yj);
824  v8sf dz = _mm256_sub_ps(zi, zj);
825 
826  v8sf r2 = _mm256_fmadd_ps(dx, dx, veps2);
827  r2 = _mm256_fmadd_ps(dy, dy, r2);
828  r2 = _mm256_fmadd_ps(dz, dz, r2);
829  v8sf ri1 = _mm256_rsqrt_ps(r2);
830 
831  v8sf ri2 = _mm256_mul_ps(ri1, ri1);
832 #ifdef RSQRT_NR_EPJ_X2
833  ri2 = _mm256_fnmadd_ps(r2, ri2, _mm256_set1_ps(3.0f));
834  ri2 = _mm256_mul_ps(ri2, _mm256_set1_ps(0.5f));
835  ri1 = _mm256_mul_ps(ri2, ri1);
836  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
837  ri2 = _mm256_mul_ps(ri1, ri1);
838 #endif
839  v8sf mri1 = _mm256_mul_ps(mj, ri1);
840  v8sf mri3 = _mm256_mul_ps(mri1, ri2);
841 
842  xj = _mm256_shuffle_ps(jbuf, jbuf, 0x00);
843  yj = _mm256_shuffle_ps(jbuf, jbuf, 0x55);
844  zj = _mm256_shuffle_ps(jbuf, jbuf, 0xaa);
845  mj = _mm256_shuffle_ps(jbuf, jbuf, 0xff);
846 
847  pot = _mm256_sub_ps(pot, mri1);
848  ax = _mm256_fnmadd_ps(mri3, dx, ax);
849  ay = _mm256_fnmadd_ps(mri3, dy, ay);
850  az = _mm256_fnmadd_ps(mri3, dz, az);
851  }
852  *(v8sf *)(accpbuf[i/8][0]) = ax;
853  *(v8sf *)(accpbuf[i/8][1]) = ay;
854  *(v8sf *)(accpbuf[i/8][2]) = az;
855  *(v8sf *)(accpbuf[i/8][3]) = pot;
856  }
857  }
858 #endif
859 
860 //#ifndef __FMA__
861 //#undef _mm256_fmadd_ps
862 //#undef _mm256_fnmadd_ps
863 //#undef _mm512_fmadd_ps
864 //#undef _mm512_fnmadd_ps
865 //#endif
866 
867 #ifdef USE__AVX512
868  __attribute__ ((noinline))
869  void kernel_spj_nounroll(const int ni, const int nj){
870  const v16sf veps2 = _mm512_set1_ps((float)eps2);
871  for(int i=0; i<ni; i+=16){
872  const v16sf xi = *(v16sf *)(xibuf[i/16][0]);
873  const v16sf yi = *(v16sf *)(xibuf[i/16][1]);
874  const v16sf zi = *(v16sf *)(xibuf[i/16][2]);
875 
876  v16sf ax, ay, az, pot;
877  ax = ay = az = pot = _mm512_set1_ps(0.0f);
878 
879 #ifdef AVX_PRELOAD
880  v16sf jbuf0 = _mm512_broadcast_f32x4(*(v4sf *)&spjbuf[0][0]);
881  v16sf jbuf1 = _mm512_broadcast_f32x4(*(v4sf *)&spjbuf[0][1]);
882  v16sf jbuf2 = _mm512_broadcast_f32x4(*(v4sf *)&spjbuf[0][2]);
883 //#else
884 // v16sf jbuf0, jbuf1, jbuf2;
885 #endif
886  for(int j=0; j<nj; j++){
887 //#ifndef AVX_PRELOAD
888 // jbuf0 = _mm512_broadcast_f32x4(*(v4sf *)&spjbuf[j+0][0]);
889 //#endif
890 #ifdef AVX_PRELOAD
891  v16sf xj = _mm512_shuffle_ps(jbuf0, jbuf0, 0x00);
892  v16sf yj = _mm512_shuffle_ps(jbuf0, jbuf0, 0x55);
893  v16sf zj = _mm512_shuffle_ps(jbuf0, jbuf0, 0xaa);
894 
895  jbuf0 = _mm512_broadcast_f32x4(*(v4sf *)&spjbuf[j+1][0]);
896 //#ifndef AVX_PRELOAD
897 // jbuf1 = _mm512_broadcast_f32x4(*(v4sf *)&spjbuf[j+0][1]);
898 //#endif
899  v16sf qxx = _mm512_shuffle_ps(jbuf1, jbuf1, 0x00);
900  v16sf qyy = _mm512_shuffle_ps(jbuf1, jbuf1, 0x55);
901  v16sf qzz = _mm512_shuffle_ps(jbuf1, jbuf1, 0xaa);
902  v16sf mj = _mm512_shuffle_ps(jbuf1, jbuf1, 0xff);
903 
904  jbuf1 = _mm512_broadcast_f32x4(*(v4sf *)&spjbuf[j+1][1]);
905 
906 //#ifndef AVX_PRELOAD
907 // jbuf2 = _mm512_broadcast_f32x4(*(v4sf *)&spjbuf[j+0][2]);
908 //#endif
909  v16sf qxy = _mm512_shuffle_ps(jbuf2, jbuf2, 0x00);
910  v16sf qyz = _mm512_shuffle_ps(jbuf2, jbuf2, 0x55);
911  v16sf qzx = _mm512_shuffle_ps(jbuf2, jbuf2, 0xaa);
912  v16sf mtr = _mm512_shuffle_ps(jbuf2, jbuf2, 0xff);
913 
914  jbuf2 = _mm512_broadcast_f32x4(*(v4sf *)&spjbuf[j+1][2]);
915 
916  v16sf dx = _mm512_sub_ps(xi, xj);
917  v16sf dy = _mm512_sub_ps(yi, yj);
918  v16sf dz = _mm512_sub_ps(zi, zj);
919 #else
920  v16sf dx = _mm512_sub_ps(xi, _mm512_set1_ps(spjbuf[j][0][0]));
921  v16sf dy = _mm512_sub_ps(yi, _mm512_set1_ps(spjbuf[j][0][1]));
922  v16sf dz = _mm512_sub_ps(zi, _mm512_set1_ps(spjbuf[j][0][2]));
923 #endif
924 
925  v16sf r2 = _mm512_fmadd_ps(dx, dx, veps2);
926  r2 = _mm512_fmadd_ps(dy, dy, r2);
927  r2 = _mm512_fmadd_ps(dz, dz, r2);
928  v16sf ri1 = _mm512_rsqrt14_ps(r2);
929  v16sf ri2 = _mm512_mul_ps(ri1, ri1);
930 
931 #ifdef RSQRT_NR_SPJ_X4
932  // x4
933  v16sf v1 = _mm512_set1_ps(1.0f);
934  v16sf h = _mm512_fnmadd_ps(r2, ri2, v1);
935 
936  v16sf v6p0 = _mm512_set1_ps(6.0f);
937  v16sf v5p0 = _mm512_set1_ps(5.0f);
938  v16sf v8p0 = _mm512_set1_ps(8.0f);
939  v16sf v0p0625 = _mm512_set1_ps((float)1.0/16.0);
940 
941  ri2 = _mm512_fmadd_ps(h, v5p0, v6p0);
942  ri2 = _mm512_fmadd_ps(h, ri2, v8p0);
943  ri2 = _mm512_mul_ps(h, ri2);
944  ri2 = _mm512_fmadd_ps(ri2, v0p0625, v1);
945  ri1 = _mm512_mul_ps(ri2, ri1);
946 
947  //v16sf h = vone - r2*(ri1*ri1);
948  //ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
949  ri2 = _mm512_mul_ps(ri1, ri1);
950 #elif defined(RSQRT_NR_SPJ_X2)
951 
952  ri2 = _mm512_fnmadd_ps(r2, ri2, _mm512_set1_ps(3.0f));
953  ri2 = _mm512_mul_ps(ri2, _mm512_set1_ps(0.5f));
954  ri1 = _mm512_mul_ps(ri2, ri1);
955  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
956  ri2 = _mm512_mul_ps(ri1, ri1);
957 #endif
958 
959  v16sf ri3 = _mm512_mul_ps(ri1, ri2);
960  v16sf ri4 = _mm512_mul_ps(ri2, ri2);
961  v16sf ri5 = _mm512_mul_ps(ri2, ri3);
962 
963 #ifdef AVX_PRELOAD
964  //(qxx*dx + qxy*dy) + qzx*dz;
965  v16sf qr_x = _mm512_mul_ps(dx, qxx);
966  qr_x = _mm512_fmadd_ps(dy, qxy, qr_x);
967  qr_x = _mm512_fmadd_ps(dz, qzx, qr_x);
968  //(qyy*dy + qxy*dx) + qyz*dz;
969  v16sf qr_y = _mm512_mul_ps(dy, qyy);
970  qr_y = _mm512_fmadd_ps(dx, qxy, qr_y);
971  qr_y = _mm512_fmadd_ps(dz, qyz, qr_y);
972  //(qzz*dz + qzx*dx) + qyz*dy;
973  v16sf qr_z = _mm512_mul_ps(dz, qzz);
974  qr_z = _mm512_fmadd_ps(dx, qzx, qr_z);
975  qr_z = _mm512_fmadd_ps(dy, qyz, qr_z);
976 
977  //v16sf rqr = ((mtr + qr_x*dx) + qr_y*dy) + qr_z*dz;
978  v16sf rqr = _mm512_fmadd_ps(qr_x, dx, mtr);
979 #else
980  //(qxx*dx + qxy*dy) + qzx*dz;
981  v16sf qr_x = _mm512_mul_ps(dx, _mm512_set1_ps(spjbuf[j][1][0]));
982  qr_x = _mm512_fmadd_ps(dy, _mm512_set1_ps(spjbuf[j][2][0]), qr_x);
983  qr_x = _mm512_fmadd_ps(dz, _mm512_set1_ps(spjbuf[j][2][2]), qr_x);
984  //(qyy*dy + qxy*dx) + qyz*dz;
985  v16sf qr_y = _mm512_mul_ps(dy, _mm512_set1_ps(spjbuf[j][1][1]));
986  qr_y = _mm512_fmadd_ps(dx, _mm512_set1_ps(spjbuf[j][2][0]), qr_y);
987  qr_y = _mm512_fmadd_ps(dz, _mm512_set1_ps(spjbuf[j][2][1]), qr_y);
988  //(qzz*dz + qzx*dx) + qyz*dy;
989  v16sf qr_z = _mm512_mul_ps(dz, _mm512_set1_ps(spjbuf[j][1][2]));
990  qr_z = _mm512_fmadd_ps(dx, _mm512_set1_ps(spjbuf[j][2][2]), qr_z);
991  qr_z = _mm512_fmadd_ps(dy, _mm512_set1_ps(spjbuf[j][2][1]), qr_z);
992 
993  //v16sf rqr = ((mtr + qr_x*dx) + qr_y*dy) + qr_z*dz;
994  v16sf rqr = _mm512_fmadd_ps(qr_x, dx, _mm512_set1_ps(spjbuf[j][2][3]));
995 #endif
996 
997  rqr = _mm512_fmadd_ps(qr_y, dy, rqr);
998  rqr = _mm512_fmadd_ps(qr_z, dz, rqr);
999 
1000  v16sf rqr_ri4 = _mm512_mul_ps(rqr, ri4);
1001 
1002  //v16sf v0p5 = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
1003  // 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
1004  //v16sf v2p5 = {2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 2.5f,
1005  //2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 2.5f};
1006 #ifndef AVX_PRELOAD
1007  v16sf mj = _mm512_set1_ps(spjbuf[j][1][3]);
1008 #endif
1009  v16sf meff = _mm512_fmadd_ps(rqr_ri4, _mm512_set1_ps(0.5f), mj);
1010  //v16sf meff3 = (mj + v2p5 * rqr_ri4) * ri3;
1011  v16sf meff3 = _mm512_fmadd_ps(rqr_ri4, _mm512_set1_ps(2.5f), mj);
1012  meff3 = _mm512_mul_ps(meff3, ri3);
1013 
1014  pot = _mm512_fnmadd_ps(meff, ri1, pot);
1015 
1016  //ax = (ax - ri5*qr_x) + meff3*dx;
1017  ax = _mm512_fmadd_ps(ri5, qr_x, ax);
1018  ax = _mm512_fnmadd_ps(meff3, dx, ax);
1019  ay = _mm512_fmadd_ps(ri5, qr_y, ay);
1020  ay = _mm512_fnmadd_ps(meff3, dy, ay);
1021  az = _mm512_fmadd_ps(ri5, qr_z, az);
1022  az = _mm512_fnmadd_ps(meff3, dz, az);
1023 
1024  }
1025  *(v16sf *)(accpbuf[i/16][0]) = ax;
1026  *(v16sf *)(accpbuf[i/16][1]) = ay;
1027  *(v16sf *)(accpbuf[i/16][2]) = az;
1028  *(v16sf *)(accpbuf[i/16][3]) = pot;
1029  }
1030  }
1031 #else
1032  __attribute__ ((noinline))
1033  void kernel_spj_nounroll(const int ni, const int nj){
1034  const v8sf veps2 = _mm256_set1_ps((float)eps2);
1035  for(int i=0; i<ni; i+=8){
1036  const v8sf xi = *(v8sf *)(xibuf[i/8][0]);
1037  const v8sf yi = *(v8sf *)(xibuf[i/8][1]);
1038  const v8sf zi = *(v8sf *)(xibuf[i/8][2]);
1039 
1040  v8sf ax, ay, az, pot;
1041  ax = ay = az = pot = _mm256_set1_ps(0.0f);
1042 
1043 #ifdef AVX_PRELOAD
1044  v8sf jbuf0 = _mm256_broadcast_ps((v4sf *)&spjbuf[0][0]);
1045  v8sf jbuf1 = _mm256_broadcast_ps((v4sf *)&spjbuf[0][1]);
1046  v8sf jbuf2 = _mm256_broadcast_ps((v4sf *)&spjbuf[0][2]);
1047 #else
1048  v8sf jbuf0, jbuf1, jbuf2;
1049 #endif
1050  for(int j=0; j<nj; j++){
1051 #ifndef AVX_PRELOAD
1052  jbuf0 = _mm256_broadcast_ps((v4sf *)&spjbuf[j+0][0]);
1053 #endif
1054  v8sf xj = _mm256_shuffle_ps(jbuf0, jbuf0, 0x00);
1055  v8sf yj = _mm256_shuffle_ps(jbuf0, jbuf0, 0x55);
1056  v8sf zj = _mm256_shuffle_ps(jbuf0, jbuf0, 0xaa);
1057 #ifdef AVX_PRELOAD
1058  jbuf0 = _mm256_broadcast_ps((v4sf *)&spjbuf[j+1][0]);
1059 #endif
1060 
1061 #ifndef AVX_PRELOAD
1062  jbuf1 = _mm256_broadcast_ps((v4sf *)&spjbuf[j+0][1]);
1063 #endif
1064  v8sf qxx = _mm256_shuffle_ps(jbuf1, jbuf1, 0x00);
1065  v8sf qyy = _mm256_shuffle_ps(jbuf1, jbuf1, 0x55);
1066  v8sf qzz = _mm256_shuffle_ps(jbuf1, jbuf1, 0xaa);
1067  v8sf mj = _mm256_shuffle_ps(jbuf1, jbuf1, 0xff);
1068 #ifdef AVX_PRELOAD
1069  jbuf1 = _mm256_broadcast_ps((v4sf *)&spjbuf[j+1][1]);
1070 #endif
1071 
1072 #ifndef AVX_PRELOAD
1073  jbuf2 = _mm256_broadcast_ps((v4sf *)&spjbuf[j+0][2]);
1074 #endif
1075  v8sf qxy = _mm256_shuffle_ps(jbuf2, jbuf2, 0x00);
1076  v8sf qyz = _mm256_shuffle_ps(jbuf2, jbuf2, 0x55);
1077  v8sf qzx = _mm256_shuffle_ps(jbuf2, jbuf2, 0xaa);
1078  v8sf mtr = _mm256_shuffle_ps(jbuf2, jbuf2, 0xff);
1079 #ifdef AVX_PRELOAD
1080  jbuf2 = _mm256_broadcast_ps((v4sf *)&spjbuf[j+1][2]);
1081 #endif
1082 
1083  v8sf dx = _mm256_sub_ps(xi, xj);
1084  v8sf dy = _mm256_sub_ps(yi, yj);
1085  v8sf dz = _mm256_sub_ps(zi, zj);
1086 
1087  v8sf r2 = _mm256_fmadd_ps(dx, dx, veps2);
1088  r2 = _mm256_fmadd_ps(dy, dy, r2);
1089  r2 = _mm256_fmadd_ps(dz, dz, r2);
1090  v8sf ri1 = _mm256_rsqrt_ps(r2);
1091 
1092  v8sf ri2 = _mm256_mul_ps(ri1, ri1);
1093 #ifdef RSQRT_NR_SPJ_X2
1094  ri2 = _mm256_fnmadd_ps(r2, ri2, _mm256_set1_ps(3.0f));
1095  ri2 = _mm256_mul_ps(ri2, _mm256_set1_ps(0.5f));
1096  ri1 = _mm256_mul_ps(ri2, ri1);
1097  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
1098  ri2 = _mm256_mul_ps(ri1, ri1);
1099 #endif
1100  v8sf ri3 = _mm256_mul_ps(ri1, ri2);
1101  v8sf ri4 = _mm256_mul_ps(ri2, ri2);
1102  v8sf ri5 = _mm256_mul_ps(ri2, ri3);
1103 
1104  //(qxx*dx + qxy*dy) + qzx*dz;
1105  v8sf qr_x = _mm256_mul_ps(dx, qxx);
1106  qr_x = _mm256_fmadd_ps(dy, qxy, qr_x);
1107  qr_x = _mm256_fmadd_ps(dz, qzx, qr_x);
1108  //(qyy*dy + qxy*dx) + qyz*dz;
1109  v8sf qr_y = _mm256_mul_ps(dy, qyy);
1110  qr_y = _mm256_fmadd_ps(dx, qxy, qr_y);
1111  qr_y = _mm256_fmadd_ps(dz, qyz, qr_y);
1112  //(qzz*dz + qzx*dx) + qyz*dy;
1113  v8sf qr_z = _mm256_mul_ps(dz, qzz);
1114  qr_z = _mm256_fmadd_ps(dx, qzx, qr_z);
1115  qr_z = _mm256_fmadd_ps(dy, qyz, qr_z);
1116 
1117  v8sf rqr = _mm256_fmadd_ps(qr_x, dx, mtr);
1118  rqr = _mm256_fmadd_ps(qr_y, dy, rqr);
1119  rqr = _mm256_fmadd_ps(qr_z, dz, rqr);
1120 
1121  v8sf rqr_ri4 = _mm256_mul_ps(rqr, ri4);
1122 
1123  v8sf meff = _mm256_fmadd_ps(rqr_ri4, _mm256_set1_ps(0.5f), mj);
1124  v8sf meff3 = _mm256_fmadd_ps(rqr_ri4, _mm256_set1_ps(2.5f), mj);
1125  meff3 = _mm256_mul_ps(meff3, ri3);
1126 
1127  pot = _mm256_fnmadd_ps(meff, ri1, pot);
1128 
1129  ax = _mm256_fmadd_ps(ri5, qr_x, ax);
1130  ax = _mm256_fnmadd_ps(meff3, dx, ax);
1131  ay = _mm256_fmadd_ps(ri5, qr_y, ay);
1132  ay = _mm256_fnmadd_ps(meff3, dy, ay);
1133  az = _mm256_fmadd_ps(ri5, qr_z, az);
1134  az = _mm256_fnmadd_ps(meff3, dz, az);
1135  }
1136  *(v8sf *)(accpbuf[i/8][0]) = ax;
1137  *(v8sf *)(accpbuf[i/8][1]) = ay;
1138  *(v8sf *)(accpbuf[i/8][2]) = az;
1139  *(v8sf *)(accpbuf[i/8][3]) = pot;
1140  }
1141  }
1142 #endif
1143 
1144  /*
1145  __attribute__ ((noinline))
1146  void kernel_spj_64bit_nounroll(const int ni, const int nj){
1147  const v4df veps2 = {eps2, eps2, eps2, eps2};
1148  for(int i=0; i<ni; i+=4){
1149  const int il = i%8;
1150  const v4df xi = *(v4df *)(&xibufd[i/8][0][il]);
1151  const v4df yi = *(v4df *)(&xibufd[i/8][1][il]);
1152  const v4df zi = *(v4df *)(&xibufd[i/8][2][il]);
1153 
1154  v4df ax, ay, az, pot;
1155  ax = ay = az = pot = (v4df){0.0, 0.0, 0.0, 0.0};
1156 
1157  v4df jbuf0 = *((v4df*)spjbufd[0][0]);
1158  v4df jbuf1 = *((v4df*)spjbufd[0][1]);
1159  v4df jbuf2 = *((v4df*)spjbufd[0][2]);
1160 
1161  for(int j=0; j<nj; j++){
1162  v4df xj = _mm256_permute4x64_pd(jbuf0, 0x00);
1163  v4df yj = _mm256_permute4x64_pd(jbuf0, 0x55);
1164  v4df zj = _mm256_permute4x64_pd(jbuf0, 0xaa);
1165  jbuf0 = *((v4df*)spjbufd[j+1][0]);
1166 
1167  v4df qxx = _mm256_permute4x64_pd(jbuf1, 0x00);
1168  v4df qyy = _mm256_permute4x64_pd(jbuf1, 0x55);
1169  v4df qzz = _mm256_permute4x64_pd(jbuf1, 0xaa);
1170  v4df mj = _mm256_permute4x64_pd(jbuf1, 0xff);
1171  jbuf1 = *((v4df*)spjbufd[j+1][1]);
1172 
1173  v4df qxy = _mm256_permute4x64_pd(jbuf2, 0x00);
1174  v4df qyz = _mm256_permute4x64_pd(jbuf2, 0x55);
1175  v4df qzx = _mm256_permute4x64_pd(jbuf2, 0xaa);
1176  v4df mtr = _mm256_permute4x64_pd(jbuf2, 0xff);
1177  jbuf2 = *((v4df*)spjbufd[j+1][2]);
1178 
1179  v4df dx = xj - xi;
1180  v4df dy = yj - yi;
1181  v4df dz = zj - zi;
1182 
1183  v4df r2 = ((veps2 + dx*dx) + dy*dy) + dz*dz;
1184  //v4df ri1 = __builtin_ia32_rsqrtps256(r2);
1185  v4df ri1 = __builtin_ia32_cvtps2pd256( __builtin_ia32_rsqrtps( __builtin_ia32_cvtpd2ps256(r2)));
1186 
1187 #ifdef RSQRT_NR_SPJ_X2
1188  //x2
1189  v4df v3p0 = {3.0, 3.0, 3.0, 3.0};
1190  ri1 *= (v3p0 - r2*(ri1*ri1));
1191 #elif defined(RSQRT_NR_SPJ_X4)
1192  // x4
1193  v4df v8p0 = {8.0, 8.0, 8.0, 8.0};
1194  v4df v6p0 = {6.0, 6.0, 6.0, 6.0};
1195  v4df v5p0 = {5.0, 5.0, 5.0, 5.0};
1196  v4df v0p0625 = {1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0};
1197  v4df v1p0 = {1.0, 1.0, 1.0, 1.0};
1198  v4df h = v1p0 - r2*(ri1*ri1);
1199  ri1 *= v1p0 + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
1200 #endif
1201  v4df ri2 = ri1 * ri1;
1202  v4df ri3 = ri1 * ri2;
1203  v4df ri4 = ri2 * ri2;
1204  v4df ri5 = ri2 * ri3;
1205 
1206  v4df qr_x = (qxx*dx + qxy*dy) + qzx*dz;
1207  v4df qr_y = (qyy*dy + qxy*dx) + qyz*dz;
1208  v4df qr_z = (qzz*dz + qzx*dx) + qyz*dy;
1209 
1210  v4df rqr = ((mtr + qr_x*dx) + qr_y*dy) + qr_z*dz;
1211  v4df rqr_ri4 = rqr * ri4;
1212 
1213  v4df v0p5 = {0.5, 0.5, 0.5, 0.5};
1214  v4df v2p5 = {2.5, 2.5, 2.5, 2.5};
1215 
1216  v4df meff = mj + v0p5 * rqr_ri4;
1217  v4df meff3 = (mj + v2p5 * rqr_ri4) * ri3;
1218 
1219  pot -= meff * ri1;
1220 
1221  ax = (ax - ri5*qr_x) + meff3*dx;
1222  ay = (ay - ri5*qr_y) + meff3*dy;
1223  az = (az - ri5*qr_z) + meff3*dz;
1224  }
1225 
1226 #ifdef RSQRT_NR_SPJ_X2
1227  //x2
1228  v4df v0p5 = {0.5, 0.5, 0.5, 0.5};
1229  v4df v0p125 = {0.125, 0.125, 0.125, 0.125};
1230  pot *= v0p5;
1231  ax *= v0p125;
1232  ay *= v0p125;
1233  az *= v0p125;
1234 #endif
1235 
1236  *(v4df *)(&accpbufd[i/8][0][il]) = ax;
1237  *(v4df *)(&accpbufd[i/8][1][il]) = ay;
1238  *(v4df *)(&accpbufd[i/8][2][il]) = az;
1239  *(v4df *)(&accpbufd[i/8][3][il]) = pot;
1240  }
1241  }
1242  */
1243 } __attribute__ ((aligned(128)));
1244 
1245 
1246 
1247 #if defined(CALC_EP_64bit) || defined(CALC_EP_MIX)
1248 
1249 #define _mm256_fmadd_pd(x,y,z) _mm256_add_pd(z, _mm256_mul_pd(x,y))
1250 #define _mm256_fnmadd_pd(x,y,z) _mm256_sub_pd(z, _mm256_mul_pd(x,y))
1251 #define _mm512_fmadd_pd(x,y,z) _mm512_add_pd(z, _mm512_mul_pd(x,y))
1252 #define _mm512_fnmadd_pd(x,y,z) _mm512_sub_pd(z, _mm512_mul_pd(x,y))
1253 
1254 class PhantomGrapeQuad64Bit{
1255 public:
1256  enum{
1257  NIMAX = 32768,
1258  NJMAX = 131072,
1259  };
1260 private:
1261  double xibuf [NIMAX/8] [4][8]; // x, y, z
1262  double accpbuf[NIMAX/8] [5][8]; // ax, ay, az, pot, nngb
1263  double epjbuf [NJMAX] [4]; // x, y, z, m
1264  double rsearchj[NJMAX]; // r_search_j
1265  double spjbuf [NJMAX] [3][4]; // x, y, z, m, | xx, yy, zz, pad, | xy, yz, zx, tr
1266 
1267  double eps2;
1268  static double get_a_NaN(){
1269  union{ long l; double d; } m;
1270  m.l = -1;
1271  return m.d;
1272  }
1273  double r_crit2;
1274 // double r_out;
1275 // double r_in;
1276 // double denominator; // for cut off
1277 public:
1278  PhantomGrapeQuad64Bit() : eps2(get_a_NaN()) {} // default NaN
1279 
1280  // void set_cutoff(const double _r_out, const double _r_in){
1281  // r_out = _r_out;
1282  // r_in = _r_in;
1283  // denominator = 1.0 / (r_out - r_in);
1284  // }
1285 
1286  void set_eps2(const double _eps2){
1287  this->eps2 = _eps2;
1288  }
1289  void set_r_crit2(const double _r_crit2){
1290  this->r_crit2 = _r_crit2;
1291  }
1292 
1293  void set_epj_one(const int addr, const double x, const double y, const double z,
1294  const double m, const double r_search) {
1295  epjbuf[addr][0] = x;
1296  epjbuf[addr][1] = y;
1297  epjbuf[addr][2] = z;
1298  epjbuf[addr][3] = m;
1299  rsearchj[addr] = r_search;
1300  }
1301 
1302  void set_spj_one(const int addr,
1303  const double x, const double y, const double z, const double m,
1304  const double qxx, const double qyy, const double qzz,
1305  const double qxy, const double qyz, const double qzx)
1306  {
1307  const double tr = qxx + qyy + qzz;
1308  spjbuf[addr][0][0] = x;
1309  spjbuf[addr][0][1] = y;
1310  spjbuf[addr][0][2] = z;
1311  spjbuf[addr][0][3] = m;
1312 
1313  spjbuf[addr][1][0] = 3.0 * qxx - tr;
1314  spjbuf[addr][1][1] = 3.0 * qyy - tr;
1315  spjbuf[addr][1][2] = 3.0 * qzz - tr;
1316  spjbuf[addr][1][3] = m;
1317 
1318  spjbuf[addr][2][0] = 3.0 * qxy;
1319  spjbuf[addr][2][1] = 3.0 * qyz;
1320  spjbuf[addr][2][2] = 3.0 * qzx;
1321  spjbuf[addr][2][3] = -(eps2 * tr);
1322  }
1323 
1324  void set_xi_one(const int addr, const double x, const double y, const double z, const double r_search){
1325  const int ah = addr / 8;
1326  const int al = addr % 8;
1327  xibuf[ah][0][al] = x;
1328  xibuf[ah][1][al] = y;
1329  xibuf[ah][2][al] = z;
1330  xibuf[ah][3][al] = r_search;
1331  }
1332 
1333  template <typename real_t>
1334  void get_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot, real_t &nngb){
1335  const int ah = addr / 8;
1336  const int al = addr % 8;
1337  ax = accpbuf[ah][0][al];
1338  ay = accpbuf[ah][1][al];
1339  az = accpbuf[ah][2][al];
1340  pot = accpbuf[ah][3][al];
1341  nngb = accpbuf[ah][4][al];
1342  }
1343 
1344  template <typename real_t>
1345  void accum_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot){
1346  const int ah = addr / 8;
1347  const int al = addr % 8;
1348  ax += accpbuf[ah][0][al];
1349  ay += accpbuf[ah][1][al];
1350  az += accpbuf[ah][2][al];
1351  pot += accpbuf[ah][3][al];
1352  }
1353 
1354  template <typename real_t>
1355  void accum_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot, real_t &nngb){
1356  const int ah = addr / 8;
1357  const int al = addr % 8;
1358  ax += accpbuf[ah][0][al];
1359  ay += accpbuf[ah][1][al];
1360  az += accpbuf[ah][2][al];
1361  pot += accpbuf[ah][3][al];
1362  nngb += accpbuf[ah][4][al];
1363  }
1364 
1365  template <typename real_t>
1366  void accum_accp_one(const int addr, real_t &nngb){
1367  const int ah = addr / 8;
1368  const int al = addr % 8;
1369  nngb += accpbuf[ah][4][al];
1370  }
1371 
1372  void run_epj_for_neighbor_count(const int ni, const int nj){
1373  if(ni > NIMAX || nj > NJMAX){
1374  std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
1375  for(PS::S32 i=0; i<(ni-1)/8+1; i++){
1376  for(PS::S32 j=0; i<8; j++){
1377  for(PS::S32 k=0; k<3; k++){
1378  std::cout<<"xibuf[i][k][j]="<<xibuf[i][k][j]<<std::endl;
1379  }
1380  std::cout<<std::endl;
1381  }
1382  }
1383  }
1384  assert(ni <= NIMAX);
1385  assert(nj <= NJMAX);
1386  kernel_epj_nounroll_for_neighbor_count(ni, nj);
1387  }
1388 
1389  void run_epj_for_p3t_with_linear_cutoff(const int ni, const int nj){
1390  if(ni > NIMAX || nj > NJMAX){
1391  std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
1392  for(PS::S32 i=0; i<(ni-1)/8+1; i++){
1393  for(PS::S32 j=0; i<8; j++){
1394  for(PS::S32 k=0; k<3; k++){
1395  std::cout<<"xibuf[i][k][j]="<<xibuf[i][k][j]<<std::endl;
1396  }
1397  std::cout<<std::endl;
1398  }
1399  }
1400  }
1401  assert(ni <= NIMAX);
1402  assert(nj <= NJMAX);
1403  kernel_epj_nounroll_for_p3t_with_linear_cutoff(ni, nj);
1404  }
1405 
1406  void run_epj(const int ni, const int nj){
1407  if(ni > NIMAX || nj > NJMAX){
1408  std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
1409  for(PS::S32 i=0; i<(ni-1)/8+1; i++){
1410  for(PS::S32 j=0; i<8; j++){
1411  for(PS::S32 k=0; k<3; k++){
1412  std::cout<<"xibuf[i][k][j]="<<xibuf[i][k][j]<<std::endl;
1413  }
1414  std::cout<<std::endl;
1415  }
1416  }
1417  }
1418  assert(ni <= NIMAX);
1419  assert(nj <= NJMAX);
1420  kernel_epj_nounroll(ni, nj);
1421  }
1422 
1423  void run_spj(const int ni, const int nj){
1424  if(ni > NIMAX || nj > NJMAX){
1425  std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
1426  }
1427  assert(ni <= NIMAX);
1428  assert(nj <= NJMAX);
1429  kernel_spj_nounroll(ni, nj);
1430  // kernel_spj_unroll2(ni, nj);
1431  }
1432 
1433 private:
1434 #ifdef USE__AVX512
1435 // typedef float v8sf __attribute__((vector_size(32)));
1436 // typedef double v4df __attribute__((vector_size(32)));
1437 // typedef float v16sf __attribute__((vector_size(64)));
1438 // typedef double v8df __attribute__((vector_size(64)));
1439  typedef __m256 v8sf;
1440  typedef __m256d v4df;
1441  typedef __m512 v16sf;
1442  typedef __m512d v8df;
1443 
1444  __attribute__ ((noinline))
1445  void kernel_epj_nounroll(const int ni, const int nj){
1446  const v8df veps2 = _mm512_set1_pd(eps2);
1447  for(int i=0; i<ni; i+=8){
1448  const v8df xi = *(v8df *)(&xibuf[i/8][0]);
1449  const v8df yi = *(v8df *)(&xibuf[i/8][1]);
1450  const v8df zi = *(v8df *)(&xibuf[i/8][2]);
1451 
1452  v8df ax, ay, az, pot;
1453  ax = ay = az = pot = _mm512_set1_pd(0.0);
1454 
1455 #ifdef AVX_PRELOAD
1456  v4df jbuf = *((v4df*)epjbuf);
1457  v8df jbuf8= _mm512_broadcast_f64x4(jbuf);
1458  v8df xj = _mm512_permutex_pd(jbuf8, 0x00);
1459  v8df yj = _mm512_permutex_pd(jbuf8, 0x55);
1460  v8df zj = _mm512_permutex_pd(jbuf8, 0xaa);
1461  v8df mj = _mm512_permutex_pd(jbuf8, 0xff);
1462 #endif
1463  for(int j=0; j<nj; j++){
1464 #ifdef AVX_PRELOAD
1465  jbuf = *((v4df*)(epjbuf+j+1));
1466  v8df dx = _mm512_sub_pd(xi, xj);
1467  v8df dy = _mm512_sub_pd(yi, yj);
1468  v8df dz = _mm512_sub_pd(zi, zj);
1469 #else
1470  v8df dx = _mm512_sub_pd(xi, _mm512_set1_pd(epjbuf[j][0]));
1471  v8df dy = _mm512_sub_pd(yi, _mm512_set1_pd(epjbuf[j][1]));
1472  v8df dz = _mm512_sub_pd(zi, _mm512_set1_pd(epjbuf[j][2]));
1473 #endif
1474  v8df r2 = _mm512_fmadd_pd(dx, dx, veps2);
1475  r2 = _mm512_fmadd_pd(dy, dy, r2);
1476  r2 = _mm512_fmadd_pd(dz, dz, r2);
1477  //v8df mask = _mm512_cmp_pd(vrcrit2, r2, 0x01); // vrcrit2 < r2
1478  //v8df mask = _mm512_cmp_pd(veps2, r2, 0x4); // veps2 != r2
1479  //v8df ri1 = _mm512_and_pd( __builtin_ia32_cvtps2pd512( __builtin_ia32_rsqrtps( __builtin_ia32_cvtpd2ps512(r2))), mask );
1480  v8df ri1 = _mm512_rsqrt14_pd(r2);
1481 
1482  v8df ri2 = _mm512_mul_pd(ri1, ri1);
1483 
1484 #ifdef RSQRT_NR_EPJ_X4
1485  // x4
1486  v8df v1 = _mm512_set1_pd(1.0f);
1487  v8df h = _mm512_fnmadd_pd(r2, ri2, v1);
1488 
1489  v8df v6p0 = _mm512_set1_pd(6.0f);
1490  v8df v5p0 = _mm512_set1_pd(5.0f);
1491  v8df v8p0 = _mm512_set1_pd(8.0f);
1492  v8df v0p0625 = _mm512_set1_pd((float)1.0/16.0);
1493 
1494  ri2 = _mm512_fmadd_pd(h, v5p0, v6p0);
1495  ri2 = _mm512_fmadd_pd(h, ri2, v8p0);
1496  ri2 = _mm512_mul_pd(h, ri2);
1497  ri2 = _mm512_fmadd_pd(ri2, v0p0625, v1);
1498  ri1 = _mm512_mul_pd(ri2, ri1);
1499 
1500  //v16sf h = vone - r2*(ri1*ri1);
1501  //ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
1502  ri2 = _mm512_mul_pd(ri1, ri1);
1503  // x4
1504  //v8df vone = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
1505  //v8df v8p0 = {8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0};
1506  //v8df v6p0 = {6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0};
1507  //v8df v5p0 = {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0};
1508  //v8df v0p0625 = {1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0,
1509  // 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0};
1510  //v8df h = vone - r2*(ri1*ri1);
1511  //ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
1512 #elif defined(RSQRT_NR_EPJ_X2)
1513  //x2
1514  ri2 = _mm512_fnmadd_pd(r2, ri2, _mm512_set1_pd(3.0f));
1515  ri2 = _mm512_mul_pd(ri2, _mm512_set1_pd(0.5f));
1516  ri1 = _mm512_mul_pd(ri2, ri1);
1517  //v8df v0p5 = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
1518  //v8df v3p0 = {3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0};
1519  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
1520  ri2 = _mm512_mul_pd(ri1, ri1);
1521 #endif
1522 
1523 #ifdef AVX_PRELOAD
1524  v8df mri1 = _mm512_mul_pd(ri1, mj);
1525 #else
1526  v8df mri1 = _mm512_mul_pd(ri1, _mm512_set1_pd(epjbuf[j][3]));
1527 #endif
1528  v8df mri3 = _mm512_mul_pd(mri1, ri2);
1529 
1530 #ifdef AVX_PRELOAD
1531  jbuf8=_mm512_broadcast_f64x4(jbuf);
1532  xj = _mm512_permutex_pd(jbuf8, 0x00);
1533  yj = _mm512_permutex_pd(jbuf8, 0x55);
1534  zj = _mm512_permutex_pd(jbuf8, 0xaa);
1535  mj = _mm512_permutex_pd(jbuf8, 0xff);
1536 #endif
1537  pot = _mm512_sub_pd(pot, mri1);
1538  ax = _mm512_fnmadd_pd(mri3, dx, ax);
1539  ay = _mm512_fnmadd_pd(mri3, dy, ay);
1540  az = _mm512_fnmadd_pd(mri3, dz, az);
1541  }
1542  *(v8df *)(&accpbuf[i/8][0]) = ax;
1543  *(v8df *)(&accpbuf[i/8][1]) = ay;
1544  *(v8df *)(&accpbuf[i/8][2]) = az;
1545  *(v8df *)(&accpbuf[i/8][3]) = pot;
1546  }
1547  }
1548 
1549 #else
1550  //typedef float v4sf __attribute__((vector_size(16)));
1551  //typedef float v8sf __attribute__((vector_size(32)));
1552  //typedef double v4df __attribute__((vector_size(32)));
1553  typedef __m128 v4sf;
1554  typedef __m256 v8sf;
1555  typedef __m256d v4df;
1556 
1557  __attribute__ ((noinline))
1558  void kernel_epj_nounroll(const int ni, const int nj){
1559  const v4df veps2 = _mm256_set1_pd(eps2);
1560  for(int i=0; i<ni; i+=4){
1561  const int il = i%8;
1562  const v4df xi = *(v4df *)(&xibuf[i/8][0][il]);
1563  const v4df yi = *(v4df *)(&xibuf[i/8][1][il]);
1564  const v4df zi = *(v4df *)(&xibuf[i/8][2][il]);
1565 
1566  v4df ax, ay, az, pot;
1567  ax = ay = az = pot = _mm256_set1_pd(0.0);
1568 
1569  v4df jbuf = *((v4df*)epjbuf);
1570  v4df xj = _mm256_permute4x64_pd(jbuf, 0x00);
1571  v4df yj = _mm256_permute4x64_pd(jbuf, 0x55);
1572  v4df zj = _mm256_permute4x64_pd(jbuf, 0xaa);
1573  v4df mj = _mm256_permute4x64_pd(jbuf, 0xff);
1574 
1575  for(int j=0; j<nj; j++){
1576  jbuf = *((v4df*)(epjbuf+j+1));
1577  v4df dx = _mm256_sub_pd(xi, xj);
1578  v4df dy = _mm256_sub_pd(yi, yj);
1579  v4df dz = _mm256_sub_pd(zi, zj);
1580 
1581 //#ifdef _FMA__
1582  v4df r2 = _mm256_fmadd_pd(dx, dx, veps2);
1583  r2 = _mm256_fmadd_pd(dy, dy, r2);
1584  r2 = _mm256_fmadd_pd(dz, dz, r2);
1585 //#else
1586 // v4df dx2 = _mm256_mul_pd(dx, dx);
1587 // v4df dy2 = _mm256_mul_pd(dy, dy);
1588 // v4df dz2 = _mm256_mul_pd(dz, dz);
1589 //
1590 // v4df r2 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(veps2, dx2), dy2), dz2);
1591 //#endif
1592  //v4df mask = _mm256_cmp_pd(vrcrit2, r2, 0x01); // vrcrit2 < r2
1593  v4df mask = _mm256_cmp_pd(veps2, r2, 0x4); // veps2 != r2
1594  v4df ri1 = _mm256_and_pd(_mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r2))), mask);
1595  //v4df ri1 = __builtin_ia32_cvtps2pd256(__builtin_ia32_rsqrtps( __builtin_ia32_cvtpd2ps256(r2)));
1596 
1597  v4df ri2 = _mm256_mul_pd(ri1, ri1);
1598 #ifdef RSQRT_NR_EPJ_X4
1599  // x4
1600  v4df v1 = _mm256_set1_pd(1.0f);
1601  v4df h = _mm256_fnmadd_pd(r2, ri2, v1);
1602 
1603  v4df v6p0 = _mm256_set1_pd(6.0f);
1604  v4df v5p0 = _mm256_set1_pd(5.0f);
1605  v4df v8p0 = _mm256_set1_pd(8.0f);
1606  v4df v0p0625 = _mm256_set1_pd((float)1.0/16.0);
1607 
1608  ri2 = _mm256_fmadd_pd(h, v5p0, v6p0);
1609  ri2 = _mm256_fmadd_pd(h, ri2, v8p0);
1610  ri2 = _mm256_mul_pd(h, ri2);
1611  ri2 = _mm256_fmadd_pd(ri2, v0p0625, v1);
1612  ri1 = _mm256_mul_pd(ri2, ri1);
1613 
1614  //v4df h = vone - r2*(ri1*ri1);
1615  //ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
1616  ri2 = _mm256_mul_pd(ri1, ri1);
1617  //v4df vone = {1.0, 1.0, 1.0, 1.0};
1618  //v4df v8p0 = {8.0, 8.0, 8.0, 8.0};
1619  //v4df v6p0 = {6.0, 6.0, 6.0, 6.0};
1620  //v4df v5p0 = {5.0, 5.0, 5.0, 5.0};
1621  //v4df v0p0625 = {1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0};
1622  //v4df h = vone - r2*(ri1*ri1);
1623  //ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
1624 #elif defined(RSQRT_NR_EPJ_X2)
1625  //x2
1626  ri2 = _mm256_fnmadd_pd(r2, ri2, _mm256_set1_pd(3.0f));
1627  ri2 = _mm256_mul_pd(ri2, _mm256_set1_pd(0.5f));
1628  ri1 = _mm256_mul_pd(ri2, ri1);
1629  //v4df v0p5 = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
1630  //v4df v3p0 = {3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0};
1631  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
1632  ri2 = _mm256_mul_pd(ri1, ri1);
1633 
1634  //v4df v0p5 = {0.5, 0.5, 0.5, 0.5};
1635  //v4df v3p0 = {3.0, 3.0, 3.0, 3.0};
1636  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
1637 #endif
1638  v4df mri1 = _mm256_mul_pd(ri1, mj);
1639  v4df mri3 = _mm256_mul_pd(mri1, ri2);
1640 
1641  xj = _mm256_permute4x64_pd(jbuf, 0x00);
1642  yj = _mm256_permute4x64_pd(jbuf, 0x55);
1643  zj = _mm256_permute4x64_pd(jbuf, 0xaa);
1644  mj = _mm256_permute4x64_pd(jbuf, 0xff);
1645 
1646  pot = _mm256_sub_pd(pot, mri1);
1647  ax = _mm256_fnmadd_pd(mri3, dx, ax);
1648  ay = _mm256_fnmadd_pd(mri3, dy, ay);
1649  az = _mm256_fnmadd_pd(mri3, dz, az);
1650  }
1651  *(v4df *)(&accpbuf[i/8][0][il]) = ax;
1652  *(v4df *)(&accpbuf[i/8][1][il]) = ay;
1653  *(v4df *)(&accpbuf[i/8][2][il]) = az;
1654  *(v4df *)(&accpbuf[i/8][3][il]) = pot;
1655  }
1656  }
1657 #endif
1658 
1659 #ifdef USE__AVX512
1660  __attribute__ ((noinline))
1661  void kernel_epj_nounroll_for_neighbor_count(const int ni, const int nj) {
1662  for(int i=0; i<ni; i+=8){
1663  const v8df xi = *(v8df *)(&xibuf[i/8][0]);
1664  const v8df yi = *(v8df *)(&xibuf[i/8][1]);
1665  const v8df zi = *(v8df *)(&xibuf[i/8][2]);
1666  const v8df rsi= *(v8df *)(&xibuf[i/8][3]);
1667  v8df nngb = _mm512_set1_pd(0.0);
1668 
1669 #ifdef AVX_PRELOAD
1670  v4df jbuf = *((v4df*)epjbuf);
1671  v8df jbuf8= _mm512_broadcast_f64x4(jbuf);
1672  v8df rsjbuf= _mm512_set1_pd(*rsearchj);
1673  v8df xj = _mm512_permutex_pd(jbuf8, 0x00);
1674  v8df yj = _mm512_permutex_pd(jbuf8, 0x55);
1675  v8df zj = _mm512_permutex_pd(jbuf8, 0xaa);
1676  v8df rsj= rsjbuf;
1677 #endif
1678  for(int j=0; j<nj; j++){
1679 #ifdef AVX_PRELOAD
1680  jbuf = *((v4df*)(epjbuf+j+1));
1681  rsjbuf = _mm512_set1_pd(*(rsearchj+j+1));
1682  v8df dx = _mm512_sub_pd(xi, xj);
1683  v8df dy = _mm512_sub_pd(yi, yj);
1684  v8df dz = _mm512_sub_pd(zi, zj);
1685 #else
1686  v8df dx = _mm512_sub_pd(xi, _mm512_set1_pd(epjbuf[j][0]));
1687  v8df dy = _mm512_sub_pd(yi, _mm512_set1_pd(epjbuf[j][1]));
1688  v8df dz = _mm512_sub_pd(zi, _mm512_set1_pd(epjbuf[j][2]));
1689 #endif
1690  v8df r2_real = _mm512_mul_pd(dx, dx);
1691  r2_real = _mm512_fmadd_pd(dy, dy, r2_real);
1692  r2_real = _mm512_fmadd_pd(dz, dz, r2_real);
1693 
1694 #ifdef AVX_PRELOAD
1695  jbuf8= _mm512_broadcast_f64x4(jbuf);
1696  xj = _mm512_permutex_pd(jbuf8, 0x00);
1697  yj = _mm512_permutex_pd(jbuf8, 0x55);
1698  zj = _mm512_permutex_pd(jbuf8, 0xaa);
1699 #endif
1700 
1701 #ifdef AVX_PRELOAD
1702  v8df vrcrit = _mm512_max_pd(rsi, rsj);
1703 #else
1704  v8df vrcrit = _mm512_max_pd(rsi, _mm512_set1_pd(rsearchj[j]));
1705 #endif
1706  v8df vrcrit2 = _mm512_max_pd(vrcrit, vrcrit);
1707  //v8df mask = _mm256_cmp_pd(vrcrit2, r2_real, 0x01); // for neighbour search
1708  //nngb += _mm256_and_pd( vone, _mm256_xor_pd(mask, allbits) ); // can remove
1709  nngb = _mm512_add_pd(
1710  _mm512_mask_blend_pd(_mm512_cmp_pd_mask(vrcrit2, r2_real, 0x01),
1711  _mm512_set1_pd(1.0),
1712  _mm512_set1_pd(0.0)),
1713  nngb); // can
1714 #ifdef AVX_PRELOAD
1715  rsj= rsjbuf;
1716 #endif
1717  }
1718  *(v8df *)(&accpbuf[i/8][4]) = nngb;
1719  }
1720  }
1721 
1722  __attribute__ ((noinline))
1723  void kernel_epj_nounroll_for_p3t_with_linear_cutoff(const int ni, const int nj) {
1724  const v8df veps2 = _mm512_set1_pd(eps2);
1725  //const v8df vr_out = _mm512_set1_pd(r_out);
1726  //const v8df vr_out2 = _mm512_mul_pd(vr_out, vr_out);
1727  const v8df vr_out2 = _mm512_set1_pd(r_crit2);
1728  //const v8df allbits = _mm256_cmp_pd(vone, vone, 0x00);
1729  for(int i=0; i<ni; i+=8){
1730  const v8df xi = *(v8df *)(&xibuf[i/8][0]);
1731  const v8df yi = *(v8df *)(&xibuf[i/8][1]);
1732  const v8df zi = *(v8df *)(&xibuf[i/8][2]);
1733  const v8df rsi= *(v8df *)(&xibuf[i/8][3]);
1734  v8df ax, ay, az, pot, nngb;
1735  ax = ay = az = pot = nngb = _mm512_set1_pd(0.0);
1736 
1737 #ifdef AVX_PRELOAD
1738  v4df jbuf = *((v4df*)epjbuf);
1739  v8df jbuf8= _mm512_broadcast_f64x4(jbuf);
1740  v8df rsjbuf= _mm512_set1_pd(*rsearchj);
1741  v8df xj = _mm512_permutex_pd(jbuf8, 0x00);
1742  v8df yj = _mm512_permutex_pd(jbuf8, 0x55);
1743  v8df zj = _mm512_permutex_pd(jbuf8, 0xaa);
1744  v8df mj = _mm512_permutex_pd(jbuf8, 0xff);
1745  v8df rsj= rsjbuf;
1746 #endif
1747  for(int j=0; j<nj; j++){
1748 #ifdef AVX_PRELOAD
1749  jbuf = *((v4df*)(epjbuf+j+1));
1750  rsjbuf = _mm512_set1_pd(*(rsearchj+j+1));
1751  v8df dx = _mm512_sub_pd(xi, xj);
1752  v8df dy = _mm512_sub_pd(yi, yj);
1753  v8df dz = _mm512_sub_pd(zi, zj);
1754 #else
1755  v8df dx = _mm512_sub_pd(xi, _mm512_set1_pd(epjbuf[j][0]));
1756  v8df dy = _mm512_sub_pd(yi, _mm512_set1_pd(epjbuf[j][1]));
1757  v8df dz = _mm512_sub_pd(zi, _mm512_set1_pd(epjbuf[j][2]));
1758 #endif
1759  v8df r2_real = _mm512_fmadd_pd(dx, dx, veps2);
1760  r2_real = _mm512_fmadd_pd(dy, dy, r2_real);
1761  r2_real = _mm512_fmadd_pd(dz, dz, r2_real);
1762 
1763  v8df r2 = _mm512_max_pd( r2_real, vr_out2);
1764  v8df ri1 = _mm512_rsqrt14_pd(r2);
1765  //v8df ri1 = __builtin_ia32_cvtps2pd256(__builtin_ia32_rsqrtps( __builtin_ia32_cvtpd2ps256(r2)));
1766  v8df ri2 = _mm512_mul_pd(ri1, ri1);
1767 
1768 #ifdef RSQRT_NR_EPJ_X4
1769  // x4
1770  v8df v1 = _mm512_set1_pd(1.0f);
1771  v8df h = _mm512_fnmadd_pd(r2, ri2, v1);
1772 
1773  v8df v6p0 = _mm512_set1_pd(6.0f);
1774  v8df v5p0 = _mm512_set1_pd(5.0f);
1775  v8df v8p0 = _mm512_set1_pd(8.0f);
1776  v8df v0p0625 = _mm512_set1_pd((float)1.0/16.0);
1777 
1778  ri2 = _mm512_fmadd_pd(h, v5p0, v6p0);
1779  ri2 = _mm512_fmadd_pd(h, ri2, v8p0);
1780  ri2 = _mm512_mul_pd(h, ri2);
1781  ri2 = _mm512_fmadd_pd(ri2, v0p0625, v1);
1782  ri1 = _mm512_mul_pd(ri2, ri1);
1783  //v8df v8p0 = {8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0};
1784  //v8df v6p0 = {6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0};
1785  //v8df v5p0 = {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0};
1786  //v8df v0p0625 = {1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0,
1787  // 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0};
1788  //v8df h = vone - r2*(ri1*ri1);
1789  //ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
1790  ri2 = _mm512_mul_pd(ri1, ri1);
1791 
1792 #elif defined(RSQRT_NR_EPJ_X2)
1793  //x2
1794  ri2 = _mm512_fnmadd_pd(r2, ri2, _mm512_set1_pd(3.0f));
1795  ri2 = _mm512_mul_pd(ri2, _mm512_set1_pd(0.5f));
1796  ri1 = _mm512_mul_pd(ri2, ri1);
1797  //v8df v0p5 = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
1798  //v8df v3p0 = {3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0};
1799  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
1800  ri2 = _mm512_mul_pd(ri1, ri1);
1801 #endif
1802 
1803 #ifdef AVX_PRELOAD
1804  v8df mri1 = _mm512_mul_pd(ri1, mj);
1805 #else
1806  v8df mri1 = _mm512_mul_pd(ri1, _mm512_set1_pd(epjbuf[j][3]));
1807 #endif
1808  v8df mri3 = _mm512_mul_pd(mri1, ri2);
1809 
1810 #ifdef AVX_PRELOAD
1811  jbuf8= _mm512_broadcast_f64x4(jbuf);
1812  xj = _mm512_permutex_pd(jbuf8, 0x00);
1813  yj = _mm512_permutex_pd(jbuf8, 0x55);
1814  zj = _mm512_permutex_pd(jbuf8, 0xaa);
1815  mj = _mm512_permutex_pd(jbuf8, 0xff);
1816 #endif
1817 
1818  pot = _mm512_sub_pd(pot, mri1);
1819  ax = _mm512_fnmadd_pd(mri3, dx, ax);
1820  ay = _mm512_fnmadd_pd(mri3, dy, ay);
1821  az = _mm512_fnmadd_pd(mri3, dz, az);
1822 
1823 #ifdef AVX_PRELOAD
1824  v8df vrcrit = _mm512_max_pd(rsi, rsj);
1825 #else
1826  v8df vrcrit = _mm512_max_pd(rsi, _mm512_set1_pd(rsearchj[j]));
1827 #endif
1828  v8df vrcrit2 = _mm512_max_pd(vrcrit, vrcrit);
1829  //v8df mask = _mm256_cmp_pd(vrcrit2, r2_real, 0x01); // for neighbour search
1830  //nngb += _mm256_and_pd( vone, _mm256_xor_pd(mask, allbits) ); // can remove
1831  nngb = _mm512_add_pd(
1832  _mm512_mask_blend_pd(_mm512_cmp_pd_mask(vrcrit2, r2_real, 0x01),
1833  _mm512_set1_pd(1.0),
1834  _mm512_set1_pd(0.0)),
1835  nngb); // can
1836 #ifdef AVX_PRELOAD
1837  rsj= rsjbuf;
1838 #endif
1839  }
1840  *(v8df *)(&accpbuf[i/8][0]) = ax;
1841  *(v8df *)(&accpbuf[i/8][1]) = ay;
1842  *(v8df *)(&accpbuf[i/8][2]) = az;
1843  *(v8df *)(&accpbuf[i/8][3]) = pot;
1844  *(v8df *)(&accpbuf[i/8][4]) = nngb;
1845  }
1846  }
1847 #else
1848  __attribute__ ((noinline))
1849  void kernel_epj_nounroll_for_neighbor_count(const int ni, const int nj){
1850  const v4df vone = _mm256_set1_pd(1.0);
1851  const v4df allbits = _mm256_cmp_pd(vone, vone, 0x00);
1852  for(int i=0; i<ni; i+=4){
1853  const int il = i%8;
1854  const v4df xi = *(v4df *)(&xibuf[i/8][0][il]);
1855  const v4df yi = *(v4df *)(&xibuf[i/8][1][il]);
1856  const v4df zi = *(v4df *)(&xibuf[i/8][2][il]);
1857  const v4df rsi= *(v4df *)(&xibuf[i/8][3][il]);
1858  v4df nngb = _mm256_set1_pd(0.0);
1859  v4df jbuf = *((v4df*)epjbuf);
1860  v4df rsjbuf= _mm256_broadcast_sd(rsearchj);
1861  v4df xj = _mm256_permute4x64_pd(jbuf, 0x00);
1862  v4df yj = _mm256_permute4x64_pd(jbuf, 0x55);
1863  v4df zj = _mm256_permute4x64_pd(jbuf, 0xaa);
1864  v4df rsj= rsjbuf;
1865  for(int j=0; j<nj; j++){
1866  jbuf = *((v4df*)(epjbuf+j+1));
1867  rsjbuf = _mm256_broadcast_sd(rsearchj+j+1);
1868  v4df dx = _mm256_sub_pd(xi, xj);
1869  v4df dy = _mm256_sub_pd(yi, yj);
1870  v4df dz = _mm256_sub_pd(zi, zj);
1871 
1872  v4df r2_real = _mm256_mul_pd(dx, dx);
1873  r2_real = _mm256_fmadd_pd(dy, dy, r2_real);
1874  r2_real = _mm256_fmadd_pd(dz, dz, r2_real);
1875 
1876  xj = _mm256_permute4x64_pd(jbuf, 0x00);
1877  yj = _mm256_permute4x64_pd(jbuf, 0x55);
1878  zj = _mm256_permute4x64_pd(jbuf, 0xaa);
1879 
1880  v4df vrcrit = _mm256_max_pd(rsi, rsj);
1881  v4df vrcrit2 = _mm256_mul_pd(vrcrit, vrcrit);
1882  v4df mask = _mm256_cmp_pd(vrcrit2, r2_real, 0x01); // for neighbour search
1883  rsj= rsjbuf;
1884  nngb = _mm256_add_pd(_mm256_and_pd( vone, _mm256_xor_pd(mask, allbits) ), nngb); // can remove
1885  }
1886  *(v4df *)(&accpbuf[i/8][4][il]) = nngb;
1887  }
1888  }
1889 
1890  __attribute__ ((noinline))
1891  void kernel_epj_nounroll_for_p3t_with_linear_cutoff(const int ni, const int nj){
1892  const v4df vone = _mm256_set1_pd(1.0);
1893  const v4df veps2 = _mm256_set1_pd(eps2);
1894  //const v4df vr_out = _mm256_set1_pd(r_out);
1895  //const v4df vr_out2 = _mm256_mul_pd(vr_out, vr_out);
1896  const v4df vr_out2 = _mm256_set1_pd(r_crit2);
1897  const v4df allbits = _mm256_cmp_pd(vone, vone, 0x00);
1898  for(int i=0; i<ni; i+=4){
1899  const int il = i%8;
1900  const v4df xi = *(v4df *)(&xibuf[i/8][0][il]);
1901  const v4df yi = *(v4df *)(&xibuf[i/8][1][il]);
1902  const v4df zi = *(v4df *)(&xibuf[i/8][2][il]);
1903  const v4df rsi= *(v4df *)(&xibuf[i/8][3][il]);
1904  v4df ax, ay, az, pot, nngb;
1905  ax = ay = az = pot = nngb = _mm256_set1_pd(0.0);
1906  v4df jbuf = *((v4df*)epjbuf);
1907  v4df rsjbuf= _mm256_broadcast_sd(rsearchj);
1908  v4df xj = _mm256_permute4x64_pd(jbuf, 0x00);
1909  v4df yj = _mm256_permute4x64_pd(jbuf, 0x55);
1910  v4df zj = _mm256_permute4x64_pd(jbuf, 0xaa);
1911  v4df mj = _mm256_permute4x64_pd(jbuf, 0xff);
1912  v4df rsj= rsjbuf;
1913  for(int j=0; j<nj; j++){
1914  jbuf = *((v4df*)(epjbuf+j+1));
1915  rsjbuf = _mm256_broadcast_sd(rsearchj+j+1);
1916  v4df dx = _mm256_sub_pd(xi, xj);
1917  v4df dy = _mm256_sub_pd(yi, yj);
1918  v4df dz = _mm256_sub_pd(zi, zj);
1919 
1920  v4df r2_real = _mm256_fmadd_pd(dx, dx, veps2);
1921  r2_real = _mm256_fmadd_pd(dy, dy, r2_real);
1922  r2_real = _mm256_fmadd_pd(dz, dz, r2_real);
1923 
1924  v4df r2 = _mm256_max_pd( r2_real, vr_out2);
1925  v4df ri1 = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r2)));
1926  //v4df ri1 = __builtin_ia32_cvtps2pd256(__builtin_ia32_rsqrtps( __builtin_ia32_cvtpd2ps256(r2)));
1927  v4df ri2 = _mm256_mul_pd(ri1, ri1);
1928 
1929 #ifdef RSQRT_NR_EPJ_X4
1930  // x4
1931  v4df v1 = _mm256_set1_pd(1.0f);
1932  v4df h = _mm256_fnmadd_pd(r2, ri2, v1);
1933 
1934  v4df v6p0 = _mm256_set1_pd(6.0f);
1935  v4df v5p0 = _mm256_set1_pd(5.0f);
1936  v4df v8p0 = _mm256_set1_pd(8.0f);
1937  v4df v0p0625 = _mm256_set1_pd((float)1.0/16.0);
1938 
1939  ri2 = _mm256_fmadd_pd(h, v5p0, v6p0);
1940  ri2 = _mm256_fmadd_pd(h, ri2, v8p0);
1941  ri2 = _mm256_mul_pd(h, ri2);
1942  ri2 = _mm256_fmadd_pd(ri2, v0p0625, v1);
1943  ri1 = _mm256_mul_pd(ri2, ri1);
1944 
1945  //v4df h = vone - r2*(ri1*ri1);
1946  //ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
1947  ri2 = _mm256_mul_pd(ri1, ri1);
1948  //v4df vone = {1.0, 1.0, 1.0, 1.0};
1949  //v4df v8p0 = {8.0, 8.0, 8.0, 8.0};
1950  //v4df v6p0 = {6.0, 6.0, 6.0, 6.0};
1951  //v4df v5p0 = {5.0, 5.0, 5.0, 5.0};
1952  //v4df v0p0625 = {1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0};
1953  //v4df h = vone - r2*(ri1*ri1);
1954  //ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
1955 #elif defined(RSQRT_NR_EPJ_X2)
1956  //x2
1957  ri2 = _mm256_fnmadd_pd(r2, ri2, _mm256_set1_pd(3.0f));
1958  ri2 = _mm256_mul_pd(ri2, _mm256_set1_pd(0.5f));
1959  ri1 = _mm256_mul_pd(ri2, ri1);
1960  //v4df v0p5 = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
1961  //v4df v3p0 = {3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0};
1962  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
1963  ri2 = _mm256_mul_pd(ri1, ri1);
1964 
1965  //v4df v0p5 = {0.5, 0.5, 0.5, 0.5};
1966  //v4df v3p0 = {3.0, 3.0, 3.0, 3.0};
1967  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
1968 #endif
1969  v4df mri1 = _mm256_mul_pd(ri1, mj);
1970  v4df mri3 = _mm256_mul_pd(mri1, ri2);
1971 
1972  xj = _mm256_permute4x64_pd(jbuf, 0x00);
1973  yj = _mm256_permute4x64_pd(jbuf, 0x55);
1974  zj = _mm256_permute4x64_pd(jbuf, 0xaa);
1975  mj = _mm256_permute4x64_pd(jbuf, 0xff);
1976 
1977  pot = _mm256_sub_pd(pot, mri1);
1978  ax = _mm256_fnmadd_pd(mri3, dx, ax);
1979  ay = _mm256_fnmadd_pd(mri3, dy, ay);
1980  az = _mm256_fnmadd_pd(mri3, dz, az);
1981 
1982  v4df vrcrit = _mm256_max_pd(rsi, rsj);
1983  v4df vrcrit2 = _mm256_mul_pd(vrcrit, vrcrit);
1984  v4df mask = _mm256_cmp_pd(vrcrit2, r2_real, 0x01); // for neighbour search
1985  rsj= rsjbuf;
1986  nngb = _mm256_add_pd(_mm256_and_pd( vone, _mm256_xor_pd(mask, allbits) ), nngb); // can remove
1987  }
1988  *(v4df *)(&accpbuf[i/8][0][il]) = ax;
1989  *(v4df *)(&accpbuf[i/8][1][il]) = ay;
1990  *(v4df *)(&accpbuf[i/8][2][il]) = az;
1991  *(v4df *)(&accpbuf[i/8][3][il]) = pot;
1992  *(v4df *)(&accpbuf[i/8][4][il]) = nngb;
1993  }
1994  }
1995 #endif
1996 
1997 #ifdef __FMA__
1998 #undef _mm512_fmadd_pd
1999 #undef _mm512_fnmadd_pd
2000 #undef _mm256_fmadd_pd
2001 #undef _mm256_fnmadd_pd
2002 #endif
2003 
2004 #ifdef USE__AVX512
2005  __attribute__ ((noinline))
2006  void kernel_spj_nounroll(const int ni, const int nj){
2007  const v8df veps2 = _mm512_set1_pd(eps2);
2008  for(int i=0; i<ni; i+=8){
2009  const v8df xi = *(v8df *)(&xibuf[i/8][0]);
2010  const v8df yi = *(v8df *)(&xibuf[i/8][1]);
2011  const v8df zi = *(v8df *)(&xibuf[i/8][2]);
2012 
2013  v8df ax, ay, az, pot;
2014  ax = ay = az = pot = _mm512_set1_pd(0.0);
2015 
2016 #ifdef AVX_PRELOAD
2017  v4df jbuf0 = *((v4df*)spjbuf[0][0]);
2018  v4df jbuf1 = *((v4df*)spjbuf[0][1]);
2019  v4df jbuf2 = *((v4df*)spjbuf[0][2]);
2020 #endif
2021  for(int j=0; j<nj; j++){
2022 #ifdef AVX_PRELOAD
2023  v8df jp = _mm512_broadcast_f64x4(jbuf0);
2024  v8df xj = _mm512_permutex_pd(jp, 0x00);
2025  v8df yj = _mm512_permutex_pd(jp, 0x55);
2026  v8df zj = _mm512_permutex_pd(jp, 0xaa);
2027  jbuf0 = *((v4df*)spjbuf[j+1][0]);
2028 
2029  v8df jq = _mm512_broadcast_f64x4(jbuf1);
2030  v8df qxx = _mm512_permutex_pd(jq, 0x00);
2031  v8df qyy = _mm512_permutex_pd(jq, 0x55);
2032  v8df qzz = _mm512_permutex_pd(jq, 0xaa);
2033  v8df mj = _mm512_permutex_pd(jq, 0xff);
2034  jbuf1 = *((v4df*)spjbuf[j+1][1]);
2035 
2036 
2037  v8df jq2 = _mm512_broadcast_f64x4(jbuf2);
2038  v8df qxy = _mm512_permutex_pd(jq2, 0x00);
2039  v8df qyz = _mm512_permutex_pd(jq2, 0x55);
2040  v8df qzx = _mm512_permutex_pd(jq2, 0xaa);
2041  v8df mtr = _mm512_permutex_pd(jq2, 0xff);
2042  jbuf2 = *((v4df*)spjbuf[j+1][2]);
2043 
2044  v8df dx = _mm512_sub_pd(xi, xj);
2045  v8df dy = _mm512_sub_pd(yi, yj);
2046  v8df dz = _mm512_sub_pd(zi, zj);
2047 #else
2048  v8df dx = _mm512_sub_pd(xi, _mm512_set1_pd(spjbuf[j][0][0]));
2049  v8df dy = _mm512_sub_pd(yi, _mm512_set1_pd(spjbuf[j][0][1]));
2050  v8df dz = _mm512_sub_pd(zi, _mm512_set1_pd(spjbuf[j][0][2]));
2051 #endif
2052  v8df r2 = _mm512_fmadd_pd(dx, dx, veps2);
2053  r2 = _mm512_fmadd_pd(dy, dy, r2);
2054  r2 = _mm512_fmadd_pd(dz, dz, r2);
2055  //v8df ri1 = __builtin_ia32_rsqrtps256(r2);
2056  v8df ri1 = _mm512_rsqrt14_pd(r2);
2057  //v8df ri1 = __builtin_ia32_cvtps2pd512( __builtin_ia32_rsqrtps( __builtin_ia32_cvtpd2ps512(r2)));
2058  v8df ri2 = _mm512_mul_pd(ri1, ri1);
2059 
2060 #ifdef RSQRT_NR_EPJ_X4
2061  // x4
2062  v8df v1 = _mm512_set1_pd(1.0f);
2063  v8df h = _mm512_fnmadd_pd(r2, ri2, v1);
2064 
2065  v8df v6p0 = _mm512_set1_pd(6.0f);
2066  v8df v5p0 = _mm512_set1_pd(5.0f);
2067  v8df v8p0 = _mm512_set1_pd(8.0f);
2068  v8df v0p0625 = _mm512_set1_pd((float)1.0/16.0);
2069 
2070  ri2 = _mm512_fmadd_pd(h, v5p0, v6p0);
2071  ri2 = _mm512_fmadd_pd(h, ri2, v8p0);
2072  ri2 = _mm512_mul_pd(h, ri2);
2073  ri2 = _mm512_fmadd_pd(ri2, v0p0625, v1);
2074  ri1 = _mm512_mul_pd(ri2, ri1);
2075 
2076  //v16sf h = vone - r2*(ri1*ri1);
2077  //ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
2078  ri2 = _mm512_mul_pd(ri1, ri1);
2079  // x4
2080  //v8df vone = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
2081  //v8df v8p0 = {8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0};
2082  //v8df v6p0 = {6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0};
2083  //v8df v5p0 = {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0};
2084  //v8df v0p0625 = {1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0,
2085  // 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0};
2086  //v8df h = vone - r2*(ri1*ri1);
2087  //ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
2088 #elif defined(RSQRT_NR_EPJ_X2)
2089  //x2
2090  ri2 = _mm512_fnmadd_pd(r2, ri2, _mm512_set1_pd(3.0f));
2091  ri2 = _mm512_mul_pd(ri2, _mm512_set1_pd(0.5f));
2092  ri1 = _mm512_mul_pd(ri2, ri1);
2093  //v8df v0p5 = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
2094  //v8df v3p0 = {3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0};
2095  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
2096  ri2 = _mm512_mul_pd(ri1, ri1);
2097 #endif
2098 
2099  v8df ri3 = _mm512_mul_pd(ri1, ri2);
2100  v8df ri4 = _mm512_mul_pd(ri2, ri2);
2101  v8df ri5 = _mm512_mul_pd(ri2, ri3);
2102 
2103 #ifdef AVX_PRELOAD
2104  //(qxx*dx + qxy*dy) + qzx*dz;
2105  v8df qr_x = _mm512_mul_pd(dx, qxx);
2106  qr_x = _mm512_fmadd_pd(dy, qxy, qr_x);
2107  qr_x = _mm512_fmadd_pd(dz, qzx, qr_x);
2108  //(qyy*dy + qxy*dx) + qyz*dz;
2109  v8df qr_y = _mm512_mul_pd(dy, qyy);
2110  qr_y = _mm512_fmadd_pd(dx, qxy, qr_y);
2111  qr_y = _mm512_fmadd_pd(dz, qyz, qr_y);
2112  //(qzz*dz + qzx*dx) + qyz*dy;
2113  v8df qr_z = _mm512_mul_pd(dz, qzz);
2114  qr_z = _mm512_fmadd_pd(dx, qzx, qr_z);
2115  qr_z = _mm512_fmadd_pd(dy, qyz, qr_z);
2116 
2117  //v8df rqr = ((mtr + qr_x*dx) + qr_y*dy) + qr_z*dz;
2118  v8df rqr = _mm512_fmadd_pd(qr_x, dx, mtr);
2119 #else
2120  //(qxx*dx + qxy*dy) + qzx*dz;
2121  v8df qr_x = _mm512_mul_pd(dx, _mm512_set1_pd(spjbuf[j][1][0]));
2122  qr_x = _mm512_fmadd_pd(dy, _mm512_set1_pd(spjbuf[j][2][0]), qr_x);
2123  qr_x = _mm512_fmadd_pd(dz, _mm512_set1_pd(spjbuf[j][2][2]), qr_x);
2124  //(qyy*dy + qxy*dx) + qyz*dz;
2125  v8df qr_y = _mm512_mul_pd(dy, _mm512_set1_pd(spjbuf[j][1][1]));
2126  qr_y = _mm512_fmadd_pd(dx, _mm512_set1_pd(spjbuf[j][2][0]), qr_y);
2127  qr_y = _mm512_fmadd_pd(dz, _mm512_set1_pd(spjbuf[j][2][1]), qr_y);
2128  //(qzz*dz + qzx*dx) + qyz*dy;
2129  v8df qr_z = _mm512_mul_pd(dz, _mm512_set1_pd(spjbuf[j][1][2]));
2130  qr_z = _mm512_fmadd_pd(dx, _mm512_set1_pd(spjbuf[j][2][2]), qr_z);
2131  qr_z = _mm512_fmadd_pd(dy, _mm512_set1_pd(spjbuf[j][2][1]), qr_z);
2132 
2133  //v8df rqr = ((mtr + qr_x*dx) + qr_y*dy) + qr_z*dz;
2134  v8df rqr = _mm512_fmadd_pd(qr_x, dx, _mm512_set1_pd(spjbuf[j][2][3]));
2135 #endif
2136 
2137  rqr = _mm512_fmadd_pd(qr_y, dy, rqr);
2138  rqr = _mm512_fmadd_pd(qr_z, dz, rqr);
2139 
2140  v8df rqr_ri4 = _mm512_mul_pd(rqr, ri4);
2141 
2142 #ifndef AVX_PRELOAD
2143  v8df mj = _mm512_set1_pd(spjbuf[j][1][3]);
2144 #endif
2145  v8df meff = _mm512_fmadd_pd(rqr_ri4, _mm512_set1_pd(0.5f), mj);
2146  //v8df meff3 = (mj + v2p5 * rqr_ri4) * ri3;
2147  v8df meff3 = _mm512_fmadd_pd(rqr_ri4, _mm512_set1_pd(2.5f), mj);
2148  meff3 = _mm512_mul_pd(meff3, ri3);
2149  //v8df meff3 = (mj + v2p5 * rqr_ri4) * ri3;
2150 
2151  pot = _mm512_fnmadd_pd(meff, ri1, pot);
2152 
2153  ax = _mm512_fmadd_pd(ri5, qr_x, ax);
2154  ax = _mm512_fnmadd_pd(meff3, dx, ax);
2155  ay = _mm512_fmadd_pd(ri5, qr_y, ay);
2156  ay = _mm512_fnmadd_pd(meff3, dy, ay);
2157  az = _mm512_fmadd_pd(ri5, qr_z, az);
2158  az = _mm512_fnmadd_pd(meff3, dz, az);
2159  //ax = (ax - ri5*qr_x) + meff3*dx;
2160  //ay = (ay - ri5*qr_y) + meff3*dy;
2161  //az = (az - ri5*qr_z) + meff3*dz;
2162  }
2163 
2164 //#ifdef RSQRT_NR_SPJ_X2
2165 // //x2
2166 // v8df v0p5 = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
2167 // v8df v0p125 = {0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125};
2168 // pot *= v0p5;
2169 // ax *= v0p125;
2170 // ay *= v0p125;
2171 // az *= v0p125;
2172 //#endif
2173 
2174  *(v8df *)(&accpbuf[i/8][0]) = ax;
2175  *(v8df *)(&accpbuf[i/8][1]) = ay;
2176  *(v8df *)(&accpbuf[i/8][2]) = az;
2177  *(v8df *)(&accpbuf[i/8][3]) = pot;
2178  }
2179  }
2180 #else
2181  __attribute__ ((noinline))
2182  void kernel_spj_nounroll(const int ni, const int nj){
2183  const v4df veps2 = _mm256_set1_pd(eps2);
2184  for(int i=0; i<ni; i+=4){
2185  const int il = i%8;
2186  const v4df xi = *(v4df *)(&xibuf[i/8][0][il]);
2187  const v4df yi = *(v4df *)(&xibuf[i/8][1][il]);
2188  const v4df zi = *(v4df *)(&xibuf[i/8][2][il]);
2189 
2190  v4df ax, ay, az, pot;
2191  ax = ay = az = pot = _mm256_set1_pd(0.0);
2192 
2193  v4df jbuf0 = *((v4df*)spjbuf[0][0]);
2194  v4df jbuf1 = *((v4df*)spjbuf[0][1]);
2195  v4df jbuf2 = *((v4df*)spjbuf[0][2]);
2196 
2197  for(int j=0; j<nj; j++){
2198  v4df xj = _mm256_permute4x64_pd(jbuf0, 0x00);
2199  v4df yj = _mm256_permute4x64_pd(jbuf0, 0x55);
2200  v4df zj = _mm256_permute4x64_pd(jbuf0, 0xaa);
2201  jbuf0 = *((v4df*)spjbuf[j+1][0]);
2202 
2203  v4df qxx = _mm256_permute4x64_pd(jbuf1, 0x00);
2204  v4df qyy = _mm256_permute4x64_pd(jbuf1, 0x55);
2205  v4df qzz = _mm256_permute4x64_pd(jbuf1, 0xaa);
2206  v4df mj = _mm256_permute4x64_pd(jbuf1, 0xff);
2207  jbuf1 = *((v4df*)spjbuf[j+1][1]);
2208 
2209  v4df qxy = _mm256_permute4x64_pd(jbuf2, 0x00);
2210  v4df qyz = _mm256_permute4x64_pd(jbuf2, 0x55);
2211  v4df qzx = _mm256_permute4x64_pd(jbuf2, 0xaa);
2212  v4df mtr = _mm256_permute4x64_pd(jbuf2, 0xff);
2213  jbuf2 = *((v4df*)spjbuf[j+1][2]);
2214 
2215  v4df dx = _mm256_sub_pd(xi, xj);
2216  v4df dy = _mm256_sub_pd(yi, yj);
2217  v4df dz = _mm256_sub_pd(zi, zj);
2218 
2219  v4df r2 = _mm256_fmadd_pd(dx, dx, veps2);
2220  r2 = _mm256_fmadd_pd(dy, dy, r2);
2221  r2 = _mm256_fmadd_pd(dz, dz, r2);
2222 
2223  v4df ri1 = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r2)));
2224  //v4df ri1 = __builtin_ia32_cvtps2pd256( __builtin_ia32_rsqrtps( __builtin_ia32_cvtpd2ps256(r2)));
2225 
2226  v4df ri2 = _mm256_mul_pd(ri1, ri1);
2227 
2228 #ifdef RSQRT_NR_EPJ_X4
2229  // x4
2230  v4df v1 = _mm256_set1_pd(1.0f);
2231  v4df h = _mm256_fnmadd_pd(r2, ri2, v1);
2232 
2233  v4df v6p0 = _mm256_set1_pd(6.0f);
2234  v4df v5p0 = _mm256_set1_pd(5.0f);
2235  v4df v8p0 = _mm256_set1_pd(8.0f);
2236  v4df v0p0625 = _mm256_set1_pd((float)1.0/16.0);
2237 
2238  ri2 = _mm256_fmadd_pd(h, v5p0, v6p0);
2239  ri2 = _mm256_fmadd_pd(h, ri2, v8p0);
2240  ri2 = _mm256_mul_pd(h, ri2);
2241  ri2 = _mm256_fmadd_pd(ri2, v0p0625, v1);
2242  ri1 = _mm256_mul_pd(ri2, ri1);
2243 
2244  //v4df h = vone - r2*(ri1*ri1);
2245  //ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
2246  ri2 = _mm256_mul_pd(ri1, ri1);
2247  //v4df vone = {1.0, 1.0, 1.0, 1.0};
2248  //v4df v8p0 = {8.0, 8.0, 8.0, 8.0};
2249  //v4df v6p0 = {6.0, 6.0, 6.0, 6.0};
2250  //v4df v5p0 = {5.0, 5.0, 5.0, 5.0};
2251  //v4df v0p0625 = {1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0};
2252  //v4df h = vone - r2*(ri1*ri1);
2253  //ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
2254 #elif defined(RSQRT_NR_EPJ_X2)
2255  //x2
2256  ri2 = _mm256_fnmadd_pd(r2, ri2, _mm256_set1_pd(3.0f));
2257  ri2 = _mm256_mul_pd(ri2, _mm256_set1_pd(0.5f));
2258  ri1 = _mm256_mul_pd(ri2, ri1);
2259  //v4df v0p5 = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
2260  //v4df v3p0 = {3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0};
2261  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
2262  ri2 = _mm256_mul_pd(ri1, ri1);
2263 
2264  //v4df v0p5 = {0.5, 0.5, 0.5, 0.5};
2265  //v4df v3p0 = {3.0, 3.0, 3.0, 3.0};
2266  //ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
2267 #endif
2268 
2269  v4df ri3 = _mm256_mul_pd(ri1, ri2);
2270  v4df ri4 = _mm256_mul_pd(ri2, ri2);
2271  v4df ri5 = _mm256_mul_pd(ri2, ri3);
2272 
2273  //(qxx*dx + qxy*dy) + qzx*dz;
2274  v4df qr_x = _mm256_mul_pd(dx, qxx);
2275  qr_x = _mm256_fmadd_pd(dy, qxy, qr_x);
2276  qr_x = _mm256_fmadd_pd(dz, qzx, qr_x);
2277  //(qyy*dy + qxy*dx) + qyz*dz;
2278  v4df qr_y = _mm256_mul_pd(dy, qyy);
2279  qr_y = _mm256_fmadd_pd(dx, qxy, qr_y);
2280  qr_y = _mm256_fmadd_pd(dz, qyz, qr_y);
2281  //(qzz*dz + qzx*dx) + qyz*dy;
2282  v4df qr_z = _mm256_mul_pd(dz, qzz);
2283  qr_z = _mm256_fmadd_pd(dx, qzx, qr_z);
2284  qr_z = _mm256_fmadd_pd(dy, qyz, qr_z);
2285 
2286  //v4df rqr = ((mtr + qr_x*dx) + qr_y*dy) + qr_z*dz;
2287  v4df rqr = _mm256_fmadd_pd(qr_x, dx, mtr);
2288  rqr = _mm256_fmadd_pd(qr_y, dy, rqr);
2289  rqr = _mm256_fmadd_pd(qr_z, dz, rqr);
2290 
2291  v4df rqr_ri4 = _mm256_mul_pd(rqr, ri4);
2292 
2293  v4df meff = _mm256_fmadd_pd(rqr_ri4, _mm256_set1_pd(0.5f), mj);
2294  v4df meff3 = _mm256_fmadd_pd(rqr_ri4, _mm256_set1_pd(2.5f), mj);
2295  meff3 = _mm256_mul_pd(meff3, ri3);
2296 
2297  pot = _mm256_fnmadd_pd(meff, ri1, pot);
2298 
2299  ax = _mm256_fmadd_pd(ri5, qr_x, ax);
2300  ax = _mm256_fnmadd_pd(meff3, dx, ax);
2301  ay = _mm256_fmadd_pd(ri5, qr_y, ay);
2302  ay = _mm256_fnmadd_pd(meff3, dy, ay);
2303  az = _mm256_fmadd_pd(ri5, qr_z, az);
2304  az = _mm256_fnmadd_pd(meff3, dz, az);
2305  }
2306 
2307 //#ifdef RSQRT_NR_SPJ_X2
2308 // //x2
2309 // v4df v0p125 = {0.125, 0.125, 0.125, 0.125};
2310 // pot *= v0p5;
2311 // ax *= v0p125;
2312 // ay *= v0p125;
2313 // az *= v0p125;
2314 //#endif
2315 
2316  *(v4df *)(&accpbuf[i/8][0][il]) = ax;
2317  *(v4df *)(&accpbuf[i/8][1][il]) = ay;
2318  *(v4df *)(&accpbuf[i/8][2][il]) = az;
2319  *(v4df *)(&accpbuf[i/8][3][il]) = pot;
2320  }
2321  }
2322 
2323 #endif
2324 
2325 } __attribute__ ((aligned(128)));
2326 
2327 #endif
2328 
2329 
2330 
2331 
2332 
2333 
2334 
2335 
2336 
2337 
2338 
2339 
2340 
2341 
2342 //#if 0
2343 //class PhantomGrapeQuad{
2344 //public:
2345 // enum{
2346 // NIMAX = 32768,
2347 // NJMAX = 131072,
2348 // };
2349 //
2350 //private:
2351 //#if 1
2352 // float xibuf [NIMAX/8] [3][8]; // x, y, z
2353 // double xibufd [NIMAX/8] [3][8]; // x, y, z
2354 // float accpbuf[NIMAX/8] [5][8]; // ax, ay, az, pot, nngb
2355 // double accpbufd[NIMAX/8] [5][8]; // ax, ay, az, pot, nngb
2356 // float epjbuf [NJMAX] [4]; // x, y, z, m
2357 // double epjbufd [NJMAX] [4]; // x, y, z, m
2358 // float spjbuf [NJMAX] [3][4]; // x, y, z, m, | xx, yy, zz, pad, | xy, yz, zx, tr
2359 // double spjbufd [NJMAX] [3][4]; // x, y, z, m, | xx, yy, zz, pad, | xy, yz, zx, tr
2360 //#else
2361 // float *** xibuf;
2362 // float *** accpbuf;
2363 // float ** epjbuf;
2364 // float *** spjbuf;
2365 //#endif
2366 // double eps2;
2367 // static double get_a_NaN(){
2368 // union{ long l; double d; } m;
2369 // m.l = -1;
2370 // return m.d;
2371 // }
2372 // double r_crit2;
2373 // double r_out;
2374 // double r_in;
2375 // double denominator; // for cut off
2376 //public:
2377 //
2378 // PhantomGrapeQuad() : eps2(get_a_NaN()) {} // default NaN
2379 //
2380 // void set_cutoff(const double _r_out, const double _r_in){
2381 // r_out = _r_out;
2382 // r_in = _r_in;
2383 // denominator = 1.0 / (r_out - r_in);
2384 // }
2385 //
2386 // void set_eps2(const double _eps2){
2387 // this->eps2 = _eps2;
2388 // }
2389 // void set_r_crit2(const double _r_crit2){
2390 // this->r_crit2 = _r_crit2;
2391 // //this->r_crit2 = _r_crit2 * 1.01;
2392 // }
2393 // void set_epj_one(const int addr, const double x, const double y, const double z, const double m) {
2394 // epjbuf[addr][0] = x;
2395 // epjbuf[addr][1] = y;
2396 // epjbuf[addr][2] = z;
2397 // epjbuf[addr][3] = m;
2398 // }
2399 //
2400 // void set_epj_one_d(const int addr, const double x, const double y, const double z, const double m){
2401 // epjbufd[addr][0] = x;
2402 // epjbufd[addr][1] = y;
2403 // epjbufd[addr][2] = z;
2404 // epjbufd[addr][3] = m;
2405 // }
2406 //
2407 // // please specialize the following
2408 // template <typename EPJ_t>
2409 // void set_epj(const int nj, const EPJ_t epj[]);
2410 //
2411 // void set_spj_one(
2412 // const int addr,
2413 // const double x, const double y, const double z, const double m,
2414 // const double qxx, const double qyy, const double qzz,
2415 // const double qxy, const double qyz, const double qzx)
2416 // {
2417 // const double tr = qxx + qyy + qzz;
2418 // spjbuf[addr][0][0] = x;
2419 // spjbuf[addr][0][1] = y;
2420 // spjbuf[addr][0][2] = z;
2421 // spjbuf[addr][0][3] = m;
2422 //
2423 // spjbuf[addr][1][0] = 3.0 * qxx - tr;
2424 // spjbuf[addr][1][1] = 3.0 * qyy - tr;
2425 // spjbuf[addr][1][2] = 3.0 * qzz - tr;
2426 // spjbuf[addr][1][3] = m;
2427 //
2428 // spjbuf[addr][2][0] = 3.0 * qxy;
2429 // spjbuf[addr][2][1] = 3.0 * qyz;
2430 // spjbuf[addr][2][2] = 3.0 * qzx;
2431 // spjbuf[addr][2][3] = -(eps2 * tr);
2432 // }
2433 //
2434 // void set_spj_one_d(
2435 // const int addr,
2436 // const double x, const double y, const double z, const double m,
2437 // const double qxx, const double qyy, const double qzz,
2438 // const double qxy, const double qyz, const double qzx)
2439 // {
2440 // const double tr = qxx + qyy + qzz;
2441 // spjbufd[addr][0][0] = x;
2442 // spjbufd[addr][0][1] = y;
2443 // spjbufd[addr][0][2] = z;
2444 // spjbufd[addr][0][3] = m;
2445 //
2446 // spjbufd[addr][1][0] = 3.0 * qxx - tr;
2447 // spjbufd[addr][1][1] = 3.0 * qyy - tr;
2448 // spjbufd[addr][1][2] = 3.0 * qzz - tr;
2449 // spjbufd[addr][1][3] = m;
2450 //
2451 // spjbufd[addr][2][0] = 3.0 * qxy;
2452 // spjbufd[addr][2][1] = 3.0 * qyz;
2453 // spjbufd[addr][2][2] = 3.0 * qzx;
2454 // spjbufd[addr][2][3] = -(eps2 * tr);
2455 // }
2456 //
2457 // // please specialize the following
2458 // template <typename SPJ_t>
2459 // void set_spj(const int nj, const SPJ_t spj[]);
2460 //
2461 // void set_xi_one(const int addr, const double x, const double y, const double z){
2462 // const int ah = addr / 8;
2463 // const int al = addr % 8;
2464 // xibuf[ah][0][al] = x;
2465 // xibuf[ah][1][al] = y;
2466 // xibuf[ah][2][al] = z;
2467 // }
2468 // void set_xi_one_d(const int addr, const double x, const double y, const double z){
2469 // const int ah = addr / 8;
2470 // const int al = addr % 8;
2471 // xibufd[ah][0][al] = x;
2472 // xibufd[ah][1][al] = y;
2473 // xibufd[ah][2][al] = z;
2474 // }
2475 //
2476 // // please specialize the following
2477 // template <typename EPI_t>
2478 // void set_xi(const int ni, const EPI_t spj[]);
2479 //
2480 // template <typename real_t>
2481 // void get_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot){
2482 // const int ah = addr / 8;
2483 // const int al = addr % 8;
2484 // ax = accpbuf[ah][0][al];
2485 // ay = accpbuf[ah][1][al];
2486 // az = accpbuf[ah][2][al];
2487 // pot = accpbuf[ah][3][al];
2488 // }
2489 // template <typename real_t>
2490 // void get_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot, real_t &nngb){
2491 // const int ah = addr / 8;
2492 // const int al = addr % 8;
2493 // ax = accpbuf[ah][0][al];
2494 // ay = accpbuf[ah][1][al];
2495 // az = accpbuf[ah][2][al];
2496 // pot = accpbuf[ah][3][al];
2497 // nngb = accpbuf[ah][4][al];
2498 // }
2499 //
2500 // template <typename real_t>
2501 // void get_accp_one_d(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot, real_t &nngb){
2502 // const int ah = addr / 8;
2503 // const int al = addr % 8;
2504 // ax = accpbufd[ah][0][al];
2505 // ay = accpbufd[ah][1][al];
2506 // az = accpbufd[ah][2][al];
2507 // pot = accpbufd[ah][3][al];
2508 // nngb = accpbufd[ah][4][al];
2509 // }
2510 //
2511 // template <typename real_t>
2512 // void accum_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot){
2513 // const int ah = addr / 8;
2514 // const int al = addr % 8;
2515 // ax += accpbuf[ah][0][al];
2516 // ay += accpbuf[ah][1][al];
2517 // az += accpbuf[ah][2][al];
2518 // pot += accpbuf[ah][3][al];
2519 // }
2520 //
2521 // template <typename real_t>
2522 // void accum_accp_one_d(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot){
2523 // const int ah = addr / 8;
2524 // const int al = addr % 8;
2525 // ax += accpbufd[ah][0][al];
2526 // ay += accpbufd[ah][1][al];
2527 // az += accpbufd[ah][2][al];
2528 // pot += accpbufd[ah][3][al];
2529 // }
2530 //
2531 // template <typename real_t>
2532 // void accum_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot, real_t &nngb){
2533 // const int ah = addr / 8;
2534 // const int al = addr % 8;
2535 // ax += accpbuf[ah][0][al];
2536 // ay += accpbuf[ah][1][al];
2537 // az += accpbuf[ah][2][al];
2538 // pot += accpbuf[ah][3][al];
2539 // nngb += accpbuf[ah][4][al];
2540 // }
2541 //
2542 // template <typename real_t>
2543 // void accum_accp_one_d(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot, real_t &nngb){
2544 // const int ah = addr / 8;
2545 // const int al = addr % 8;
2546 // ax += accpbufd[ah][0][al];
2547 // ay += accpbufd[ah][1][al];
2548 // az += accpbufd[ah][2][al];
2549 // pot += accpbufd[ah][3][al];
2550 // nngb += accpbufd[ah][4][al];
2551 // }
2552 //
2553 // void run_epj(const int ni, const int nj){
2554 // if(ni > NIMAX || nj > NJMAX){
2555 // std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
2556 // for(PS::S32 i=0; i<(ni-1)/8+1; i++){
2557 // for(PS::S32 j=0; j<8; j++){
2558 // for(PS::S32 k=0; k<3; k++){
2559 // std::cout<<"i,j,k="<<i<<" "<<j<<" "<<k<<std::endl;
2560 // std::cout<<"xibuf[i][k][j]="<<xibuf[i][k][j]<<std::endl;
2561 // }
2562 // std::cout<<std::endl;
2563 // }
2564 // }
2565 // }
2566 // assert(ni <= NIMAX);
2567 // assert(nj <= NJMAX);
2568 //
2569 // kernel_epj_nounroll(ni, nj);
2570 // }
2571 //
2572 // void run_epj_d(const int ni, const int nj){
2573 // if(ni > NIMAX || nj > NJMAX){
2574 // std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
2575 // for(PS::S32 i=0; i<(ni-1)/8+1; i++){
2576 // for(PS::S32 j=0; j<8; j++){
2577 // for(PS::S32 k=0; k<3; k++){
2578 // std::cout<<"i,j,k="<<i<<" "<<j<<" "<<k<<std::endl;
2579 // std::cout<<"xibuf[i][k][j]="<<xibuf[i][k][j]<<std::endl;
2580 // }
2581 // std::cout<<std::endl;
2582 // }
2583 // }
2584 // }
2585 // assert(ni <= NIMAX);
2586 // assert(nj <= NJMAX);
2587 // kernel_epj_nounroll_64bit(ni, nj);
2588 // }
2589 //
2590 // //////////
2591 // // include linear cutoff
2592 // void run_epj_for_p3t_with_linear_cutoff(const int ni, const int nj){
2593 // if(ni > NIMAX || nj > NJMAX){
2594 // std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
2595 // for(PS::S32 i=0; i<(ni-1)/8+1; i++){
2596 // for(PS::S32 j=0; i<8; j++){
2597 // for(PS::S32 k=0; k<3; k++){
2598 // std::cout<<"xibufd[i][k][j]="<<xibufd[i][k][j]<<std::endl;
2599 // }
2600 // std::cout<<std::endl;
2601 // }
2602 // }
2603 // }
2604 // assert(ni <= NIMAX);
2605 // assert(nj <= NJMAX);
2606 // kernel_epj_nounroll_for_p3t_with_linear_cutoff(ni, nj);
2607 // }
2608 //
2609 // void run_epj_for_p3t_with_linear_cutoff_d(const int ni, const int nj){
2610 // if(ni > NIMAX || nj > NJMAX){
2611 // std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
2612 // for(PS::S32 i=0; i<(ni-1)/8+1; i++){
2613 // for(PS::S32 j=0; i<8; j++){
2614 // for(PS::S32 k=0; k<3; k++){
2615 // std::cout<<"xibufd[i][k][j]="<<xibufd[i][k][j]<<std::endl;
2616 // }
2617 // std::cout<<std::endl;
2618 // }
2619 // }
2620 // }
2621 // assert(ni <= NIMAX);
2622 // assert(nj <= NJMAX);
2623 // kernel_epj_64bit_nounroll_for_p3t_with_linear_cutoff(ni, nj);
2624 // }
2625 // // include linear cutoff
2626 // //////////
2627 //
2628 // void run_epj_for_p3t(const int ni, const int nj){
2629 // if(ni > NIMAX || nj > NJMAX){
2630 // std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
2631 // for(PS::S32 i=0; i<(ni-1)/8+1; i++){
2632 // for(PS::S32 j=0; i<8; j++){
2633 // for(PS::S32 k=0; k<3; k++){
2634 // std::cout<<"xibufd[i][k][j]="<<xibufd[i][k][j]<<std::endl;
2635 // }
2636 // std::cout<<std::endl;
2637 // }
2638 // }
2639 // }
2640 // assert(ni <= NIMAX);
2641 // assert(nj <= NJMAX);
2642 // kernel_epj_nounroll_for_p3t(ni, nj);
2643 // }
2644 //
2645 // void run_epj_for_p3t_d(const int ni, const int nj){
2646 // if(ni > NIMAX || nj > NJMAX){
2647 // std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
2648 // for(PS::S32 i=0; i<(ni-1)/8+1; i++){
2649 // for(PS::S32 j=0; i<8; j++){
2650 // for(PS::S32 k=0; k<3; k++){
2651 // std::cout<<"xibufd[i][k][j]="<<xibufd[i][k][j]<<std::endl;
2652 // }
2653 // std::cout<<std::endl;
2654 // }
2655 // }
2656 // }
2657 // assert(ni <= NIMAX);
2658 // assert(nj <= NJMAX);
2659 // kernel_epj_64bit_nounroll_for_p3t(ni, nj);
2660 // }
2661 //
2662 // void run_spj(const int ni, const int nj){
2663 // if(ni > NIMAX || nj > NJMAX){
2664 // std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
2665 // }
2666 // assert(ni <= NIMAX);
2667 // assert(nj <= NJMAX);
2668 //
2669 // kernel_spj_nounroll(ni, nj);
2670 // // kernel_spj_unroll2(ni, nj);
2671 // }
2672 //
2673 // void run_spj_d(const int ni, const int nj){
2674 // if(ni > NIMAX || nj > NJMAX){
2675 // std::cout<<"ni= "<<ni<<" NIMAX= "<<NIMAX<<" nj= "<<nj<<" NJMAX= "<<NJMAX<<std::endl;
2676 // }
2677 // assert(ni <= NIMAX);
2678 // assert(nj <= NJMAX);
2679 //
2680 // kernel_spj_64bit_nounroll(ni, nj);
2681 // // kernel_spj_unroll2(ni, nj);
2682 // }
2683 //
2684 //private:
2685 // typedef float v4sf __attribute__((vector_size(16)));
2686 // typedef float v8sf __attribute__((vector_size(32)));
2687 // typedef double v4df __attribute__((vector_size(32)));
2688 //
2689 // __attribute__ ((noinline))
2690 // void kernel_epj_nounroll(const int ni, const int nj){
2691 //
2692 // const v8sf veps2 = {(float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2};
2693 // for(int i=0; i<ni; i+=8){
2694 // const v8sf xi = *(v8sf *)(xibuf[i/8][0]);
2695 // const v8sf yi = *(v8sf *)(xibuf[i/8][1]);
2696 // const v8sf zi = *(v8sf *)(xibuf[i/8][2]);
2697 //
2698 // v8sf ax, ay, az, pot;
2699 // ax = ay = az = pot = (v8sf){0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
2700 // v8sf jbuf = __builtin_ia32_vbroadcastf128_ps256((v4sf *)epjbuf);
2701 // v8sf xj = __builtin_ia32_shufps256(jbuf, jbuf, 0x00);
2702 // v8sf yj = __builtin_ia32_shufps256(jbuf, jbuf, 0x55);
2703 // v8sf zj = __builtin_ia32_shufps256(jbuf, jbuf, 0xaa);
2704 // v8sf mj = __builtin_ia32_shufps256(jbuf, jbuf, 0xff);
2705 // for(int j=0; j<nj; j++){
2706 // jbuf = __builtin_ia32_vbroadcastf128_ps256((v4sf *)(epjbuf + j+1));
2707 //
2708 // v8sf dx = xj - xi;
2709 // v8sf dy = yj - yi;
2710 // v8sf dz = zj - zi;
2711 //
2712 // v8sf r2 = ((veps2 + dx*dx) + dy*dy) + dz*dz;
2713 // v8sf ri1 = __builtin_ia32_rsqrtps256(r2);
2714 //#ifdef RSQRT_NR_EPJ_X2
2715 // v8sf v3p0 = {3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f};
2716 // ri1 *= (v3p0 - r2*(ri1*ri1));
2717 //#endif
2718 // v8sf mri1 = mj * ri1;
2719 // v8sf ri2 = ri1 * ri1;
2720 // v8sf mri3 = mri1 * ri2;
2721 //
2722 // xj = __builtin_ia32_shufps256(jbuf, jbuf, 0x00);
2723 // yj = __builtin_ia32_shufps256(jbuf, jbuf, 0x55);
2724 // zj = __builtin_ia32_shufps256(jbuf, jbuf, 0xaa);
2725 // mj = __builtin_ia32_shufps256(jbuf, jbuf, 0xff);
2726 //
2727 // pot -= mri1;
2728 // ax += mri3 * dx;
2729 // ay += mri3 * dy;
2730 // az += mri3 * dz;
2731 // }
2732 //#ifdef RSQRT_NR_EPJ_X2
2733 // v8sf v0p5 = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
2734 // v8sf v0p125 = {0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f};
2735 // pot *= v0p5;
2736 // ax *= v0p125;
2737 // ay *= v0p125;
2738 // az *= v0p125;
2739 //#endif
2740 //
2741 // *(v8sf *)(accpbuf[i/8][0]) = ax;
2742 // *(v8sf *)(accpbuf[i/8][1]) = ay;
2743 // *(v8sf *)(accpbuf[i/8][2]) = az;
2744 // *(v8sf *)(accpbuf[i/8][3]) = pot;
2745 // }
2746 // }
2747 //
2748 // __attribute__ ((noinline))
2749 // void kernel_epj_nounroll_64bit(const int ni, const int nj){
2750 // //const v4df vzero = {0.0, 0.0, 0.0, 0.0};
2751 //
2752 // const v4df veps2 = {eps2, eps2, eps2, eps2};
2753 // for(int i=0; i<ni; i+=4){
2754 // const int il = i%8;
2755 // const v4df xi = *(v4df *)(&xibufd[i/8][0][il]);
2756 // const v4df yi = *(v4df *)(&xibufd[i/8][1][il]);
2757 // const v4df zi = *(v4df *)(&xibufd[i/8][2][il]);
2758 //
2759 // v4df ax, ay, az, pot;
2760 // ax = ay = az = pot = (v4df){0.0, 0.0, 0.0, 0.0};
2761 //
2762 // v4df jbuf = *((v4df*)epjbufd);
2763 // v4df xj = _mm256_permute4x64_pd(jbuf, 0x00);
2764 // v4df yj = _mm256_permute4x64_pd(jbuf, 0x55);
2765 // v4df zj = _mm256_permute4x64_pd(jbuf, 0xaa);
2766 // v4df mj = _mm256_permute4x64_pd(jbuf, 0xff);
2767 //
2768 // for(int j=0; j<nj; j++){
2769 // jbuf = *((v4df*)(epjbufd+j+1));
2770 // v4df dx = xj - xi;
2771 // v4df dy = yj - yi;
2772 // v4df dz = zj - zi;
2773 // v4df r2 = ((veps2 + dx*dx) + dy*dy) + dz*dz;
2774 // //v4df mask = _mm256_cmp_pd(vrcrit2, r2, 0x01); // vrcrit2 < r2
2775 // v4df mask = _mm256_cmp_pd(veps2, r2, 0x4); // veps2 != r2
2776 // v4df ri1 = _mm256_and_pd( __builtin_ia32_cvtps2pd256( __builtin_ia32_rsqrtps( __builtin_ia32_cvtpd2ps256(r2))), mask );
2777 //#ifdef RSQRT_NR_EPJ_X2
2778 // //x2
2779 // v4df v0p5 = {0.5, 0.5, 0.5, 0.5};
2780 // v4df v3p0 = {3.0, 3.0, 3.0, 3.0};
2781 // ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
2782 //#elif defined(RSQRT_NR_EPJ_X4)
2783 // // x4
2784 // v4df vone = {1.0, 1.0, 1.0, 1.0};
2785 // v4df v8p0 = {8.0, 8.0, 8.0, 8.0};
2786 // v4df v6p0 = {6.0, 6.0, 6.0, 6.0};
2787 // v4df v5p0 = {5.0, 5.0, 5.0, 5.0};
2788 // v4df v0p0625 = {1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0};
2789 // v4df h = vone - r2*(ri1*ri1);
2790 // ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
2791 //#endif
2792 // v4df mri1 = mj * ri1;
2793 // v4df ri2 = ri1 * ri1;
2794 // v4df mri3 = mri1 * ri2;
2795 //
2796 // xj = _mm256_permute4x64_pd(jbuf, 0x00);
2797 // yj = _mm256_permute4x64_pd(jbuf, 0x55);
2798 // zj = _mm256_permute4x64_pd(jbuf, 0xaa);
2799 // mj = _mm256_permute4x64_pd(jbuf, 0xff);
2800 //
2801 // pot -= mri1;
2802 // ax += mri3 * dx;
2803 // ay += mri3 * dy;
2804 // az += mri3 * dz;
2805 // }
2806 // *(v4df *)(&accpbufd[i/8][0][il]) = ax;
2807 // *(v4df *)(&accpbufd[i/8][1][il]) = ay;
2808 // *(v4df *)(&accpbufd[i/8][2][il]) = az;
2809 // *(v4df *)(&accpbufd[i/8][3][il]) = pot;
2810 // }
2811 // }
2812 //
2813 // ///////////////////////
2814 // // with linear cutoff
2815 // __attribute__ ((noinline))
2816 // void kernel_epj_nounroll_for_p3t_with_linear_cutoff(const int ni, const int nj){
2817 // const v8sf vone = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
2818 // const v8sf veps2 = {(float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2};
2819 // const v8sf vr_out = {(float)r_out, (float)r_out, (float)r_out, (float)r_out, (float)r_out, (float)r_out, (float)r_out, (float)r_out};
2820 // const v8sf vr_out2 = vr_out * vr_out;
2821 // const v8sf vrcrit2 = {(float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2};
2822 // const v8sf allbits = _mm256_cmp_ps(vone, vone, 0x00);
2823 // for(int i=0; i<ni; i+=8){
2824 // const v8sf xi = *(v8sf *)(xibuf[i/8][0]);
2825 // const v8sf yi = *(v8sf *)(xibuf[i/8][1]);
2826 // const v8sf zi = *(v8sf *)(xibuf[i/8][2]);
2827 // v8sf ax, ay, az, pot, nngb;
2828 // ax = ay = az = pot = nngb = (v8sf){0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
2829 // v8sf jbuf = __builtin_ia32_vbroadcastf128_ps256((v4sf *)epjbuf);
2830 // v8sf xj = __builtin_ia32_shufps256(jbuf, jbuf, 0x00);
2831 // v8sf yj = __builtin_ia32_shufps256(jbuf, jbuf, 0x55);
2832 // v8sf zj = __builtin_ia32_shufps256(jbuf, jbuf, 0xaa);
2833 // v8sf mj = __builtin_ia32_shufps256(jbuf, jbuf, 0xff);
2834 // for(int j=0; j<nj; j++){
2835 // jbuf = __builtin_ia32_vbroadcastf128_ps256((v4sf *)(epjbuf + j+1));
2836 // v8sf dx = xj - xi;
2837 // v8sf dy = yj - yi;
2838 // v8sf dz = zj - zi;
2839 // v8sf r2_real = ((veps2 + dx*dx) + dy*dy) + dz*dz;
2840 // v8sf r2 = _mm256_max_ps(r2_real, vr_out2);
2841 // v8sf ri1 = __builtin_ia32_rsqrtps256(r2);
2842 //#ifdef RSQRT_NR_EPJ_X2
2843 // v8sf v3p0 = {3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f};
2844 // v8sf v0p5 = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
2845 // ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
2846 //#elif defined(RSQRT_NR_EPJ_X4)
2847 // // x4
2848 // v8sf v8p0 = {8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f};
2849 // v8sf v6p0 = {6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f};
2850 // v8sf v5p0 = {5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f};
2851 // v8sf v0p0625 = {(float)1.0/16.0, (float)1.0/16.0, (float)1.0/16.0, (float)1.0/16.0};
2852 // v8sf h = vone - r2*(ri1*ri1);
2853 // ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
2854 //#endif
2855 // v8sf ri2 = ri1*ri1;
2856 // v8sf mri1 = mj*ri1;
2857 // v8sf mri3 = mri1 * ri2;
2858 // xj = __builtin_ia32_shufps256(jbuf, jbuf, 0x00);
2859 // yj = __builtin_ia32_shufps256(jbuf, jbuf, 0x55);
2860 // zj = __builtin_ia32_shufps256(jbuf, jbuf, 0xaa);
2861 // mj = __builtin_ia32_shufps256(jbuf, jbuf, 0xff);
2862 // pot -= mri1;
2863 // ax += mri3 * dx;
2864 // ay += mri3 * dy;
2865 // az += mri3 * dz;
2866 // v8sf mask = _mm256_cmp_ps(vrcrit2, r2_real, 0x01); // for neighbour search
2867 // nngb += _mm256_and_ps( vone, _mm256_xor_ps(mask, allbits) ); // can remove
2868 // }
2869 // *(v8sf *)(accpbuf[i/8][0]) = ax;
2870 // *(v8sf *)(accpbuf[i/8][1]) = ay;
2871 // *(v8sf *)(accpbuf[i/8][2]) = az;
2872 // *(v8sf *)(accpbuf[i/8][3]) = pot;
2873 // *(v8sf *)(accpbuf[i/8][4]) = nngb;
2874 // }
2875 // }
2876 //
2877 // __attribute__ ((noinline))
2878 // void kernel_epj_64bit_nounroll_for_p3t_with_linear_cutoff(const int ni, const int nj){
2879 // const v4df vone = {1.0, 1.0, 1.0, 1.0};
2880 // const v4df veps2 = {eps2, eps2, eps2, eps2};
2881 // const v4df vr_out = {r_out, r_out, r_out, r_out};
2882 // const v4df vr_out2 = vr_out * vr_out;
2883 // const v4df vrcrit2 = {r_crit2, r_crit2, r_crit2, r_crit2};
2884 // const v4df allbits = _mm256_cmp_pd(vone, vone, 0x00);
2885 // for(int i=0; i<ni; i+=4){
2886 // const int il = i%8;
2887 // const v4df xi = *(v4df *)(&xibufd[i/8][0][il]);
2888 // const v4df yi = *(v4df *)(&xibufd[i/8][1][il]);
2889 // const v4df zi = *(v4df *)(&xibufd[i/8][2][il]);
2890 // v4df ax, ay, az, pot, nngb;
2891 // ax = ay = az = pot = nngb = (v4df){0.0, 0.0, 0.0, 0.0};
2892 // v4df jbuf = *((v4df*)epjbufd);
2893 // v4df xj = _mm256_permute4x64_pd(jbuf, 0x00);
2894 // v4df yj = _mm256_permute4x64_pd(jbuf, 0x55);
2895 // v4df zj = _mm256_permute4x64_pd(jbuf, 0xaa);
2896 // v4df mj = _mm256_permute4x64_pd(jbuf, 0xff);
2897 // for(int j=0; j<nj; j++){
2898 // jbuf = *((v4df*)(epjbufd+j+1));
2899 // v4df dx = xj - xi;
2900 // v4df dy = yj - yi;
2901 // v4df dz = zj - zi;
2902 // v4df r2_real = ((veps2 + dx*dx) + dy*dy) + dz*dz;
2903 // v4df r2 = _mm256_max_pd( r2_real, vr_out2);
2904 // v4df ri1 = __builtin_ia32_cvtps2pd256(__builtin_ia32_rsqrtps( __builtin_ia32_cvtpd2ps256(r2)));
2905 //#ifdef RSQRT_NR_EPJ_X2
2906 // //x2
2907 // v4df v0p5 = {0.5, 0.5, 0.5, 0.5};
2908 // v4df v3p0 = {3.0, 3.0, 3.0, 3.0};
2909 // ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
2910 //#elif defined(RSQRT_NR_EPJ_X4)
2911 // // x4
2912 // v4df v8p0 = {8.0, 8.0, 8.0, 8.0};
2913 // v4df v6p0 = {6.0, 6.0, 6.0, 6.0};
2914 // v4df v5p0 = {5.0, 5.0, 5.0, 5.0};
2915 // v4df v0p0625 = {1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0};
2916 // v4df h = vone - r2*(ri1*ri1);
2917 // ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
2918 //#endif
2919 // v4df ri2 = ri1*ri1;
2920 // v4df mri1 = mj*ri1;
2921 // v4df mri3 = mri1 * ri2;
2922 // xj = _mm256_permute4x64_pd(jbuf, 0x00);
2923 // yj = _mm256_permute4x64_pd(jbuf, 0x55);
2924 // zj = _mm256_permute4x64_pd(jbuf, 0xaa);
2925 // mj = _mm256_permute4x64_pd(jbuf, 0xff);
2926 // pot -= mri1;
2927 // ax += mri3 * dx;
2928 // ay += mri3 * dy;
2929 // az += mri3 * dz;
2930 // v4df mask = _mm256_cmp_pd(vrcrit2, r2_real, 0x01); // for neighbour search
2931 // nngb += _mm256_and_pd( vone, _mm256_xor_pd(mask, allbits) ); // can remove
2932 // }
2933 // *(v4df *)(&accpbufd[i/8][0][il]) = ax;
2934 // *(v4df *)(&accpbufd[i/8][1][il]) = ay;
2935 // *(v4df *)(&accpbufd[i/8][2][il]) = az;
2936 // *(v4df *)(&accpbufd[i/8][3][il]) = pot;
2937 // *(v4df *)(&accpbufd[i/8][4][il]) = nngb;
2938 // }
2939 // }
2940 // // linear cutoff
2941 // //////////////
2942 //
2943 // //////////////
2944 // // includ cutoff
2945 // __attribute__ ((noinline))
2946 // void kernel_epj_nounroll_for_p3t(const int ni, const int nj){
2947 // const v8sf vzero = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
2948 // const v8sf vone = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
2949 // const v8sf veps2 = {(float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2};
2950 // const v8sf vr_in = {(float)r_in, (float)r_in, (float)r_in, (float)r_in, (float)r_in, (float)r_in, (float)r_in, (float)r_in};
2951 // const v8sf vr_in2 = vr_in * vr_in;
2952 // const v8sf vrcrit2 = {(float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2};
2953 // const v8sf vdenominator = {(float)denominator, (float)denominator, (float)denominator, (float)denominator, (float)denominator, (float)denominator, (float)denominator, (float)denominator};
2954 // const v8sf vn20 = {-20.0f, -20.0f, -20.0f, -20.0f, -20.0f, -20.0f, -20.0f, -20.0f};
2955 // const v8sf v70 = { 70.0f, 70.0f, 70.0f, 70.0f, 70.0f, 70.0f, 70.0f, 70.0f};
2956 // const v8sf vn84 = {-84.0f, -84.0f, -84.0f, -84.0f, -84.0f, -84.0f, -84.0f, -84.0f};
2957 // const v8sf v35 = { 35.0f, 35.0f, 35.0f, 35.0f, 35.0f, 35.0f, 35.0f, 35.0f};
2958 // const v8sf allbits = _mm256_cmp_ps(vone, vone, 0x00);
2959 // for(int i=0; i<ni; i+=8){
2960 // const v8sf xi = *(v8sf *)(xibuf[i/8][0]);
2961 // const v8sf yi = *(v8sf *)(xibuf[i/8][1]);
2962 // const v8sf zi = *(v8sf *)(xibuf[i/8][2]);
2963 // v8sf ax, ay, az, pot, nngb;
2964 // ax = ay = az = pot = nngb = (v8sf){0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
2965 // v8sf jbuf = __builtin_ia32_vbroadcastf128_ps256((v4sf *)epjbuf);
2966 // v8sf xj = __builtin_ia32_shufps256(jbuf, jbuf, 0x00);
2967 // v8sf yj = __builtin_ia32_shufps256(jbuf, jbuf, 0x55);
2968 // v8sf zj = __builtin_ia32_shufps256(jbuf, jbuf, 0xaa);
2969 // v8sf mj = __builtin_ia32_shufps256(jbuf, jbuf, 0xff);
2970 // for(int j=0; j<nj; j++){
2971 // jbuf = __builtin_ia32_vbroadcastf128_ps256((v4sf *)(epjbuf + j+1));
2972 // v8sf dx = xj - xi;
2973 // v8sf dy = yj - yi;
2974 // v8sf dz = zj - zi;
2975 // v8sf r2 = ((veps2 + dx*dx) + dy*dy) + dz*dz;
2976 // v8sf mask = _mm256_cmp_ps(vr_in2, r2, 0x01); // 1op, can remove?
2977 // v8sf ri1 = __builtin_ia32_rsqrtps256(r2);
2978 // ri1 = _mm256_and_ps(ri1, mask); // 1op
2979 //#ifdef RSQRT_NR_EPJ_X2
2980 // v8sf v3p0 = {3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f};
2981 // v8sf v0p5 = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
2982 // ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
2983 //#elif defined(RSQRT_NR_EPJ_X4)
2984 // // x4
2985 // v8sf v8p0 = {8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f};
2986 // v8sf v6p0 = {6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f};
2987 // v8sf v5p0 = {5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f};
2988 // v8sf v0p0625 = {(float)1.0/16.0, (float)1.0/16.0, (float)1.0/16.0, (float)1.0/16.0};
2989 // v8sf h = vone - r2*(ri1*ri1);
2990 // ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
2991 //#endif
2992 // v8sf r1 = r2*ri1; // 1op
2993 // v8sf x = _mm256_min_ps( _mm256_max_ps( ((r1-vr_in)*vdenominator), vzero), vone); // 4op
2994 // v8sf x2 = x * x; // 1op
2995 // v8sf x4 = x2 * x2; // 1op
2996 // v8sf k = _mm256_fmadd_ps( _mm256_fmadd_ps( _mm256_fmadd_ps(vn20, x, v70), x, vn84), x, v35)*x4; // 4op
2997 // v8sf mkri1 = mj * k * ri1 ;
2998 // v8sf ri2 = ri1 * ri1;
2999 // v8sf mkri3 = mkri1 * ri2;
3000 // xj = __builtin_ia32_shufps256(jbuf, jbuf, 0x00);
3001 // yj = __builtin_ia32_shufps256(jbuf, jbuf, 0x55);
3002 // zj = __builtin_ia32_shufps256(jbuf, jbuf, 0xaa);
3003 // mj = __builtin_ia32_shufps256(jbuf, jbuf, 0xff);
3004 // pot -= mkri1;
3005 // ax += mkri3 * dx;
3006 // ay += mkri3 * dy;
3007 // az += mkri3 * dz;
3008 // mask = _mm256_cmp_ps(vrcrit2, r2, 0x01); // for neighbour search
3009 // nngb += _mm256_and_ps( vone, _mm256_xor_ps(mask, allbits) ); // can remove
3010 // }
3011 // *(v8sf *)(accpbuf[i/8][0]) = ax;
3012 // *(v8sf *)(accpbuf[i/8][1]) = ay;
3013 // *(v8sf *)(accpbuf[i/8][2]) = az;
3014 // *(v8sf *)(accpbuf[i/8][3]) = pot;
3015 // *(v8sf *)(accpbuf[i/8][4]) = nngb;
3016 // }
3017 // }
3018 //
3019 // __attribute__ ((noinline))
3020 // void kernel_epj_64bit_nounroll_for_p3t(const int ni, const int nj){
3021 // const v4df vzero = {0.0, 0.0, 0.0, 0.0};
3022 // const v4df vone = {1.0, 1.0, 1.0, 1.0};
3023 // const v4df veps2 = {eps2, eps2, eps2, eps2};
3024 // const v4df vr_in = {r_in, r_in, r_in, r_in};
3025 // const v4df vr_in2 = vr_in * vr_in;
3026 // const v4df vrcrit2 = {r_crit2, r_crit2, r_crit2, r_crit2};
3027 // const v4df vdenominator = {denominator, denominator, denominator, denominator};
3028 // const v4df vn20 = {-20.0, -20.0, -20.0, -20.0};
3029 // const v4df v70 = { 70.0, 70.0, 70.0, 70.0};
3030 // const v4df vn84 = {-84.0, -84.0, -84.0, -84.0};
3031 // const v4df v35 = { 35.0, 35.0, 35.0, 35.0};
3032 // const v4df allbits = _mm256_cmp_pd(vone, vone, 0x00);
3033 // for(int i=0; i<ni; i+=4){
3034 // const int il = i%8;
3035 // const v4df xi = *(v4df *)(&xibufd[i/8][0][il]);
3036 // const v4df yi = *(v4df *)(&xibufd[i/8][1][il]);
3037 // const v4df zi = *(v4df *)(&xibufd[i/8][2][il]);
3038 // v4df ax, ay, az, pot, nngb;
3039 // ax = ay = az = pot = nngb = (v4df){0.0, 0.0, 0.0, 0.0};
3040 // v4df jbuf = *((v4df*)epjbufd);
3041 // v4df xj = _mm256_permute4x64_pd(jbuf, 0x00);
3042 // v4df yj = _mm256_permute4x64_pd(jbuf, 0x55);
3043 // v4df zj = _mm256_permute4x64_pd(jbuf, 0xaa);
3044 // v4df mj = _mm256_permute4x64_pd(jbuf, 0xff);
3045 // for(int j=0; j<nj; j++){
3046 // jbuf = *((v4df*)(epjbufd+j+1));
3047 // v4df dx = xj - xi;
3048 // v4df dy = yj - yi;
3049 // v4df dz = zj - zi;
3050 // v4df r2 = ((veps2 + dx*dx) + dy*dy) + dz*dz;
3051 // v4df mask = _mm256_cmp_pd(vr_in2, r2, 0x01);
3052 // v4df ri1 = _mm256_and_pd( __builtin_ia32_cvtps2pd256( __builtin_ia32_rsqrtps( __builtin_ia32_cvtpd2ps256(r2))), mask );
3053 // ri1 = _mm256_and_pd(ri1, mask); // 1op
3054 //#ifdef RSQRT_NR_EPJ_X2
3055 // //x2
3056 // v4df v0p5 = {0.5, 0.5, 0.5, 0.5};
3057 // v4df v3p0 = {3.0, 3.0, 3.0, 3.0};
3058 // ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
3059 //#elif defined(RSQRT_NR_EPJ_X4)
3060 // // x4
3061 // v4df v8p0 = {8.0, 8.0, 8.0, 8.0};
3062 // v4df v6p0 = {6.0, 6.0, 6.0, 6.0};
3063 // v4df v5p0 = {5.0, 5.0, 5.0, 5.0};
3064 // v4df v0p0625 = {1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0};
3065 // v4df h = vone - r2*(ri1*ri1);
3066 // ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
3067 //#endif
3068 // v4df r1 = r2*ri1; // 1op
3069 // v4df x = _mm256_min_pd( _mm256_max_pd( ((r1-vr_in)*vdenominator), vzero), vone); // 4op
3070 // v4df x2 = x * x; // 1op
3071 // v4df x4 = x2 * x2; // 1op
3072 // v4df k = _mm256_fmadd_pd( _mm256_fmadd_pd( _mm256_fmadd_pd(vn20, x, v70), x, vn84), x, v35)*x4; // 4op
3073 // v4df mkri1 = mj * k * ri1 ;
3074 // v4df ri2 = ri1 * ri1;
3075 // v4df mkri3 = mkri1 * ri2;
3076 //
3077 // xj = _mm256_permute4x64_pd(jbuf, 0x00);
3078 // yj = _mm256_permute4x64_pd(jbuf, 0x55);
3079 // zj = _mm256_permute4x64_pd(jbuf, 0xaa);
3080 // mj = _mm256_permute4x64_pd(jbuf, 0xff);
3081 // pot -= mkri1;
3082 // ax += mkri3 * dx;
3083 // ay += mkri3 * dy;
3084 // az += mkri3 * dz;
3085 // mask = _mm256_cmp_pd(vrcrit2, r2, 0x01); // for neighbour search
3086 // nngb += _mm256_and_pd( vone, _mm256_xor_pd(mask, allbits) ); // can remove
3087 // }
3088 // *(v4df *)(&accpbufd[i/8][0][il]) = ax;
3089 // *(v4df *)(&accpbufd[i/8][1][il]) = ay;
3090 // *(v4df *)(&accpbufd[i/8][2][il]) = az;
3091 // *(v4df *)(&accpbufd[i/8][3][il]) = pot;
3092 // *(v4df *)(&accpbufd[i/8][4][il]) = nngb;
3093 // }
3094 // }
3095 //
3096 // /*
3097 // __attribute__ ((noinline))
3098 // void kernel_epj_mix_nounroll_for_p3t(const int ni, const int nj){
3099 // const v8sf vzero = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
3100 // const v8sf vone = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
3101 // const v8sf veps2 = {(float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2};
3102 // const v8sf vr_in = {(float)r_in, (float)r_in, (float)r_in, (float)r_in, (float)r_in, (float)r_in, (float)r_in, (float)r_in};
3103 // const v8sf vr_in2 = vr_in * vr_in;
3104 // const v8sf vrcrit2 = {(float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2, (float)r_crit2};
3105 // const v8sf vdenominator = {(float)denominator, (float)denominator, (float)denominator, (float)denominator, (float)denominator, (float)denominator, (float)denominator, (float)denominator};
3106 // const v8sf vn20 = {-20.0f, -20.0f, -20.0f, -20.0f, -20.0f, -20.0f, -20.0f, -20.0f};
3107 // const v8sf v70 = { 70.0f, 70.0f, 70.0f, 70.0f, 70.0f, 70.0f, 70.0f, 70.0f};
3108 // const v8sf vn84 = {-84.0f, -84.0f, -84.0f, -84.0f, -84.0f, -84.0f, -84.0f, -84.0f};
3109 // const v8sf v35 = { 35.0f, 35.0f, 35.0f, 35.0f, 35.0f, 35.0f, 35.0f, 35.0f};
3110 // const v8sf allbits = _mm256_cmp_ps(vone, vone, 0x00);
3111 // for(int i=0; i<ni; i+=8){
3112 // const v4df xi0 = *(v4df *)(&xibufd[i/8][0][0]);
3113 // const v4df yi0 = *(v4df *)(&xibufd[i/8][1][0]);
3114 // const v4df zi0 = *(v4df *)(&xibufd[i/8][2][0]);
3115 //
3116 // const v4df xi1 = *(v4df *)(&xibufd[i/8][0][1]);
3117 // const v4df yi1 = *(v4df *)(&xibufd[i/8][1][1]);
3118 // const v4df zi1 = *(v4df *)(&xibufd[i/8][2][1]);
3119 //
3120 // //const v8sf xi = *(v8sf *)(xibuf[i/8][0]);
3121 // //const v8sf yi = *(v8sf *)(xibuf[i/8][1]);
3122 // //const v8sf zi = *(v8sf *)(xibuf[i/8][2]);
3123 // //v8sf ax, ay, az, pot, nngb;
3124 // //ax = ay = az = pot = nngb = (v8sf){0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
3125 //
3126 // v4df ax0, ay0, az0, pot0;
3127 // ax = ay = az = pot = (v4df){0.0, 0.0, 0.0, 0.0};
3128 // v8sf nngb;
3129 // nngb = (v8sf){0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
3130 //
3131 // //v8sf jbuf = __builtin_ia32_vbroadcastf128_ps256((v4sf *)epjbuf);
3132 // //v8sf xj = __builtin_ia32_shufps256(jbuf, jbuf, 0x00);
3133 // //v8sf yj = __builtin_ia32_shufps256(jbuf, jbuf, 0x55);
3134 // //v8sf zj = __builtin_ia32_shufps256(jbuf, jbuf, 0xaa);
3135 // //v8sf mj = __builtin_ia32_shufps256(jbuf, jbuf, 0xff);
3136 // v4df jbuf = *((v4df*)epjbufd);
3137 // v4df xj = _mm256_permute4x64_pd(jbuf, 0x00);
3138 // v4df yj = _mm256_permute4x64_pd(jbuf, 0x55);
3139 // v4df zj = _mm256_permute4x64_pd(jbuf, 0xaa);
3140 // v4df mj = _mm256_permute4x64_pd(jbuf, 0xff);
3141 // for(int j=0; j<nj; j++){
3142 // //jbuf = __builtin_ia32_vbroadcastf128_ps256((v4sf *)(epjbuf + j+1));
3143 // jbuf = *((v4df*)(epjbufd+j+1));
3144 // //v8sf dx = xj - xi;
3145 // //v8sf dy = yj - yi;
3146 // //v8sf dz = zj - zi;
3147 // v4df dx0 = xj - xi0;
3148 // v4df dy0 = yj - yi0;
3149 // v4df dz0 = zj - zi0;
3150 // v4df dx1 = xj - xi1;
3151 // v4df dy1 = yj - yi1;
3152 // v4df dz1 = zj - zi1;
3153 // v8sf dx0s = _mm256_cvtpd_ps(dx0);
3154 // v8sf dx1s = _mm256_cvtpd_ps(dx1);
3155 // v8sf dx = __mm256_permute2f128_ps(dx0s, dx1s, 0x20);
3156 // v8sf dy0s = _mm256_cvtpd_ps(dy0);
3157 // v8sf dy1s = _mm256_cvtpd_ps(dy1);
3158 // v8sf dy = __mm256_permute2f128_ps(dy0s, dy1s, 0x20);
3159 // v8sf dz0s = _mm256_cvtpd_ps(dz0);
3160 // v8sf dz1s = _mm256_cvtpd_ps(dz1);
3161 // v8sf dz = __mm256_permute2f128_ps(dz0s, dz1s, 0x20);
3162 // v8sf r2 = ((veps2 + dx*dx) + dy*dy) + dz*dz;
3163 // v8sf mask = _mm256_cmp_ps(vr_in2, r2, 0x01); // 1op, can remove?
3164 // v8sf ri1 = __builtin_ia32_rsqrtps256(r2);
3165 // ri1 = _mm256_and_ps(ri1, mask); // 1op
3166 //#ifdef RSQRT_NR_EPJ_X2
3167 // v8sf v3p0 = {3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f};
3168 // v8sf v0p5 = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
3169 // ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
3170 //#elif defined(RSQRT_NR_EPJ_X4)
3171 // // x4
3172 // v8sf v8p0 = {8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f};
3173 // v8sf v6p0 = {6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f};
3174 // v8sf v5p0 = {5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f};
3175 // v8sf v0p0625 = {(float)1.0/16.0, (float)1.0/16.0, (float)1.0/16.0, (float)1.0/16.0};
3176 // v8sf h = vone - r2*(ri1*ri1);
3177 // ri1 *= vone + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
3178 //#endif
3179 // v8sf r1 = r2*ri1; // 1op
3180 // v8sf x = _mm256_min_ps( _mm256_max_ps( ((r1-vr_in)*vdenominator), vzero), vone); // 4op
3181 // v8sf x2 = x * x; // 1op
3182 // v8sf x4 = x2 * x2; // 1op
3183 // v8sf k = _mm256_fmadd_ps( _mm256_fmadd_ps( _mm256_fmadd_ps(vn20, x, v70), x, vn84), x, v35)*x4; // 4op
3184 // v8sf mkri1 = mj * k * ri1 ;
3185 // v8sf ri2 = ri1 * ri1;
3186 // v8sf mkri3 = mkri1 * ri2;
3187 // xj = _mm256_permute4x64_pd(jbuf, 0x00);
3188 // yj = _mm256_permute4x64_pd(jbuf, 0x55);
3189 // zj = _mm256_permute4x64_pd(jbuf, 0xaa);
3190 // mj = _mm256_permute4x64_pd(jbuf, 0xff);
3191 // //xj = __builtin_ia32_shufps256(jbuf, jbuf, 0x00);
3192 // //yj = __builtin_ia32_shufps256(jbuf, jbuf, 0x55);
3193 // //zj = __builtin_ia32_shufps256(jbuf, jbuf, 0xaa);
3194 // //mj = __builtin_ia32_shufps256(jbuf, jbuf, 0xff);
3195 // pot -= mkri1;
3196 // ax += mkri3 * dx;
3197 // ay += mkri3 * dy;
3198 // az += mkri3 * dz;
3199 // mask = _mm256_cmp_ps(vrcrit2, r2, 0x01); // for neighbour search
3200 // nngb += _mm256_and_ps( vone, _mm256_xor_ps(mask, allbits) ); // can remove
3201 // }
3202 // *(v8sf *)(accpbuf[i/8][0]) = ax;
3203 // *(v8sf *)(accpbuf[i/8][1]) = ay;
3204 // *(v8sf *)(accpbuf[i/8][2]) = az;
3205 // *(v8sf *)(accpbuf[i/8][3]) = pot;
3206 // *(v8sf *)(accpbuf[i/8][4]) = nngb;
3207 // }
3208 // }
3209 // */
3210 //
3211 // __attribute__ ((noinline))
3212 // void kernel_spj_nounroll(const int ni, const int nj){
3213 // const v8sf veps2 = {(float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2, (float)eps2};
3214 // for(int i=0; i<ni; i+=8){
3215 // const v8sf xi = *(v8sf *)(xibuf[i/8][0]);
3216 // const v8sf yi = *(v8sf *)(xibuf[i/8][1]);
3217 // const v8sf zi = *(v8sf *)(xibuf[i/8][2]);
3218 //
3219 // v8sf ax, ay, az, pot;
3220 // ax = ay = az = pot = (v8sf){0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
3221 //
3222 //#define AVX_PRELOAD
3223 //
3224 //#ifdef AVX_PRELOAD
3225 // v8sf jbuf0 = __builtin_ia32_vbroadcastf128_ps256((v4sf *)&spjbuf[0][0]);
3226 // v8sf jbuf1 = __builtin_ia32_vbroadcastf128_ps256((v4sf *)&spjbuf[0][1]);
3227 // v8sf jbuf2 = __builtin_ia32_vbroadcastf128_ps256((v4sf *)&spjbuf[0][2]);
3228 //#else
3229 // v8sf jbuf0, jbuf1, jbuf2;
3230 //#endif
3231 // for(int j=0; j<nj; j++){
3232 //#ifndef AVX_PRELOAD
3233 // jbuf0 = __builtin_ia32_vbroadcastf128_ps256((v4sf *)&spjbuf[j+0][0]);
3234 //#endif
3235 // v8sf xj = __builtin_ia32_shufps256(jbuf0, jbuf0, 0x00);
3236 // v8sf yj = __builtin_ia32_shufps256(jbuf0, jbuf0, 0x55);
3237 // v8sf zj = __builtin_ia32_shufps256(jbuf0, jbuf0, 0xaa);
3238 //#ifdef AVX_PRELOAD
3239 // jbuf0 = __builtin_ia32_vbroadcastf128_ps256((v4sf *)&spjbuf[j+1][0]);
3240 //#endif
3241 //
3242 //#ifndef AVX_PRELOAD
3243 // jbuf1 = __builtin_ia32_vbroadcastf128_ps256((v4sf *)&spjbuf[j+0][1]);
3244 //#endif
3245 // v8sf qxx = __builtin_ia32_shufps256(jbuf1, jbuf1, 0x00);
3246 // v8sf qyy = __builtin_ia32_shufps256(jbuf1, jbuf1, 0x55);
3247 // v8sf qzz = __builtin_ia32_shufps256(jbuf1, jbuf1, 0xaa);
3248 // v8sf mj = __builtin_ia32_shufps256(jbuf1, jbuf1, 0xff);
3249 //#ifdef AVX_PRELOAD
3250 // jbuf1 = __builtin_ia32_vbroadcastf128_ps256((v4sf *)&spjbuf[j+1][1]);
3251 //#endif
3252 //
3253 //#ifndef AVX_PRELOAD
3254 // jbuf2 = __builtin_ia32_vbroadcastf128_ps256((v4sf *)&spjbuf[j+0][2]);
3255 //#endif
3256 // v8sf qxy = __builtin_ia32_shufps256(jbuf2, jbuf2, 0x00);
3257 // v8sf qyz = __builtin_ia32_shufps256(jbuf2, jbuf2, 0x55);
3258 // v8sf qzx = __builtin_ia32_shufps256(jbuf2, jbuf2, 0xaa);
3259 // v8sf mtr = __builtin_ia32_shufps256(jbuf2, jbuf2, 0xff);
3260 //#ifdef AVX_PRELOAD
3261 // jbuf2 = __builtin_ia32_vbroadcastf128_ps256((v4sf *)&spjbuf[j+1][2]);
3262 //#endif
3263 //
3264 // v8sf dx = xj - xi;
3265 // v8sf dy = yj - yi;
3266 // v8sf dz = zj - zi;
3267 //
3268 // v8sf r2 = ((veps2 + dx*dx) + dy*dy) + dz*dz;
3269 // v8sf ri1 = __builtin_ia32_rsqrtps256(r2);
3270 // v8sf v0p5 = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
3271 //#ifdef RSQRT_NR_SPJ_X2
3272 // v8sf v3p0 = {3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0};
3273 // ri1 *= (v3p0 - r2*(ri1*ri1))*v0p5;
3274 //#endif
3275 // v8sf ri2 = ri1 * ri1;
3276 // v8sf ri3 = ri1 * ri2;
3277 // v8sf ri4 = ri2 * ri2;
3278 // v8sf ri5 = ri2 * ri3;
3279 //
3280 // v8sf qr_x = (qxx*dx + qxy*dy) + qzx*dz;
3281 // v8sf qr_y = (qyy*dy + qxy*dx) + qyz*dz;
3282 // v8sf qr_z = (qzz*dz + qzx*dx) + qyz*dy;
3283 //
3284 // v8sf rqr = ((mtr + qr_x*dx) + qr_y*dy) + qr_z*dz;
3285 // v8sf rqr_ri4 = rqr * ri4;
3286 //
3287 // //v8sf v0p5 = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
3288 // v8sf v2p5 = {2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 2.5f};
3289 //
3290 // v8sf meff = mj + v0p5 * rqr_ri4;
3291 // v8sf meff3 = (mj + v2p5 * rqr_ri4) * ri3;
3292 //
3293 // pot -= meff * ri1;
3294 //
3295 // ax = (ax - ri5*qr_x) + meff3*dx;
3296 // ay = (ay - ri5*qr_y) + meff3*dy;
3297 // az = (az - ri5*qr_z) + meff3*dz;
3298 // }
3299 // *(v8sf *)(accpbuf[i/8][0]) = ax;
3300 // *(v8sf *)(accpbuf[i/8][1]) = ay;
3301 // *(v8sf *)(accpbuf[i/8][2]) = az;
3302 // *(v8sf *)(accpbuf[i/8][3]) = pot;
3303 // }
3304 // }
3305 //
3306 // __attribute__ ((noinline))
3307 // void kernel_spj_64bit_nounroll(const int ni, const int nj){
3308 // const v4df veps2 = {eps2, eps2, eps2, eps2};
3309 // for(int i=0; i<ni; i+=4){
3310 // const int il = i%8;
3311 // const v4df xi = *(v4df *)(&xibufd[i/8][0][il]);
3312 // const v4df yi = *(v4df *)(&xibufd[i/8][1][il]);
3313 // const v4df zi = *(v4df *)(&xibufd[i/8][2][il]);
3314 //
3315 // v4df ax, ay, az, pot;
3316 // ax = ay = az = pot = (v4df){0.0, 0.0, 0.0, 0.0};
3317 //
3318 // v4df jbuf0 = *((v4df*)spjbufd[0][0]);
3319 // v4df jbuf1 = *((v4df*)spjbufd[0][1]);
3320 // v4df jbuf2 = *((v4df*)spjbufd[0][2]);
3321 //
3322 // for(int j=0; j<nj; j++){
3323 // v4df xj = _mm256_permute4x64_pd(jbuf0, 0x00);
3324 // v4df yj = _mm256_permute4x64_pd(jbuf0, 0x55);
3325 // v4df zj = _mm256_permute4x64_pd(jbuf0, 0xaa);
3326 // jbuf0 = *((v4df*)spjbufd[j+1][0]);
3327 //
3328 // v4df qxx = _mm256_permute4x64_pd(jbuf1, 0x00);
3329 // v4df qyy = _mm256_permute4x64_pd(jbuf1, 0x55);
3330 // v4df qzz = _mm256_permute4x64_pd(jbuf1, 0xaa);
3331 // v4df mj = _mm256_permute4x64_pd(jbuf1, 0xff);
3332 // jbuf1 = *((v4df*)spjbufd[j+1][1]);
3333 //
3334 // v4df qxy = _mm256_permute4x64_pd(jbuf2, 0x00);
3335 // v4df qyz = _mm256_permute4x64_pd(jbuf2, 0x55);
3336 // v4df qzx = _mm256_permute4x64_pd(jbuf2, 0xaa);
3337 // v4df mtr = _mm256_permute4x64_pd(jbuf2, 0xff);
3338 // jbuf2 = *((v4df*)spjbufd[j+1][2]);
3339 //
3340 // v4df dx = xj - xi;
3341 // v4df dy = yj - yi;
3342 // v4df dz = zj - zi;
3343 //
3344 // v4df r2 = ((veps2 + dx*dx) + dy*dy) + dz*dz;
3345 // //v4df ri1 = __builtin_ia32_rsqrtps256(r2);
3346 // v4df ri1 = __builtin_ia32_cvtps2pd256( __builtin_ia32_rsqrtps( __builtin_ia32_cvtpd2ps256(r2)));
3347 //
3348 //#ifdef RSQRT_NR_SPJ_X2
3349 // //x2
3350 // v4df v3p0 = {3.0, 3.0, 3.0, 3.0};
3351 // ri1 *= (v3p0 - r2*(ri1*ri1));
3352 //#elif defined(RSQRT_NR_SPJ_X4)
3353 // // x4
3354 // v4df v8p0 = {8.0, 8.0, 8.0, 8.0};
3355 // v4df v6p0 = {6.0, 6.0, 6.0, 6.0};
3356 // v4df v5p0 = {5.0, 5.0, 5.0, 5.0};
3357 // v4df v0p0625 = {1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0};
3358 // v4df v1p0 = {1.0, 1.0, 1.0, 1.0};
3359 // v4df h = v1p0 - r2*(ri1*ri1);
3360 // ri1 *= v1p0 + h*(v8p0+h*(v6p0+v5p0*h))*v0p0625;
3361 //#endif
3362 // v4df ri2 = ri1 * ri1;
3363 // v4df ri3 = ri1 * ri2;
3364 // v4df ri4 = ri2 * ri2;
3365 // v4df ri5 = ri2 * ri3;
3366 //
3367 // v4df qr_x = (qxx*dx + qxy*dy) + qzx*dz;
3368 // v4df qr_y = (qyy*dy + qxy*dx) + qyz*dz;
3369 // v4df qr_z = (qzz*dz + qzx*dx) + qyz*dy;
3370 //
3371 // v4df rqr = ((mtr + qr_x*dx) + qr_y*dy) + qr_z*dz;
3372 // v4df rqr_ri4 = rqr * ri4;
3373 //
3374 // v4df v0p5 = {0.5, 0.5, 0.5, 0.5};
3375 // v4df v2p5 = {2.5, 2.5, 2.5, 2.5};
3376 //
3377 // v4df meff = mj + v0p5 * rqr_ri4;
3378 // v4df meff3 = (mj + v2p5 * rqr_ri4) * ri3;
3379 //
3380 // pot -= meff * ri1;
3381 //
3382 // ax = (ax - ri5*qr_x) + meff3*dx;
3383 // ay = (ay - ri5*qr_y) + meff3*dy;
3384 // az = (az - ri5*qr_z) + meff3*dz;
3385 // }
3386 //
3387 //#ifdef RSQRT_NR_SPJ_X2
3388 // //x2
3389 // v4df v0p5 = {0.5, 0.5, 0.5, 0.5};
3390 // v4df v0p125 = {0.125, 0.125, 0.125, 0.125};
3391 // pot *= v0p5;
3392 // ax *= v0p125;
3393 // ay *= v0p125;
3394 // az *= v0p125;
3395 //#endif
3396 //
3397 // *(v4df *)(&accpbufd[i/8][0][il]) = ax;
3398 // *(v4df *)(&accpbufd[i/8][1][il]) = ay;
3399 // *(v4df *)(&accpbufd[i/8][2][il]) = az;
3400 // *(v4df *)(&accpbufd[i/8][3][il]) = pot;
3401 // }
3402 // }
3403 //
3404 //
3405 //} __attribute__ ((aligned(128)));
3406 //
3407 //
3408 //
3409 //
3410 //
3411 //
3412 //#endif
PhantomGrapeQuad::run_epj
void run_epj(const int ni, const int nj)
Definition: phantomquad_for_p3t_x86.hpp:293
_mm256_fmadd_ps
#define _mm256_fmadd_ps(x, y, z)
Definition: phantomquad_for_p3t_x86.hpp:10
PIKG::S32
int32_t S32
Definition: pikg_vector.hpp:24
_mm512_fnmadd_ps
#define _mm512_fnmadd_ps(x, y, z)
Definition: phantomquad_for_p3t_x86.hpp:13
PhantomGrapeQuad::set_r_crit2
void set_r_crit2(const double _r_crit2)
Definition: phantomquad_for_p3t_x86.hpp:131
PhantomGrapeQuad::NJMAX
@ NJMAX
Definition: phantomquad_for_p3t_k.hpp:8
run_epj
void run_epj(const int ni, const int nj)
Definition: phantomquad_for_p3t_x86.hpp:278
PhantomGrapeQuad::run_spj
void run_spj(const int ni, const int nj)
Definition: phantomquad_for_p3t_x86.hpp:324
PhantomGrapeQuad::accum_accp_one
void accum_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot, real_t &nngb)
Definition: phantomquad_for_p3t_x86.hpp:194
PhantomGrapeQuad::accum_accp_one
void accum_accp_one(const int addr, real_t &nngb)
Definition: phantomquad_for_p3t_x86.hpp:210
PhantomGrapeQuad::set_epj_one
void set_epj_one(const int addr, const double x, const double y, const double z, const double m, const double r_search)
Definition: phantomquad_for_p3t_x86.hpp:135
set_r_crit2
void set_r_crit2(const double _r_crit2)
Definition: phantomquad_for_p3t_x86.hpp:116
NJMAX
@ NJMAX
Definition: phantomquad_for_p3t_x86.hpp:6
run_epj_for_p3t_with_linear_cutoff
void run_epj_for_p3t_with_linear_cutoff(const int ni, const int nj)
Definition: phantomquad_for_p3t_x86.hpp:222
_mm512_fmadd_ps
#define _mm512_fmadd_ps(x, y, z)
Definition: phantomquad_for_p3t_x86.hpp:12
__attribute__
class PhantomGrapeQuad __attribute__((aligned(128)))
set_epj_one
void set_epj_one(const int addr, const double x, const double y, const double z, const double m, const double r_search)
Definition: phantomquad_for_p3t_x86.hpp:120
set_xi_one
void set_xi_one(const int addr, const double x, const double y, const double z, const double r_search)
Definition: phantomquad_for_p3t_x86.hpp:150
PhantomGrapeQuad::set_xi_one
void set_xi_one(const int addr, const double x, const double y, const double z, const double r_search)
Definition: phantomquad_for_p3t_x86.hpp:165
run_spj
void run_spj(const int ni, const int nj)
Definition: phantomquad_for_p3t_x86.hpp:309
PhantomGrapeQuad::set_eps2
void set_eps2(const double _eps2)
Definition: phantomquad_for_p3t_x86.hpp:128
PhantomGrapeQuad::run_epj_for_p3t_with_linear_cutoff
void run_epj_for_p3t_with_linear_cutoff(const int ni, const int nj)
Definition: phantomquad_for_p3t_x86.hpp:237
PhantomGrapeQuad::get_accp_one
void get_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot, real_t &nngb)
Definition: phantomquad_for_p3t_x86.hpp:221
NIMAX
@ NIMAX
Definition: phantomquad_for_p3t_x86.hpp:5
set_spj_one
void set_spj_one(const int addr, const double x, const double y, const double z, const double m, const double qxx, const double qyy, const double qzz, const double qxy, const double qyz, const double qzx)
Definition: phantomquad_for_p3t_x86.hpp:129
run_epj_for_neighbor_count
void run_epj_for_neighbor_count(const int ni, const int nj)
Definition: phantomquad_for_p3t_x86.hpp:250
_mm256_fnmadd_ps
#define _mm256_fnmadd_ps(x, y, z)
Definition: phantomquad_for_p3t_x86.hpp:11
PhantomGrapeQuad::accum_accp_one
void accum_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot)
Definition: phantomquad_for_p3t_k.hpp:143
accum_accp_one
void accum_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot)
Definition: phantomquad_for_p3t_x86.hpp:164
PhantomGrapeQuad::set_spj_one
void set_spj_one(const int addr, const double x, const double y, const double z, const double m, const double qxx, const double qyy, const double qzz, const double qxy, const double qyz, const double qzx)
Definition: phantomquad_for_p3t_x86.hpp:144
PhantomGrapeQuad::run_epj_for_neighbor_count
void run_epj_for_neighbor_count(const int ni, const int nj)
Definition: phantomquad_for_p3t_x86.hpp:265
PhantomGrapeQuad::__attribute__
enum PhantomGrapeQuad::@0 __attribute__
PhantomGrapeQuad::NIMAX
@ NIMAX
Definition: phantomquad_for_p3t_k.hpp:7
PhantomGrapeQuad::PhantomGrapeQuad
PhantomGrapeQuad()
Definition: phantomquad_for_p3t_x86.hpp:47
set_eps2
void set_eps2(const double _eps2)
Definition: phantomquad_for_p3t_x86.hpp:113
PhantomGrapeQuad
Definition: phantomquad_for_p3t_k.hpp:4
get_accp_one
void get_accp_one(const int addr, real_t &ax, real_t &ay, real_t &az, real_t &pot, real_t &nngb)
Definition: phantomquad_for_p3t_x86.hpp:206