PeTar
N-body code for collisional gravitational systems
force_fugaku.hpp
Go to the documentation of this file.
1 // Generated by PIKG and then be modified to fit PeTar
2 #pragma once
3 #ifdef USE_FUGAKU
4 #include<cmath>
5 #include<limits>
6 #include<chrono>
7 #include <arm_sve.h>
8 #define PIKG_USE_FDPS_VECTOR
9 #include"pikg_vector.hpp"
10 #include"soft_ptcl.hpp"
11 
12 #ifdef USE_QUAD
13 #define SPJSoft PS::SPJQuadrupoleInAndOut
14 #else
15 #define SPJSoft PS::SPJMonopoleInAndOut
16 #endif
17 
18 struct EPI32{
19  PIKG::F32vec pos;
20  PIKG::F32 rs;
21 };
22 
23 struct EPJ32{
24  PIKG::F32vec pos;
25  PIKG::F32 mass,rs;
26 };
27 
28 struct SPJ32{
29  PIKG::F32vec pos;
30  PIKG::F32 mass;
31 #ifdef USE_QUAD
32  PIKG::F32 qxx, qyy, qzz, qxy, qxz, qyz;
33 #endif
34 };
35 
36 struct FORCE32{
37  PIKG::F32vec acc;
38  PIKG::F32 pot;
39  PIKG::S32 nnb;
40 };
41 
42 struct FORCESP32{
43  PIKG::F32vec acc;
44  PIKG::F32 pot;
45 };
46 
47 struct CalcForceEpEpWithLinearCutoffFugaku{
48  PIKG::F32 eps2;
49  PIKG::F32 rcut2;
50  PIKG::F64 G;
51 
52  CalcForceEpEpWithLinearCutoffFugaku(){}
53 
54  CalcForceEpEpWithLinearCutoffFugaku(PIKG::F64 eps2_, PIKG::F64 rcut2_, PIKG::F64 G_): eps2(eps2_), rcut2(rcut2_), G(G_) {}
55 
56  void initialize(PIKG::F64 eps2_, PIKG::F64 rcut2_, PIKG::F64 G_){
57  eps2 = eps2_;
58  rcut2 = rcut2_;
59  G = G_;
60  }
61 
62  int kernel_id = 0;
63 
64  void operator()(const EPISoft* __restrict__ epi, const int ni, const EPJSoft* __restrict__ epj, const int nj, ForceSoft* __restrict__ force, const int kernel_select = 1){
65 
66  static_assert(sizeof(EPI32) == 16,"check consistency of EPI member variable definition between PIKG source and original source");
67  static_assert(sizeof(EPJ32) == 20,"check consistency of EPJ member variable definition between PIKG source and original source");
68  static_assert(sizeof(FORCE32) == 20,"check consistency of FORCE member variable definition between PIKG source and original source");
69  if(kernel_select>=0) kernel_id = kernel_select;
70  if(kernel_id == 0){
71  std::cout << "ni: " << ni << " nj:" << nj << std::endl;
72  ForceSoft* force_tmp = new ForceSoft[ni];
73  std::chrono::system_clock::time_point start, end;
74  double min_time = std::numeric_limits<double>::max();
75  { // test Kernel_I16_J1
76  for(int i=0;i<ni;i++) force_tmp[i] = force[i];
77  start = std::chrono::system_clock::now();
78  Kernel_I16_J1(epi,ni,epj,nj,force_tmp);
79  end = std::chrono::system_clock::now();
80  double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
81  std::cerr << "kerel 1: " << elapsed << " ns" << std::endl;
82  if(min_time > elapsed){
83  min_time = elapsed;
84  kernel_id = 1;
85  }
86  }
87  { // test Kernel_I1_J16
88  for(int i=0;i<ni;i++) force_tmp[i] = force[i];
89  start = std::chrono::system_clock::now();
90  Kernel_I1_J16(epi,ni,epj,nj,force_tmp);
91  end = std::chrono::system_clock::now();
92  double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
93  std::cerr << "kerel 2: " << elapsed << " ns" << std::endl;
94  if(min_time > elapsed){
95  min_time = elapsed;
96  kernel_id = 2;
97  }
98  }
99  delete[] force_tmp;
100  } // if(kernel_id == 0)
101  if(kernel_id == 1) Kernel_I16_J1(epi,ni,epj,nj,force);
102  if(kernel_id == 2) Kernel_I1_J16(epi,ni,epj,nj,force);
103  } // operator() definition
104 
105  void Kernel_I16_J1(const EPISoft* __restrict__ epi, const PIKG::S32 ni, const EPJSoft* __restrict__ epj, const PIKG::S32 nj, ForceSoft* __restrict__ force){
106 
107  //const PIKG::F32 eps2 = EPISoft::eps * EPISoft::eps;
108  //const PIKG::F32 rcut2 = EPISoft::r_out*EPISoft::r_out;
109  //const PIKG::F64 G = ForceSoft::grav_const;
110 
111  PIKG::S32 i;
112  PIKG::S32 j;
113 
114  PIKG::S32 ni_act;
115  PIKG::S32 nj_act;
116  PIKG::S32 epi_list[ni];
117  EPI32 epi_loc[ni];
118  EPJ32 epj_loc[nj];
119  FORCE32 force_loc[ni];
120 
121  ni_act=0;
122  for(i=0; i<ni; ++i){
123  if (epi[i].type==1) {
124  epi_list[ni_act] = i;
125  epi_loc[ni_act].pos.x = epi[i].pos.x;
126  epi_loc[ni_act].pos.y = epi[i].pos.y;
127  epi_loc[ni_act].pos.z = epi[i].pos.z;
128  epi_loc[ni_act].rs = epi[i].r_search;
129 
130  force_loc[ni_act].acc = 0.f;
131  force_loc[ni_act].pot = 0.0;
132  force_loc[ni_act].nnb = 0;
133  ni_act++;
134  }
135  }
136 
137  nj_act=0;
138  for(j=0; j<nj; ++j) {
139  if (epj[j].mass>0) {
140  epj_loc[nj_act].pos.x = epj[j].pos.x;
141  epj_loc[nj_act].pos.y = epj[j].pos.y;
142  epj_loc[nj_act].pos.z = epj[j].pos.z;
143  epj_loc[nj_act].mass = epj[j].mass;
144  epj_loc[nj_act].rs = epj[j].r_search;
145  nj_act++;
146  }
147  }
148 
149  for(i = 0;i < ((ni_act + 16 - 1)/16)*16;i += 16){
150  svbool_t pg0 = svwhilelt_b32_s32(i,ni_act);
151  svfloat32_t rsi;
152 
153  uint32_t index_gather_load0[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
154  svuint32_t vindex_gather_load0 = svld1_u32(pg0,index_gather_load0);
155  rsi = svld1_gather_u32index_f32(pg0,((float*)&epi_loc[i+0].rs),vindex_gather_load0);
156  svfloat32x3_t xi;
157 
158  uint32_t index_gather_load1[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
159  svuint32_t vindex_gather_load1 = svld1_u32(pg0,index_gather_load1);
160  xi.v0 = svld1_gather_u32index_f32(pg0,((float*)&epi_loc[i+0].pos.x),vindex_gather_load1);
161  uint32_t index_gather_load2[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
162  svuint32_t vindex_gather_load2 = svld1_u32(pg0,index_gather_load2);
163  xi.v1 = svld1_gather_u32index_f32(pg0,((float*)&epi_loc[i+0].pos.y),vindex_gather_load2);
164  uint32_t index_gather_load3[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
165  svuint32_t vindex_gather_load3 = svld1_u32(pg0,index_gather_load3);
166  xi.v2 = svld1_gather_u32index_f32(pg0,((float*)&epi_loc[i+0].pos.z),vindex_gather_load3);
167  svfloat32x3_t acc;
168 
169  acc.v0 = svdup_n_f32(0.0f);
170  acc.v1 = svdup_n_f32(0.0f);
171  acc.v2 = svdup_n_f32(0.0f);
172  svint32_t nnbi;
173 
174  nnbi = svdup_n_s32(0);
175  svfloat32_t pot;
176 
177  pot = svdup_n_f32(0.0f);
178  for(j = 0;j < ((nj_act + 1 - 1)/1)*1;++j){
179  svfloat32_t mj;
180 
181  mj = svdup_n_f32(epj_loc[j+0].mass);
182 
183  svfloat32_t rsj;
184 
185  rsj = svdup_n_f32(epj_loc[j+0].rs);
186 
187  svfloat32x3_t xj;
188 
189  xj.v0 = svdup_n_f32(epj_loc[j+0].pos.x);
190 
191  xj.v1 = svdup_n_f32(epj_loc[j+0].pos.y);
192 
193  xj.v2 = svdup_n_f32(epj_loc[j+0].pos.z);
194 
195  svfloat32x3_t rij;
196 
197  svfloat32_t __fkg_tmp2;
198 
199  svfloat32_t __fkg_tmp1;
200 
201  svfloat32_t r2;
202 
203  svfloat32_t r2_eps;
204 
205  svfloat32_t r2_cut;
206 
207  svfloat32_t r_inv;
208 
209  svfloat32_t r2_inv;
210 
211  svfloat32_t mr_inv;
212 
213  svfloat32_t mr3_inv;
214 
215  svint32_t __fkg_tmp0;
216 
217  rij.v0 = svsub_f32_z(pg0,xi.v0,xj.v0);
218  rij.v1 = svsub_f32_z(pg0,xi.v1,xj.v1);
219  rij.v2 = svsub_f32_z(pg0,xi.v2,xj.v2);
220  __fkg_tmp2 = svmul_f32_z(pg0,rij.v1,rij.v1);
221  __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,__fkg_tmp2);
222  r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp1);
223  r2_eps = svadd_f32_z(pg0,r2,svdup_n_f32(eps2));
224  r2_cut = max(pg0,r2_eps,svdup_n_f32(rcut2));
225  r_inv = rsqrt(pg0,r2_cut);
226  r2_inv = svmul_f32_z(pg0,r_inv,r_inv);
227  mr_inv = svmul_f32_z(pg0,mj,r_inv);
228  mr3_inv = svmul_f32_z(pg0,r2_inv,mr_inv);
229  {
230  svbool_t pg2;
231  pg2 = svcmplt_f32(pg0,r2,max(pg0,svmul_f32_z(pg0,rsi,rsi),svmul_f32_z(pg0,rsj,rsj)));
232 
233  __fkg_tmp0 = svadd_s32_z(pg2,nnbi,svdup_n_s32(1));
234  nnbi = svsel_s32(pg2,__fkg_tmp0,nnbi);;
235  }
236 
237  acc.v0 = svmsb_f32_z(svptrue_b32(),mr3_inv,rij.v0,acc.v0);
238  acc.v1 = svmsb_f32_z(svptrue_b32(),mr3_inv,rij.v1,acc.v1);
239  acc.v2 = svmsb_f32_z(svptrue_b32(),mr3_inv,rij.v2,acc.v2);
240  pot = svsub_f32_z(svptrue_b32(),pot,mr_inv);
241  } // loop of j
242 
243  {
244  svfloat32_t __fkg_tmp_accum;
245  uint32_t index_gather_load4[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
246  svuint32_t vindex_gather_load4 = svld1_u32(svptrue_b32(),index_gather_load4);
247  __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.x),vindex_gather_load4);
248  __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,acc.v0);
249  uint32_t index_scatter_store0[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
250  svuint32_t vindex_scatter_store0 = svld1_u32(svptrue_b32(),index_scatter_store0);
251  svst1_scatter_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.x),vindex_scatter_store0,__fkg_tmp_accum);}
252 
253  {
254  svfloat32_t __fkg_tmp_accum;
255  uint32_t index_gather_load5[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
256  svuint32_t vindex_gather_load5 = svld1_u32(svptrue_b32(),index_gather_load5);
257  __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.y),vindex_gather_load5);
258  __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,acc.v1);
259  uint32_t index_scatter_store1[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
260  svuint32_t vindex_scatter_store1 = svld1_u32(svptrue_b32(),index_scatter_store1);
261  svst1_scatter_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.y),vindex_scatter_store1,__fkg_tmp_accum);}
262 
263  {
264  svfloat32_t __fkg_tmp_accum;
265  uint32_t index_gather_load6[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
266  svuint32_t vindex_gather_load6 = svld1_u32(svptrue_b32(),index_gather_load6);
267  __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.z),vindex_gather_load6);
268  __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,acc.v2);
269  uint32_t index_scatter_store2[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
270  svuint32_t vindex_scatter_store2 = svld1_u32(svptrue_b32(),index_scatter_store2);
271  svst1_scatter_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.z),vindex_scatter_store2,__fkg_tmp_accum);}
272 
273  {
274  svint32_t __fkg_tmp_accum;
275  uint32_t index_gather_load7[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
276  svuint32_t vindex_gather_load7 = svld1_u32(svptrue_b32(),index_gather_load7);
277  __fkg_tmp_accum = svld1_gather_u32index_s32(svptrue_b32(),((int*)&force_loc[i+0].nnb),vindex_gather_load7);
278  __fkg_tmp_accum = svadd_s32_z(svptrue_b32(),__fkg_tmp_accum,nnbi);
279  uint32_t index_scatter_store3[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
280  svuint32_t vindex_scatter_store3 = svld1_u32(svptrue_b32(),index_scatter_store3);
281  svst1_scatter_u32index_s32(svptrue_b32(),((int*)&force_loc[i+0].nnb),vindex_scatter_store3,__fkg_tmp_accum);}
282 
283  {
284  svfloat32_t __fkg_tmp_accum;
285  uint32_t index_gather_load8[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
286  svuint32_t vindex_gather_load8 = svld1_u32(svptrue_b32(),index_gather_load8);
287  __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].pot),vindex_gather_load8);
288  __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,pot);
289  uint32_t index_scatter_store4[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
290  svuint32_t vindex_scatter_store4 = svld1_u32(svptrue_b32(),index_scatter_store4);
291  svst1_scatter_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].pot),vindex_scatter_store4,__fkg_tmp_accum);}
292 
293  } // loop of i
294 
295  // copy back
296  for(i=0; i<ni_act; ++i){
297  j = epi_list[i];
298  assert(epi[j].type==1);
299  force[j].acc.x += G*force_loc[i].acc.x;
300  force[j].acc.y += G*force_loc[i].acc.y;
301  force[j].acc.z += G*force_loc[i].acc.z;
302  force[j].pot += G*force_loc[i].pot;
303  force[j].n_ngb = force_loc[i].nnb;
304  }
305 
306  } // Kernel_I16_J1 definition
307 
308  void Kernel_I1_J16(const EPISoft* __restrict__ epi, const PIKG::S32 ni, const EPJSoft* __restrict__ epj, const PIKG::S32 nj, ForceSoft* __restrict__ force){
309  PIKG::S32 i;
310  PIKG::S32 j;
311 
312  PIKG::S32 ni_act;
313  PIKG::S32 nj_act;
314  PIKG::S32 epi_list[ni];
315  EPI32 epi_loc[ni];
316  EPJ32 epj_loc[nj];
317  FORCE32 force_loc[ni];
318 
319  ni_act=0;
320  for(i=0; i<ni; ++i){
321  if (epi[i].type==1) {
322  epi_list[ni_act] = i;
323  epi_loc[ni_act].pos.x = epi[i].pos.x;
324  epi_loc[ni_act].pos.y = epi[i].pos.y;
325  epi_loc[ni_act].pos.z = epi[i].pos.z;
326  epi_loc[ni_act].rs = epi[i].r_search;
327 
328  force_loc[ni_act].acc = 0.f;
329  force_loc[ni_act].pot = 0.0;
330  force_loc[ni_act].nnb = 0;
331  ni_act++;
332  }
333  }
334 
335  nj_act=0;
336  for(j=0; j<nj; ++j) {
337  if (epj[j].mass>0) {
338  epj_loc[nj_act].pos.x = epj[j].pos.x;
339  epj_loc[nj_act].pos.y = epj[j].pos.y;
340  epj_loc[nj_act].pos.z = epj[j].pos.z;
341  epj_loc[nj_act].mass = epj[j].mass;
342  epj_loc[nj_act].rs = epj[j].r_search;
343  nj_act++;
344  }
345  }
346 
347  for(i = 0;i < ((ni_act + 1 - 1)/1)*1;++i){
348  svfloat32_t rsi;
349 
350  rsi = svdup_n_f32(epi_loc[i+0].rs);
351 
352  svfloat32x3_t xi;
353 
354  xi.v0 = svdup_n_f32(epi_loc[i+0].pos.x);
355 
356  xi.v1 = svdup_n_f32(epi_loc[i+0].pos.y);
357 
358  xi.v2 = svdup_n_f32(epi_loc[i+0].pos.z);
359 
360  svfloat32x3_t acc;
361 
362  acc.v0 = svdup_n_f32(0.0f);
363  acc.v1 = svdup_n_f32(0.0f);
364  acc.v2 = svdup_n_f32(0.0f);
365  svint32_t nnbi;
366 
367  nnbi = svdup_n_s32(0);
368  svfloat32_t pot;
369 
370  pot = svdup_n_f32(0.0f);
371  for(j = 0;j < ((nj_act + 16 - 1)/16)*16;j += 16){
372  svbool_t pg0 = svwhilelt_b32_s32(j,nj_act);
373  svfloat32_t mj;
374 
375  uint32_t index_gather_load9[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
376  svuint32_t vindex_gather_load9 = svld1_u32(pg0,index_gather_load9);
377  mj = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].mass),vindex_gather_load9);
378  svfloat32_t rsj;
379 
380  uint32_t index_gather_load10[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
381  svuint32_t vindex_gather_load10 = svld1_u32(pg0,index_gather_load10);
382  rsj = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].rs),vindex_gather_load10);
383  svfloat32x3_t xj;
384 
385  uint32_t index_gather_load11[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
386  svuint32_t vindex_gather_load11 = svld1_u32(pg0,index_gather_load11);
387  xj.v0 = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].pos.x),vindex_gather_load11);
388  uint32_t index_gather_load12[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
389  svuint32_t vindex_gather_load12 = svld1_u32(pg0,index_gather_load12);
390  xj.v1 = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].pos.y),vindex_gather_load12);
391  uint32_t index_gather_load13[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
392  svuint32_t vindex_gather_load13 = svld1_u32(pg0,index_gather_load13);
393  xj.v2 = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].pos.z),vindex_gather_load13);
394  svfloat32x3_t rij;
395 
396  svfloat32_t __fkg_tmp2;
397 
398  svfloat32_t __fkg_tmp1;
399 
400  svfloat32_t r2;
401 
402  svfloat32_t r2_eps;
403 
404  svfloat32_t r2_cut;
405 
406  svfloat32_t r_inv;
407 
408  svfloat32_t r2_inv;
409 
410  svfloat32_t mr_inv;
411 
412  svfloat32_t mr3_inv;
413 
414  svint32_t __fkg_tmp0;
415 
416  rij.v0 = svsub_f32_z(pg0,xi.v0,xj.v0);
417  rij.v1 = svsub_f32_z(pg0,xi.v1,xj.v1);
418  rij.v2 = svsub_f32_z(pg0,xi.v2,xj.v2);
419  __fkg_tmp2 = svmul_f32_z(pg0,rij.v1,rij.v1);
420  __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,__fkg_tmp2);
421  r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp1);
422  r2_eps = svadd_f32_z(pg0,r2,svdup_n_f32(eps2));
423  r2_cut = max(pg0,r2_eps,svdup_n_f32(rcut2));
424  r_inv = rsqrt(pg0,r2_cut);
425  r2_inv = svmul_f32_z(pg0,r_inv,r_inv);
426  mr_inv = svmul_f32_z(pg0,mj,r_inv);
427  mr3_inv = svmul_f32_z(pg0,r2_inv,mr_inv);
428  {
429  svbool_t pg2;
430  pg2 = svcmplt_f32(pg0,r2,max(pg0,svmul_f32_z(pg0,rsi,rsi),svmul_f32_z(pg0,rsj,rsj)));
431 
432  __fkg_tmp0 = svadd_s32_z(pg2,nnbi,svdup_n_s32(1));
433  nnbi = svsel_s32(pg2,__fkg_tmp0,nnbi);;
434  }
435 
436  acc.v0 = svmsb_f32_z(svptrue_b32(),mr3_inv,rij.v0,acc.v0);
437  acc.v1 = svmsb_f32_z(svptrue_b32(),mr3_inv,rij.v1,acc.v1);
438  acc.v2 = svmsb_f32_z(svptrue_b32(),mr3_inv,rij.v2,acc.v2);
439  pot = svsub_f32_z(svptrue_b32(),pot,mr_inv);
440  } // loop of j
441 
442  ((float*)&force_loc[i+0].acc.x)[0] += svaddv_f32(svptrue_b32(),acc.v0);
443 
444  ((float*)&force_loc[i+0].acc.y)[0] += svaddv_f32(svptrue_b32(),acc.v1);
445 
446  ((float*)&force_loc[i+0].acc.z)[0] += svaddv_f32(svptrue_b32(),acc.v2);
447 
448  ((int*)&force_loc[i+0].nnb)[0] += svaddv_s32(svptrue_b32(),nnbi);
449 
450  ((float*)&force_loc[i+0].pot)[0] += svaddv_f32(svptrue_b32(),pot);
451 
452  } // loop of i
453 
454  // copy back
455  for(i=0; i<ni_act; ++i){
456  j = epi_list[i];
457  if (epi[j].type==1) {
458  force[j].acc.x += G*force_loc[i].acc.x;
459  force[j].acc.y += G*force_loc[i].acc.y;
460  force[j].acc.z += G*force_loc[i].acc.z;
461  force[j].pot += G*force_loc[i].pot;
462  force[j].n_ngb = force_loc[i].nnb;
463  }
464  }
465 
466  } // Kernel_I1_J16 definition
467 
468  PIKG::F64 rsqrt(PIKG::F64 op){ return 1.0/std::sqrt(op); }
469  PIKG::F64 sqrt(PIKG::F64 op){ return std::sqrt(op); }
470  PIKG::F64 inv(PIKG::F64 op){ return 1.0/op; }
471  PIKG::F64 max(PIKG::F64 a,PIKG::F64 b){ return std::max(a,b);}
472  PIKG::F64 min(PIKG::F64 a,PIKG::F64 b){ return std::min(a,b);}
473  PIKG::F32 rsqrt(PIKG::F32 op){ return 1.f/std::sqrt(op); }
474  PIKG::F32 sqrt(PIKG::F32 op){ return std::sqrt(op); }
475  PIKG::F32 inv(PIKG::F32 op){ return 1.f/op; }
476  PIKG::S64 max(PIKG::S64 a,PIKG::S64 b){ return std::max(a,b);}
477  PIKG::S64 min(PIKG::S64 a,PIKG::S64 b){ return std::min(a,b);}
478  PIKG::S32 max(PIKG::S32 a,PIKG::S32 b){ return std::max(a,b);}
479  PIKG::S32 min(PIKG::S32 a,PIKG::S32 b){ return std::min(a,b);}
480  PIKG::F64 table(PIKG::F64 tab[],PIKG::S64 i){ return tab[i]; }
481  PIKG::F32 table(PIKG::F32 tab[],PIKG::S32 i){ return tab[i]; }
482  PIKG::F64 to_float(PIKG::U64 op){return (PIKG::F64)op;}
483  PIKG::F32 to_float(PIKG::U32 op){return (PIKG::F32)op;}
484  PIKG::F64 to_float(PIKG::S64 op){return (PIKG::F64)op;}
485  PIKG::F32 to_float(PIKG::S32 op){return (PIKG::F32)op;}
486  //PIKG::S64 to_int(PIKG::F64 op){return (PIKG::S64)op;}
487  //PIKG::S32 to_int(PIKG::F32 op){return (PIKG::S32)op;}
488  PIKG::U64 to_uint(PIKG::F64 op){return (PIKG::U64)op;}
489  PIKG::U32 to_uint(PIKG::F32 op){return (PIKG::U32)op;}
490  template<typename T> PIKG::F64 to_f64(const T& op){return (PIKG::F64)op;}
491  template<typename T> PIKG::F32 to_f32(const T& op){return (PIKG::F32)op;}
492  template<typename T> PIKG::S64 to_s64(const T& op){return (PIKG::S64)op;}
493  template<typename T> PIKG::S32 to_s32(const T& op){return (PIKG::S32)op;}
494  template<typename T> PIKG::U64 to_u64(const T& op){return (PIKG::U64)op;}
495  template<typename T> PIKG::U32 to_u32(const T& op){return (PIKG::U32)op;}
496  svfloat32_t rsqrt(svbool_t pg,svfloat32_t op){
497  svfloat32_t rinv = svrsqrte_f32(op);
498  svfloat32_t h = svmul_f32_z(pg,op,rinv);
499  h = svmsb_n_f32_z(pg,h,rinv,1.f);
500  svfloat32_t poly = svmad_n_f32_z(pg,h,svdup_f32(0.375f),0.5f);
501  poly = svmul_f32_z(pg,poly,h);
502  rinv = svmad_f32_z(pg,rinv,poly,rinv);
503  return rinv;
504  }
505  svfloat64_t rsqrt(svbool_t pg,svfloat64_t op){
506  svfloat64_t rinv = svrsqrte_f64(op);
507  svfloat64_t h = svmul_f64_z(pg,op,rinv);
508  h = svmsb_n_f64_z(pg,h,rinv,1.f);
509  svfloat64_t poly = svmad_n_f64_z(pg,h,svdup_f64(0.375f),0.5f);
510  poly = svmul_f64_z(pg,poly,h);
511  rinv = svmad_f64_z(pg,rinv,poly,rinv);
512  return rinv;
513  }
514  svfloat32x2_t svdup_n_f32x3(PIKG::F32vec2 v){
515  svfloat32x2_t ret;
516  ret.v0 = svdup_n_f32(v.x);
517  ret.v1 = svdup_n_f32(v.y);
518  return ret;
519  }
520  svfloat32x3_t svdup_n_f32x3(PIKG::F32vec v){
521  svfloat32x3_t ret;
522  ret.v0 = svdup_n_f32(v.x);
523  ret.v1 = svdup_n_f32(v.y);
524  ret.v2 = svdup_n_f32(v.z);
525  return ret;
526  }
527  svfloat32x4_t svdup_n_f32x4(PIKG::F32vec4 v){
528  svfloat32x4_t ret;
529  ret.v0 = svdup_n_f32(v.x);
530  ret.v1 = svdup_n_f32(v.y);
531  ret.v2 = svdup_n_f32(v.z);
532  ret.v3 = svdup_n_f32(v.w);
533  return ret;
534  }
535  svfloat64x2_t svdup_n_f64x3(PIKG::F64vec2 v){
536  svfloat64x2_t ret;
537  ret.v0 = svdup_n_f64(v.x);
538  ret.v1 = svdup_n_f64(v.y);
539  return ret;
540  }
541  svfloat64x3_t svdup_n_f64x3(PIKG::F64vec v){
542  svfloat64x3_t ret;
543  ret.v0 = svdup_n_f64(v.x);
544  ret.v1 = svdup_n_f64(v.y);
545  ret.v2 = svdup_n_f64(v.z);
546  return ret;
547  }
548  svfloat64x4_t svdup_n_f64x4(PIKG::F64vec4 v){
549  svfloat64x4_t ret;
550  ret.v0 = svdup_n_f64(v.x);
551  ret.v1 = svdup_n_f64(v.y);
552  ret.v2 = svdup_n_f64(v.z);
553  ret.v3 = svdup_n_f64(v.w);
554  return ret;
555  }
556  svfloat32_t sqrt(svbool_t pg,svfloat32_t op){ return svsqrt_f32_z(pg,op); }
557  svfloat64_t sqrt(svbool_t pg,svfloat64_t op){ return svsqrt_f64_z(pg,op); }
558  svfloat32_t inv(svbool_t pg,svfloat32_t op){
559  svfloat32_t x1 = svrecpe_f32(op);
560  svfloat32_t x2 = svmsb_n_f32_z(pg,op,x1,2.f);
561  x2 = svmul_f32_z(pg,x2,x1);
562  svfloat32_t ret = svmsb_n_f32_z(pg,op,x2,2.f);
563  ret = svmul_f32_z(pg,ret,x2);
564  return ret;
565  }
566  svfloat64_t inv(svbool_t pg,svfloat64_t op){
567  svfloat64_t x1 = svrecpe_f64(op);
568  svfloat64_t x2 = svmsb_n_f64_z(pg,op,x1,2.f);
569  x2 = svmul_f64_z(pg,x2,x1);
570  svfloat64_t ret = svmsb_n_f64_z(pg,op,x2,2.f);
571  ret = svmul_f64_z(pg,ret,x2);
572  return ret;
573  }
574  svfloat64_t max(svbool_t pg,svfloat64_t a,svfloat64_t b){ return svmax_f64_z(pg,a,b);}
575  svfloat64_t min(svbool_t pg,svfloat64_t a,svfloat64_t b){ return svmin_f64_z(pg,a,b);}
576  svuint64_t max(svbool_t pg,svuint64_t a,svuint64_t b){ return svmax_u64_z(pg,a,b);}
577  svuint64_t min(svbool_t pg,svuint64_t a,svuint64_t b){ return svmin_u64_z(pg,a,b);}
578  svint64_t max(svbool_t pg,svint64_t a,svint64_t b){ return svmax_s64_z(pg,a,b);}
579  svint64_t min(svbool_t pg,svint64_t a,svint64_t b){ return svmin_s64_z(pg,a,b);}
580  svfloat32_t max(svbool_t pg,svfloat32_t a,svfloat32_t b){ return svmax_f32_z(pg,a,b);}
581  svfloat32_t min(svbool_t pg,svfloat32_t a,svfloat32_t b){ return svmin_f32_z(pg,a,b);}
582  svuint32_t max(svbool_t pg,svuint32_t a,svuint32_t b){ return svmax_u32_z(pg,a,b);}
583  svuint32_t min(svbool_t pg,svuint32_t a,svuint32_t b){ return svmin_u32_z(pg,a,b);}
584  svint32_t max(svbool_t pg,svint32_t a,svint32_t b){ return svmax_s32_z(pg,a,b);}
585  svint32_t min(svbool_t pg,svint32_t a,svint32_t b){ return svmin_s32_z(pg,a,b);}
586  svfloat64_t table(svfloat64_t tab,svuint64_t index){ return svtbl_f64(tab,index);}
587  svfloat32_t table(svfloat32_t tab,svuint32_t index){ return svtbl_f32(tab,index);}
588  svfloat64_t to_float(svbool_t pg, svint64_t op){ return svcvt_f64_s64_z(pg,op);}
589  svfloat32_t to_float(svbool_t pg, svint32_t op){ return svcvt_f32_s32_z(pg,op);}
590  svfloat64_t to_float(svbool_t pg,svuint64_t op){ return svcvt_f64_u64_z(pg,op);}
591  svfloat32_t to_float(svbool_t pg,svuint32_t op){ return svcvt_f32_u32_z(pg,op);}
592  //svint64_t to_int(svbool_t pg,svfloat64_t op){ return svcvt_s64_f64_z(pg,op);}
593  //svint32_t to_int(svbool_t pg,svfloat32_t op){ return svcvt_s32_f32_z(pg,op);}
594  svuint64_t to_uint(svbool_t pg,svfloat64_t op){ return svcvt_u64_f64_z(pg,op);}
595  svuint32_t to_uint(svbool_t pg,svfloat32_t op){ return svcvt_u32_f32_z(pg,op);}
596 };// kernel functor definition
597 
598 #ifdef USE_QUAD
599 
600 struct CalcForceEpSpQuadFugaku{
601  PIKG::F32 eps2;
602  PIKG::F64 G;
603  CalcForceEpSpQuadFugaku(){}
604  CalcForceEpSpQuadFugaku(PIKG::F64 eps2,PIKG::F64 G):eps2(eps2),G(G){}
605  void initialize(PIKG::F64 eps2_,PIKG::F64 G_){
606  eps2 = eps2_;
607  G = G_;
608  }
609  int kernel_id = 0;
610  void operator()(const EPISoft* __restrict__ epi,const int ni,const SPJSoft* __restrict__ epj,const int nj, ForceSoft* __restrict__ force,const int kernel_select = 1){
611  static_assert(sizeof(EPI32) == 16,"check consistency of EPI member variable definition between PIKG source and original source");
612  static_assert(sizeof(SPJ32) == 40,"check consistency of EPJ member variable definition between PIKG source and original source");
613  static_assert(sizeof(FORCESP32) == 16,"check consistency of FORCE member variable definition between PIKG source and original source");
614  if(kernel_select>=0) kernel_id = kernel_select;
615  if(kernel_id == 0){
616  std::cout << "ni: " << ni << " nj:" << nj << std::endl;
617  ForceSoft* force_tmp = new ForceSoft[ni];
618  std::chrono::system_clock::time_point start, end;
619  double min_time = std::numeric_limits<double>::max();
620  { // test Kernel_I16_J1
621  for(int i=0;i<ni;i++) force_tmp[i] = force[i];
622  start = std::chrono::system_clock::now();
623  Kernel_I16_J1(epi,ni,epj,nj,force_tmp);
624  end = std::chrono::system_clock::now();
625  double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
626  std::cerr << "kerel 1: " << elapsed << " ns" << std::endl;
627  if(min_time > elapsed){
628  min_time = elapsed;
629  kernel_id = 1;
630  }
631  }
632  { // test Kernel_I1_J16
633  for(int i=0;i<ni;i++) force_tmp[i] = force[i];
634  start = std::chrono::system_clock::now();
635  Kernel_I1_J16(epi,ni,epj,nj,force_tmp);
636  end = std::chrono::system_clock::now();
637  double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
638  std::cerr << "kerel 2: " << elapsed << " ns" << std::endl;
639  if(min_time > elapsed){
640  min_time = elapsed;
641  kernel_id = 2;
642  }
643  }
644  delete[] force_tmp;
645  } // if(kernel_id == 0)
646  if(kernel_id == 1) Kernel_I16_J1(epi,ni,epj,nj,force);
647  if(kernel_id == 2) Kernel_I1_J16(epi,ni,epj,nj,force);
648  } // operator() definition
649 
650  void Kernel_I16_J1(const EPISoft* __restrict__ epi,const int ni,const SPJSoft* __restrict__ epj,const int nj, ForceSoft* __restrict__ force,const int kernel_select = 1){
651  PIKG::S32 i;
652  PIKG::S32 j;
653 
654  PIKG::S32 ni_act;
655  PIKG::S32 epi_list[ni];
656  EPI32 epi_loc[ni];
657  SPJ32 epj_loc[nj];
658  FORCESP32 force_loc[ni];
659 
660  ni_act=0;
661  for(i=0; i<ni; ++i){
662  if (epi[i].type==1) {
663  epi_list[ni_act] = i;
664  epi_loc[ni_act].pos.x = epi[i].pos.x - epi[0].pos.x;
665  epi_loc[ni_act].pos.y = epi[i].pos.y - epi[0].pos.y;
666  epi_loc[ni_act].pos.z = epi[i].pos.z - epi[0].pos.z;
667 
668  force_loc[ni_act].acc = 0.f;
669  force_loc[ni_act].pot = 0.0;
670  ni_act++;
671  }
672  }
673 
674  for(j=0; j<nj; ++j) {
675  epj_loc[j].pos.x = epj[j].pos.x - epi[0].pos.x;
676  epj_loc[j].pos.y = epj[j].pos.y - epi[0].pos.y;
677  epj_loc[j].pos.z = epj[j].pos.z - epi[0].pos.z;
678  epj_loc[j].mass = epj[j].mass;
679  epj_loc[j].qxx = epj[j].quad.xx;
680  epj_loc[j].qyy = epj[j].quad.yy;
681  epj_loc[j].qzz = epj[j].quad.zz;
682  epj_loc[j].qxy = epj[j].quad.xy;
683  epj_loc[j].qxz = epj[j].quad.xz;
684  epj_loc[j].qyz = epj[j].quad.yz;
685  }
686 
687  for(i = 0;i < ((ni_act + 16 - 1)/16)*16;i += 16){
688  svbool_t pg0 = svwhilelt_b32_s32(i,ni_act);
689  svfloat32x3_t xi;
690 
691  uint32_t index_gather_load0[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
692  svuint32_t vindex_gather_load0 = svld1_u32(pg0,index_gather_load0);
693  xi.v0 = svld1_gather_u32index_f32(pg0,((float*)&epi_loc[i+0].pos.x),vindex_gather_load0);
694  uint32_t index_gather_load1[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
695  svuint32_t vindex_gather_load1 = svld1_u32(pg0,index_gather_load1);
696  xi.v1 = svld1_gather_u32index_f32(pg0,((float*)&epi_loc[i+0].pos.y),vindex_gather_load1);
697  uint32_t index_gather_load2[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
698  svuint32_t vindex_gather_load2 = svld1_u32(pg0,index_gather_load2);
699  xi.v2 = svld1_gather_u32index_f32(pg0,((float*)&epi_loc[i+0].pos.z),vindex_gather_load2);
700  svfloat32x3_t af;
701 
702  af.v0 = svdup_n_f32(0.0f);
703  af.v1 = svdup_n_f32(0.0f);
704  af.v2 = svdup_n_f32(0.0f);
705  svfloat32_t pf;
706 
707  pf = svdup_n_f32(0.0f);
708  for(j = 0;j < ((nj + 1 - 1)/1)*1;++j){
709  svfloat32_t mj;
710 
711  mj = svdup_n_f32(epj_loc[j+0].mass);
712 
713  svfloat32_t qj_xx;
714 
715  qj_xx = svdup_n_f32(epj_loc[j+0].qxx);
716 
717  svfloat32_t qj_xy;
718 
719  qj_xy = svdup_n_f32(epj_loc[j+0].qxy);
720 
721  svfloat32_t qj_yy;
722 
723  qj_yy = svdup_n_f32(epj_loc[j+0].qyy);
724 
725  svfloat32_t qj_yz;
726 
727  qj_yz = svdup_n_f32(epj_loc[j+0].qyz);
728 
729  svfloat32_t qj_zx;
730 
731  qj_zx = svdup_n_f32(epj_loc[j+0].qxz);
732 
733  svfloat32_t qj_zz;
734 
735  qj_zz = svdup_n_f32(epj_loc[j+0].qzz);
736 
737  svfloat32x3_t xj;
738 
739  xj.v0 = svdup_n_f32(epj_loc[j+0].pos.x);
740 
741  xj.v1 = svdup_n_f32(epj_loc[j+0].pos.y);
742 
743  xj.v2 = svdup_n_f32(epj_loc[j+0].pos.z);
744 
745  svfloat32x3_t rij;
746 
747  svfloat32_t __fkg_tmp1;
748 
749  svfloat32_t __fkg_tmp0;
750 
751  svfloat32_t r2;
752 
753  svfloat32_t r_inv;
754 
755  svfloat32_t __fkg_tmp2;
756 
757  svfloat32_t tmp;
758 
759  svfloat32_t __fkg_tmp3;
760 
761  svfloat32_t r2_inv;
762 
763  svfloat32_t r3_inv;
764 
765  svfloat32_t r4_inv;
766 
767  svfloat32_t r5_inv;
768 
769  svfloat32_t __fkg_tmp4;
770 
771  svfloat32_t tr;
772 
773  svfloat32_t qxx;
774 
775  svfloat32_t qyy;
776 
777  svfloat32_t qzz;
778 
779  svfloat32_t qxy;
780 
781  svfloat32_t qyz;
782 
783  svfloat32_t qxz;
784 
785  svfloat32_t __fkg_tmp5;
786 
787  svfloat32_t mtr;
788 
789  svfloat32_t __fkg_tmp7;
790 
791  svfloat32_t __fkg_tmp6;
792 
793  svfloat32x3_t qr;
794 
795  svfloat32_t __fkg_tmp9;
796 
797  svfloat32_t __fkg_tmp8;
798 
799  svfloat32_t __fkg_tmp11;
800 
801  svfloat32_t __fkg_tmp10;
802 
803  svfloat32_t __fkg_tmp13;
804 
805  svfloat32_t __fkg_tmp12;
806 
807  svfloat32_t rqr;
808 
809  svfloat32_t rqr_r4_inv;
810 
811  svfloat32_t meff;
812 
813  svfloat32_t __fkg_tmp14;
814 
815  svfloat32_t meff3;
816 
817  svfloat32_t __fkg_tmp15;
818 
819  svfloat32_t __fkg_tmp16;
820 
821  svfloat32_t __fkg_tmp17;
822 
823  rij.v0 = svsub_f32_z(pg0,xj.v0,xi.v0);
824  rij.v1 = svsub_f32_z(pg0,xj.v1,xi.v1);
825  rij.v2 = svsub_f32_z(pg0,xj.v2,xi.v2);
826  __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,svdup_n_f32(eps2));
827  __fkg_tmp0 = svmad_f32_z(pg0,rij.v1,rij.v1,__fkg_tmp1);
828  r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp0);
829  r_inv = rsqrt(pg0,r2);
830  __fkg_tmp2 = svmul_f32_z(pg0,r_inv,r_inv);
831  tmp = svmsb_f32_z(pg0,r2,__fkg_tmp2,svdup_n_f32(3.0f));
832  __fkg_tmp3 = svmul_f32_z(pg0,tmp,svdup_n_f32(0.5f));
833  r_inv = svmul_f32_z(pg0,r_inv,__fkg_tmp3);
834  r2_inv = svmul_f32_z(pg0,r_inv,r_inv);
835  r3_inv = svmul_f32_z(pg0,r2_inv,r_inv);
836  r4_inv = svmul_f32_z(pg0,r2_inv,r2_inv);
837  r5_inv = svmul_f32_z(pg0,r2_inv,r3_inv);
838  __fkg_tmp4 = svadd_f32_z(pg0,qj_xx,qj_yy);
839  tr = svadd_f32_z(pg0,__fkg_tmp4,qj_zz);
840  qxx = svnmsb_f32_z(pg0,svdup_n_f32(3.0f),qj_xx,tr);
841  qyy = svnmsb_f32_z(pg0,svdup_n_f32(3.0f),qj_yy,tr);
842  qzz = svnmsb_f32_z(pg0,svdup_n_f32(3.0f),qj_zz,tr);
843  qxy = svmul_f32_z(pg0,svdup_n_f32(3.0f),qj_xy);
844  qyz = svmul_f32_z(pg0,svdup_n_f32(3.0f),qj_yz);
845  qxz = svmul_f32_z(pg0,svdup_n_f32(3.0f),qj_zx);
846  __fkg_tmp5 = svmul_f32_z(pg0,svdup_n_f32(eps2),tr);
847  mtr = svneg_f32_z(pg0,__fkg_tmp5);
848  __fkg_tmp7 = svmul_f32_z(pg0,qxx,rij.v0);
849  __fkg_tmp6 = svmad_f32_z(pg0,qxy,rij.v1,__fkg_tmp7);
850  qr.v0 = svmad_f32_z(pg0,qxz,rij.v2,__fkg_tmp6);
851  __fkg_tmp9 = svmul_f32_z(pg0,qyy,rij.v1);
852  __fkg_tmp8 = svmad_f32_z(pg0,qyz,rij.v2,__fkg_tmp9);
853  qr.v1 = svmad_f32_z(pg0,qxy,rij.v0,__fkg_tmp8);
854  __fkg_tmp11 = svmul_f32_z(pg0,qzz,rij.v2);
855  __fkg_tmp10 = svmad_f32_z(pg0,qxz,rij.v0,__fkg_tmp11);
856  qr.v2 = svmad_f32_z(pg0,qyz,rij.v1,__fkg_tmp10);
857  __fkg_tmp13 = svmad_f32_z(pg0,qr.v0,rij.v0,mtr);
858  __fkg_tmp12 = svmad_f32_z(pg0,qr.v1,rij.v1,__fkg_tmp13);
859  rqr = svmad_f32_z(pg0,qr.v2,rij.v2,__fkg_tmp12);
860  rqr_r4_inv = svmul_f32_z(pg0,rqr,r4_inv);
861  meff = svmad_f32_z(pg0,svdup_n_f32(0.5f),rqr_r4_inv,mj);
862  __fkg_tmp14 = svmad_f32_z(pg0,svdup_n_f32(2.5f),rqr_r4_inv,mj);
863  meff3 = svmul_f32_z(pg0,__fkg_tmp14,r3_inv);
864  pf = svmsb_f32_z(pg0,meff,r_inv,pf);
865  __fkg_tmp15 = svmsb_f32_z(pg0,r5_inv,qr.v0,af.v0);
866  af.v0 = svmad_f32_z(pg0,meff3,rij.v0,__fkg_tmp15);
867  __fkg_tmp16 = svmsb_f32_z(pg0,r5_inv,qr.v1,af.v1);
868  af.v1 = svmad_f32_z(pg0,meff3,rij.v1,__fkg_tmp16);
869  __fkg_tmp17 = svmsb_f32_z(pg0,r5_inv,qr.v2,af.v2);
870  af.v2 = svmad_f32_z(pg0,meff3,rij.v2,__fkg_tmp17);
871  } // loop of j
872 
873  {
874  svfloat32_t __fkg_tmp_accum;
875  uint32_t index_gather_load3[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
876  svuint32_t vindex_gather_load3 = svld1_u32(svptrue_b32(),index_gather_load3);
877  __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.x),vindex_gather_load3);
878  __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,af.v0);
879  uint32_t index_scatter_store0[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
880  svuint32_t vindex_scatter_store0 = svld1_u32(svptrue_b32(),index_scatter_store0);
881  svst1_scatter_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.x),vindex_scatter_store0,__fkg_tmp_accum);}
882 
883  {
884  svfloat32_t __fkg_tmp_accum;
885  uint32_t index_gather_load4[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
886  svuint32_t vindex_gather_load4 = svld1_u32(svptrue_b32(),index_gather_load4);
887  __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.y),vindex_gather_load4);
888  __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,af.v1);
889  uint32_t index_scatter_store1[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
890  svuint32_t vindex_scatter_store1 = svld1_u32(svptrue_b32(),index_scatter_store1);
891  svst1_scatter_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.y),vindex_scatter_store1,__fkg_tmp_accum);}
892 
893  {
894  svfloat32_t __fkg_tmp_accum;
895  uint32_t index_gather_load5[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
896  svuint32_t vindex_gather_load5 = svld1_u32(svptrue_b32(),index_gather_load5);
897  __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.z),vindex_gather_load5);
898  __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,af.v2);
899  uint32_t index_scatter_store2[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
900  svuint32_t vindex_scatter_store2 = svld1_u32(svptrue_b32(),index_scatter_store2);
901  svst1_scatter_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.z),vindex_scatter_store2,__fkg_tmp_accum);}
902 
903  {
904  svfloat32_t __fkg_tmp_accum;
905  uint32_t index_gather_load6[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
906  svuint32_t vindex_gather_load6 = svld1_u32(svptrue_b32(),index_gather_load6);
907  __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].pot),vindex_gather_load6);
908  __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,pf);
909  uint32_t index_scatter_store3[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
910  svuint32_t vindex_scatter_store3 = svld1_u32(svptrue_b32(),index_scatter_store3);
911  svst1_scatter_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].pot),vindex_scatter_store3,__fkg_tmp_accum);}
912 
913  } // loop of i
914  // copy back
915  for(i=0; i<ni_act; ++i){
916  j = epi_list[i];
917  if (epi[j].type==1) {
918  force[j].acc.x += G*force_loc[i].acc.x;
919  force[j].acc.y += G*force_loc[i].acc.y;
920  force[j].acc.z += G*force_loc[i].acc.z;
921  force[j].pot += G*force_loc[i].pot;
922  }
923  }
924  } // Kernel_I16_J1 definition
925 
926  void Kernel_I1_J16(const EPISoft* __restrict__ epi, const PIKG::S32 ni, const SPJSoft* __restrict__ epj, const PIKG::S32 nj, ForceSoft* __restrict__ force){
927  PIKG::S32 i;
928  PIKG::S32 j;
929 
930  PIKG::S32 ni_act;
931  PIKG::S32 epi_list[ni];
932  EPI32 epi_loc[ni];
933  SPJ32 epj_loc[nj];
934  FORCESP32 force_loc[ni];
935 
936  ni_act=0;
937  for(i=0; i<ni; ++i){
938  if (epi[i].type==1) {
939  epi_list[ni_act] = i;
940  epi_loc[ni_act].pos.x = epi[i].pos.x - epi[0].pos.x;
941  epi_loc[ni_act].pos.y = epi[i].pos.y - epi[0].pos.y;
942  epi_loc[ni_act].pos.z = epi[i].pos.z - epi[0].pos.z;
943 
944  force_loc[ni_act].acc = 0.f;
945  force_loc[ni_act].pot = 0.0;
946  ni_act++;
947  }
948  }
949 
950  for(j=0; j<nj; ++j) {
951  epj_loc[j].pos.x = epj[j].pos.x - epi[0].pos.x;
952  epj_loc[j].pos.y = epj[j].pos.y - epi[0].pos.y;
953  epj_loc[j].pos.z = epj[j].pos.z - epi[0].pos.z;
954  epj_loc[j].mass = epj[j].mass;
955  epj_loc[j].qxx = epj[j].quad.xx;
956  epj_loc[j].qyy = epj[j].quad.yy;
957  epj_loc[j].qzz = epj[j].quad.zz;
958  epj_loc[j].qxy = epj[j].quad.xy;
959  epj_loc[j].qxz = epj[j].quad.xz;
960  epj_loc[j].qyz = epj[j].quad.yz;
961  }
962 
963  for(i = 0;i < ((ni_act + 1 - 1)/1)*1;++i){
964  svfloat32x3_t xi;
965 
966  xi.v0 = svdup_n_f32(epi_loc[i+0].pos.x);
967 
968  xi.v1 = svdup_n_f32(epi_loc[i+0].pos.y);
969 
970  xi.v2 = svdup_n_f32(epi_loc[i+0].pos.z);
971 
972  svfloat32x3_t af;
973 
974  af.v0 = svdup_n_f32(0.0f);
975  af.v1 = svdup_n_f32(0.0f);
976  af.v2 = svdup_n_f32(0.0f);
977  svfloat32_t pf;
978 
979  pf = svdup_n_f32(0.0f);
980  for(j = 0;j < ((nj + 16 - 1)/16)*16;j += 16){
981  svbool_t pg0 = svwhilelt_b32_s32(j,nj);
982  svfloat32_t mj;
983 
984  uint32_t index_gather_load7[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
985  svuint32_t vindex_gather_load7 = svld1_u32(pg0,index_gather_load7);
986  mj = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].mass),vindex_gather_load7);
987  svfloat32_t qj_xx;
988 
989  uint32_t index_gather_load8[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
990  svuint32_t vindex_gather_load8 = svld1_u32(pg0,index_gather_load8);
991  qj_xx = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].qxx),vindex_gather_load8);
992  svfloat32_t qj_xy;
993 
994  uint32_t index_gather_load9[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
995  svuint32_t vindex_gather_load9 = svld1_u32(pg0,index_gather_load9);
996  qj_xy = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].qxy),vindex_gather_load9);
997  svfloat32_t qj_yy;
998 
999  uint32_t index_gather_load10[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
1000  svuint32_t vindex_gather_load10 = svld1_u32(pg0,index_gather_load10);
1001  qj_yy = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].qyy),vindex_gather_load10);
1002  svfloat32_t qj_yz;
1003 
1004  uint32_t index_gather_load11[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
1005  svuint32_t vindex_gather_load11 = svld1_u32(pg0,index_gather_load11);
1006  qj_yz = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].qyz),vindex_gather_load11);
1007  svfloat32_t qj_zx;
1008 
1009  uint32_t index_gather_load12[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
1010  svuint32_t vindex_gather_load12 = svld1_u32(pg0,index_gather_load12);
1011  qj_zx = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].qxz),vindex_gather_load12);
1012  svfloat32_t qj_zz;
1013 
1014  uint32_t index_gather_load13[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
1015  svuint32_t vindex_gather_load13 = svld1_u32(pg0,index_gather_load13);
1016  qj_zz = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].qzz),vindex_gather_load13);
1017  svfloat32x3_t xj;
1018 
1019  uint32_t index_gather_load14[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
1020  svuint32_t vindex_gather_load14 = svld1_u32(pg0,index_gather_load14);
1021  xj.v0 = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].pos.x),vindex_gather_load14);
1022  uint32_t index_gather_load15[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
1023  svuint32_t vindex_gather_load15 = svld1_u32(pg0,index_gather_load15);
1024  xj.v1 = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].pos.y),vindex_gather_load15);
1025  uint32_t index_gather_load16[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
1026  svuint32_t vindex_gather_load16 = svld1_u32(pg0,index_gather_load16);
1027  xj.v2 = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].pos.z),vindex_gather_load16);
1028  svfloat32x3_t rij;
1029 
1030  svfloat32_t __fkg_tmp1;
1031 
1032  svfloat32_t __fkg_tmp0;
1033 
1034  svfloat32_t r2;
1035 
1036  svfloat32_t r_inv;
1037 
1038  svfloat32_t __fkg_tmp2;
1039 
1040  svfloat32_t tmp;
1041 
1042  svfloat32_t __fkg_tmp3;
1043 
1044  svfloat32_t r2_inv;
1045 
1046  svfloat32_t r3_inv;
1047 
1048  svfloat32_t r4_inv;
1049 
1050  svfloat32_t r5_inv;
1051 
1052  svfloat32_t __fkg_tmp4;
1053 
1054  svfloat32_t tr;
1055 
1056  svfloat32_t qxx;
1057 
1058  svfloat32_t qyy;
1059 
1060  svfloat32_t qzz;
1061 
1062  svfloat32_t qxy;
1063 
1064  svfloat32_t qyz;
1065 
1066  svfloat32_t qxz;
1067 
1068  svfloat32_t __fkg_tmp5;
1069 
1070  svfloat32_t mtr;
1071 
1072  svfloat32_t __fkg_tmp7;
1073 
1074  svfloat32_t __fkg_tmp6;
1075 
1076  svfloat32x3_t qr;
1077 
1078  svfloat32_t __fkg_tmp9;
1079 
1080  svfloat32_t __fkg_tmp8;
1081 
1082  svfloat32_t __fkg_tmp11;
1083 
1084  svfloat32_t __fkg_tmp10;
1085 
1086  svfloat32_t __fkg_tmp13;
1087 
1088  svfloat32_t __fkg_tmp12;
1089 
1090  svfloat32_t rqr;
1091 
1092  svfloat32_t rqr_r4_inv;
1093 
1094  svfloat32_t meff;
1095 
1096  svfloat32_t __fkg_tmp14;
1097 
1098  svfloat32_t meff3;
1099 
1100  svfloat32_t __fkg_tmp15;
1101 
1102  svfloat32_t __fkg_tmp16;
1103 
1104  svfloat32_t __fkg_tmp17;
1105 
1106  rij.v0 = svsub_f32_z(pg0,xj.v0,xi.v0);
1107  rij.v1 = svsub_f32_z(pg0,xj.v1,xi.v1);
1108  rij.v2 = svsub_f32_z(pg0,xj.v2,xi.v2);
1109  __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,svdup_n_f32(eps2));
1110  __fkg_tmp0 = svmad_f32_z(pg0,rij.v1,rij.v1,__fkg_tmp1);
1111  r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp0);
1112  r_inv = rsqrt(pg0,r2);
1113  __fkg_tmp2 = svmul_f32_z(pg0,r_inv,r_inv);
1114  tmp = svmsb_f32_z(pg0,r2,__fkg_tmp2,svdup_n_f32(3.0f));
1115  __fkg_tmp3 = svmul_f32_z(pg0,tmp,svdup_n_f32(0.5f));
1116  r_inv = svmul_f32_z(pg0,r_inv,__fkg_tmp3);
1117  r2_inv = svmul_f32_z(pg0,r_inv,r_inv);
1118  r3_inv = svmul_f32_z(pg0,r2_inv,r_inv);
1119  r4_inv = svmul_f32_z(pg0,r2_inv,r2_inv);
1120  r5_inv = svmul_f32_z(pg0,r2_inv,r3_inv);
1121  __fkg_tmp4 = svadd_f32_z(pg0,qj_xx,qj_yy);
1122  tr = svadd_f32_z(pg0,__fkg_tmp4,qj_zz);
1123  qxx = svnmsb_f32_z(pg0,svdup_n_f32(3.0f),qj_xx,tr);
1124  qyy = svnmsb_f32_z(pg0,svdup_n_f32(3.0f),qj_yy,tr);
1125  qzz = svnmsb_f32_z(pg0,svdup_n_f32(3.0f),qj_zz,tr);
1126  qxy = svmul_f32_z(pg0,svdup_n_f32(3.0f),qj_xy);
1127  qyz = svmul_f32_z(pg0,svdup_n_f32(3.0f),qj_yz);
1128  qxz = svmul_f32_z(pg0,svdup_n_f32(3.0f),qj_zx);
1129  __fkg_tmp5 = svmul_f32_z(pg0,svdup_n_f32(eps2),tr);
1130  mtr = svneg_f32_z(pg0,__fkg_tmp5);
1131  __fkg_tmp7 = svmul_f32_z(pg0,qxx,rij.v0);
1132  __fkg_tmp6 = svmad_f32_z(pg0,qxy,rij.v1,__fkg_tmp7);
1133  qr.v0 = svmad_f32_z(pg0,qxz,rij.v2,__fkg_tmp6);
1134  __fkg_tmp9 = svmul_f32_z(pg0,qyy,rij.v1);
1135  __fkg_tmp8 = svmad_f32_z(pg0,qyz,rij.v2,__fkg_tmp9);
1136  qr.v1 = svmad_f32_z(pg0,qxy,rij.v0,__fkg_tmp8);
1137  __fkg_tmp11 = svmul_f32_z(pg0,qzz,rij.v2);
1138  __fkg_tmp10 = svmad_f32_z(pg0,qxz,rij.v0,__fkg_tmp11);
1139  qr.v2 = svmad_f32_z(pg0,qyz,rij.v1,__fkg_tmp10);
1140  __fkg_tmp13 = svmad_f32_z(pg0,qr.v0,rij.v0,mtr);
1141  __fkg_tmp12 = svmad_f32_z(pg0,qr.v1,rij.v1,__fkg_tmp13);
1142  rqr = svmad_f32_z(pg0,qr.v2,rij.v2,__fkg_tmp12);
1143  rqr_r4_inv = svmul_f32_z(pg0,rqr,r4_inv);
1144  meff = svmad_f32_z(pg0,svdup_n_f32(0.5f),rqr_r4_inv,mj);
1145  __fkg_tmp14 = svmad_f32_z(pg0,svdup_n_f32(2.5f),rqr_r4_inv,mj);
1146  meff3 = svmul_f32_z(pg0,__fkg_tmp14,r3_inv);
1147  pf = svmsb_f32_z(pg0,meff,r_inv,pf);
1148  __fkg_tmp15 = svmsb_f32_z(pg0,r5_inv,qr.v0,af.v0);
1149  af.v0 = svmad_f32_z(pg0,meff3,rij.v0,__fkg_tmp15);
1150  __fkg_tmp16 = svmsb_f32_z(pg0,r5_inv,qr.v1,af.v1);
1151  af.v1 = svmad_f32_z(pg0,meff3,rij.v1,__fkg_tmp16);
1152  __fkg_tmp17 = svmsb_f32_z(pg0,r5_inv,qr.v2,af.v2);
1153  af.v2 = svmad_f32_z(pg0,meff3,rij.v2,__fkg_tmp17);
1154  } // loop of j
1155 
1156  ((float*)&force_loc[i+0].acc.x)[0] += svaddv_f32(svptrue_b32(),af.v0);
1157 
1158  ((float*)&force_loc[i+0].acc.y)[0] += svaddv_f32(svptrue_b32(),af.v1);
1159 
1160  ((float*)&force_loc[i+0].acc.z)[0] += svaddv_f32(svptrue_b32(),af.v2);
1161 
1162  ((float*)&force_loc[i+0].pot)[0] += svaddv_f32(svptrue_b32(),pf);
1163 
1164  } // loop of i
1165  // copy back
1166  for(i=0; i<ni_act; ++i){
1167  j = epi_list[i];
1168  if (epi[j].type==1) {
1169  force[j].acc.x += G*force_loc[i].acc.x;
1170  force[j].acc.y += G*force_loc[i].acc.y;
1171  force[j].acc.z += G*force_loc[i].acc.z;
1172  force[j].pot += G*force_loc[i].pot;
1173  }
1174  }
1175 
1176  } // Kernel_I1_J16 definition
1177 
1178  PIKG::F64 rsqrt(PIKG::F64 op){ return 1.0/std::sqrt(op); }
1179  PIKG::F64 sqrt(PIKG::F64 op){ return std::sqrt(op); }
1180  PIKG::F64 inv(PIKG::F64 op){ return 1.0/op; }
1181  PIKG::F64 max(PIKG::F64 a,PIKG::F64 b){ return std::max(a,b);}
1182  PIKG::F64 min(PIKG::F64 a,PIKG::F64 b){ return std::min(a,b);}
1183  PIKG::F32 rsqrt(PIKG::F32 op){ return 1.f/std::sqrt(op); }
1184  PIKG::F32 sqrt(PIKG::F32 op){ return std::sqrt(op); }
1185  PIKG::F32 inv(PIKG::F32 op){ return 1.f/op; }
1186  PIKG::S64 max(PIKG::S64 a,PIKG::S64 b){ return std::max(a,b);}
1187  PIKG::S64 min(PIKG::S64 a,PIKG::S64 b){ return std::min(a,b);}
1188  PIKG::S32 max(PIKG::S32 a,PIKG::S32 b){ return std::max(a,b);}
1189  PIKG::S32 min(PIKG::S32 a,PIKG::S32 b){ return std::min(a,b);}
1190  PIKG::F64 table(PIKG::F64 tab[],PIKG::S64 i){ return tab[i]; }
1191  PIKG::F32 table(PIKG::F32 tab[],PIKG::S32 i){ return tab[i]; }
1192  PIKG::F64 to_float(PIKG::U64 op){return (PIKG::F64)op;}
1193  PIKG::F32 to_float(PIKG::U32 op){return (PIKG::F32)op;}
1194  PIKG::F64 to_float(PIKG::S64 op){return (PIKG::F64)op;}
1195  PIKG::F32 to_float(PIKG::S32 op){return (PIKG::F32)op;}
1196  //PIKG::S64 to_int(PIKG::F64 op){return (PIKG::S64)op;}
1197  //PIKG::S32 to_int(PIKG::F32 op){return (PIKG::S32)op;}
1198  PIKG::U64 to_uint(PIKG::F64 op){return (PIKG::U64)op;}
1199  PIKG::U32 to_uint(PIKG::F32 op){return (PIKG::U32)op;}
1200  template<typename T> PIKG::F64 to_f64(const T& op){return (PIKG::F64)op;}
1201  template<typename T> PIKG::F32 to_f32(const T& op){return (PIKG::F32)op;}
1202  template<typename T> PIKG::S64 to_s64(const T& op){return (PIKG::S64)op;}
1203  template<typename T> PIKG::S32 to_s32(const T& op){return (PIKG::S32)op;}
1204  template<typename T> PIKG::U64 to_u64(const T& op){return (PIKG::U64)op;}
1205  template<typename T> PIKG::U32 to_u32(const T& op){return (PIKG::U32)op;}
1206  svfloat32_t rsqrt(svbool_t pg,svfloat32_t op){
1207  svfloat32_t rinv = svrsqrte_f32(op);
1208  svfloat32_t h = svmul_f32_z(pg,op,rinv);
1209  h = svmsb_n_f32_z(pg,h,rinv,1.f);
1210  svfloat32_t poly = svmad_n_f32_z(pg,h,svdup_f32(0.375f),0.5f);
1211  poly = svmul_f32_z(pg,poly,h);
1212  rinv = svmad_f32_z(pg,rinv,poly,rinv);
1213  return rinv;
1214  }
1215  svfloat64_t rsqrt(svbool_t pg,svfloat64_t op){
1216  svfloat64_t rinv = svrsqrte_f64(op);
1217  svfloat64_t h = svmul_f64_z(pg,op,rinv);
1218  h = svmsb_n_f64_z(pg,h,rinv,1.f);
1219  svfloat64_t poly = svmad_n_f64_z(pg,h,svdup_f64(0.375f),0.5f);
1220  poly = svmul_f64_z(pg,poly,h);
1221  rinv = svmad_f64_z(pg,rinv,poly,rinv);
1222  return rinv;
1223  }
1224  svfloat32x2_t svdup_n_f32x3(PIKG::F32vec2 v){
1225  svfloat32x2_t ret;
1226  ret.v0 = svdup_n_f32(v.x);
1227  ret.v1 = svdup_n_f32(v.y);
1228  return ret;
1229  }
1230  svfloat32x3_t svdup_n_f32x3(PIKG::F32vec v){
1231  svfloat32x3_t ret;
1232  ret.v0 = svdup_n_f32(v.x);
1233  ret.v1 = svdup_n_f32(v.y);
1234  ret.v2 = svdup_n_f32(v.z);
1235  return ret;
1236  }
1237  svfloat32x4_t svdup_n_f32x4(PIKG::F32vec4 v){
1238  svfloat32x4_t ret;
1239  ret.v0 = svdup_n_f32(v.x);
1240  ret.v1 = svdup_n_f32(v.y);
1241  ret.v2 = svdup_n_f32(v.z);
1242  ret.v3 = svdup_n_f32(v.w);
1243  return ret;
1244  }
1245  svfloat64x2_t svdup_n_f64x3(PIKG::F64vec2 v){
1246  svfloat64x2_t ret;
1247  ret.v0 = svdup_n_f64(v.x);
1248  ret.v1 = svdup_n_f64(v.y);
1249  return ret;
1250  }
1251  svfloat64x3_t svdup_n_f64x3(PIKG::F64vec v){
1252  svfloat64x3_t ret;
1253  ret.v0 = svdup_n_f64(v.x);
1254  ret.v1 = svdup_n_f64(v.y);
1255  ret.v2 = svdup_n_f64(v.z);
1256  return ret;
1257  }
1258  svfloat64x4_t svdup_n_f64x4(PIKG::F64vec4 v){
1259  svfloat64x4_t ret;
1260  ret.v0 = svdup_n_f64(v.x);
1261  ret.v1 = svdup_n_f64(v.y);
1262  ret.v2 = svdup_n_f64(v.z);
1263  ret.v3 = svdup_n_f64(v.w);
1264  return ret;
1265  }
1266  svfloat32_t sqrt(svbool_t pg,svfloat32_t op){ return svsqrt_f32_z(pg,op); }
1267  svfloat64_t sqrt(svbool_t pg,svfloat64_t op){ return svsqrt_f64_z(pg,op); }
1268  svfloat32_t inv(svbool_t pg,svfloat32_t op){
1269  svfloat32_t x1 = svrecpe_f32(op);
1270  svfloat32_t x2 = svmsb_n_f32_z(pg,op,x1,2.f);
1271  x2 = svmul_f32_z(pg,x2,x1);
1272  svfloat32_t ret = svmsb_n_f32_z(pg,op,x2,2.f);
1273  ret = svmul_f32_z(pg,ret,x2);
1274  return ret;
1275  }
1276  svfloat64_t inv(svbool_t pg,svfloat64_t op){
1277  svfloat64_t x1 = svrecpe_f64(op);
1278  svfloat64_t x2 = svmsb_n_f64_z(pg,op,x1,2.f);
1279  x2 = svmul_f64_z(pg,x2,x1);
1280  svfloat64_t ret = svmsb_n_f64_z(pg,op,x2,2.f);
1281  ret = svmul_f64_z(pg,ret,x2);
1282  return ret;
1283  }
1284  svfloat64_t max(svbool_t pg,svfloat64_t a,svfloat64_t b){ return svmax_f64_z(pg,a,b);}
1285  svfloat64_t min(svbool_t pg,svfloat64_t a,svfloat64_t b){ return svmin_f64_z(pg,a,b);}
1286  svuint64_t max(svbool_t pg,svuint64_t a,svuint64_t b){ return svmax_u64_z(pg,a,b);}
1287  svuint64_t min(svbool_t pg,svuint64_t a,svuint64_t b){ return svmin_u64_z(pg,a,b);}
1288  svint64_t max(svbool_t pg,svint64_t a,svint64_t b){ return svmax_s64_z(pg,a,b);}
1289  svint64_t min(svbool_t pg,svint64_t a,svint64_t b){ return svmin_s64_z(pg,a,b);}
1290  svfloat32_t max(svbool_t pg,svfloat32_t a,svfloat32_t b){ return svmax_f32_z(pg,a,b);}
1291  svfloat32_t min(svbool_t pg,svfloat32_t a,svfloat32_t b){ return svmin_f32_z(pg,a,b);}
1292  svuint32_t max(svbool_t pg,svuint32_t a,svuint32_t b){ return svmax_u32_z(pg,a,b);}
1293  svuint32_t min(svbool_t pg,svuint32_t a,svuint32_t b){ return svmin_u32_z(pg,a,b);}
1294  svint32_t max(svbool_t pg,svint32_t a,svint32_t b){ return svmax_s32_z(pg,a,b);}
1295  svint32_t min(svbool_t pg,svint32_t a,svint32_t b){ return svmin_s32_z(pg,a,b);}
1296  svfloat64_t table(svfloat64_t tab,svuint64_t index){ return svtbl_f64(tab,index);}
1297  svfloat32_t table(svfloat32_t tab,svuint32_t index){ return svtbl_f32(tab,index);}
1298  svfloat64_t to_float(svbool_t pg, svint64_t op){ return svcvt_f64_s64_z(pg,op);}
1299  svfloat32_t to_float(svbool_t pg, svint32_t op){ return svcvt_f32_s32_z(pg,op);}
1300  svfloat64_t to_float(svbool_t pg,svuint64_t op){ return svcvt_f64_u64_z(pg,op);}
1301  svfloat32_t to_float(svbool_t pg,svuint32_t op){ return svcvt_f32_u32_z(pg,op);}
1302  //svint64_t to_int(svbool_t pg,svfloat64_t op){ return svcvt_s64_f64_z(pg,op);}
1303  //svint32_t to_int(svbool_t pg,svfloat32_t op){ return svcvt_s32_f32_z(pg,op);}
1304  svuint64_t to_uint(svbool_t pg,svfloat64_t op){ return svcvt_u64_f64_z(pg,op);}
1305  svuint32_t to_uint(svbool_t pg,svfloat32_t op){ return svcvt_u32_f32_z(pg,op);}
1306 };// kernel functor definition
1307 
1308 #else //USE_QUAD
1309 
1310 struct CalcForceEpSpMonoFugaku{
1311  PIKG::F32 eps2;
1312  PIKG::F64 G;
1313 
1314  CalcForceEpSpMonoFugaku(){}
1315  CalcForceEpSpMonoFugaku(PIKG::F64 eps2,PIKG::F64 G):eps2(eps2),G(G){}
1316  void initialize(PIKG::F64 eps2_,PIKG::F64 G_){
1317  eps2 = eps2_;
1318  G = G_;
1319  }
1320 
1321  int kernel_id = 0;
1322 
1323  void operator()(const EPISoft* __restrict__ epi, const int ni, const SPJSoft* __restrict__ epj, const int nj, ForceSoft* __restrict__ force, const int kernel_select = 1){
1324 
1325  static_assert(sizeof(EPI32) == 16,"check consistency of EPI member variable definition between PIKG source and original source");
1326  static_assert(sizeof(SPJ32) == 16,"check consistency of EPJ member variable definition between PIKG source and original source");
1327  static_assert(sizeof(FORCESP32) == 16,"check consistency of FORCE member variable definition between PIKG source and original source");
1328  if(kernel_select>=0) kernel_id = kernel_select;
1329  if(kernel_id == 0){
1330  std::cout << "ni: " << ni << " nj:" << nj << std::endl;
1331  ForceSoft* force_tmp = new ForceSoft[ni];
1332  std::chrono::system_clock::time_point start, end;
1333  double min_time = std::numeric_limits<double>::max();
1334  { // test Kernel_I16_J1
1335  for(int i=0;i<ni;i++) force_tmp[i] = force[i];
1336  start = std::chrono::system_clock::now();
1337  Kernel_I16_J1(epi,ni,epj,nj,force_tmp);
1338  end = std::chrono::system_clock::now();
1339  double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
1340  std::cerr << "kerel 1: " << elapsed << " ns" << std::endl;
1341  if(min_time > elapsed){
1342  min_time = elapsed;
1343  kernel_id = 1;
1344  }
1345  }
1346  { // test Kernel_I1_J16
1347  for(int i=0;i<ni;i++) force_tmp[i] = force[i];
1348  start = std::chrono::system_clock::now();
1349  Kernel_I1_J16(epi,ni,epj,nj,force_tmp);
1350  end = std::chrono::system_clock::now();
1351  double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
1352  std::cerr << "kerel 2: " << elapsed << " ns" << std::endl;
1353  if(min_time > elapsed){
1354  min_time = elapsed;
1355  kernel_id = 2;
1356  }
1357  }
1358  delete[] force_tmp;
1359  } // if(kernel_id == 0)
1360  if(kernel_id == 1) Kernel_I16_J1(epi,ni,epj,nj,force);
1361  if(kernel_id == 2) Kernel_I1_J16(epi,ni,epj,nj,force);
1362  } // operator() definition
1363 
1364  void Kernel_I16_J1(const EPISoft* __restrict__ epi, const PIKG::S32 ni, const SPJSoft* __restrict__ epj, const PIKG::S32 nj, ForceSoft* __restrict__ force){
1365  PIKG::S32 i;
1366  PIKG::S32 j;
1367 
1368  PIKG::S32 ni_act;
1369  PIKG::S32 epi_list[ni];
1370  EPI32 epi_loc[ni];
1371  SPJ32 epj_loc[nj];
1372  FORCESP32 force_loc[ni];
1373 
1374  ni_act=0;
1375  for(i=0; i<ni; ++i){
1376  if (epi[i].type==1) {
1377  epi_list[ni_act] = i;
1378  epi_loc[ni_act].pos.x = epi[i].pos.x - epi[0].pos.x;
1379  epi_loc[ni_act].pos.y = epi[i].pos.y - epi[0].pos.y;
1380  epi_loc[ni_act].pos.z = epi[i].pos.z - epi[0].pos.z;
1381 
1382  force_loc[ni_act].acc = 0.f;
1383  force_loc[ni_act].pot = 0.0;
1384  ni_act++;
1385  }
1386  }
1387 
1388  for(j=0; j<nj; ++j) {
1389  epj_loc[j].pos.x = epj[j].pos.x - epi[0].pos.x;
1390  epj_loc[j].pos.y = epj[j].pos.y - epi[0].pos.y;
1391  epj_loc[j].pos.z = epj[j].pos.z - epi[0].pos.z;
1392  epj_loc[j].mass = epj[j].mass;
1393  }
1394 
1395 
1396  for(i = 0;i < ((ni_act + 16 - 1)/16)*16;i += 16){
1397  svbool_t pg0 = svwhilelt_b32_s32(i,ni_act);
1398  svfloat32x3_t xi;
1399 
1400  uint32_t index_gather_load0[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1401  svuint32_t vindex_gather_load0 = svld1_u32(pg0,index_gather_load0);
1402  xi.v0 = svld1_gather_u32index_f32(pg0,((float*)&epi_loc[i+0].pos.x),vindex_gather_load0);
1403  uint32_t index_gather_load1[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1404  svuint32_t vindex_gather_load1 = svld1_u32(pg0,index_gather_load1);
1405  xi.v1 = svld1_gather_u32index_f32(pg0,((float*)&epi_loc[i+0].pos.y),vindex_gather_load1);
1406  uint32_t index_gather_load2[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1407  svuint32_t vindex_gather_load2 = svld1_u32(pg0,index_gather_load2);
1408  xi.v2 = svld1_gather_u32index_f32(pg0,((float*)&epi_loc[i+0].pos.z),vindex_gather_load2);
1409  svfloat32x3_t acc;
1410 
1411  acc.v0 = svdup_n_f32(0.0f);
1412  acc.v1 = svdup_n_f32(0.0f);
1413  acc.v2 = svdup_n_f32(0.0f);
1414  svfloat32_t pot;
1415 
1416  pot = svdup_n_f32(0.0f);
1417  for(j = 0;j < ((nj + 1 - 1)/1)*1;++j){
1418  svfloat32_t mj;
1419 
1420  mj = svdup_n_f32(epj_loc[j+0].mass);
1421 
1422  svfloat32x3_t xj;
1423 
1424  xj.v0 = svdup_n_f32(epj_loc[j+0].pos.x);
1425 
1426  xj.v1 = svdup_n_f32(epj_loc[j+0].pos.y);
1427 
1428  xj.v2 = svdup_n_f32(epj_loc[j+0].pos.z);
1429 
1430  svfloat32x3_t rij;
1431 
1432  svfloat32_t __fkg_tmp1;
1433 
1434  svfloat32_t __fkg_tmp0;
1435 
1436  svfloat32_t r2;
1437 
1438  svfloat32_t r_inv;
1439 
1440  svfloat32_t r2_inv;
1441 
1442  svfloat32_t mr_inv;
1443 
1444  svfloat32_t mr3_inv;
1445 
1446  rij.v0 = svsub_f32_z(pg0,xi.v0,xj.v0);
1447  rij.v1 = svsub_f32_z(pg0,xi.v1,xj.v1);
1448  rij.v2 = svsub_f32_z(pg0,xi.v2,xj.v2);
1449  __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,svdup_n_f32(eps2));
1450  __fkg_tmp0 = svmad_f32_z(pg0,rij.v1,rij.v1,__fkg_tmp1);
1451  r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp0);
1452  r_inv = rsqrt(pg0,r2);
1453  r2_inv = svmul_f32_z(pg0,r_inv,r_inv);
1454  mr_inv = svmul_f32_z(pg0,mj,r_inv);
1455  mr3_inv = svmul_f32_z(pg0,r2_inv,mr_inv);
1456  acc.v0 = svmsb_f32_z(pg0,mr3_inv,rij.v0,acc.v0);
1457  acc.v1 = svmsb_f32_z(pg0,mr3_inv,rij.v1,acc.v1);
1458  acc.v2 = svmsb_f32_z(pg0,mr3_inv,rij.v2,acc.v2);
1459  pot = svsub_f32_z(pg0,pot,mr_inv);
1460  } // loop of j
1461 
1462  {
1463  svfloat32_t __fkg_tmp_accum;
1464  uint32_t index_gather_load3[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1465  svuint32_t vindex_gather_load3 = svld1_u32(svptrue_b32(),index_gather_load3);
1466  __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.x),vindex_gather_load3);
1467  __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,acc.v0);
1468  uint32_t index_scatter_store0[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1469  svuint32_t vindex_scatter_store0 = svld1_u32(svptrue_b32(),index_scatter_store0);
1470  svst1_scatter_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.x),vindex_scatter_store0,__fkg_tmp_accum);}
1471 
1472  {
1473  svfloat32_t __fkg_tmp_accum;
1474  uint32_t index_gather_load4[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1475  svuint32_t vindex_gather_load4 = svld1_u32(svptrue_b32(),index_gather_load4);
1476  __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.y),vindex_gather_load4);
1477  __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,acc.v1);
1478  uint32_t index_scatter_store1[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1479  svuint32_t vindex_scatter_store1 = svld1_u32(svptrue_b32(),index_scatter_store1);
1480  svst1_scatter_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.y),vindex_scatter_store1,__fkg_tmp_accum);}
1481 
1482  {
1483  svfloat32_t __fkg_tmp_accum;
1484  uint32_t index_gather_load5[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1485  svuint32_t vindex_gather_load5 = svld1_u32(svptrue_b32(),index_gather_load5);
1486  __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.z),vindex_gather_load5);
1487  __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,acc.v2);
1488  uint32_t index_scatter_store2[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1489  svuint32_t vindex_scatter_store2 = svld1_u32(svptrue_b32(),index_scatter_store2);
1490  svst1_scatter_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].acc.z),vindex_scatter_store2,__fkg_tmp_accum);}
1491 
1492  {
1493  svfloat32_t __fkg_tmp_accum;
1494  uint32_t index_gather_load6[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1495  svuint32_t vindex_gather_load6 = svld1_u32(svptrue_b32(),index_gather_load6);
1496  __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].pot),vindex_gather_load6);
1497  __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,pot);
1498  uint32_t index_scatter_store3[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1499  svuint32_t vindex_scatter_store3 = svld1_u32(svptrue_b32(),index_scatter_store3);
1500  svst1_scatter_u32index_f32(svptrue_b32(),((float*)&force_loc[i+0].pot),vindex_scatter_store3,__fkg_tmp_accum);}
1501 
1502  } // loop of i
1503 
1504  // copy back
1505  for(i=0; i<ni_act; ++i){
1506  j = epi_list[i];
1507  assert(epi[j].type==1);
1508  force[j].acc.x += G*force_loc[i].acc.x;
1509  force[j].acc.y += G*force_loc[i].acc.y;
1510  force[j].acc.z += G*force_loc[i].acc.z;
1511  force[j].pot += G*force_loc[i].pot;
1512  }
1513  } // Kernel_I16_J1 definition
1514 
1515  void Kernel_I1_J16(const EPISoft* __restrict__ epi, const PIKG::S32 ni, const SPJSoft* __restrict__ epj, const PIKG::S32 nj, ForceSoft* __restrict__ force){
1516  PIKG::S32 i;
1517  PIKG::S32 j;
1518 
1519  PIKG::S32 ni_act;
1520  PIKG::S32 epi_list[ni];
1521  EPI32 epi_loc[ni];
1522  SPJ32 epj_loc[nj];
1523  FORCESP32 force_loc[ni];
1524 
1525  ni_act=0;
1526  for(i=0; i<ni; ++i){
1527  if (epi[i].type==1) {
1528  epi_list[ni_act] = i;
1529  epi_loc[ni_act].pos.x = epi[i].pos.x - epi[0].pos.x;
1530  epi_loc[ni_act].pos.y = epi[i].pos.y - epi[0].pos.y;
1531  epi_loc[ni_act].pos.z = epi[i].pos.z - epi[0].pos.z;
1532 
1533  force_loc[ni_act].acc = 0.f;
1534  force_loc[ni_act].pot = 0.0;
1535  ni_act++;
1536  }
1537  }
1538 
1539  for(j=0; j<nj; ++j) {
1540  epj_loc[j].pos.x = epj[j].pos.x - epi[0].pos.x;
1541  epj_loc[j].pos.y = epj[j].pos.y - epi[0].pos.y;
1542  epj_loc[j].pos.z = epj[j].pos.z - epi[0].pos.z;
1543  epj_loc[j].mass = epj[j].mass;
1544  }
1545 
1546  for(i = 0;i < ((ni_act + 1 - 1)/1)*1;++i){
1547  svfloat32x3_t xi;
1548 
1549  xi.v0 = svdup_n_f32(epi_loc[i+0].pos.x);
1550 
1551  xi.v1 = svdup_n_f32(epi_loc[i+0].pos.y);
1552 
1553  xi.v2 = svdup_n_f32(epi_loc[i+0].pos.z);
1554 
1555  svfloat32x3_t acc;
1556 
1557  acc.v0 = svdup_n_f32(0.0f);
1558  acc.v1 = svdup_n_f32(0.0f);
1559  acc.v2 = svdup_n_f32(0.0f);
1560  svfloat32_t pot;
1561 
1562  pot = svdup_n_f32(0.0f);
1563  for(j = 0;j < ((nj + 16 - 1)/16)*16;j += 16){
1564  svbool_t pg0 = svwhilelt_b32_s32(j,nj);
1565  svfloat32_t mj;
1566 
1567  uint32_t index_gather_load7[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1568  svuint32_t vindex_gather_load7 = svld1_u32(pg0,index_gather_load7);
1569  mj = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].mass),vindex_gather_load7);
1570  svfloat32x3_t xj;
1571 
1572  uint32_t index_gather_load8[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1573  svuint32_t vindex_gather_load8 = svld1_u32(pg0,index_gather_load8);
1574  xj.v0 = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].pos.x),vindex_gather_load8);
1575  uint32_t index_gather_load9[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1576  svuint32_t vindex_gather_load9 = svld1_u32(pg0,index_gather_load9);
1577  xj.v1 = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].pos.y),vindex_gather_load9);
1578  uint32_t index_gather_load10[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1579  svuint32_t vindex_gather_load10 = svld1_u32(pg0,index_gather_load10);
1580  xj.v2 = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].pos.z),vindex_gather_load10);
1581  svfloat32x3_t rij;
1582 
1583  svfloat32_t __fkg_tmp1;
1584 
1585  svfloat32_t __fkg_tmp0;
1586 
1587  svfloat32_t r2;
1588 
1589  svfloat32_t r_inv;
1590 
1591  svfloat32_t r2_inv;
1592 
1593  svfloat32_t mr_inv;
1594 
1595  svfloat32_t mr3_inv;
1596 
1597  rij.v0 = svsub_f32_z(pg0,xi.v0,xj.v0);
1598  rij.v1 = svsub_f32_z(pg0,xi.v1,xj.v1);
1599  rij.v2 = svsub_f32_z(pg0,xi.v2,xj.v2);
1600  __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,svdup_n_f32(eps2));
1601  __fkg_tmp0 = svmad_f32_z(pg0,rij.v1,rij.v1,__fkg_tmp1);
1602  r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp0);
1603  r_inv = rsqrt(pg0,r2);
1604  r2_inv = svmul_f32_z(pg0,r_inv,r_inv);
1605  mr_inv = svmul_f32_z(pg0,mj,r_inv);
1606  mr3_inv = svmul_f32_z(pg0,r2_inv,mr_inv);
1607  acc.v0 = svmsb_f32_z(pg0,mr3_inv,rij.v0,acc.v0);
1608  acc.v1 = svmsb_f32_z(pg0,mr3_inv,rij.v1,acc.v1);
1609  acc.v2 = svmsb_f32_z(pg0,mr3_inv,rij.v2,acc.v2);
1610  pot = svsub_f32_z(pg0,pot,mr_inv);
1611  } // loop of j
1612 
1613  ((float*)&force_loc[i+0].acc.x)[0] += svaddv_f32(svptrue_b32(),acc.v0);
1614 
1615  ((float*)&force_loc[i+0].acc.y)[0] += svaddv_f32(svptrue_b32(),acc.v1);
1616 
1617  ((float*)&force_loc[i+0].acc.z)[0] += svaddv_f32(svptrue_b32(),acc.v2);
1618 
1619  ((float*)&force_loc[i+0].pot)[0] += svaddv_f32(svptrue_b32(),pot);
1620 
1621  } // loop of i
1622 
1623  // copy back
1624  for(i=0; i<ni_act; ++i){
1625  j = epi_list[i];
1626  assert(epi[j].type==1);
1627  force[j].acc.x += G*force_loc[i].acc.x;
1628  force[j].acc.y += G*force_loc[i].acc.y;
1629  force[j].acc.z += G*force_loc[i].acc.z;
1630  force[j].pot += G*force_loc[i].pot;
1631  }
1632 
1633  } // Kernel_I1_J16 definition
1634 
1635  PIKG::F64 rsqrt(PIKG::F64 op){ return 1.0/std::sqrt(op); }
1636  PIKG::F64 sqrt(PIKG::F64 op){ return std::sqrt(op); }
1637  PIKG::F64 inv(PIKG::F64 op){ return 1.0/op; }
1638  PIKG::F64 max(PIKG::F64 a,PIKG::F64 b){ return std::max(a,b);}
1639  PIKG::F64 min(PIKG::F64 a,PIKG::F64 b){ return std::min(a,b);}
1640  PIKG::F32 rsqrt(PIKG::F32 op){ return 1.f/std::sqrt(op); }
1641  PIKG::F32 sqrt(PIKG::F32 op){ return std::sqrt(op); }
1642  PIKG::F32 inv(PIKG::F32 op){ return 1.f/op; }
1643  PIKG::S64 max(PIKG::S64 a,PIKG::S64 b){ return std::max(a,b);}
1644  PIKG::S64 min(PIKG::S64 a,PIKG::S64 b){ return std::min(a,b);}
1645  PIKG::S32 max(PIKG::S32 a,PIKG::S32 b){ return std::max(a,b);}
1646  PIKG::S32 min(PIKG::S32 a,PIKG::S32 b){ return std::min(a,b);}
1647  PIKG::F64 table(PIKG::F64 tab[],PIKG::S64 i){ return tab[i]; }
1648  PIKG::F32 table(PIKG::F32 tab[],PIKG::S32 i){ return tab[i]; }
1649  PIKG::F64 to_float(PIKG::U64 op){return (PIKG::F64)op;}
1650  PIKG::F32 to_float(PIKG::U32 op){return (PIKG::F32)op;}
1651  PIKG::F64 to_float(PIKG::S64 op){return (PIKG::F64)op;}
1652  PIKG::F32 to_float(PIKG::S32 op){return (PIKG::F32)op;}
1653  //PIKG::S64 to_int(PIKG::F64 op){return (PIKG::S64)op;}
1654  //PIKG::S32 to_int(PIKG::F32 op){return (PIKG::S32)op;}
1655  PIKG::U64 to_uint(PIKG::F64 op){return (PIKG::U64)op;}
1656  PIKG::U32 to_uint(PIKG::F32 op){return (PIKG::U32)op;}
1657  template<typename T> PIKG::F64 to_f64(const T& op){return (PIKG::F64)op;}
1658  template<typename T> PIKG::F32 to_f32(const T& op){return (PIKG::F32)op;}
1659  template<typename T> PIKG::S64 to_s64(const T& op){return (PIKG::S64)op;}
1660  template<typename T> PIKG::S32 to_s32(const T& op){return (PIKG::S32)op;}
1661  template<typename T> PIKG::U64 to_u64(const T& op){return (PIKG::U64)op;}
1662  template<typename T> PIKG::U32 to_u32(const T& op){return (PIKG::U32)op;}
1663  svfloat32_t rsqrt(svbool_t pg,svfloat32_t op){
1664  svfloat32_t rinv = svrsqrte_f32(op);
1665  svfloat32_t h = svmul_f32_z(pg,op,rinv);
1666  h = svmsb_n_f32_z(pg,h,rinv,1.f);
1667  svfloat32_t poly = svmad_n_f32_z(pg,h,svdup_f32(0.375f),0.5f);
1668  poly = svmul_f32_z(pg,poly,h);
1669  rinv = svmad_f32_z(pg,rinv,poly,rinv);
1670  return rinv;
1671  }
1672  svfloat64_t rsqrt(svbool_t pg,svfloat64_t op){
1673  svfloat64_t rinv = svrsqrte_f64(op);
1674  svfloat64_t h = svmul_f64_z(pg,op,rinv);
1675  h = svmsb_n_f64_z(pg,h,rinv,1.f);
1676  svfloat64_t poly = svmad_n_f64_z(pg,h,svdup_f64(0.375f),0.5f);
1677  poly = svmul_f64_z(pg,poly,h);
1678  rinv = svmad_f64_z(pg,rinv,poly,rinv);
1679  return rinv;
1680  }
1681  svfloat32x2_t svdup_n_f32x3(PIKG::F32vec2 v){
1682  svfloat32x2_t ret;
1683  ret.v0 = svdup_n_f32(v.x);
1684  ret.v1 = svdup_n_f32(v.y);
1685  return ret;
1686  }
1687  svfloat32x3_t svdup_n_f32x3(PIKG::F32vec v){
1688  svfloat32x3_t ret;
1689  ret.v0 = svdup_n_f32(v.x);
1690  ret.v1 = svdup_n_f32(v.y);
1691  ret.v2 = svdup_n_f32(v.z);
1692  return ret;
1693  }
1694  svfloat32x4_t svdup_n_f32x4(PIKG::F32vec4 v){
1695  svfloat32x4_t ret;
1696  ret.v0 = svdup_n_f32(v.x);
1697  ret.v1 = svdup_n_f32(v.y);
1698  ret.v2 = svdup_n_f32(v.z);
1699  ret.v3 = svdup_n_f32(v.w);
1700  return ret;
1701  }
1702  svfloat64x2_t svdup_n_f64x3(PIKG::F64vec2 v){
1703  svfloat64x2_t ret;
1704  ret.v0 = svdup_n_f64(v.x);
1705  ret.v1 = svdup_n_f64(v.y);
1706  return ret;
1707  }
1708  svfloat64x3_t svdup_n_f64x3(PIKG::F64vec v){
1709  svfloat64x3_t ret;
1710  ret.v0 = svdup_n_f64(v.x);
1711  ret.v1 = svdup_n_f64(v.y);
1712  ret.v2 = svdup_n_f64(v.z);
1713  return ret;
1714  }
1715  svfloat64x4_t svdup_n_f64x4(PIKG::F64vec4 v){
1716  svfloat64x4_t ret;
1717  ret.v0 = svdup_n_f64(v.x);
1718  ret.v1 = svdup_n_f64(v.y);
1719  ret.v2 = svdup_n_f64(v.z);
1720  ret.v3 = svdup_n_f64(v.w);
1721  return ret;
1722  }
1723  svfloat32_t sqrt(svbool_t pg,svfloat32_t op){ return svsqrt_f32_z(pg,op); }
1724  svfloat64_t sqrt(svbool_t pg,svfloat64_t op){ return svsqrt_f64_z(pg,op); }
1725  svfloat32_t inv(svbool_t pg,svfloat32_t op){
1726  svfloat32_t x1 = svrecpe_f32(op);
1727  svfloat32_t x2 = svmsb_n_f32_z(pg,op,x1,2.f);
1728  x2 = svmul_f32_z(pg,x2,x1);
1729  svfloat32_t ret = svmsb_n_f32_z(pg,op,x2,2.f);
1730  ret = svmul_f32_z(pg,ret,x2);
1731  return ret;
1732  }
1733  svfloat64_t inv(svbool_t pg,svfloat64_t op){
1734  svfloat64_t x1 = svrecpe_f64(op);
1735  svfloat64_t x2 = svmsb_n_f64_z(pg,op,x1,2.f);
1736  x2 = svmul_f64_z(pg,x2,x1);
1737  svfloat64_t ret = svmsb_n_f64_z(pg,op,x2,2.f);
1738  ret = svmul_f64_z(pg,ret,x2);
1739  return ret;
1740  }
1741  svfloat64_t max(svbool_t pg,svfloat64_t a,svfloat64_t b){ return svmax_f64_z(pg,a,b);}
1742  svfloat64_t min(svbool_t pg,svfloat64_t a,svfloat64_t b){ return svmin_f64_z(pg,a,b);}
1743  svuint64_t max(svbool_t pg,svuint64_t a,svuint64_t b){ return svmax_u64_z(pg,a,b);}
1744  svuint64_t min(svbool_t pg,svuint64_t a,svuint64_t b){ return svmin_u64_z(pg,a,b);}
1745  svint64_t max(svbool_t pg,svint64_t a,svint64_t b){ return svmax_s64_z(pg,a,b);}
1746  svint64_t min(svbool_t pg,svint64_t a,svint64_t b){ return svmin_s64_z(pg,a,b);}
1747  svfloat32_t max(svbool_t pg,svfloat32_t a,svfloat32_t b){ return svmax_f32_z(pg,a,b);}
1748  svfloat32_t min(svbool_t pg,svfloat32_t a,svfloat32_t b){ return svmin_f32_z(pg,a,b);}
1749  svuint32_t max(svbool_t pg,svuint32_t a,svuint32_t b){ return svmax_u32_z(pg,a,b);}
1750  svuint32_t min(svbool_t pg,svuint32_t a,svuint32_t b){ return svmin_u32_z(pg,a,b);}
1751  svint32_t max(svbool_t pg,svint32_t a,svint32_t b){ return svmax_s32_z(pg,a,b);}
1752  svint32_t min(svbool_t pg,svint32_t a,svint32_t b){ return svmin_s32_z(pg,a,b);}
1753  svfloat64_t table(svfloat64_t tab,svuint64_t index){ return svtbl_f64(tab,index);}
1754  svfloat32_t table(svfloat32_t tab,svuint32_t index){ return svtbl_f32(tab,index);}
1755  svfloat64_t to_float(svbool_t pg, svint64_t op){ return svcvt_f64_s64_z(pg,op);}
1756  svfloat32_t to_float(svbool_t pg, svint32_t op){ return svcvt_f32_s32_z(pg,op);}
1757  svfloat64_t to_float(svbool_t pg,svuint64_t op){ return svcvt_f64_u64_z(pg,op);}
1758  svfloat32_t to_float(svbool_t pg,svuint32_t op){ return svcvt_f32_u32_z(pg,op);}
1759  //svint64_t to_int(svbool_t pg,svfloat64_t op){ return svcvt_s64_f64_z(pg,op);}
1760  //svint32_t to_int(svbool_t pg,svfloat32_t op){ return svcvt_s32_f32_z(pg,op);}
1761  svuint64_t to_uint(svbool_t pg,svfloat64_t op){ return svcvt_u64_f64_z(pg,op);}
1762  svuint32_t to_uint(svbool_t pg,svfloat32_t op){ return svcvt_u32_f32_z(pg,op);}
1763 };// kernel functor definition
1764 
1765 #endif //USE_QUAD
1766 
1767 struct SearchNeighborEpEpFugaku{
1768  SearchNeighborEpEpFugaku(){}
1769  void initialize(){
1770  }
1771  int kernel_id = 0;
1772  void operator()(const EPISoft* __restrict__ epi,const int ni,const EPJSoft* __restrict__ epj,const int nj,ForceSoft* __restrict__ force,const int kernel_select = 1){
1773  static_assert(sizeof(EPI32) == 16,"check consistency of EPI member variable definition between PIKG source and original source");
1774  if(kernel_select>=0) kernel_id = kernel_select;
1775  if(kernel_id == 0){
1776  std::cout << "ni: " << ni << " nj:" << nj << std::endl;
1777  ForceSoft* force_tmp = new ForceSoft[ni];
1778  std::chrono::system_clock::time_point start, end;
1779  double min_time = std::numeric_limits<double>::max();
1780  { // test Kernel_I16_J1
1781  for(int i=0;i<ni;i++) force_tmp[i] = force[i];
1782  start = std::chrono::system_clock::now();
1783  Kernel_I16_J1(epi,ni,epj,nj,force_tmp);
1784  end = std::chrono::system_clock::now();
1785  double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
1786  std::cerr << "kerel 1: " << elapsed << " ns" << std::endl;
1787  if(min_time > elapsed){
1788  min_time = elapsed;
1789  kernel_id = 1;
1790  }
1791  }
1792  { // test Kernel_I1_J16
1793  for(int i=0;i<ni;i++) force_tmp[i] = force[i];
1794  start = std::chrono::system_clock::now();
1795  Kernel_I1_J16(epi,ni,epj,nj,force_tmp);
1796  end = std::chrono::system_clock::now();
1797  double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
1798  std::cerr << "kerel 2: " << elapsed << " ns" << std::endl;
1799  if(min_time > elapsed){
1800  min_time = elapsed;
1801  kernel_id = 2;
1802  }
1803  }
1804  delete[] force_tmp;
1805  } // if(kernel_id == 0)
1806  if(kernel_id == 1) Kernel_I16_J1(epi,ni,epj,nj,force);
1807  if(kernel_id == 2) Kernel_I1_J16(epi,ni,epj,nj,force);
1808  } // operator() definition
1809 
1810  void Kernel_I16_J1(const EPISoft* __restrict__ epi,const PIKG::S32 ni,const EPJSoft* __restrict__ epj,const PIKG::S32 nj, ForceSoft* __restrict__ force){
1811  PIKG::S32 i;
1812  PIKG::S32 j;
1813 
1814  EPI32 epi_loc[ni];
1815  EPI32 epj_loc[nj];
1816  PIKG::S32 nnb[ni];
1817 
1818  for(i=0; i<ni; ++i){
1819  epi_loc[i].pos.x = epi[i].pos.x - epi[0].pos.x;
1820  epi_loc[i].pos.y = epi[i].pos.y - epi[0].pos.y;
1821  epi_loc[i].pos.z = epi[i].pos.z - epi[0].pos.z;
1822  epi_loc[i].rs = epi[i].r_search;
1823  nnb[i] = 0;
1824  }
1825 
1826  for(j=0; j<nj; ++j) {
1827  epj_loc[j].pos.x = epj[j].pos.x - epi[0].pos.x;
1828  epj_loc[j].pos.y = epj[j].pos.y - epi[0].pos.y;
1829  epj_loc[j].pos.z = epj[j].pos.z - epi[0].pos.z;
1830  epj_loc[j].rs = epj[j].r_search;
1831  }
1832 
1833  for(i = 0;i < ((ni + 16 - 1)/16)*16;i += 16){
1834  svbool_t pg0 = svwhilelt_b32_s32(i,ni);
1835  svfloat32_t rsi;
1836 
1837  uint32_t index_gather_load0[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1838  svuint32_t vindex_gather_load0 = svld1_u32(pg0,index_gather_load0);
1839  rsi = svld1_gather_u32index_f32(pg0,((float*)&epi_loc[i+0].rs),vindex_gather_load0);
1840  svfloat32x3_t xi;
1841 
1842  uint32_t index_gather_load1[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1843  svuint32_t vindex_gather_load1 = svld1_u32(pg0,index_gather_load1);
1844  xi.v0 = svld1_gather_u32index_f32(pg0,((float*)&epi_loc[i+0].pos.x),vindex_gather_load1);
1845  uint32_t index_gather_load2[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1846  svuint32_t vindex_gather_load2 = svld1_u32(pg0,index_gather_load2);
1847  xi.v1 = svld1_gather_u32index_f32(pg0,((float*)&epi_loc[i+0].pos.y),vindex_gather_load2);
1848  uint32_t index_gather_load3[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1849  svuint32_t vindex_gather_load3 = svld1_u32(pg0,index_gather_load3);
1850  xi.v2 = svld1_gather_u32index_f32(pg0,((float*)&epi_loc[i+0].pos.z),vindex_gather_load3);
1851  svint32_t nnbi;
1852 
1853  nnbi = svdup_n_s32(0);
1854  for(j = 0;j < ((nj + 1 - 1)/1)*1;++j){
1855  svfloat32_t rsj;
1856 
1857  rsj = svdup_n_f32(epj_loc[j+0].rs);
1858 
1859  svfloat32x3_t xj;
1860 
1861  xj.v0 = svdup_n_f32(epj_loc[j+0].pos.x);
1862 
1863  xj.v1 = svdup_n_f32(epj_loc[j+0].pos.y);
1864 
1865  xj.v2 = svdup_n_f32(epj_loc[j+0].pos.z);
1866 
1867  svfloat32x3_t rij;
1868 
1869  svfloat32_t __fkg_tmp2;
1870 
1871  svfloat32_t __fkg_tmp1;
1872 
1873  svfloat32_t r2;
1874 
1875  svint32_t __fkg_tmp0;
1876 
1877  rij.v0 = svsub_f32_z(pg0,xi.v0,xj.v0);
1878  rij.v1 = svsub_f32_z(pg0,xi.v1,xj.v1);
1879  rij.v2 = svsub_f32_z(pg0,xi.v2,xj.v2);
1880  __fkg_tmp2 = svmul_f32_z(pg0,rij.v1,rij.v1);
1881  __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,__fkg_tmp2);
1882  r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp1);
1883  {
1884  svbool_t pg2;
1885  pg2 = svcmplt_f32(pg0,r2,max(pg0,svmul_f32_z(pg0,rsi,rsi),svmul_f32_z(pg0,rsj,rsj)));
1886 
1887  __fkg_tmp0 = svadd_s32_z(pg2,nnbi,svdup_n_s32(1));
1888  nnbi = svsel_s32(pg2,__fkg_tmp0,nnbi);;
1889  }
1890 
1891  } // loop of j
1892 
1893  {
1894  svint32_t __fkg_tmp_accum;
1895  uint32_t index_gather_load4[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
1896  svuint32_t vindex_gather_load4 = svld1_u32(svptrue_b32(),index_gather_load4);
1897  __fkg_tmp_accum = svld1_gather_u32index_s32(svptrue_b32(),((int*)&nnb[i+0]),vindex_gather_load4);
1898  __fkg_tmp_accum = svadd_s32_z(svptrue_b32(),__fkg_tmp_accum,nnbi);
1899  uint32_t index_scatter_store0[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
1900  svuint32_t vindex_scatter_store0 = svld1_u32(svptrue_b32(),index_scatter_store0);
1901  svst1_scatter_u32index_s32(svptrue_b32(),((int*)&nnb[i+0]),vindex_scatter_store0,__fkg_tmp_accum);}
1902 
1903  } // loop of i
1904 
1905  for(i=0; i<ni; ++i){
1906  force[i].n_ngb = nnb[i];
1907  }
1908 
1909  } // Kernel_I16_J1 definition
1910  void Kernel_I1_J16(const EPISoft* __restrict__ epi,const PIKG::S32 ni,const EPJSoft* __restrict__ epj,const PIKG::S32 nj, ForceSoft* __restrict__ force){
1911  PIKG::S32 i;
1912  PIKG::S32 j;
1913 
1914  EPI32 epi_loc[ni];
1915  EPI32 epj_loc[nj];
1916  PIKG::S32 nnb[ni];
1917 
1918  for(i=0; i<ni; ++i){
1919  epi_loc[i].pos.x = epi[i].pos.x - epi[0].pos.x;
1920  epi_loc[i].pos.y = epi[i].pos.y - epi[0].pos.y;
1921  epi_loc[i].pos.z = epi[i].pos.z - epi[0].pos.z;
1922  epi_loc[i].rs = epi[i].r_search;
1923  nnb[i] = 0;
1924  }
1925 
1926  for(j=0; j<nj; ++j) {
1927  epj_loc[j].pos.x = epj[j].pos.x - epi[0].pos.x;
1928  epj_loc[j].pos.y = epj[j].pos.y - epi[0].pos.y;
1929  epj_loc[j].pos.z = epj[j].pos.z - epi[0].pos.z;
1930  epj_loc[j].rs = epj[j].r_search;
1931  }
1932 
1933  for(i = 0;i < ((ni + 1 - 1)/1)*1;++i){
1934  svfloat32_t rsi;
1935 
1936  rsi = svdup_n_f32(epi_loc[i+0].rs);
1937 
1938  svfloat32x3_t xi;
1939 
1940  xi.v0 = svdup_n_f32(epi_loc[i+0].pos.x);
1941 
1942  xi.v1 = svdup_n_f32(epi_loc[i+0].pos.y);
1943 
1944  xi.v2 = svdup_n_f32(epi_loc[i+0].pos.z);
1945 
1946  svint32_t nnbi;
1947 
1948  nnbi = svdup_n_s32(0);
1949  for(j = 0;j < ((nj + 16 - 1)/16)*16;j += 16){
1950  svbool_t pg0 = svwhilelt_b32_s32(j,nj);
1951  svfloat32_t rsj;
1952 
1953  uint32_t index_gather_load5[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1954  svuint32_t vindex_gather_load5 = svld1_u32(pg0,index_gather_load5);
1955  rsj = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].rs),vindex_gather_load5);
1956  svfloat32x3_t xj;
1957 
1958  uint32_t index_gather_load6[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1959  svuint32_t vindex_gather_load6 = svld1_u32(pg0,index_gather_load6);
1960  xj.v0 = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].pos.x),vindex_gather_load6);
1961  uint32_t index_gather_load7[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1962  svuint32_t vindex_gather_load7 = svld1_u32(pg0,index_gather_load7);
1963  xj.v1 = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].pos.y),vindex_gather_load7);
1964  uint32_t index_gather_load8[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1965  svuint32_t vindex_gather_load8 = svld1_u32(pg0,index_gather_load8);
1966  xj.v2 = svld1_gather_u32index_f32(pg0,((float*)&epj_loc[j+0].pos.z),vindex_gather_load8);
1967  svfloat32x3_t rij;
1968 
1969  svfloat32_t __fkg_tmp2;
1970 
1971  svfloat32_t __fkg_tmp1;
1972 
1973  svfloat32_t r2;
1974 
1975  svint32_t __fkg_tmp0;
1976 
1977  rij.v0 = svsub_f32_z(pg0,xi.v0,xj.v0);
1978  rij.v1 = svsub_f32_z(pg0,xi.v1,xj.v1);
1979  rij.v2 = svsub_f32_z(pg0,xi.v2,xj.v2);
1980  __fkg_tmp2 = svmul_f32_z(pg0,rij.v1,rij.v1);
1981  __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,__fkg_tmp2);
1982  r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp1);
1983  {
1984  svbool_t pg2;
1985  pg2 = svcmplt_f32(pg0,r2,max(pg0,svmul_f32_z(pg0,rsi,rsi),svmul_f32_z(pg0,rsj,rsj)));
1986 
1987  __fkg_tmp0 = svadd_s32_z(pg2,nnbi,svdup_n_s32(1));
1988  nnbi = svsel_s32(pg2,__fkg_tmp0,nnbi);;
1989  }
1990 
1991  } // loop of j
1992 
1993  ((int*)&nnb[i+0])[0] += svaddv_s32(svptrue_b32(),nnbi);
1994 
1995  } // loop of i
1996 
1997  for(i=0; i<ni; ++i){
1998  force[i].n_ngb = nnb[i];
1999  }
2000 
2001  } // Kernel_I1_J16 definition
2002  PIKG::F64 rsqrt(PIKG::F64 op){ return 1.0/std::sqrt(op); }
2003  PIKG::F64 sqrt(PIKG::F64 op){ return std::sqrt(op); }
2004  PIKG::F64 inv(PIKG::F64 op){ return 1.0/op; }
2005  PIKG::F64 max(PIKG::F64 a,PIKG::F64 b){ return std::max(a,b);}
2006  PIKG::F64 min(PIKG::F64 a,PIKG::F64 b){ return std::min(a,b);}
2007  PIKG::F32 rsqrt(PIKG::F32 op){ return 1.f/std::sqrt(op); }
2008  PIKG::F32 sqrt(PIKG::F32 op){ return std::sqrt(op); }
2009  PIKG::F32 inv(PIKG::F32 op){ return 1.f/op; }
2010  PIKG::S64 max(PIKG::S64 a,PIKG::S64 b){ return std::max(a,b);}
2011  PIKG::S64 min(PIKG::S64 a,PIKG::S64 b){ return std::min(a,b);}
2012  PIKG::S32 max(PIKG::S32 a,PIKG::S32 b){ return std::max(a,b);}
2013  PIKG::S32 min(PIKG::S32 a,PIKG::S32 b){ return std::min(a,b);}
2014  PIKG::F64 table(PIKG::F64 tab[],PIKG::S64 i){ return tab[i]; }
2015  PIKG::F32 table(PIKG::F32 tab[],PIKG::S32 i){ return tab[i]; }
2016  PIKG::F64 to_float(PIKG::U64 op){return (PIKG::F64)op;}
2017  PIKG::F32 to_float(PIKG::U32 op){return (PIKG::F32)op;}
2018  PIKG::F64 to_float(PIKG::S64 op){return (PIKG::F64)op;}
2019  PIKG::F32 to_float(PIKG::S32 op){return (PIKG::F32)op;}
2020  //PIKG::S64 to_int(PIKG::F64 op){return (PIKG::S64)op;}
2021  //PIKG::S32 to_int(PIKG::F32 op){return (PIKG::S32)op;}
2022  PIKG::U64 to_uint(PIKG::F64 op){return (PIKG::U64)op;}
2023  PIKG::U32 to_uint(PIKG::F32 op){return (PIKG::U32)op;}
2024  template<typename T> PIKG::F64 to_f64(const T& op){return (PIKG::F64)op;}
2025  template<typename T> PIKG::F32 to_f32(const T& op){return (PIKG::F32)op;}
2026  template<typename T> PIKG::S64 to_s64(const T& op){return (PIKG::S64)op;}
2027  template<typename T> PIKG::S32 to_s32(const T& op){return (PIKG::S32)op;}
2028  template<typename T> PIKG::U64 to_u64(const T& op){return (PIKG::U64)op;}
2029  template<typename T> PIKG::U32 to_u32(const T& op){return (PIKG::U32)op;}
2030  svfloat32_t rsqrt(svbool_t pg,svfloat32_t op){
2031  svfloat32_t rinv = svrsqrte_f32(op);
2032  svfloat32_t h = svmul_f32_z(pg,op,rinv);
2033  h = svmsb_n_f32_z(pg,h,rinv,1.f);
2034  svfloat32_t poly = svmad_n_f32_z(pg,h,svdup_f32(0.375f),0.5f);
2035  poly = svmul_f32_z(pg,poly,h);
2036  rinv = svmad_f32_z(pg,rinv,poly,rinv);
2037  return rinv;
2038  }
2039  svfloat64_t rsqrt(svbool_t pg,svfloat64_t op){
2040  svfloat64_t rinv = svrsqrte_f64(op);
2041  svfloat64_t h = svmul_f64_z(pg,op,rinv);
2042  h = svmsb_n_f64_z(pg,h,rinv,1.f);
2043  svfloat64_t poly = svmad_n_f64_z(pg,h,svdup_f64(0.375f),0.5f);
2044  poly = svmul_f64_z(pg,poly,h);
2045  rinv = svmad_f64_z(pg,rinv,poly,rinv);
2046  return rinv;
2047  }
2048  svfloat32x2_t svdup_n_f32x3(PIKG::F32vec2 v){
2049  svfloat32x2_t ret;
2050  ret.v0 = svdup_n_f32(v.x);
2051  ret.v1 = svdup_n_f32(v.y);
2052  return ret;
2053  }
2054  svfloat32x3_t svdup_n_f32x3(PIKG::F32vec v){
2055  svfloat32x3_t ret;
2056  ret.v0 = svdup_n_f32(v.x);
2057  ret.v1 = svdup_n_f32(v.y);
2058  ret.v2 = svdup_n_f32(v.z);
2059  return ret;
2060  }
2061  svfloat32x4_t svdup_n_f32x4(PIKG::F32vec4 v){
2062  svfloat32x4_t ret;
2063  ret.v0 = svdup_n_f32(v.x);
2064  ret.v1 = svdup_n_f32(v.y);
2065  ret.v2 = svdup_n_f32(v.z);
2066  ret.v3 = svdup_n_f32(v.w);
2067  return ret;
2068  }
2069  svfloat64x2_t svdup_n_f64x3(PIKG::F64vec2 v){
2070  svfloat64x2_t ret;
2071  ret.v0 = svdup_n_f64(v.x);
2072  ret.v1 = svdup_n_f64(v.y);
2073  return ret;
2074  }
2075  svfloat64x3_t svdup_n_f64x3(PIKG::F64vec v){
2076  svfloat64x3_t ret;
2077  ret.v0 = svdup_n_f64(v.x);
2078  ret.v1 = svdup_n_f64(v.y);
2079  ret.v2 = svdup_n_f64(v.z);
2080  return ret;
2081  }
2082  svfloat64x4_t svdup_n_f64x4(PIKG::F64vec4 v){
2083  svfloat64x4_t ret;
2084  ret.v0 = svdup_n_f64(v.x);
2085  ret.v1 = svdup_n_f64(v.y);
2086  ret.v2 = svdup_n_f64(v.z);
2087  ret.v3 = svdup_n_f64(v.w);
2088  return ret;
2089  }
2090  svfloat32_t sqrt(svbool_t pg,svfloat32_t op){ return svsqrt_f32_z(pg,op); }
2091  svfloat64_t sqrt(svbool_t pg,svfloat64_t op){ return svsqrt_f64_z(pg,op); }
2092  svfloat32_t inv(svbool_t pg,svfloat32_t op){
2093  svfloat32_t x1 = svrecpe_f32(op);
2094  svfloat32_t x2 = svmsb_n_f32_z(pg,op,x1,2.f);
2095  x2 = svmul_f32_z(pg,x2,x1);
2096  svfloat32_t ret = svmsb_n_f32_z(pg,op,x2,2.f);
2097  ret = svmul_f32_z(pg,ret,x2);
2098  return ret;
2099  }
2100  svfloat64_t inv(svbool_t pg,svfloat64_t op){
2101  svfloat64_t x1 = svrecpe_f64(op);
2102  svfloat64_t x2 = svmsb_n_f64_z(pg,op,x1,2.f);
2103  x2 = svmul_f64_z(pg,x2,x1);
2104  svfloat64_t ret = svmsb_n_f64_z(pg,op,x2,2.f);
2105  ret = svmul_f64_z(pg,ret,x2);
2106  return ret;
2107  }
2108  svfloat64_t max(svbool_t pg,svfloat64_t a,svfloat64_t b){ return svmax_f64_z(pg,a,b);}
2109  svfloat64_t min(svbool_t pg,svfloat64_t a,svfloat64_t b){ return svmin_f64_z(pg,a,b);}
2110  svuint64_t max(svbool_t pg,svuint64_t a,svuint64_t b){ return svmax_u64_z(pg,a,b);}
2111  svuint64_t min(svbool_t pg,svuint64_t a,svuint64_t b){ return svmin_u64_z(pg,a,b);}
2112  svint64_t max(svbool_t pg,svint64_t a,svint64_t b){ return svmax_s64_z(pg,a,b);}
2113  svint64_t min(svbool_t pg,svint64_t a,svint64_t b){ return svmin_s64_z(pg,a,b);}
2114  svfloat32_t max(svbool_t pg,svfloat32_t a,svfloat32_t b){ return svmax_f32_z(pg,a,b);}
2115  svfloat32_t min(svbool_t pg,svfloat32_t a,svfloat32_t b){ return svmin_f32_z(pg,a,b);}
2116  svuint32_t max(svbool_t pg,svuint32_t a,svuint32_t b){ return svmax_u32_z(pg,a,b);}
2117  svuint32_t min(svbool_t pg,svuint32_t a,svuint32_t b){ return svmin_u32_z(pg,a,b);}
2118  svint32_t max(svbool_t pg,svint32_t a,svint32_t b){ return svmax_s32_z(pg,a,b);}
2119  svint32_t min(svbool_t pg,svint32_t a,svint32_t b){ return svmin_s32_z(pg,a,b);}
2120  svfloat64_t table(svfloat64_t tab,svuint64_t index){ return svtbl_f64(tab,index);}
2121  svfloat32_t table(svfloat32_t tab,svuint32_t index){ return svtbl_f32(tab,index);}
2122  svfloat64_t to_float(svbool_t pg, svint64_t op){ return svcvt_f64_s64_z(pg,op);}
2123  svfloat32_t to_float(svbool_t pg, svint32_t op){ return svcvt_f32_s32_z(pg,op);}
2124  svfloat64_t to_float(svbool_t pg,svuint64_t op){ return svcvt_f64_u64_z(pg,op);}
2125  svfloat32_t to_float(svbool_t pg,svuint32_t op){ return svcvt_f32_u32_z(pg,op);}
2126  //svint64_t to_int(svbool_t pg,svfloat64_t op){ return svcvt_s64_f64_z(pg,op);}
2127  //svint32_t to_int(svbool_t pg,svfloat32_t op){ return svcvt_s32_f32_z(pg,op);}
2128  svuint64_t to_uint(svbool_t pg,svfloat64_t op){ return svcvt_u64_f64_z(pg,op);}
2129  svuint32_t to_uint(svbool_t pg,svfloat32_t op){ return svcvt_u32_f32_z(pg,op);}
2130 };// kernel functor definition
2131 
2132 #endif
2133 
PIKG::Vector4
Definition: pikg_vector.hpp:321
EPISoft
Definition: soft_ptcl.hpp:271
PIKG::Vector4::w
T w
Definition: pikg_vector.hpp:323
PIKG::Vector4::y
T y
Definition: pikg_vector.hpp:323
PIKG::Vector3::z
T z
Definition: pikg_vector.hpp:30
PIKG::U32
uint32_t U32
Definition: pikg_vector.hpp:21
PIKG::Vector4::x
T x
Definition: pikg_vector.hpp:323
PIKG::S32
int32_t S32
Definition: pikg_vector.hpp:24
pikg_vector.hpp
soft_ptcl.hpp
ForceSoft
Definition: soft_ptcl.hpp:4
PIKG::Vector3::y
T y
Definition: pikg_vector.hpp:30
SPJSoft
#define SPJSoft
Definition: force_gpu_cuda.hpp:98
PIKG::F64
double F64
Definition: pikg_vector.hpp:17
PIKG::Vector2::x
T x
Definition: pikg_vector.hpp:176
PIKG::S64
int64_t S64
Definition: pikg_vector.hpp:23
PIKG::Vector2
Definition: pikg_vector.hpp:174
PIKG::min
T min(const Vector3< T > &v)
Definition: pikg_vector.hpp:150
EPJSoft
Definition: soft_ptcl.hpp:311
PIKG::max
T max(const Vector3< T > &v)
Definition: pikg_vector.hpp:143
PIKG::Vector3
Definition: pikg_vector.hpp:28
PIKG::Vector3::x
T x
Definition: pikg_vector.hpp:30
PIKG::Vector4::z
T z
Definition: pikg_vector.hpp:323
PIKG::Vector2::y
T y
Definition: pikg_vector.hpp:176
PIKG::U64
uint64_t U64
Definition: pikg_vector.hpp:20
PIKG::F32
float F32
Definition: pikg_vector.hpp:18