8 #define PIKG_USE_FDPS_VECTOR
13 #define SPJSoft PS::SPJQuadrupoleInAndOut
15 #define SPJSoft PS::SPJMonopoleInAndOut
47 struct CalcForceEpEpWithLinearCutoffFugaku{
52 CalcForceEpEpWithLinearCutoffFugaku(){}
64 void operator()(
const EPISoft* __restrict__ epi,
const int ni,
const EPJSoft* __restrict__ epj,
const int nj,
ForceSoft* __restrict__ force,
const int kernel_select = 1){
66 static_assert(
sizeof(EPI32) == 16,
"check consistency of EPI member variable definition between PIKG source and original source");
67 static_assert(
sizeof(EPJ32) == 20,
"check consistency of EPJ member variable definition between PIKG source and original source");
68 static_assert(
sizeof(FORCE32) == 20,
"check consistency of FORCE member variable definition between PIKG source and original source");
69 if(kernel_select>=0) kernel_id = kernel_select;
71 std::cout <<
"ni: " << ni <<
" nj:" << nj << std::endl;
73 std::chrono::system_clock::time_point start, end;
76 for(
int i=0;i<ni;i++) force_tmp[i] = force[i];
77 start = std::chrono::system_clock::now();
78 Kernel_I16_J1(epi,ni,epj,nj,force_tmp);
79 end = std::chrono::system_clock::now();
80 double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
81 std::cerr <<
"kerel 1: " << elapsed <<
" ns" << std::endl;
82 if(min_time > elapsed){
88 for(
int i=0;i<ni;i++) force_tmp[i] = force[i];
89 start = std::chrono::system_clock::now();
90 Kernel_I1_J16(epi,ni,epj,nj,force_tmp);
91 end = std::chrono::system_clock::now();
92 double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
93 std::cerr <<
"kerel 2: " << elapsed <<
" ns" << std::endl;
94 if(min_time > elapsed){
101 if(kernel_id == 1) Kernel_I16_J1(epi,ni,epj,nj,force);
102 if(kernel_id == 2) Kernel_I1_J16(epi,ni,epj,nj,force);
119 FORCE32 force_loc[ni];
123 if (epi[i].type==1) {
124 epi_list[ni_act] = i;
125 epi_loc[ni_act].pos.x = epi[i].pos.x;
126 epi_loc[ni_act].pos.y = epi[i].pos.y;
127 epi_loc[ni_act].pos.z = epi[i].pos.z;
128 epi_loc[ni_act].rs = epi[i].r_search;
130 force_loc[ni_act].acc = 0.f;
131 force_loc[ni_act].pot = 0.0;
132 force_loc[ni_act].nnb = 0;
138 for(j=0; j<nj; ++j) {
140 epj_loc[nj_act].pos.x = epj[j].pos.x;
141 epj_loc[nj_act].pos.y = epj[j].pos.y;
142 epj_loc[nj_act].pos.z = epj[j].pos.z;
143 epj_loc[nj_act].mass = epj[j].mass;
144 epj_loc[nj_act].rs = epj[j].r_search;
149 for(i = 0;i < ((ni_act + 16 - 1)/16)*16;i += 16){
150 svbool_t pg0 = svwhilelt_b32_s32(i,ni_act);
153 uint32_t index_gather_load0[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
154 svuint32_t vindex_gather_load0 = svld1_u32(pg0,index_gather_load0);
155 rsi = svld1_gather_u32index_f32(pg0,((
float*)&epi_loc[i+0].rs),vindex_gather_load0);
158 uint32_t index_gather_load1[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
159 svuint32_t vindex_gather_load1 = svld1_u32(pg0,index_gather_load1);
160 xi.v0 = svld1_gather_u32index_f32(pg0,((
float*)&epi_loc[i+0].pos.x),vindex_gather_load1);
161 uint32_t index_gather_load2[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
162 svuint32_t vindex_gather_load2 = svld1_u32(pg0,index_gather_load2);
163 xi.v1 = svld1_gather_u32index_f32(pg0,((
float*)&epi_loc[i+0].pos.y),vindex_gather_load2);
164 uint32_t index_gather_load3[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
165 svuint32_t vindex_gather_load3 = svld1_u32(pg0,index_gather_load3);
166 xi.v2 = svld1_gather_u32index_f32(pg0,((
float*)&epi_loc[i+0].pos.z),vindex_gather_load3);
169 acc.v0 = svdup_n_f32(0.0f);
170 acc.v1 = svdup_n_f32(0.0f);
171 acc.v2 = svdup_n_f32(0.0f);
174 nnbi = svdup_n_s32(0);
177 pot = svdup_n_f32(0.0f);
178 for(j = 0;j < ((nj_act + 1 - 1)/1)*1;++j){
181 mj = svdup_n_f32(epj_loc[j+0].mass);
185 rsj = svdup_n_f32(epj_loc[j+0].rs);
189 xj.v0 = svdup_n_f32(epj_loc[j+0].pos.x);
191 xj.v1 = svdup_n_f32(epj_loc[j+0].pos.y);
193 xj.v2 = svdup_n_f32(epj_loc[j+0].pos.z);
197 svfloat32_t __fkg_tmp2;
199 svfloat32_t __fkg_tmp1;
215 svint32_t __fkg_tmp0;
217 rij.v0 = svsub_f32_z(pg0,xi.v0,xj.v0);
218 rij.v1 = svsub_f32_z(pg0,xi.v1,xj.v1);
219 rij.v2 = svsub_f32_z(pg0,xi.v2,xj.v2);
220 __fkg_tmp2 = svmul_f32_z(pg0,rij.v1,rij.v1);
221 __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,__fkg_tmp2);
222 r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp1);
223 r2_eps = svadd_f32_z(pg0,r2,svdup_n_f32(eps2));
224 r2_cut =
max(pg0,r2_eps,svdup_n_f32(rcut2));
225 r_inv = rsqrt(pg0,r2_cut);
226 r2_inv = svmul_f32_z(pg0,r_inv,r_inv);
227 mr_inv = svmul_f32_z(pg0,mj,r_inv);
228 mr3_inv = svmul_f32_z(pg0,r2_inv,mr_inv);
231 pg2 = svcmplt_f32(pg0,r2,
max(pg0,svmul_f32_z(pg0,rsi,rsi),svmul_f32_z(pg0,rsj,rsj)));
233 __fkg_tmp0 = svadd_s32_z(pg2,nnbi,svdup_n_s32(1));
234 nnbi = svsel_s32(pg2,__fkg_tmp0,nnbi);;
237 acc.v0 = svmsb_f32_z(svptrue_b32(),mr3_inv,rij.v0,acc.v0);
238 acc.v1 = svmsb_f32_z(svptrue_b32(),mr3_inv,rij.v1,acc.v1);
239 acc.v2 = svmsb_f32_z(svptrue_b32(),mr3_inv,rij.v2,acc.v2);
240 pot = svsub_f32_z(svptrue_b32(),pot,mr_inv);
244 svfloat32_t __fkg_tmp_accum;
245 uint32_t index_gather_load4[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
246 svuint32_t vindex_gather_load4 = svld1_u32(svptrue_b32(),index_gather_load4);
247 __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.x),vindex_gather_load4);
248 __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,acc.v0);
249 uint32_t index_scatter_store0[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
250 svuint32_t vindex_scatter_store0 = svld1_u32(svptrue_b32(),index_scatter_store0);
251 svst1_scatter_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.x),vindex_scatter_store0,__fkg_tmp_accum);}
254 svfloat32_t __fkg_tmp_accum;
255 uint32_t index_gather_load5[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
256 svuint32_t vindex_gather_load5 = svld1_u32(svptrue_b32(),index_gather_load5);
257 __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.y),vindex_gather_load5);
258 __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,acc.v1);
259 uint32_t index_scatter_store1[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
260 svuint32_t vindex_scatter_store1 = svld1_u32(svptrue_b32(),index_scatter_store1);
261 svst1_scatter_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.y),vindex_scatter_store1,__fkg_tmp_accum);}
264 svfloat32_t __fkg_tmp_accum;
265 uint32_t index_gather_load6[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
266 svuint32_t vindex_gather_load6 = svld1_u32(svptrue_b32(),index_gather_load6);
267 __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.z),vindex_gather_load6);
268 __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,acc.v2);
269 uint32_t index_scatter_store2[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
270 svuint32_t vindex_scatter_store2 = svld1_u32(svptrue_b32(),index_scatter_store2);
271 svst1_scatter_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.z),vindex_scatter_store2,__fkg_tmp_accum);}
274 svint32_t __fkg_tmp_accum;
275 uint32_t index_gather_load7[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
276 svuint32_t vindex_gather_load7 = svld1_u32(svptrue_b32(),index_gather_load7);
277 __fkg_tmp_accum = svld1_gather_u32index_s32(svptrue_b32(),((
int*)&force_loc[i+0].nnb),vindex_gather_load7);
278 __fkg_tmp_accum = svadd_s32_z(svptrue_b32(),__fkg_tmp_accum,nnbi);
279 uint32_t index_scatter_store3[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
280 svuint32_t vindex_scatter_store3 = svld1_u32(svptrue_b32(),index_scatter_store3);
281 svst1_scatter_u32index_s32(svptrue_b32(),((
int*)&force_loc[i+0].nnb),vindex_scatter_store3,__fkg_tmp_accum);}
284 svfloat32_t __fkg_tmp_accum;
285 uint32_t index_gather_load8[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
286 svuint32_t vindex_gather_load8 = svld1_u32(svptrue_b32(),index_gather_load8);
287 __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].pot),vindex_gather_load8);
288 __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,pot);
289 uint32_t index_scatter_store4[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
290 svuint32_t vindex_scatter_store4 = svld1_u32(svptrue_b32(),index_scatter_store4);
291 svst1_scatter_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].pot),vindex_scatter_store4,__fkg_tmp_accum);}
296 for(i=0; i<ni_act; ++i){
298 assert(epi[j].type==1);
299 force[j].acc.x += G*force_loc[i].acc.x;
300 force[j].acc.y += G*force_loc[i].acc.y;
301 force[j].acc.z += G*force_loc[i].acc.z;
302 force[j].pot += G*force_loc[i].pot;
303 force[j].n_ngb = force_loc[i].nnb;
317 FORCE32 force_loc[ni];
321 if (epi[i].type==1) {
322 epi_list[ni_act] = i;
323 epi_loc[ni_act].pos.x = epi[i].pos.x;
324 epi_loc[ni_act].pos.y = epi[i].pos.y;
325 epi_loc[ni_act].pos.z = epi[i].pos.z;
326 epi_loc[ni_act].rs = epi[i].r_search;
328 force_loc[ni_act].acc = 0.f;
329 force_loc[ni_act].pot = 0.0;
330 force_loc[ni_act].nnb = 0;
336 for(j=0; j<nj; ++j) {
338 epj_loc[nj_act].pos.x = epj[j].pos.x;
339 epj_loc[nj_act].pos.y = epj[j].pos.y;
340 epj_loc[nj_act].pos.z = epj[j].pos.z;
341 epj_loc[nj_act].mass = epj[j].mass;
342 epj_loc[nj_act].rs = epj[j].r_search;
347 for(i = 0;i < ((ni_act + 1 - 1)/1)*1;++i){
350 rsi = svdup_n_f32(epi_loc[i+0].rs);
354 xi.v0 = svdup_n_f32(epi_loc[i+0].pos.x);
356 xi.v1 = svdup_n_f32(epi_loc[i+0].pos.y);
358 xi.v2 = svdup_n_f32(epi_loc[i+0].pos.z);
362 acc.v0 = svdup_n_f32(0.0f);
363 acc.v1 = svdup_n_f32(0.0f);
364 acc.v2 = svdup_n_f32(0.0f);
367 nnbi = svdup_n_s32(0);
370 pot = svdup_n_f32(0.0f);
371 for(j = 0;j < ((nj_act + 16 - 1)/16)*16;j += 16){
372 svbool_t pg0 = svwhilelt_b32_s32(j,nj_act);
375 uint32_t index_gather_load9[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
376 svuint32_t vindex_gather_load9 = svld1_u32(pg0,index_gather_load9);
377 mj = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].mass),vindex_gather_load9);
380 uint32_t index_gather_load10[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
381 svuint32_t vindex_gather_load10 = svld1_u32(pg0,index_gather_load10);
382 rsj = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].rs),vindex_gather_load10);
385 uint32_t index_gather_load11[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
386 svuint32_t vindex_gather_load11 = svld1_u32(pg0,index_gather_load11);
387 xj.v0 = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].pos.x),vindex_gather_load11);
388 uint32_t index_gather_load12[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
389 svuint32_t vindex_gather_load12 = svld1_u32(pg0,index_gather_load12);
390 xj.v1 = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].pos.y),vindex_gather_load12);
391 uint32_t index_gather_load13[16] = {0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75};
392 svuint32_t vindex_gather_load13 = svld1_u32(pg0,index_gather_load13);
393 xj.v2 = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].pos.z),vindex_gather_load13);
396 svfloat32_t __fkg_tmp2;
398 svfloat32_t __fkg_tmp1;
414 svint32_t __fkg_tmp0;
416 rij.v0 = svsub_f32_z(pg0,xi.v0,xj.v0);
417 rij.v1 = svsub_f32_z(pg0,xi.v1,xj.v1);
418 rij.v2 = svsub_f32_z(pg0,xi.v2,xj.v2);
419 __fkg_tmp2 = svmul_f32_z(pg0,rij.v1,rij.v1);
420 __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,__fkg_tmp2);
421 r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp1);
422 r2_eps = svadd_f32_z(pg0,r2,svdup_n_f32(eps2));
423 r2_cut =
max(pg0,r2_eps,svdup_n_f32(rcut2));
424 r_inv = rsqrt(pg0,r2_cut);
425 r2_inv = svmul_f32_z(pg0,r_inv,r_inv);
426 mr_inv = svmul_f32_z(pg0,mj,r_inv);
427 mr3_inv = svmul_f32_z(pg0,r2_inv,mr_inv);
430 pg2 = svcmplt_f32(pg0,r2,
max(pg0,svmul_f32_z(pg0,rsi,rsi),svmul_f32_z(pg0,rsj,rsj)));
432 __fkg_tmp0 = svadd_s32_z(pg2,nnbi,svdup_n_s32(1));
433 nnbi = svsel_s32(pg2,__fkg_tmp0,nnbi);;
436 acc.v0 = svmsb_f32_z(svptrue_b32(),mr3_inv,rij.v0,acc.v0);
437 acc.v1 = svmsb_f32_z(svptrue_b32(),mr3_inv,rij.v1,acc.v1);
438 acc.v2 = svmsb_f32_z(svptrue_b32(),mr3_inv,rij.v2,acc.v2);
439 pot = svsub_f32_z(svptrue_b32(),pot,mr_inv);
442 ((
float*)&force_loc[i+0].acc.x)[0] += svaddv_f32(svptrue_b32(),acc.v0);
444 ((
float*)&force_loc[i+0].acc.y)[0] += svaddv_f32(svptrue_b32(),acc.v1);
446 ((
float*)&force_loc[i+0].acc.z)[0] += svaddv_f32(svptrue_b32(),acc.v2);
448 ((
int*)&force_loc[i+0].nnb)[0] += svaddv_s32(svptrue_b32(),nnbi);
450 ((
float*)&force_loc[i+0].pot)[0] += svaddv_f32(svptrue_b32(),pot);
455 for(i=0; i<ni_act; ++i){
457 if (epi[j].type==1) {
458 force[j].acc.x += G*force_loc[i].acc.x;
459 force[j].acc.y += G*force_loc[i].acc.y;
460 force[j].acc.z += G*force_loc[i].acc.z;
461 force[j].pot += G*force_loc[i].pot;
462 force[j].n_ngb = force_loc[i].nnb;
496 svfloat32_t rsqrt(svbool_t pg,svfloat32_t op){
497 svfloat32_t rinv = svrsqrte_f32(op);
498 svfloat32_t h = svmul_f32_z(pg,op,rinv);
499 h = svmsb_n_f32_z(pg,h,rinv,1.f);
500 svfloat32_t poly = svmad_n_f32_z(pg,h,svdup_f32(0.375f),0.5f);
501 poly = svmul_f32_z(pg,poly,h);
502 rinv = svmad_f32_z(pg,rinv,poly,rinv);
505 svfloat64_t rsqrt(svbool_t pg,svfloat64_t op){
506 svfloat64_t rinv = svrsqrte_f64(op);
507 svfloat64_t h = svmul_f64_z(pg,op,rinv);
508 h = svmsb_n_f64_z(pg,h,rinv,1.f);
509 svfloat64_t poly = svmad_n_f64_z(pg,h,svdup_f64(0.375f),0.5f);
510 poly = svmul_f64_z(pg,poly,h);
511 rinv = svmad_f64_z(pg,rinv,poly,rinv);
516 ret.v0 = svdup_n_f32(v.
x);
517 ret.v1 = svdup_n_f32(v.
y);
522 ret.v0 = svdup_n_f32(v.
x);
523 ret.v1 = svdup_n_f32(v.
y);
524 ret.v2 = svdup_n_f32(v.
z);
529 ret.v0 = svdup_n_f32(v.
x);
530 ret.v1 = svdup_n_f32(v.
y);
531 ret.v2 = svdup_n_f32(v.
z);
532 ret.v3 = svdup_n_f32(v.
w);
537 ret.v0 = svdup_n_f64(v.
x);
538 ret.v1 = svdup_n_f64(v.
y);
543 ret.v0 = svdup_n_f64(v.
x);
544 ret.v1 = svdup_n_f64(v.
y);
545 ret.v2 = svdup_n_f64(v.
z);
550 ret.v0 = svdup_n_f64(v.
x);
551 ret.v1 = svdup_n_f64(v.
y);
552 ret.v2 = svdup_n_f64(v.
z);
553 ret.v3 = svdup_n_f64(v.
w);
556 svfloat32_t sqrt(svbool_t pg,svfloat32_t op){
return svsqrt_f32_z(pg,op); }
557 svfloat64_t sqrt(svbool_t pg,svfloat64_t op){
return svsqrt_f64_z(pg,op); }
558 svfloat32_t inv(svbool_t pg,svfloat32_t op){
559 svfloat32_t x1 = svrecpe_f32(op);
560 svfloat32_t x2 = svmsb_n_f32_z(pg,op,x1,2.f);
561 x2 = svmul_f32_z(pg,x2,x1);
562 svfloat32_t ret = svmsb_n_f32_z(pg,op,x2,2.f);
563 ret = svmul_f32_z(pg,ret,x2);
566 svfloat64_t inv(svbool_t pg,svfloat64_t op){
567 svfloat64_t x1 = svrecpe_f64(op);
568 svfloat64_t x2 = svmsb_n_f64_z(pg,op,x1,2.f);
569 x2 = svmul_f64_z(pg,x2,x1);
570 svfloat64_t ret = svmsb_n_f64_z(pg,op,x2,2.f);
571 ret = svmul_f64_z(pg,ret,x2);
574 svfloat64_t
max(svbool_t pg,svfloat64_t a,svfloat64_t b){
return svmax_f64_z(pg,a,b);}
575 svfloat64_t
min(svbool_t pg,svfloat64_t a,svfloat64_t b){
return svmin_f64_z(pg,a,b);}
576 svuint64_t
max(svbool_t pg,svuint64_t a,svuint64_t b){
return svmax_u64_z(pg,a,b);}
577 svuint64_t
min(svbool_t pg,svuint64_t a,svuint64_t b){
return svmin_u64_z(pg,a,b);}
578 svint64_t
max(svbool_t pg,svint64_t a,svint64_t b){
return svmax_s64_z(pg,a,b);}
579 svint64_t
min(svbool_t pg,svint64_t a,svint64_t b){
return svmin_s64_z(pg,a,b);}
580 svfloat32_t
max(svbool_t pg,svfloat32_t a,svfloat32_t b){
return svmax_f32_z(pg,a,b);}
581 svfloat32_t
min(svbool_t pg,svfloat32_t a,svfloat32_t b){
return svmin_f32_z(pg,a,b);}
582 svuint32_t
max(svbool_t pg,svuint32_t a,svuint32_t b){
return svmax_u32_z(pg,a,b);}
583 svuint32_t
min(svbool_t pg,svuint32_t a,svuint32_t b){
return svmin_u32_z(pg,a,b);}
584 svint32_t
max(svbool_t pg,svint32_t a,svint32_t b){
return svmax_s32_z(pg,a,b);}
585 svint32_t
min(svbool_t pg,svint32_t a,svint32_t b){
return svmin_s32_z(pg,a,b);}
586 svfloat64_t table(svfloat64_t tab,svuint64_t index){
return svtbl_f64(tab,index);}
587 svfloat32_t table(svfloat32_t tab,svuint32_t index){
return svtbl_f32(tab,index);}
588 svfloat64_t to_float(svbool_t pg, svint64_t op){
return svcvt_f64_s64_z(pg,op);}
589 svfloat32_t to_float(svbool_t pg, svint32_t op){
return svcvt_f32_s32_z(pg,op);}
590 svfloat64_t to_float(svbool_t pg,svuint64_t op){
return svcvt_f64_u64_z(pg,op);}
591 svfloat32_t to_float(svbool_t pg,svuint32_t op){
return svcvt_f32_u32_z(pg,op);}
594 svuint64_t to_uint(svbool_t pg,svfloat64_t op){
return svcvt_u64_f64_z(pg,op);}
595 svuint32_t to_uint(svbool_t pg,svfloat32_t op){
return svcvt_u32_f32_z(pg,op);}
600 struct CalcForceEpSpQuadFugaku{
603 CalcForceEpSpQuadFugaku(){}
610 void operator()(
const EPISoft* __restrict__ epi,
const int ni,
const SPJSoft* __restrict__ epj,
const int nj,
ForceSoft* __restrict__ force,
const int kernel_select = 1){
611 static_assert(
sizeof(EPI32) == 16,
"check consistency of EPI member variable definition between PIKG source and original source");
612 static_assert(
sizeof(SPJ32) == 40,
"check consistency of EPJ member variable definition between PIKG source and original source");
613 static_assert(
sizeof(FORCESP32) == 16,
"check consistency of FORCE member variable definition between PIKG source and original source");
614 if(kernel_select>=0) kernel_id = kernel_select;
616 std::cout <<
"ni: " << ni <<
" nj:" << nj << std::endl;
618 std::chrono::system_clock::time_point start, end;
621 for(
int i=0;i<ni;i++) force_tmp[i] = force[i];
622 start = std::chrono::system_clock::now();
623 Kernel_I16_J1(epi,ni,epj,nj,force_tmp);
624 end = std::chrono::system_clock::now();
625 double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
626 std::cerr <<
"kerel 1: " << elapsed <<
" ns" << std::endl;
627 if(min_time > elapsed){
633 for(
int i=0;i<ni;i++) force_tmp[i] = force[i];
634 start = std::chrono::system_clock::now();
635 Kernel_I1_J16(epi,ni,epj,nj,force_tmp);
636 end = std::chrono::system_clock::now();
637 double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
638 std::cerr <<
"kerel 2: " << elapsed <<
" ns" << std::endl;
639 if(min_time > elapsed){
646 if(kernel_id == 1) Kernel_I16_J1(epi,ni,epj,nj,force);
647 if(kernel_id == 2) Kernel_I1_J16(epi,ni,epj,nj,force);
650 void Kernel_I16_J1(
const EPISoft* __restrict__ epi,
const int ni,
const SPJSoft* __restrict__ epj,
const int nj,
ForceSoft* __restrict__ force,
const int kernel_select = 1){
658 FORCESP32 force_loc[ni];
662 if (epi[i].type==1) {
663 epi_list[ni_act] = i;
664 epi_loc[ni_act].pos.x = epi[i].pos.x - epi[0].pos.x;
665 epi_loc[ni_act].pos.y = epi[i].pos.y - epi[0].pos.y;
666 epi_loc[ni_act].pos.z = epi[i].pos.z - epi[0].pos.z;
668 force_loc[ni_act].acc = 0.f;
669 force_loc[ni_act].pot = 0.0;
674 for(j=0; j<nj; ++j) {
675 epj_loc[j].pos.x = epj[j].pos.x - epi[0].pos.x;
676 epj_loc[j].pos.y = epj[j].pos.y - epi[0].pos.y;
677 epj_loc[j].pos.z = epj[j].pos.z - epi[0].pos.z;
678 epj_loc[j].mass = epj[j].mass;
679 epj_loc[j].qxx = epj[j].quad.xx;
680 epj_loc[j].qyy = epj[j].quad.yy;
681 epj_loc[j].qzz = epj[j].quad.zz;
682 epj_loc[j].qxy = epj[j].quad.xy;
683 epj_loc[j].qxz = epj[j].quad.xz;
684 epj_loc[j].qyz = epj[j].quad.yz;
687 for(i = 0;i < ((ni_act + 16 - 1)/16)*16;i += 16){
688 svbool_t pg0 = svwhilelt_b32_s32(i,ni_act);
691 uint32_t index_gather_load0[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
692 svuint32_t vindex_gather_load0 = svld1_u32(pg0,index_gather_load0);
693 xi.v0 = svld1_gather_u32index_f32(pg0,((
float*)&epi_loc[i+0].pos.x),vindex_gather_load0);
694 uint32_t index_gather_load1[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
695 svuint32_t vindex_gather_load1 = svld1_u32(pg0,index_gather_load1);
696 xi.v1 = svld1_gather_u32index_f32(pg0,((
float*)&epi_loc[i+0].pos.y),vindex_gather_load1);
697 uint32_t index_gather_load2[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
698 svuint32_t vindex_gather_load2 = svld1_u32(pg0,index_gather_load2);
699 xi.v2 = svld1_gather_u32index_f32(pg0,((
float*)&epi_loc[i+0].pos.z),vindex_gather_load2);
702 af.v0 = svdup_n_f32(0.0f);
703 af.v1 = svdup_n_f32(0.0f);
704 af.v2 = svdup_n_f32(0.0f);
707 pf = svdup_n_f32(0.0f);
708 for(j = 0;j < ((nj + 1 - 1)/1)*1;++j){
711 mj = svdup_n_f32(epj_loc[j+0].mass);
715 qj_xx = svdup_n_f32(epj_loc[j+0].qxx);
719 qj_xy = svdup_n_f32(epj_loc[j+0].qxy);
723 qj_yy = svdup_n_f32(epj_loc[j+0].qyy);
727 qj_yz = svdup_n_f32(epj_loc[j+0].qyz);
731 qj_zx = svdup_n_f32(epj_loc[j+0].qxz);
735 qj_zz = svdup_n_f32(epj_loc[j+0].qzz);
739 xj.v0 = svdup_n_f32(epj_loc[j+0].pos.x);
741 xj.v1 = svdup_n_f32(epj_loc[j+0].pos.y);
743 xj.v2 = svdup_n_f32(epj_loc[j+0].pos.z);
747 svfloat32_t __fkg_tmp1;
749 svfloat32_t __fkg_tmp0;
755 svfloat32_t __fkg_tmp2;
759 svfloat32_t __fkg_tmp3;
769 svfloat32_t __fkg_tmp4;
785 svfloat32_t __fkg_tmp5;
789 svfloat32_t __fkg_tmp7;
791 svfloat32_t __fkg_tmp6;
795 svfloat32_t __fkg_tmp9;
797 svfloat32_t __fkg_tmp8;
799 svfloat32_t __fkg_tmp11;
801 svfloat32_t __fkg_tmp10;
803 svfloat32_t __fkg_tmp13;
805 svfloat32_t __fkg_tmp12;
809 svfloat32_t rqr_r4_inv;
813 svfloat32_t __fkg_tmp14;
817 svfloat32_t __fkg_tmp15;
819 svfloat32_t __fkg_tmp16;
821 svfloat32_t __fkg_tmp17;
823 rij.v0 = svsub_f32_z(pg0,xj.v0,xi.v0);
824 rij.v1 = svsub_f32_z(pg0,xj.v1,xi.v1);
825 rij.v2 = svsub_f32_z(pg0,xj.v2,xi.v2);
826 __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,svdup_n_f32(eps2));
827 __fkg_tmp0 = svmad_f32_z(pg0,rij.v1,rij.v1,__fkg_tmp1);
828 r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp0);
829 r_inv = rsqrt(pg0,r2);
830 __fkg_tmp2 = svmul_f32_z(pg0,r_inv,r_inv);
831 tmp = svmsb_f32_z(pg0,r2,__fkg_tmp2,svdup_n_f32(3.0f));
832 __fkg_tmp3 = svmul_f32_z(pg0,tmp,svdup_n_f32(0.5f));
833 r_inv = svmul_f32_z(pg0,r_inv,__fkg_tmp3);
834 r2_inv = svmul_f32_z(pg0,r_inv,r_inv);
835 r3_inv = svmul_f32_z(pg0,r2_inv,r_inv);
836 r4_inv = svmul_f32_z(pg0,r2_inv,r2_inv);
837 r5_inv = svmul_f32_z(pg0,r2_inv,r3_inv);
838 __fkg_tmp4 = svadd_f32_z(pg0,qj_xx,qj_yy);
839 tr = svadd_f32_z(pg0,__fkg_tmp4,qj_zz);
840 qxx = svnmsb_f32_z(pg0,svdup_n_f32(3.0f),qj_xx,tr);
841 qyy = svnmsb_f32_z(pg0,svdup_n_f32(3.0f),qj_yy,tr);
842 qzz = svnmsb_f32_z(pg0,svdup_n_f32(3.0f),qj_zz,tr);
843 qxy = svmul_f32_z(pg0,svdup_n_f32(3.0f),qj_xy);
844 qyz = svmul_f32_z(pg0,svdup_n_f32(3.0f),qj_yz);
845 qxz = svmul_f32_z(pg0,svdup_n_f32(3.0f),qj_zx);
846 __fkg_tmp5 = svmul_f32_z(pg0,svdup_n_f32(eps2),tr);
847 mtr = svneg_f32_z(pg0,__fkg_tmp5);
848 __fkg_tmp7 = svmul_f32_z(pg0,qxx,rij.v0);
849 __fkg_tmp6 = svmad_f32_z(pg0,qxy,rij.v1,__fkg_tmp7);
850 qr.v0 = svmad_f32_z(pg0,qxz,rij.v2,__fkg_tmp6);
851 __fkg_tmp9 = svmul_f32_z(pg0,qyy,rij.v1);
852 __fkg_tmp8 = svmad_f32_z(pg0,qyz,rij.v2,__fkg_tmp9);
853 qr.v1 = svmad_f32_z(pg0,qxy,rij.v0,__fkg_tmp8);
854 __fkg_tmp11 = svmul_f32_z(pg0,qzz,rij.v2);
855 __fkg_tmp10 = svmad_f32_z(pg0,qxz,rij.v0,__fkg_tmp11);
856 qr.v2 = svmad_f32_z(pg0,qyz,rij.v1,__fkg_tmp10);
857 __fkg_tmp13 = svmad_f32_z(pg0,qr.v0,rij.v0,mtr);
858 __fkg_tmp12 = svmad_f32_z(pg0,qr.v1,rij.v1,__fkg_tmp13);
859 rqr = svmad_f32_z(pg0,qr.v2,rij.v2,__fkg_tmp12);
860 rqr_r4_inv = svmul_f32_z(pg0,rqr,r4_inv);
861 meff = svmad_f32_z(pg0,svdup_n_f32(0.5f),rqr_r4_inv,mj);
862 __fkg_tmp14 = svmad_f32_z(pg0,svdup_n_f32(2.5f),rqr_r4_inv,mj);
863 meff3 = svmul_f32_z(pg0,__fkg_tmp14,r3_inv);
864 pf = svmsb_f32_z(pg0,meff,r_inv,pf);
865 __fkg_tmp15 = svmsb_f32_z(pg0,r5_inv,qr.v0,af.v0);
866 af.v0 = svmad_f32_z(pg0,meff3,rij.v0,__fkg_tmp15);
867 __fkg_tmp16 = svmsb_f32_z(pg0,r5_inv,qr.v1,af.v1);
868 af.v1 = svmad_f32_z(pg0,meff3,rij.v1,__fkg_tmp16);
869 __fkg_tmp17 = svmsb_f32_z(pg0,r5_inv,qr.v2,af.v2);
870 af.v2 = svmad_f32_z(pg0,meff3,rij.v2,__fkg_tmp17);
874 svfloat32_t __fkg_tmp_accum;
875 uint32_t index_gather_load3[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
876 svuint32_t vindex_gather_load3 = svld1_u32(svptrue_b32(),index_gather_load3);
877 __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.x),vindex_gather_load3);
878 __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,af.v0);
879 uint32_t index_scatter_store0[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
880 svuint32_t vindex_scatter_store0 = svld1_u32(svptrue_b32(),index_scatter_store0);
881 svst1_scatter_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.x),vindex_scatter_store0,__fkg_tmp_accum);}
884 svfloat32_t __fkg_tmp_accum;
885 uint32_t index_gather_load4[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
886 svuint32_t vindex_gather_load4 = svld1_u32(svptrue_b32(),index_gather_load4);
887 __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.y),vindex_gather_load4);
888 __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,af.v1);
889 uint32_t index_scatter_store1[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
890 svuint32_t vindex_scatter_store1 = svld1_u32(svptrue_b32(),index_scatter_store1);
891 svst1_scatter_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.y),vindex_scatter_store1,__fkg_tmp_accum);}
894 svfloat32_t __fkg_tmp_accum;
895 uint32_t index_gather_load5[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
896 svuint32_t vindex_gather_load5 = svld1_u32(svptrue_b32(),index_gather_load5);
897 __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.z),vindex_gather_load5);
898 __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,af.v2);
899 uint32_t index_scatter_store2[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
900 svuint32_t vindex_scatter_store2 = svld1_u32(svptrue_b32(),index_scatter_store2);
901 svst1_scatter_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.z),vindex_scatter_store2,__fkg_tmp_accum);}
904 svfloat32_t __fkg_tmp_accum;
905 uint32_t index_gather_load6[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
906 svuint32_t vindex_gather_load6 = svld1_u32(svptrue_b32(),index_gather_load6);
907 __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].pot),vindex_gather_load6);
908 __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,pf);
909 uint32_t index_scatter_store3[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
910 svuint32_t vindex_scatter_store3 = svld1_u32(svptrue_b32(),index_scatter_store3);
911 svst1_scatter_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].pot),vindex_scatter_store3,__fkg_tmp_accum);}
915 for(i=0; i<ni_act; ++i){
917 if (epi[j].type==1) {
918 force[j].acc.x += G*force_loc[i].acc.x;
919 force[j].acc.y += G*force_loc[i].acc.y;
920 force[j].acc.z += G*force_loc[i].acc.z;
921 force[j].pot += G*force_loc[i].pot;
934 FORCESP32 force_loc[ni];
938 if (epi[i].type==1) {
939 epi_list[ni_act] = i;
940 epi_loc[ni_act].pos.x = epi[i].pos.x - epi[0].pos.x;
941 epi_loc[ni_act].pos.y = epi[i].pos.y - epi[0].pos.y;
942 epi_loc[ni_act].pos.z = epi[i].pos.z - epi[0].pos.z;
944 force_loc[ni_act].acc = 0.f;
945 force_loc[ni_act].pot = 0.0;
950 for(j=0; j<nj; ++j) {
951 epj_loc[j].pos.x = epj[j].pos.x - epi[0].pos.x;
952 epj_loc[j].pos.y = epj[j].pos.y - epi[0].pos.y;
953 epj_loc[j].pos.z = epj[j].pos.z - epi[0].pos.z;
954 epj_loc[j].mass = epj[j].mass;
955 epj_loc[j].qxx = epj[j].quad.xx;
956 epj_loc[j].qyy = epj[j].quad.yy;
957 epj_loc[j].qzz = epj[j].quad.zz;
958 epj_loc[j].qxy = epj[j].quad.xy;
959 epj_loc[j].qxz = epj[j].quad.xz;
960 epj_loc[j].qyz = epj[j].quad.yz;
963 for(i = 0;i < ((ni_act + 1 - 1)/1)*1;++i){
966 xi.v0 = svdup_n_f32(epi_loc[i+0].pos.x);
968 xi.v1 = svdup_n_f32(epi_loc[i+0].pos.y);
970 xi.v2 = svdup_n_f32(epi_loc[i+0].pos.z);
974 af.v0 = svdup_n_f32(0.0f);
975 af.v1 = svdup_n_f32(0.0f);
976 af.v2 = svdup_n_f32(0.0f);
979 pf = svdup_n_f32(0.0f);
980 for(j = 0;j < ((nj + 16 - 1)/16)*16;j += 16){
981 svbool_t pg0 = svwhilelt_b32_s32(j,nj);
984 uint32_t index_gather_load7[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
985 svuint32_t vindex_gather_load7 = svld1_u32(pg0,index_gather_load7);
986 mj = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].mass),vindex_gather_load7);
989 uint32_t index_gather_load8[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
990 svuint32_t vindex_gather_load8 = svld1_u32(pg0,index_gather_load8);
991 qj_xx = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].qxx),vindex_gather_load8);
994 uint32_t index_gather_load9[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
995 svuint32_t vindex_gather_load9 = svld1_u32(pg0,index_gather_load9);
996 qj_xy = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].qxy),vindex_gather_load9);
999 uint32_t index_gather_load10[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
1000 svuint32_t vindex_gather_load10 = svld1_u32(pg0,index_gather_load10);
1001 qj_yy = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].qyy),vindex_gather_load10);
1004 uint32_t index_gather_load11[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
1005 svuint32_t vindex_gather_load11 = svld1_u32(pg0,index_gather_load11);
1006 qj_yz = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].qyz),vindex_gather_load11);
1009 uint32_t index_gather_load12[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
1010 svuint32_t vindex_gather_load12 = svld1_u32(pg0,index_gather_load12);
1011 qj_zx = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].qxz),vindex_gather_load12);
1014 uint32_t index_gather_load13[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
1015 svuint32_t vindex_gather_load13 = svld1_u32(pg0,index_gather_load13);
1016 qj_zz = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].qzz),vindex_gather_load13);
1019 uint32_t index_gather_load14[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
1020 svuint32_t vindex_gather_load14 = svld1_u32(pg0,index_gather_load14);
1021 xj.v0 = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].pos.x),vindex_gather_load14);
1022 uint32_t index_gather_load15[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
1023 svuint32_t vindex_gather_load15 = svld1_u32(pg0,index_gather_load15);
1024 xj.v1 = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].pos.y),vindex_gather_load15);
1025 uint32_t index_gather_load16[16] = {0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150};
1026 svuint32_t vindex_gather_load16 = svld1_u32(pg0,index_gather_load16);
1027 xj.v2 = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].pos.z),vindex_gather_load16);
1030 svfloat32_t __fkg_tmp1;
1032 svfloat32_t __fkg_tmp0;
1038 svfloat32_t __fkg_tmp2;
1042 svfloat32_t __fkg_tmp3;
1052 svfloat32_t __fkg_tmp4;
1068 svfloat32_t __fkg_tmp5;
1072 svfloat32_t __fkg_tmp7;
1074 svfloat32_t __fkg_tmp6;
1078 svfloat32_t __fkg_tmp9;
1080 svfloat32_t __fkg_tmp8;
1082 svfloat32_t __fkg_tmp11;
1084 svfloat32_t __fkg_tmp10;
1086 svfloat32_t __fkg_tmp13;
1088 svfloat32_t __fkg_tmp12;
1092 svfloat32_t rqr_r4_inv;
1096 svfloat32_t __fkg_tmp14;
1100 svfloat32_t __fkg_tmp15;
1102 svfloat32_t __fkg_tmp16;
1104 svfloat32_t __fkg_tmp17;
1106 rij.v0 = svsub_f32_z(pg0,xj.v0,xi.v0);
1107 rij.v1 = svsub_f32_z(pg0,xj.v1,xi.v1);
1108 rij.v2 = svsub_f32_z(pg0,xj.v2,xi.v2);
1109 __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,svdup_n_f32(eps2));
1110 __fkg_tmp0 = svmad_f32_z(pg0,rij.v1,rij.v1,__fkg_tmp1);
1111 r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp0);
1112 r_inv = rsqrt(pg0,r2);
1113 __fkg_tmp2 = svmul_f32_z(pg0,r_inv,r_inv);
1114 tmp = svmsb_f32_z(pg0,r2,__fkg_tmp2,svdup_n_f32(3.0f));
1115 __fkg_tmp3 = svmul_f32_z(pg0,tmp,svdup_n_f32(0.5f));
1116 r_inv = svmul_f32_z(pg0,r_inv,__fkg_tmp3);
1117 r2_inv = svmul_f32_z(pg0,r_inv,r_inv);
1118 r3_inv = svmul_f32_z(pg0,r2_inv,r_inv);
1119 r4_inv = svmul_f32_z(pg0,r2_inv,r2_inv);
1120 r5_inv = svmul_f32_z(pg0,r2_inv,r3_inv);
1121 __fkg_tmp4 = svadd_f32_z(pg0,qj_xx,qj_yy);
1122 tr = svadd_f32_z(pg0,__fkg_tmp4,qj_zz);
1123 qxx = svnmsb_f32_z(pg0,svdup_n_f32(3.0f),qj_xx,tr);
1124 qyy = svnmsb_f32_z(pg0,svdup_n_f32(3.0f),qj_yy,tr);
1125 qzz = svnmsb_f32_z(pg0,svdup_n_f32(3.0f),qj_zz,tr);
1126 qxy = svmul_f32_z(pg0,svdup_n_f32(3.0f),qj_xy);
1127 qyz = svmul_f32_z(pg0,svdup_n_f32(3.0f),qj_yz);
1128 qxz = svmul_f32_z(pg0,svdup_n_f32(3.0f),qj_zx);
1129 __fkg_tmp5 = svmul_f32_z(pg0,svdup_n_f32(eps2),tr);
1130 mtr = svneg_f32_z(pg0,__fkg_tmp5);
1131 __fkg_tmp7 = svmul_f32_z(pg0,qxx,rij.v0);
1132 __fkg_tmp6 = svmad_f32_z(pg0,qxy,rij.v1,__fkg_tmp7);
1133 qr.v0 = svmad_f32_z(pg0,qxz,rij.v2,__fkg_tmp6);
1134 __fkg_tmp9 = svmul_f32_z(pg0,qyy,rij.v1);
1135 __fkg_tmp8 = svmad_f32_z(pg0,qyz,rij.v2,__fkg_tmp9);
1136 qr.v1 = svmad_f32_z(pg0,qxy,rij.v0,__fkg_tmp8);
1137 __fkg_tmp11 = svmul_f32_z(pg0,qzz,rij.v2);
1138 __fkg_tmp10 = svmad_f32_z(pg0,qxz,rij.v0,__fkg_tmp11);
1139 qr.v2 = svmad_f32_z(pg0,qyz,rij.v1,__fkg_tmp10);
1140 __fkg_tmp13 = svmad_f32_z(pg0,qr.v0,rij.v0,mtr);
1141 __fkg_tmp12 = svmad_f32_z(pg0,qr.v1,rij.v1,__fkg_tmp13);
1142 rqr = svmad_f32_z(pg0,qr.v2,rij.v2,__fkg_tmp12);
1143 rqr_r4_inv = svmul_f32_z(pg0,rqr,r4_inv);
1144 meff = svmad_f32_z(pg0,svdup_n_f32(0.5f),rqr_r4_inv,mj);
1145 __fkg_tmp14 = svmad_f32_z(pg0,svdup_n_f32(2.5f),rqr_r4_inv,mj);
1146 meff3 = svmul_f32_z(pg0,__fkg_tmp14,r3_inv);
1147 pf = svmsb_f32_z(pg0,meff,r_inv,pf);
1148 __fkg_tmp15 = svmsb_f32_z(pg0,r5_inv,qr.v0,af.v0);
1149 af.v0 = svmad_f32_z(pg0,meff3,rij.v0,__fkg_tmp15);
1150 __fkg_tmp16 = svmsb_f32_z(pg0,r5_inv,qr.v1,af.v1);
1151 af.v1 = svmad_f32_z(pg0,meff3,rij.v1,__fkg_tmp16);
1152 __fkg_tmp17 = svmsb_f32_z(pg0,r5_inv,qr.v2,af.v2);
1153 af.v2 = svmad_f32_z(pg0,meff3,rij.v2,__fkg_tmp17);
1156 ((
float*)&force_loc[i+0].acc.x)[0] += svaddv_f32(svptrue_b32(),af.v0);
1158 ((
float*)&force_loc[i+0].acc.y)[0] += svaddv_f32(svptrue_b32(),af.v1);
1160 ((
float*)&force_loc[i+0].acc.z)[0] += svaddv_f32(svptrue_b32(),af.v2);
1162 ((
float*)&force_loc[i+0].pot)[0] += svaddv_f32(svptrue_b32(),pf);
1166 for(i=0; i<ni_act; ++i){
1168 if (epi[j].type==1) {
1169 force[j].acc.x += G*force_loc[i].acc.x;
1170 force[j].acc.y += G*force_loc[i].acc.y;
1171 force[j].acc.z += G*force_loc[i].acc.z;
1172 force[j].pot += G*force_loc[i].pot;
1206 svfloat32_t rsqrt(svbool_t pg,svfloat32_t op){
1207 svfloat32_t rinv = svrsqrte_f32(op);
1208 svfloat32_t h = svmul_f32_z(pg,op,rinv);
1209 h = svmsb_n_f32_z(pg,h,rinv,1.f);
1210 svfloat32_t poly = svmad_n_f32_z(pg,h,svdup_f32(0.375f),0.5f);
1211 poly = svmul_f32_z(pg,poly,h);
1212 rinv = svmad_f32_z(pg,rinv,poly,rinv);
1215 svfloat64_t rsqrt(svbool_t pg,svfloat64_t op){
1216 svfloat64_t rinv = svrsqrte_f64(op);
1217 svfloat64_t h = svmul_f64_z(pg,op,rinv);
1218 h = svmsb_n_f64_z(pg,h,rinv,1.f);
1219 svfloat64_t poly = svmad_n_f64_z(pg,h,svdup_f64(0.375f),0.5f);
1220 poly = svmul_f64_z(pg,poly,h);
1221 rinv = svmad_f64_z(pg,rinv,poly,rinv);
1226 ret.v0 = svdup_n_f32(v.
x);
1227 ret.v1 = svdup_n_f32(v.
y);
1232 ret.v0 = svdup_n_f32(v.
x);
1233 ret.v1 = svdup_n_f32(v.
y);
1234 ret.v2 = svdup_n_f32(v.
z);
1239 ret.v0 = svdup_n_f32(v.
x);
1240 ret.v1 = svdup_n_f32(v.
y);
1241 ret.v2 = svdup_n_f32(v.
z);
1242 ret.v3 = svdup_n_f32(v.
w);
1247 ret.v0 = svdup_n_f64(v.
x);
1248 ret.v1 = svdup_n_f64(v.
y);
1253 ret.v0 = svdup_n_f64(v.
x);
1254 ret.v1 = svdup_n_f64(v.
y);
1255 ret.v2 = svdup_n_f64(v.
z);
1260 ret.v0 = svdup_n_f64(v.
x);
1261 ret.v1 = svdup_n_f64(v.
y);
1262 ret.v2 = svdup_n_f64(v.
z);
1263 ret.v3 = svdup_n_f64(v.
w);
1266 svfloat32_t sqrt(svbool_t pg,svfloat32_t op){
return svsqrt_f32_z(pg,op); }
1267 svfloat64_t sqrt(svbool_t pg,svfloat64_t op){
return svsqrt_f64_z(pg,op); }
1268 svfloat32_t inv(svbool_t pg,svfloat32_t op){
1269 svfloat32_t x1 = svrecpe_f32(op);
1270 svfloat32_t x2 = svmsb_n_f32_z(pg,op,x1,2.f);
1271 x2 = svmul_f32_z(pg,x2,x1);
1272 svfloat32_t ret = svmsb_n_f32_z(pg,op,x2,2.f);
1273 ret = svmul_f32_z(pg,ret,x2);
1276 svfloat64_t inv(svbool_t pg,svfloat64_t op){
1277 svfloat64_t x1 = svrecpe_f64(op);
1278 svfloat64_t x2 = svmsb_n_f64_z(pg,op,x1,2.f);
1279 x2 = svmul_f64_z(pg,x2,x1);
1280 svfloat64_t ret = svmsb_n_f64_z(pg,op,x2,2.f);
1281 ret = svmul_f64_z(pg,ret,x2);
1284 svfloat64_t
max(svbool_t pg,svfloat64_t a,svfloat64_t b){
return svmax_f64_z(pg,a,b);}
1285 svfloat64_t
min(svbool_t pg,svfloat64_t a,svfloat64_t b){
return svmin_f64_z(pg,a,b);}
1286 svuint64_t
max(svbool_t pg,svuint64_t a,svuint64_t b){
return svmax_u64_z(pg,a,b);}
1287 svuint64_t
min(svbool_t pg,svuint64_t a,svuint64_t b){
return svmin_u64_z(pg,a,b);}
1288 svint64_t
max(svbool_t pg,svint64_t a,svint64_t b){
return svmax_s64_z(pg,a,b);}
1289 svint64_t
min(svbool_t pg,svint64_t a,svint64_t b){
return svmin_s64_z(pg,a,b);}
1290 svfloat32_t
max(svbool_t pg,svfloat32_t a,svfloat32_t b){
return svmax_f32_z(pg,a,b);}
1291 svfloat32_t
min(svbool_t pg,svfloat32_t a,svfloat32_t b){
return svmin_f32_z(pg,a,b);}
1292 svuint32_t
max(svbool_t pg,svuint32_t a,svuint32_t b){
return svmax_u32_z(pg,a,b);}
1293 svuint32_t
min(svbool_t pg,svuint32_t a,svuint32_t b){
return svmin_u32_z(pg,a,b);}
1294 svint32_t
max(svbool_t pg,svint32_t a,svint32_t b){
return svmax_s32_z(pg,a,b);}
1295 svint32_t
min(svbool_t pg,svint32_t a,svint32_t b){
return svmin_s32_z(pg,a,b);}
1296 svfloat64_t table(svfloat64_t tab,svuint64_t index){
return svtbl_f64(tab,index);}
1297 svfloat32_t table(svfloat32_t tab,svuint32_t index){
return svtbl_f32(tab,index);}
1298 svfloat64_t to_float(svbool_t pg, svint64_t op){
return svcvt_f64_s64_z(pg,op);}
1299 svfloat32_t to_float(svbool_t pg, svint32_t op){
return svcvt_f32_s32_z(pg,op);}
1300 svfloat64_t to_float(svbool_t pg,svuint64_t op){
return svcvt_f64_u64_z(pg,op);}
1301 svfloat32_t to_float(svbool_t pg,svuint32_t op){
return svcvt_f32_u32_z(pg,op);}
1304 svuint64_t to_uint(svbool_t pg,svfloat64_t op){
return svcvt_u64_f64_z(pg,op);}
1305 svuint32_t to_uint(svbool_t pg,svfloat32_t op){
return svcvt_u32_f32_z(pg,op);}
1310 struct CalcForceEpSpMonoFugaku{
1314 CalcForceEpSpMonoFugaku(){}
1323 void operator()(
const EPISoft* __restrict__ epi,
const int ni,
const SPJSoft* __restrict__ epj,
const int nj,
ForceSoft* __restrict__ force,
const int kernel_select = 1){
1325 static_assert(
sizeof(EPI32) == 16,
"check consistency of EPI member variable definition between PIKG source and original source");
1326 static_assert(
sizeof(SPJ32) == 16,
"check consistency of EPJ member variable definition between PIKG source and original source");
1327 static_assert(
sizeof(FORCESP32) == 16,
"check consistency of FORCE member variable definition between PIKG source and original source");
1328 if(kernel_select>=0) kernel_id = kernel_select;
1330 std::cout <<
"ni: " << ni <<
" nj:" << nj << std::endl;
1332 std::chrono::system_clock::time_point start, end;
1335 for(
int i=0;i<ni;i++) force_tmp[i] = force[i];
1336 start = std::chrono::system_clock::now();
1337 Kernel_I16_J1(epi,ni,epj,nj,force_tmp);
1338 end = std::chrono::system_clock::now();
1339 double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
1340 std::cerr <<
"kerel 1: " << elapsed <<
" ns" << std::endl;
1341 if(min_time > elapsed){
1347 for(
int i=0;i<ni;i++) force_tmp[i] = force[i];
1348 start = std::chrono::system_clock::now();
1349 Kernel_I1_J16(epi,ni,epj,nj,force_tmp);
1350 end = std::chrono::system_clock::now();
1351 double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
1352 std::cerr <<
"kerel 2: " << elapsed <<
" ns" << std::endl;
1353 if(min_time > elapsed){
1360 if(kernel_id == 1) Kernel_I16_J1(epi,ni,epj,nj,force);
1361 if(kernel_id == 2) Kernel_I1_J16(epi,ni,epj,nj,force);
1372 FORCESP32 force_loc[ni];
1375 for(i=0; i<ni; ++i){
1376 if (epi[i].type==1) {
1377 epi_list[ni_act] = i;
1378 epi_loc[ni_act].pos.x = epi[i].pos.x - epi[0].pos.x;
1379 epi_loc[ni_act].pos.y = epi[i].pos.y - epi[0].pos.y;
1380 epi_loc[ni_act].pos.z = epi[i].pos.z - epi[0].pos.z;
1382 force_loc[ni_act].acc = 0.f;
1383 force_loc[ni_act].pot = 0.0;
1388 for(j=0; j<nj; ++j) {
1389 epj_loc[j].pos.x = epj[j].pos.x - epi[0].pos.x;
1390 epj_loc[j].pos.y = epj[j].pos.y - epi[0].pos.y;
1391 epj_loc[j].pos.z = epj[j].pos.z - epi[0].pos.z;
1392 epj_loc[j].mass = epj[j].mass;
1396 for(i = 0;i < ((ni_act + 16 - 1)/16)*16;i += 16){
1397 svbool_t pg0 = svwhilelt_b32_s32(i,ni_act);
1400 uint32_t index_gather_load0[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1401 svuint32_t vindex_gather_load0 = svld1_u32(pg0,index_gather_load0);
1402 xi.v0 = svld1_gather_u32index_f32(pg0,((
float*)&epi_loc[i+0].pos.x),vindex_gather_load0);
1403 uint32_t index_gather_load1[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1404 svuint32_t vindex_gather_load1 = svld1_u32(pg0,index_gather_load1);
1405 xi.v1 = svld1_gather_u32index_f32(pg0,((
float*)&epi_loc[i+0].pos.y),vindex_gather_load1);
1406 uint32_t index_gather_load2[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1407 svuint32_t vindex_gather_load2 = svld1_u32(pg0,index_gather_load2);
1408 xi.v2 = svld1_gather_u32index_f32(pg0,((
float*)&epi_loc[i+0].pos.z),vindex_gather_load2);
1411 acc.v0 = svdup_n_f32(0.0f);
1412 acc.v1 = svdup_n_f32(0.0f);
1413 acc.v2 = svdup_n_f32(0.0f);
1416 pot = svdup_n_f32(0.0f);
1417 for(j = 0;j < ((nj + 1 - 1)/1)*1;++j){
1420 mj = svdup_n_f32(epj_loc[j+0].mass);
1424 xj.v0 = svdup_n_f32(epj_loc[j+0].pos.x);
1426 xj.v1 = svdup_n_f32(epj_loc[j+0].pos.y);
1428 xj.v2 = svdup_n_f32(epj_loc[j+0].pos.z);
1432 svfloat32_t __fkg_tmp1;
1434 svfloat32_t __fkg_tmp0;
1444 svfloat32_t mr3_inv;
1446 rij.v0 = svsub_f32_z(pg0,xi.v0,xj.v0);
1447 rij.v1 = svsub_f32_z(pg0,xi.v1,xj.v1);
1448 rij.v2 = svsub_f32_z(pg0,xi.v2,xj.v2);
1449 __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,svdup_n_f32(eps2));
1450 __fkg_tmp0 = svmad_f32_z(pg0,rij.v1,rij.v1,__fkg_tmp1);
1451 r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp0);
1452 r_inv = rsqrt(pg0,r2);
1453 r2_inv = svmul_f32_z(pg0,r_inv,r_inv);
1454 mr_inv = svmul_f32_z(pg0,mj,r_inv);
1455 mr3_inv = svmul_f32_z(pg0,r2_inv,mr_inv);
1456 acc.v0 = svmsb_f32_z(pg0,mr3_inv,rij.v0,acc.v0);
1457 acc.v1 = svmsb_f32_z(pg0,mr3_inv,rij.v1,acc.v1);
1458 acc.v2 = svmsb_f32_z(pg0,mr3_inv,rij.v2,acc.v2);
1459 pot = svsub_f32_z(pg0,pot,mr_inv);
1463 svfloat32_t __fkg_tmp_accum;
1464 uint32_t index_gather_load3[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1465 svuint32_t vindex_gather_load3 = svld1_u32(svptrue_b32(),index_gather_load3);
1466 __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.x),vindex_gather_load3);
1467 __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,acc.v0);
1468 uint32_t index_scatter_store0[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1469 svuint32_t vindex_scatter_store0 = svld1_u32(svptrue_b32(),index_scatter_store0);
1470 svst1_scatter_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.x),vindex_scatter_store0,__fkg_tmp_accum);}
1473 svfloat32_t __fkg_tmp_accum;
1474 uint32_t index_gather_load4[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1475 svuint32_t vindex_gather_load4 = svld1_u32(svptrue_b32(),index_gather_load4);
1476 __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.y),vindex_gather_load4);
1477 __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,acc.v1);
1478 uint32_t index_scatter_store1[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1479 svuint32_t vindex_scatter_store1 = svld1_u32(svptrue_b32(),index_scatter_store1);
1480 svst1_scatter_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.y),vindex_scatter_store1,__fkg_tmp_accum);}
1483 svfloat32_t __fkg_tmp_accum;
1484 uint32_t index_gather_load5[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1485 svuint32_t vindex_gather_load5 = svld1_u32(svptrue_b32(),index_gather_load5);
1486 __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.z),vindex_gather_load5);
1487 __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,acc.v2);
1488 uint32_t index_scatter_store2[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1489 svuint32_t vindex_scatter_store2 = svld1_u32(svptrue_b32(),index_scatter_store2);
1490 svst1_scatter_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].acc.z),vindex_scatter_store2,__fkg_tmp_accum);}
1493 svfloat32_t __fkg_tmp_accum;
1494 uint32_t index_gather_load6[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1495 svuint32_t vindex_gather_load6 = svld1_u32(svptrue_b32(),index_gather_load6);
1496 __fkg_tmp_accum = svld1_gather_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].pot),vindex_gather_load6);
1497 __fkg_tmp_accum = svadd_f32_z(svptrue_b32(),__fkg_tmp_accum,pot);
1498 uint32_t index_scatter_store3[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1499 svuint32_t vindex_scatter_store3 = svld1_u32(svptrue_b32(),index_scatter_store3);
1500 svst1_scatter_u32index_f32(svptrue_b32(),((
float*)&force_loc[i+0].pot),vindex_scatter_store3,__fkg_tmp_accum);}
1505 for(i=0; i<ni_act; ++i){
1507 assert(epi[j].type==1);
1508 force[j].acc.x += G*force_loc[i].acc.x;
1509 force[j].acc.y += G*force_loc[i].acc.y;
1510 force[j].acc.z += G*force_loc[i].acc.z;
1511 force[j].pot += G*force_loc[i].pot;
1523 FORCESP32 force_loc[ni];
1526 for(i=0; i<ni; ++i){
1527 if (epi[i].type==1) {
1528 epi_list[ni_act] = i;
1529 epi_loc[ni_act].pos.x = epi[i].pos.x - epi[0].pos.x;
1530 epi_loc[ni_act].pos.y = epi[i].pos.y - epi[0].pos.y;
1531 epi_loc[ni_act].pos.z = epi[i].pos.z - epi[0].pos.z;
1533 force_loc[ni_act].acc = 0.f;
1534 force_loc[ni_act].pot = 0.0;
1539 for(j=0; j<nj; ++j) {
1540 epj_loc[j].pos.x = epj[j].pos.x - epi[0].pos.x;
1541 epj_loc[j].pos.y = epj[j].pos.y - epi[0].pos.y;
1542 epj_loc[j].pos.z = epj[j].pos.z - epi[0].pos.z;
1543 epj_loc[j].mass = epj[j].mass;
1546 for(i = 0;i < ((ni_act + 1 - 1)/1)*1;++i){
1549 xi.v0 = svdup_n_f32(epi_loc[i+0].pos.x);
1551 xi.v1 = svdup_n_f32(epi_loc[i+0].pos.y);
1553 xi.v2 = svdup_n_f32(epi_loc[i+0].pos.z);
1557 acc.v0 = svdup_n_f32(0.0f);
1558 acc.v1 = svdup_n_f32(0.0f);
1559 acc.v2 = svdup_n_f32(0.0f);
1562 pot = svdup_n_f32(0.0f);
1563 for(j = 0;j < ((nj + 16 - 1)/16)*16;j += 16){
1564 svbool_t pg0 = svwhilelt_b32_s32(j,nj);
1567 uint32_t index_gather_load7[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1568 svuint32_t vindex_gather_load7 = svld1_u32(pg0,index_gather_load7);
1569 mj = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].mass),vindex_gather_load7);
1572 uint32_t index_gather_load8[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1573 svuint32_t vindex_gather_load8 = svld1_u32(pg0,index_gather_load8);
1574 xj.v0 = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].pos.x),vindex_gather_load8);
1575 uint32_t index_gather_load9[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1576 svuint32_t vindex_gather_load9 = svld1_u32(pg0,index_gather_load9);
1577 xj.v1 = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].pos.y),vindex_gather_load9);
1578 uint32_t index_gather_load10[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1579 svuint32_t vindex_gather_load10 = svld1_u32(pg0,index_gather_load10);
1580 xj.v2 = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].pos.z),vindex_gather_load10);
1583 svfloat32_t __fkg_tmp1;
1585 svfloat32_t __fkg_tmp0;
1595 svfloat32_t mr3_inv;
1597 rij.v0 = svsub_f32_z(pg0,xi.v0,xj.v0);
1598 rij.v1 = svsub_f32_z(pg0,xi.v1,xj.v1);
1599 rij.v2 = svsub_f32_z(pg0,xi.v2,xj.v2);
1600 __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,svdup_n_f32(eps2));
1601 __fkg_tmp0 = svmad_f32_z(pg0,rij.v1,rij.v1,__fkg_tmp1);
1602 r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp0);
1603 r_inv = rsqrt(pg0,r2);
1604 r2_inv = svmul_f32_z(pg0,r_inv,r_inv);
1605 mr_inv = svmul_f32_z(pg0,mj,r_inv);
1606 mr3_inv = svmul_f32_z(pg0,r2_inv,mr_inv);
1607 acc.v0 = svmsb_f32_z(pg0,mr3_inv,rij.v0,acc.v0);
1608 acc.v1 = svmsb_f32_z(pg0,mr3_inv,rij.v1,acc.v1);
1609 acc.v2 = svmsb_f32_z(pg0,mr3_inv,rij.v2,acc.v2);
1610 pot = svsub_f32_z(pg0,pot,mr_inv);
1613 ((
float*)&force_loc[i+0].acc.x)[0] += svaddv_f32(svptrue_b32(),acc.v0);
1615 ((
float*)&force_loc[i+0].acc.y)[0] += svaddv_f32(svptrue_b32(),acc.v1);
1617 ((
float*)&force_loc[i+0].acc.z)[0] += svaddv_f32(svptrue_b32(),acc.v2);
1619 ((
float*)&force_loc[i+0].pot)[0] += svaddv_f32(svptrue_b32(),pot);
1624 for(i=0; i<ni_act; ++i){
1626 assert(epi[j].type==1);
1627 force[j].acc.x += G*force_loc[i].acc.x;
1628 force[j].acc.y += G*force_loc[i].acc.y;
1629 force[j].acc.z += G*force_loc[i].acc.z;
1630 force[j].pot += G*force_loc[i].pot;
1663 svfloat32_t rsqrt(svbool_t pg,svfloat32_t op){
1664 svfloat32_t rinv = svrsqrte_f32(op);
1665 svfloat32_t h = svmul_f32_z(pg,op,rinv);
1666 h = svmsb_n_f32_z(pg,h,rinv,1.f);
1667 svfloat32_t poly = svmad_n_f32_z(pg,h,svdup_f32(0.375f),0.5f);
1668 poly = svmul_f32_z(pg,poly,h);
1669 rinv = svmad_f32_z(pg,rinv,poly,rinv);
1672 svfloat64_t rsqrt(svbool_t pg,svfloat64_t op){
1673 svfloat64_t rinv = svrsqrte_f64(op);
1674 svfloat64_t h = svmul_f64_z(pg,op,rinv);
1675 h = svmsb_n_f64_z(pg,h,rinv,1.f);
1676 svfloat64_t poly = svmad_n_f64_z(pg,h,svdup_f64(0.375f),0.5f);
1677 poly = svmul_f64_z(pg,poly,h);
1678 rinv = svmad_f64_z(pg,rinv,poly,rinv);
1683 ret.v0 = svdup_n_f32(v.
x);
1684 ret.v1 = svdup_n_f32(v.
y);
1689 ret.v0 = svdup_n_f32(v.
x);
1690 ret.v1 = svdup_n_f32(v.
y);
1691 ret.v2 = svdup_n_f32(v.
z);
1696 ret.v0 = svdup_n_f32(v.
x);
1697 ret.v1 = svdup_n_f32(v.
y);
1698 ret.v2 = svdup_n_f32(v.
z);
1699 ret.v3 = svdup_n_f32(v.
w);
1704 ret.v0 = svdup_n_f64(v.
x);
1705 ret.v1 = svdup_n_f64(v.
y);
1710 ret.v0 = svdup_n_f64(v.
x);
1711 ret.v1 = svdup_n_f64(v.
y);
1712 ret.v2 = svdup_n_f64(v.
z);
1717 ret.v0 = svdup_n_f64(v.
x);
1718 ret.v1 = svdup_n_f64(v.
y);
1719 ret.v2 = svdup_n_f64(v.
z);
1720 ret.v3 = svdup_n_f64(v.
w);
1723 svfloat32_t sqrt(svbool_t pg,svfloat32_t op){
return svsqrt_f32_z(pg,op); }
1724 svfloat64_t sqrt(svbool_t pg,svfloat64_t op){
return svsqrt_f64_z(pg,op); }
1725 svfloat32_t inv(svbool_t pg,svfloat32_t op){
1726 svfloat32_t x1 = svrecpe_f32(op);
1727 svfloat32_t x2 = svmsb_n_f32_z(pg,op,x1,2.f);
1728 x2 = svmul_f32_z(pg,x2,x1);
1729 svfloat32_t ret = svmsb_n_f32_z(pg,op,x2,2.f);
1730 ret = svmul_f32_z(pg,ret,x2);
1733 svfloat64_t inv(svbool_t pg,svfloat64_t op){
1734 svfloat64_t x1 = svrecpe_f64(op);
1735 svfloat64_t x2 = svmsb_n_f64_z(pg,op,x1,2.f);
1736 x2 = svmul_f64_z(pg,x2,x1);
1737 svfloat64_t ret = svmsb_n_f64_z(pg,op,x2,2.f);
1738 ret = svmul_f64_z(pg,ret,x2);
1741 svfloat64_t
max(svbool_t pg,svfloat64_t a,svfloat64_t b){
return svmax_f64_z(pg,a,b);}
1742 svfloat64_t
min(svbool_t pg,svfloat64_t a,svfloat64_t b){
return svmin_f64_z(pg,a,b);}
1743 svuint64_t
max(svbool_t pg,svuint64_t a,svuint64_t b){
return svmax_u64_z(pg,a,b);}
1744 svuint64_t
min(svbool_t pg,svuint64_t a,svuint64_t b){
return svmin_u64_z(pg,a,b);}
1745 svint64_t
max(svbool_t pg,svint64_t a,svint64_t b){
return svmax_s64_z(pg,a,b);}
1746 svint64_t
min(svbool_t pg,svint64_t a,svint64_t b){
return svmin_s64_z(pg,a,b);}
1747 svfloat32_t
max(svbool_t pg,svfloat32_t a,svfloat32_t b){
return svmax_f32_z(pg,a,b);}
1748 svfloat32_t
min(svbool_t pg,svfloat32_t a,svfloat32_t b){
return svmin_f32_z(pg,a,b);}
1749 svuint32_t
max(svbool_t pg,svuint32_t a,svuint32_t b){
return svmax_u32_z(pg,a,b);}
1750 svuint32_t
min(svbool_t pg,svuint32_t a,svuint32_t b){
return svmin_u32_z(pg,a,b);}
1751 svint32_t
max(svbool_t pg,svint32_t a,svint32_t b){
return svmax_s32_z(pg,a,b);}
1752 svint32_t
min(svbool_t pg,svint32_t a,svint32_t b){
return svmin_s32_z(pg,a,b);}
1753 svfloat64_t table(svfloat64_t tab,svuint64_t index){
return svtbl_f64(tab,index);}
1754 svfloat32_t table(svfloat32_t tab,svuint32_t index){
return svtbl_f32(tab,index);}
1755 svfloat64_t to_float(svbool_t pg, svint64_t op){
return svcvt_f64_s64_z(pg,op);}
1756 svfloat32_t to_float(svbool_t pg, svint32_t op){
return svcvt_f32_s32_z(pg,op);}
1757 svfloat64_t to_float(svbool_t pg,svuint64_t op){
return svcvt_f64_u64_z(pg,op);}
1758 svfloat32_t to_float(svbool_t pg,svuint32_t op){
return svcvt_f32_u32_z(pg,op);}
1761 svuint64_t to_uint(svbool_t pg,svfloat64_t op){
return svcvt_u64_f64_z(pg,op);}
1762 svuint32_t to_uint(svbool_t pg,svfloat32_t op){
return svcvt_u32_f32_z(pg,op);}
1767 struct SearchNeighborEpEpFugaku{
1768 SearchNeighborEpEpFugaku(){}
1772 void operator()(
const EPISoft* __restrict__ epi,
const int ni,
const EPJSoft* __restrict__ epj,
const int nj,
ForceSoft* __restrict__ force,
const int kernel_select = 1){
1773 static_assert(
sizeof(EPI32) == 16,
"check consistency of EPI member variable definition between PIKG source and original source");
1774 if(kernel_select>=0) kernel_id = kernel_select;
1776 std::cout <<
"ni: " << ni <<
" nj:" << nj << std::endl;
1778 std::chrono::system_clock::time_point start, end;
1781 for(
int i=0;i<ni;i++) force_tmp[i] = force[i];
1782 start = std::chrono::system_clock::now();
1783 Kernel_I16_J1(epi,ni,epj,nj,force_tmp);
1784 end = std::chrono::system_clock::now();
1785 double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
1786 std::cerr <<
"kerel 1: " << elapsed <<
" ns" << std::endl;
1787 if(min_time > elapsed){
1793 for(
int i=0;i<ni;i++) force_tmp[i] = force[i];
1794 start = std::chrono::system_clock::now();
1795 Kernel_I1_J16(epi,ni,epj,nj,force_tmp);
1796 end = std::chrono::system_clock::now();
1797 double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count();
1798 std::cerr <<
"kerel 2: " << elapsed <<
" ns" << std::endl;
1799 if(min_time > elapsed){
1806 if(kernel_id == 1) Kernel_I16_J1(epi,ni,epj,nj,force);
1807 if(kernel_id == 2) Kernel_I1_J16(epi,ni,epj,nj,force);
1818 for(i=0; i<ni; ++i){
1819 epi_loc[i].pos.x = epi[i].pos.x - epi[0].pos.x;
1820 epi_loc[i].pos.y = epi[i].pos.y - epi[0].pos.y;
1821 epi_loc[i].pos.z = epi[i].pos.z - epi[0].pos.z;
1822 epi_loc[i].rs = epi[i].r_search;
1826 for(j=0; j<nj; ++j) {
1827 epj_loc[j].pos.x = epj[j].pos.x - epi[0].pos.x;
1828 epj_loc[j].pos.y = epj[j].pos.y - epi[0].pos.y;
1829 epj_loc[j].pos.z = epj[j].pos.z - epi[0].pos.z;
1830 epj_loc[j].rs = epj[j].r_search;
1833 for(i = 0;i < ((ni + 16 - 1)/16)*16;i += 16){
1834 svbool_t pg0 = svwhilelt_b32_s32(i,ni);
1837 uint32_t index_gather_load0[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1838 svuint32_t vindex_gather_load0 = svld1_u32(pg0,index_gather_load0);
1839 rsi = svld1_gather_u32index_f32(pg0,((
float*)&epi_loc[i+0].rs),vindex_gather_load0);
1842 uint32_t index_gather_load1[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1843 svuint32_t vindex_gather_load1 = svld1_u32(pg0,index_gather_load1);
1844 xi.v0 = svld1_gather_u32index_f32(pg0,((
float*)&epi_loc[i+0].pos.x),vindex_gather_load1);
1845 uint32_t index_gather_load2[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1846 svuint32_t vindex_gather_load2 = svld1_u32(pg0,index_gather_load2);
1847 xi.v1 = svld1_gather_u32index_f32(pg0,((
float*)&epi_loc[i+0].pos.y),vindex_gather_load2);
1848 uint32_t index_gather_load3[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1849 svuint32_t vindex_gather_load3 = svld1_u32(pg0,index_gather_load3);
1850 xi.v2 = svld1_gather_u32index_f32(pg0,((
float*)&epi_loc[i+0].pos.z),vindex_gather_load3);
1853 nnbi = svdup_n_s32(0);
1854 for(j = 0;j < ((nj + 1 - 1)/1)*1;++j){
1857 rsj = svdup_n_f32(epj_loc[j+0].rs);
1861 xj.v0 = svdup_n_f32(epj_loc[j+0].pos.x);
1863 xj.v1 = svdup_n_f32(epj_loc[j+0].pos.y);
1865 xj.v2 = svdup_n_f32(epj_loc[j+0].pos.z);
1869 svfloat32_t __fkg_tmp2;
1871 svfloat32_t __fkg_tmp1;
1875 svint32_t __fkg_tmp0;
1877 rij.v0 = svsub_f32_z(pg0,xi.v0,xj.v0);
1878 rij.v1 = svsub_f32_z(pg0,xi.v1,xj.v1);
1879 rij.v2 = svsub_f32_z(pg0,xi.v2,xj.v2);
1880 __fkg_tmp2 = svmul_f32_z(pg0,rij.v1,rij.v1);
1881 __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,__fkg_tmp2);
1882 r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp1);
1885 pg2 = svcmplt_f32(pg0,r2,
max(pg0,svmul_f32_z(pg0,rsi,rsi),svmul_f32_z(pg0,rsj,rsj)));
1887 __fkg_tmp0 = svadd_s32_z(pg2,nnbi,svdup_n_s32(1));
1888 nnbi = svsel_s32(pg2,__fkg_tmp0,nnbi);;
1894 svint32_t __fkg_tmp_accum;
1895 uint32_t index_gather_load4[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
1896 svuint32_t vindex_gather_load4 = svld1_u32(svptrue_b32(),index_gather_load4);
1897 __fkg_tmp_accum = svld1_gather_u32index_s32(svptrue_b32(),((
int*)&nnb[i+0]),vindex_gather_load4);
1898 __fkg_tmp_accum = svadd_s32_z(svptrue_b32(),__fkg_tmp_accum,nnbi);
1899 uint32_t index_scatter_store0[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
1900 svuint32_t vindex_scatter_store0 = svld1_u32(svptrue_b32(),index_scatter_store0);
1901 svst1_scatter_u32index_s32(svptrue_b32(),((
int*)&nnb[i+0]),vindex_scatter_store0,__fkg_tmp_accum);}
1905 for(i=0; i<ni; ++i){
1906 force[i].n_ngb = nnb[i];
1918 for(i=0; i<ni; ++i){
1919 epi_loc[i].pos.x = epi[i].pos.x - epi[0].pos.x;
1920 epi_loc[i].pos.y = epi[i].pos.y - epi[0].pos.y;
1921 epi_loc[i].pos.z = epi[i].pos.z - epi[0].pos.z;
1922 epi_loc[i].rs = epi[i].r_search;
1926 for(j=0; j<nj; ++j) {
1927 epj_loc[j].pos.x = epj[j].pos.x - epi[0].pos.x;
1928 epj_loc[j].pos.y = epj[j].pos.y - epi[0].pos.y;
1929 epj_loc[j].pos.z = epj[j].pos.z - epi[0].pos.z;
1930 epj_loc[j].rs = epj[j].r_search;
1933 for(i = 0;i < ((ni + 1 - 1)/1)*1;++i){
1936 rsi = svdup_n_f32(epi_loc[i+0].rs);
1940 xi.v0 = svdup_n_f32(epi_loc[i+0].pos.x);
1942 xi.v1 = svdup_n_f32(epi_loc[i+0].pos.y);
1944 xi.v2 = svdup_n_f32(epi_loc[i+0].pos.z);
1948 nnbi = svdup_n_s32(0);
1949 for(j = 0;j < ((nj + 16 - 1)/16)*16;j += 16){
1950 svbool_t pg0 = svwhilelt_b32_s32(j,nj);
1953 uint32_t index_gather_load5[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1954 svuint32_t vindex_gather_load5 = svld1_u32(pg0,index_gather_load5);
1955 rsj = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].rs),vindex_gather_load5);
1958 uint32_t index_gather_load6[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1959 svuint32_t vindex_gather_load6 = svld1_u32(pg0,index_gather_load6);
1960 xj.v0 = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].pos.x),vindex_gather_load6);
1961 uint32_t index_gather_load7[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1962 svuint32_t vindex_gather_load7 = svld1_u32(pg0,index_gather_load7);
1963 xj.v1 = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].pos.y),vindex_gather_load7);
1964 uint32_t index_gather_load8[16] = {0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60};
1965 svuint32_t vindex_gather_load8 = svld1_u32(pg0,index_gather_load8);
1966 xj.v2 = svld1_gather_u32index_f32(pg0,((
float*)&epj_loc[j+0].pos.z),vindex_gather_load8);
1969 svfloat32_t __fkg_tmp2;
1971 svfloat32_t __fkg_tmp1;
1975 svint32_t __fkg_tmp0;
1977 rij.v0 = svsub_f32_z(pg0,xi.v0,xj.v0);
1978 rij.v1 = svsub_f32_z(pg0,xi.v1,xj.v1);
1979 rij.v2 = svsub_f32_z(pg0,xi.v2,xj.v2);
1980 __fkg_tmp2 = svmul_f32_z(pg0,rij.v1,rij.v1);
1981 __fkg_tmp1 = svmad_f32_z(pg0,rij.v0,rij.v0,__fkg_tmp2);
1982 r2 = svmad_f32_z(pg0,rij.v2,rij.v2,__fkg_tmp1);
1985 pg2 = svcmplt_f32(pg0,r2,
max(pg0,svmul_f32_z(pg0,rsi,rsi),svmul_f32_z(pg0,rsj,rsj)));
1987 __fkg_tmp0 = svadd_s32_z(pg2,nnbi,svdup_n_s32(1));
1988 nnbi = svsel_s32(pg2,__fkg_tmp0,nnbi);;
1993 ((
int*)&nnb[i+0])[0] += svaddv_s32(svptrue_b32(),nnbi);
1997 for(i=0; i<ni; ++i){
1998 force[i].n_ngb = nnb[i];
2030 svfloat32_t rsqrt(svbool_t pg,svfloat32_t op){
2031 svfloat32_t rinv = svrsqrte_f32(op);
2032 svfloat32_t h = svmul_f32_z(pg,op,rinv);
2033 h = svmsb_n_f32_z(pg,h,rinv,1.f);
2034 svfloat32_t poly = svmad_n_f32_z(pg,h,svdup_f32(0.375f),0.5f);
2035 poly = svmul_f32_z(pg,poly,h);
2036 rinv = svmad_f32_z(pg,rinv,poly,rinv);
2039 svfloat64_t rsqrt(svbool_t pg,svfloat64_t op){
2040 svfloat64_t rinv = svrsqrte_f64(op);
2041 svfloat64_t h = svmul_f64_z(pg,op,rinv);
2042 h = svmsb_n_f64_z(pg,h,rinv,1.f);
2043 svfloat64_t poly = svmad_n_f64_z(pg,h,svdup_f64(0.375f),0.5f);
2044 poly = svmul_f64_z(pg,poly,h);
2045 rinv = svmad_f64_z(pg,rinv,poly,rinv);
2050 ret.v0 = svdup_n_f32(v.
x);
2051 ret.v1 = svdup_n_f32(v.
y);
2056 ret.v0 = svdup_n_f32(v.
x);
2057 ret.v1 = svdup_n_f32(v.
y);
2058 ret.v2 = svdup_n_f32(v.
z);
2063 ret.v0 = svdup_n_f32(v.
x);
2064 ret.v1 = svdup_n_f32(v.
y);
2065 ret.v2 = svdup_n_f32(v.
z);
2066 ret.v3 = svdup_n_f32(v.
w);
2071 ret.v0 = svdup_n_f64(v.
x);
2072 ret.v1 = svdup_n_f64(v.
y);
2077 ret.v0 = svdup_n_f64(v.
x);
2078 ret.v1 = svdup_n_f64(v.
y);
2079 ret.v2 = svdup_n_f64(v.
z);
2084 ret.v0 = svdup_n_f64(v.
x);
2085 ret.v1 = svdup_n_f64(v.
y);
2086 ret.v2 = svdup_n_f64(v.
z);
2087 ret.v3 = svdup_n_f64(v.
w);
2090 svfloat32_t sqrt(svbool_t pg,svfloat32_t op){
return svsqrt_f32_z(pg,op); }
2091 svfloat64_t sqrt(svbool_t pg,svfloat64_t op){
return svsqrt_f64_z(pg,op); }
2092 svfloat32_t inv(svbool_t pg,svfloat32_t op){
2093 svfloat32_t x1 = svrecpe_f32(op);
2094 svfloat32_t x2 = svmsb_n_f32_z(pg,op,x1,2.f);
2095 x2 = svmul_f32_z(pg,x2,x1);
2096 svfloat32_t ret = svmsb_n_f32_z(pg,op,x2,2.f);
2097 ret = svmul_f32_z(pg,ret,x2);
2100 svfloat64_t inv(svbool_t pg,svfloat64_t op){
2101 svfloat64_t x1 = svrecpe_f64(op);
2102 svfloat64_t x2 = svmsb_n_f64_z(pg,op,x1,2.f);
2103 x2 = svmul_f64_z(pg,x2,x1);
2104 svfloat64_t ret = svmsb_n_f64_z(pg,op,x2,2.f);
2105 ret = svmul_f64_z(pg,ret,x2);
2108 svfloat64_t
max(svbool_t pg,svfloat64_t a,svfloat64_t b){
return svmax_f64_z(pg,a,b);}
2109 svfloat64_t
min(svbool_t pg,svfloat64_t a,svfloat64_t b){
return svmin_f64_z(pg,a,b);}
2110 svuint64_t
max(svbool_t pg,svuint64_t a,svuint64_t b){
return svmax_u64_z(pg,a,b);}
2111 svuint64_t
min(svbool_t pg,svuint64_t a,svuint64_t b){
return svmin_u64_z(pg,a,b);}
2112 svint64_t
max(svbool_t pg,svint64_t a,svint64_t b){
return svmax_s64_z(pg,a,b);}
2113 svint64_t
min(svbool_t pg,svint64_t a,svint64_t b){
return svmin_s64_z(pg,a,b);}
2114 svfloat32_t
max(svbool_t pg,svfloat32_t a,svfloat32_t b){
return svmax_f32_z(pg,a,b);}
2115 svfloat32_t
min(svbool_t pg,svfloat32_t a,svfloat32_t b){
return svmin_f32_z(pg,a,b);}
2116 svuint32_t
max(svbool_t pg,svuint32_t a,svuint32_t b){
return svmax_u32_z(pg,a,b);}
2117 svuint32_t
min(svbool_t pg,svuint32_t a,svuint32_t b){
return svmin_u32_z(pg,a,b);}
2118 svint32_t
max(svbool_t pg,svint32_t a,svint32_t b){
return svmax_s32_z(pg,a,b);}
2119 svint32_t
min(svbool_t pg,svint32_t a,svint32_t b){
return svmin_s32_z(pg,a,b);}
2120 svfloat64_t table(svfloat64_t tab,svuint64_t index){
return svtbl_f64(tab,index);}
2121 svfloat32_t table(svfloat32_t tab,svuint32_t index){
return svtbl_f32(tab,index);}
2122 svfloat64_t to_float(svbool_t pg, svint64_t op){
return svcvt_f64_s64_z(pg,op);}
2123 svfloat32_t to_float(svbool_t pg, svint32_t op){
return svcvt_f32_s32_z(pg,op);}
2124 svfloat64_t to_float(svbool_t pg,svuint64_t op){
return svcvt_f64_u64_z(pg,op);}
2125 svfloat32_t to_float(svbool_t pg,svuint32_t op){
return svcvt_f32_u32_z(pg,op);}
2128 svuint64_t to_uint(svbool_t pg,svfloat64_t op){
return svcvt_u64_f64_z(pg,op);}
2129 svuint32_t to_uint(svbool_t pg,svfloat32_t op){
return svcvt_u32_f32_z(pg,op);}