24 if(r2 < r_search*r_search){
25 #ifdef SAVE_NEIGHBOR_ID_IN_FORCE_KERNEL
26 force[i].id_ngb[n_ngb_i & 0x3] = ep_j[j].
id;
31 force[i].
n_ngb = n_ngb_i;
60 const PS::F64 r2_eps = r2 + eps2;
62 if(r2 < r_search*r_search){
65 const PS::F64 r2_tmp = (r2_eps > r_out2) ? r2_eps : r_out2;
66 const PS::F64 r_inv = 1.0/sqrt(r2_tmp);
68 const PS::F64 m_r3 = m_r * r_inv * r_inv;
77 force[i].
pot += G*poti;
78 #ifdef NAN_CHECK_DEBUG
79 assert(!std::isnan(ai[0]));
80 assert(!std::isnan(ai[1]));
81 assert(!std::isnan(ai[2]));
82 assert(!std::isnan(poti));
84 force[i].
n_ngb = n_ngb_i;
90 struct CalcCorrectEpEpWithLinearCutoffNoSimd{
91 void operator () (
const EPISoft * ep_i,
106 const PS::F64 r2 = dr * dr + eps2;
108 const PS::F64 r2_tmp = (r2 > r_out2) ? r2 : r_out2;
109 const PS::F64 r_inv = 1.0/sqrt(r2_tmp);
110 const PS::F64 r2_inv = r_inv*r_inv;
112 const PS::F64 m_r3 = m_r * r2_inv;
114 const PS::F64 alpha = 3.0 * drda * r2_inv;
115 acorr -= m_r3 * (da - alpha * dr);
118 force[i].acorr += 2.0 * acorr;
140 PS::F64 r3_inv = rij * rij + eps2;
141 PS::F64 r_inv = 1.0/sqrt(r3_inv);
142 r3_inv = r_inv * r_inv;
143 r_inv *= sp_j[j].getCharge();
148 force[i].
acc += G*ai;
149 force[i].
pot += G*poti;
150 #ifdef NAN_CHECK_DEBUG
151 assert(!std::isnan(ai[0]));
152 assert(!std::isnan(ai[1]));
153 assert(!std::isnan(ai[2]));
154 assert(!std::isnan(poti));
170 for(
PS::S32 ip=0; ip<n_ip; ip++){
174 for(
PS::S32 jp=0; jp<n_jp; jp++){
179 PS::F64mat qj = sp_j[jp].quad;
181 PS::F64vec qr( (qj.xx*rij.x + qj.xy*rij.y + qj.xz*rij.z),
182 (qj.yy*rij.y + qj.yz*rij.z + qj.xy*rij.x),
183 (qj.zz*rij.z + qj.xz*rij.x + qj.yz*rij.y) );
186 PS::F64 r2_inv = r_inv * r_inv;
187 PS::F64 r3_inv = r2_inv * r_inv;
188 PS::F64 r5_inv = r2_inv * r3_inv * 1.5;
190 PS::F64 qrr_r7 = r2_inv * qrr_r5;
191 PS::F64 A = mj*r3_inv - tr*r5_inv + 5*qrr_r7;
194 poti -= mj*r_inv - 0.5*tr*r3_inv + qrr_r5;
196 force[ip].
acc += G*ai;
197 force[ip].
pot += G*poti;
202 template<
class Tpi,
class Tpj>
218 PS::F64 r3_inv = rij * rij + eps2;
219 PS::F64 r_inv = 1.0/sqrt(r3_inv);
220 r3_inv = r_inv * r_inv;
221 r_inv *= ep_j[j].mass;
226 force[i].
acc += G*ai;
227 force[i].
pot += G*poti;
228 #ifdef NAN_CHECK_DEBUG
229 assert(!std::isnan(ai[0]));
230 assert(!std::isnan(ai[1]));
231 assert(!std::isnan(ai[2]));
232 assert(!std::isnan(poti));
239 struct SearchNeighborEpEpSimd{
240 void operator () (
const EPISoft * ep_i,
248 #if defined(CALC_EP_64bit) || defined(CALC_EP_MIX)
249 static thread_local PhantomGrapeQuad64Bit pg;
254 if(n_ip > pg.NIMAX || n_jp > pg.NJMAX){
255 std::cout<<
"ni= "<<n_ip<<
" NIMAX= "<<pg.
NIMAX<<
" nj= "<<n_jp<<
" NJMAX= "<<pg.NJMAX<<std::endl;
257 assert(n_ip<=pg.NIMAX);
258 assert(n_jp<=pg.NJMAX);
261 pg.set_xi_one(i, pos_i.x, pos_i.y, pos_i.z, ep_i[i].r_search);
264 for(
PS::S32 loop=0; loop<loop_max; loop++){
267 const PS::S32 it =ih + n_jp_tmp;
269 for(
PS::S32 i=ih; i<it; i++, i_tmp++){
272 pg.set_epj_one(i_tmp, pos_j.x, pos_j.y, pos_j.z, m_j, ep_j[i].r_search);
275 pg.run_epj_for_neighbor_count(n_ip, n_jp_tmp);
278 pg.accum_accp_one(i, n_ngb);
285 template <
class Tpi,
class Tpj>
286 struct CalcForcePPSimd{
287 void operator () (
const Tpi * ep_i,
293 PS::S32 ep_j_list[n_jp], n_jp_local=0;
294 for (
PS::S32 i=0; i<n_jp; i++){
295 if(ep_j[i].mass>0) ep_j_list[n_jp_local++] = i;
300 #if defined(CALC_EP_64bit) || defined(CALC_EP_MIX)
301 static thread_local PhantomGrapeQuad64Bit pg;
306 if(n_ip > pg.NIMAX || n_jp > pg.NJMAX){
307 std::cout<<
"ni= "<<n_ip<<
" NIMAX= "<<pg.
NIMAX<<
" nj= "<<n_jp<<
" NJMAX= "<<pg.NJMAX<<std::endl;
309 assert(n_ip<=pg.NIMAX);
310 assert(n_jp<=pg.NJMAX);
315 pg.set_xi_one(i, pos_i.x, pos_i.y, pos_i.z, 0.0);
318 for(
PS::S32 loop=0; loop<loop_max; loop++){
321 const PS::S32 it =ih + n_jp_tmp;
323 for(
PS::S32 i=ih; i<it; i++, i_tmp++){
324 const PS::S32 ij = ep_j_list[i];
325 const PS::F64 m_j = ep_j[ij].mass;
327 pg.set_epj_one(i_tmp, pos_j.x, pos_j.y, pos_j.z, m_j, 0.0);
330 pg.run_epj_for_p3t_with_linear_cutoff(n_ip, n_jp_tmp);
335 pg.accum_accp_one(i, a[0], a[1], a[2], p, n_ngb);
336 force[i].
acc[0] += G*a[0];
337 force[i].
acc[1] += G*a[1];
338 force[i].
acc[2] += G*a[2];
350 struct CalcForceEpEpWithLinearCutoffSimd{
351 void operator () (
const EPISoft * ep_i,
358 PS::S32 ep_j_list[n_jp], n_jp_local=0;
359 PS::S32 ep_i_list[n_ip], n_ip_local=0;
360 for (
PS::S32 i=0; i<n_jp; i++){
361 if(ep_j[i].mass>0) ep_j_list[n_jp_local++] = i;
368 #if defined(CALC_EP_64bit) || defined(CALC_EP_MIX)
369 static thread_local PhantomGrapeQuad64Bit pg;
374 if(n_ip > pg.NIMAX || n_jp > pg.NJMAX){
375 std::cout<<
"ni= "<<n_ip<<
" NIMAX= "<<pg.
NIMAX<<
" nj= "<<n_jp<<
" NJMAX= "<<pg.NJMAX<<std::endl;
377 assert(n_ip<=pg.NIMAX);
378 assert(n_jp<=pg.NJMAX);
383 if (ep_i[i].type==1) {
384 ep_i_list[n_ip_local] = i;
386 pg.set_xi_one(n_ip_local, pos_i.x, pos_i.y, pos_i.z, ep_i[i].r_search);
392 for(
PS::S32 loop=0; loop<loop_max; loop++){
395 const PS::S32 it =ih + n_jp_tmp;
397 for(
PS::S32 i=ih; i<it; i++, i_tmp++){
398 const PS::S32 ij = ep_j_list[i];
401 pg.set_epj_one(i_tmp, pos_j.x, pos_j.y, pos_j.z, m_j, ep_j[ij].r_search);
404 pg.run_epj_for_p3t_with_linear_cutoff(n_ip, n_jp_tmp);
405 for(
PS::S32 k=0; k<n_ip_local; k++){
410 pg.accum_accp_one(k, a[0], a[1], a[2], p, n_ngb);
411 #ifdef NAN_CHECK_DEBUG
412 assert(!std::isnan(a[0]));
413 assert(!std::isnan(a[1]));
414 assert(!std::isnan(a[2]));
415 assert(!std::isnan(p));
417 force[i].
acc[0] += G*a[0];
418 force[i].
acc[1] += G*a[1];
419 force[i].
acc[2] += G*a[2];
427 struct CalcForceEpSpMonoSimd{
429 void operator () (
const EPISoft * ep_i,
437 PS::S32 ep_i_list[n_ip], n_ip_local=0;
441 #if defined(CALC_EP_64bit)
442 static thread_local PhantomGrapeQuad64Bit pg;
447 assert(n_ip<=pg.NIMAX);
448 assert(n_jp<=pg.NJMAX);
452 if (ep_i[i].type==1) {
453 ep_i_list[n_ip_local] = i;
455 pg.set_xi_one(n_ip_local, pos_i.x, pos_i.y, pos_i.z, 0.0);
460 for(
PS::S32 loop=0; loop<loop_max; loop++){
463 const PS::S32 it = ih + n_jp_tmp;
465 for(
PS::S32 i=ih; i<it; i++, i_tmp++){
466 const PS::F64 m_j = sp_j[i].getCharge();
468 pg.set_epj_one(i_tmp, pos_j.x, pos_j.y, pos_j.z, m_j, 0.0);
470 pg.run_epj(n_ip, n_jp_tmp);
471 for(
PS::S32 k=0; k<n_ip_local; k++){
475 pg.accum_accp_one(k, a[0], a[1], a[2], p);
476 force[i].
acc[0] += G*a[0];
477 force[i].
acc[1] += G*a[1];
478 force[i].
acc[2] += G*a[2];
480 #ifdef NAN_CHECK_DEBUG
481 assert(!std::isnan(a[0]));
482 assert(!std::isnan(a[1]));
483 assert(!std::isnan(a[2]));
484 assert(!std::isnan(p));
491 struct CalcForceEpSpQuadSimd{
493 void operator () (
const EPISoft * ep_i,
500 PS::S32 ep_i_list[n_ip], n_ip_local=0;
504 #if defined(CALC_EP_64bit)
505 static thread_local PhantomGrapeQuad64Bit pg;
510 assert(n_ip<=pg.NIMAX);
511 assert(n_jp<=pg.NJMAX);
515 if (ep_i[i].type==1) {
516 ep_i_list[n_ip_local] = i;
518 pg.set_xi_one(n_ip_local, pos_i.x, pos_i.y, pos_i.z, 0.0);
523 for(
PS::S32 loop=0; loop<loop_max; loop++){
526 const PS::S32 it = ih + n_jp_tmp;
528 for(
PS::S32 i=ih; i<it; i++, i_tmp++){
529 const PS::F64 m_j = sp_j[i].getCharge();
531 const PS::F64mat q = sp_j[i].quad;
532 pg.set_spj_one(i, pos_j.x, pos_j.y, pos_j.z, m_j,
533 q.xx, q.yy, q.zz, q.xy, q.yz, q.xz);
535 pg.run_spj(n_ip, n_jp_tmp);
536 for(
PS::S32 k=0; k<n_ip_local; k++){
540 pg.accum_accp_one(k, a[0], a[1], a[2], p);
541 #ifdef NAN_CHECK_DEBUG
542 assert(!std::isnan(a[0]));
543 assert(!std::isnan(a[1]));
544 assert(!std::isnan(a[2]));
545 assert(!std::isnan(p));
547 force[i].
acc[0] += G*a[0];
548 force[i].
acc[1] += G*a[1];
549 force[i].
acc[2] += G*a[2];