// clang-format off
/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing authors: Stan Moore (SNL), Christian Trott (SNL)
------------------------------------------------------------------------- */

#include "pair_eam_kokkos.h"

#include "atom_kokkos.h"
#include "atom_masks.h"
#include "comm.h"
#include "error.h"
#include "force.h"
#include "kokkos.h"
#include "memory_kokkos.h"
#include "neigh_list_kokkos.h"
#include "neigh_request.h"
#include "neighbor.h"
#include "pair_kokkos.h"

#include <cmath>
using namespace LAMMPS_NS;

#ifdef KOKKOS_ENABLE_HIP
static constexpr int MAX_CACHE_ROWS = 500;
#endif
/* ---------------------------------------------------------------------- */

template<class DeviceType>
PairEAMKokkos<DeviceType>::PairEAMKokkos(LAMMPS *lmp) : PairEAM(lmp)
{
  respa_enable = 0;
  single_enable = 0;

  kokkosable = 1;
  atomKK = (AtomKokkos *) atom;
  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
PairEAMKokkos<DeviceType>::~PairEAMKokkos()
{
  if (copymode) return;

  memoryKK->destroy_kokkos(k_eatom,eatom);
  memoryKK->destroy_kokkos(k_vatom,vatom);
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
void PairEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
{
  eflag = eflag_in;
  vflag = vflag_in;

  if (neighflag == FULL) no_virial_fdotr_compute = 1;

  ev_init(eflag,vflag,0);

  // reallocate per-atom arrays if necessary

  if (eflag_atom) {
    memoryKK->destroy_kokkos(k_eatom,eatom);
    memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
    d_eatom = k_eatom.view<DeviceType>();
  }
  if (vflag_atom) {
    memoryKK->destroy_kokkos(k_vatom,vatom);
    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom");
    d_vatom = k_vatom.view<DeviceType>();
  }

  atomKK->sync(execution_space,datamask_read);
  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
  else atomKK->modified(execution_space,F_MASK);

  // grow energy and fp arrays if necessary
  // need to be atom->nmax in length

  if (atom->nmax > nmax) {
    nmax = atom->nmax;
    k_rho = DAT::tdual_kkfloat_1d("pair:rho",nmax);
    k_fp = DAT::tdual_kkfloat_1d("pair:fp",nmax);
    d_rho = k_rho.template view<DeviceType>();
    d_fp = k_fp.template view<DeviceType>();
    h_rho = k_rho.view_host();
    h_fp = k_fp.view_host();
  }

  x = atomKK->k_x.view<DeviceType>();
  f = atomKK->k_f.view<DeviceType>();
  type = atomKK->k_type.view<DeviceType>();
  nlocal = atom->nlocal;
  nall = atom->nlocal + atom->nghost;
  newton_pair = force->newton_pair;

  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
  d_numneigh = k_list->d_numneigh;
  d_neighbors = k_list->d_neighbors;
  d_ilist = k_list->d_ilist;
  inum = list->inum;

  // make sure this is set appropriately
  cutforcesq_kk = static_cast<KK_FLOAT>(cutforcesq);

  need_dup = lmp->kokkos->need_dup<DeviceType>();
  if (need_dup) {
    dup_rho   = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_rho);
    dup_f     = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f);
    dup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_eatom);
    dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom);
  } else {
    ndup_rho   = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_rho);
    ndup_f     = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f);
    ndup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_eatom);
    ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom);
  }

  rhomax_kk = static_cast<KK_FLOAT>(rhomax);
  copymode = 1;

  // zero out density

  if (newton_pair)
    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMInitialize>(0,nall),*this);
  else
    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMInitialize>(0,nlocal),*this);

  // loop over neighbors of my atoms

  EV_FLOAT ev;

  // compute kernel A

  if (neighflag == HALF || neighflag == HALFTHREAD) {

    if (neighflag == HALF) {
      if (newton_pair) {
        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelA<HALF,1>>(0,inum),*this);
      } else {
        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelA<HALF,0>>(0,inum),*this);
      }
    } else if (neighflag == HALFTHREAD) {
      if (newton_pair) {
        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelA<HALFTHREAD,1>>(0,inum),*this);
      } else {
        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelA<HALFTHREAD,0>>(0,inum),*this);
      }
    }

    if (need_dup)
      Kokkos::Experimental::contribute(d_rho, dup_rho);

    // communicate and sum densities (on the host)

    if (newton_pair) {
      k_rho.template modify<DeviceType>();
      comm->reverse_comm(this);
      k_rho.template sync<DeviceType>();
    }

    // compute kernel B

    if (eflag)
      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelB<1>>(0,inum),*this,ev);
    else
      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelB<0>>(0,inum),*this);

  } else if (neighflag == FULL) {

    // compute kernel AB

    if (eflag)
      Kokkos::parallel_reduce(
             Kokkos::RangePolicy<DeviceType,TagPairEAMKernelAB<1>>(0,inum),
             *this,ev);
    else
      Kokkos::parallel_for(
            policyInstance<TagPairEAMKernelAB<0>>::get(inum),
            *this);
  }

  if (eflag) {
    eng_vdwl += static_cast<double>(ev.evdwl);
    ev.evdwl = 0;
  }

  // communicate derivative of embedding function

  k_fp.template modify<DeviceType>();
  comm->forward_comm(this);
  k_fp.template sync<DeviceType>();

  // compute kernel C

  if (evflag) {
    if (neighflag == HALF) {
      if (newton_pair) {
        Kokkos::parallel_reduce(
              Kokkos::RangePolicy<DeviceType,TagPairEAMKernelC<HALF,1,1>>(0,inum),
              *this,ev);
      } else {
        Kokkos::parallel_reduce(
              Kokkos::RangePolicy<DeviceType,TagPairEAMKernelC<HALF,0,1>>(0,inum),
              *this,ev);
      }
    } else if (neighflag == HALFTHREAD) {
      if (newton_pair) {
        Kokkos::parallel_reduce(
              Kokkos::RangePolicy<DeviceType,TagPairEAMKernelC<HALFTHREAD,1,1>>(0,inum),
              *this,ev);
      } else {
        Kokkos::parallel_reduce(
              Kokkos::RangePolicy<DeviceType,TagPairEAMKernelC<HALFTHREAD,0,1>>(0,inum),
              *this,ev);
      }
    } else if (neighflag == FULL) {
      if (newton_pair) {
        Kokkos::parallel_reduce(
              Kokkos::RangePolicy<DeviceType,TagPairEAMKernelC<FULL,1,1>>(0,inum),
              *this,ev);
      } else {
        Kokkos::parallel_reduce(
              Kokkos::RangePolicy<DeviceType,TagPairEAMKernelC<FULL,0,1>>(0,inum),
              *this,ev);
      }
    }
  } else {
    if (neighflag == HALF) {
      if (newton_pair) {
        Kokkos::parallel_for(
              policyInstance<TagPairEAMKernelC<HALF,1,0>>::get(inum),
              *this);
      } else {
        Kokkos::parallel_for(
              policyInstance<TagPairEAMKernelC<HALF,0,0>>::get(inum),
              *this);
      }
    } else if (neighflag == HALFTHREAD) {
      if (newton_pair) {
        Kokkos::parallel_for(
              policyInstance<TagPairEAMKernelC<HALFTHREAD,1,0>>::get(inum),
              *this);
      } else {
        Kokkos::parallel_for(
              policyInstance<TagPairEAMKernelC<HALFTHREAD,0,0>>::get(inum),
              *this);
      }
    } else if (neighflag == FULL) {
      if (newton_pair) {
        Kokkos::parallel_for(
              policyInstance<TagPairEAMKernelC<FULL,1,0>>::get(inum),
              *this);
      } else {
        Kokkos::parallel_for(
              policyInstance<TagPairEAMKernelC<FULL,0,0>>::get(inum),
              *this);
      }
    }
  }

  if (need_dup)
    Kokkos::Experimental::contribute(f, dup_f);

  if (eflag_global) eng_vdwl += static_cast<double>(ev.evdwl);
  if (vflag_global) {
    virial[0] += static_cast<double>(ev.v[0]);
    virial[1] += static_cast<double>(ev.v[1]);
    virial[2] += static_cast<double>(ev.v[2]);
    virial[3] += static_cast<double>(ev.v[3]);
    virial[4] += static_cast<double>(ev.v[4]);
    virial[5] += static_cast<double>(ev.v[5]);
  }

  if (vflag_fdotr) pair_virial_fdotr_compute(this);

  if (eflag_atom) {
    if (need_dup)
      Kokkos::Experimental::contribute(d_eatom, dup_eatom);
    k_eatom.template modify<DeviceType>();
    k_eatom.sync_host();
  }

  if (vflag_atom) {
    if (need_dup)
      Kokkos::Experimental::contribute(d_vatom, dup_vatom);
    k_vatom.template modify<DeviceType>();
    k_vatom.sync_host();
  }

  copymode = 0;

  // free duplicated memory
  if (need_dup) {
    dup_rho   = {};
    dup_f     = {};
    dup_eatom = {};
    dup_vatom = {};
  }
}

/* ----------------------------------------------------------------------
   init specific to this pair style
------------------------------------------------------------------------- */

template<class DeviceType>
void PairEAMKokkos<DeviceType>::init_style()
{
  // convert read-in file(s) to arrays and spline them

  PairEAM::init_style();

  // adjust neighbor list request for KOKKOS

  neighflag = lmp->kokkos->neighflag;
  auto request = neighbor->find_request(this);
  request->set_kokkos_host(std::is_same_v<DeviceType,LMPHostType> &&
                           !std::is_same_v<DeviceType,LMPDeviceType>);
  request->set_kokkos_device(std::is_same_v<DeviceType,LMPDeviceType>);
  if (neighflag == FULL) request->enable_full();
}

/* ----------------------------------------------------------------------
   convert read-in funcfl potential(s) to standard array format
   interpolate all file values to a single grid and cutoff
------------------------------------------------------------------------- */

template<class DeviceType>
void PairEAMKokkos<DeviceType>::file2array()
{
  PairEAM::file2array();

  int i,j;
  int n = atom->ntypes;

  auto k_type2frho = DAT::tdual_int_1d("pair:type2frho",n+1);
  auto k_type2rhor = DAT::tdual_int_2d_dl("pair:type2rhor",n+1,n+1);
  auto k_type2z2r = DAT::tdual_int_2d_dl("pair:type2z2r",n+1,n+1);

  auto h_type2frho =  k_type2frho.view_host();
  auto h_type2rhor = k_type2rhor.view_host();
  auto h_type2z2r = k_type2z2r.view_host();

  for (i = 1; i <= n; i++) {
    h_type2frho[i] = type2frho[i];
    for (j = 1; j <= n; j++) {
      h_type2rhor(i,j) = type2rhor[i][j];
      h_type2z2r(i,j) = type2z2r[i][j];
    }
  }
  k_type2frho.modify_host();
  k_type2frho.template sync<DeviceType>();
  k_type2rhor.modify_host();
  k_type2rhor.template sync<DeviceType>();
  k_type2z2r.modify_host();
  k_type2z2r.template sync<DeviceType>();

  d_type2frho = k_type2frho.template view<DeviceType>();
  d_type2rhor = k_type2rhor.template view<DeviceType>();
  d_type2z2r = k_type2z2r.template view<DeviceType>();
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
void PairEAMKokkos<DeviceType>::array2spline()
{
  rdr = 1.0/dr;
  rdrho = 1.0/drho;

  rdr_kk = static_cast<KK_FLOAT>(rdr);
  rdrho_kk = static_cast<KK_FLOAT>(rdrho);

  tdual_kkfloat_2d_n7 k_frho_spline = tdual_kkfloat_2d_n7("pair:frho",nfrho,nrho+1);
  tdual_kkfloat_2d_n7 k_rhor_spline = tdual_kkfloat_2d_n7("pair:rhor",nrhor,nr+1);
  tdual_kkfloat_2d_n7 k_z2r_spline = tdual_kkfloat_2d_n7("pair:z2r",nz2r,nr+1);

  t_hostkkfloat_2d_n7 h_frho_spline = k_frho_spline.view_host();
  t_hostkkfloat_2d_n7 h_rhor_spline = k_rhor_spline.view_host();
  t_hostkkfloat_2d_n7 h_z2r_spline = k_z2r_spline.view_host();

  for (int i = 0; i < nfrho; i++)
    interpolate(nrho,drho,frho[i],h_frho_spline,i);
  k_frho_spline.modify_host();
  k_frho_spline.template sync<DeviceType>();

  for (int i = 0; i < nrhor; i++)
    interpolate(nr,dr,rhor[i],h_rhor_spline,i);
  k_rhor_spline.modify_host();
  k_rhor_spline.template sync<DeviceType>();

  for (int i = 0; i < nz2r; i++)
    interpolate(nr,dr,z2r[i],h_z2r_spline,i);
  k_z2r_spline.modify_host();
  k_z2r_spline.template sync<DeviceType>();

  d_frho_spline = k_frho_spline.template view<DeviceType>();
  d_rhor_spline = k_rhor_spline.template view<DeviceType>();
  d_z2r_spline = k_z2r_spline.template view<DeviceType>();
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
void PairEAMKokkos<DeviceType>::interpolate(int n, double delta, double *f, t_hostkkfloat_2d_n7 h_spline, int i)
{
  for (int m = 1; m <= n; m++) h_spline(i,m,6) = static_cast<KK_FLOAT>(f[m]);

  h_spline(i,1,5) = h_spline(i,2,6) - h_spline(i,1,6);
  h_spline(i,2,5) = static_cast<KK_FLOAT>(0.5) * (h_spline(i,3,6)-h_spline(i,1,6));
  h_spline(i,n-1,5) = static_cast<KK_FLOAT>(0.5) * (h_spline(i,n,6)-h_spline(i,n-2,6));
  h_spline(i,n,5) = h_spline(i,n,6) - h_spline(i,n-1,6);

  for (int m = 3; m <= n-2; m++)
    h_spline(i,m,5) = ((h_spline(i,m-2,6)-h_spline(i,m+2,6)) +
                    static_cast<KK_FLOAT>(8.0)*(h_spline(i,m+1,6)-h_spline(i,m-1,6))) / static_cast<KK_FLOAT>(12.0);

  for (int m = 1; m <= n-1; m++) {
    h_spline(i,m,4) = static_cast<KK_FLOAT>(3.0)*(h_spline(i,m+1,6)-h_spline(i,m,6)) -
      static_cast<KK_FLOAT>(2.0)*h_spline(i,m,5) - h_spline(i,m+1,5);
    h_spline(i,m,3) = h_spline(i,m,5) + h_spline(i,m+1,5) -
      static_cast<KK_FLOAT>(2.0)*(h_spline(i,m+1,6)-h_spline(i,m,6));
  }

  h_spline(i,n,4) = 0;
  h_spline(i,n,3) = 0;

  KK_FLOAT inv_delta_float = static_cast<KK_FLOAT>(1.0 / delta);
  for (int m = 1; m <= n; m++) {
    h_spline(i,m,2) = h_spline(i,m,5)*inv_delta_float;
    h_spline(i,m,1) = static_cast<KK_FLOAT>(2.0)*h_spline(i,m,4)*inv_delta_float;
    h_spline(i,m,0) = static_cast<KK_FLOAT>(3.0)*h_spline(i,m,3)*inv_delta_float;
  }
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
int PairEAMKokkos<DeviceType>::pack_forward_comm_kokkos(int n, DAT::tdual_int_1d k_sendlist,
                                                        DAT::tdual_double_1d &buf,
                                                        int /*pbc_flag*/, int * /*pbc*/)
{
  d_sendlist = k_sendlist.view<DeviceType>();
  v_buf = buf.view<DeviceType>();
  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMPackForwardComm>(0,n),*this);
  return n;
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::operator()(TagPairEAMPackForwardComm, const int &i) const {
  int j = d_sendlist(i);
  v_buf[i] = static_cast<double>(d_fp[j]);
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
void PairEAMKokkos<DeviceType>::unpack_forward_comm_kokkos(int n, int first_in, DAT::tdual_double_1d &buf)
{
  first = first_in;
  v_buf = buf.view<DeviceType>();
  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMUnpackForwardComm>(0,n),*this);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::operator()(TagPairEAMUnpackForwardComm, const int &i) const {
  d_fp[i + first] = static_cast<KK_FLOAT>(v_buf[i]);
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
int PairEAMKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf,
                                                 int /*pbc_flag*/, int * /*pbc*/)
{
  k_fp.sync_host();

  int i,j;

  for (i = 0; i < n; i++) {
    j = list[i];
    buf[i] = static_cast<double>(h_fp[j]);
  }
  return n;
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
void PairEAMKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
{
  k_fp.sync_host();

  for (int i = 0; i < n; i++) {
    h_fp[i + first] = static_cast<KK_FLOAT>(buf[i]);
  }

  k_fp.modify_host();
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
int PairEAMKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
{
  k_rho.sync_host();

  int i,m,last;

  m = 0;
  last = first + n;
  for (i = first; i < last; i++) buf[m++] = static_cast<double>(h_rho[i]);
  return m;
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
void PairEAMKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
{
  k_rho.sync_host();

  int i,j,m;

  m = 0;
  for (i = 0; i < n; i++) {
    j = list[i];
    h_rho[j] += static_cast<KK_FLOAT>(buf[m++]);
  }

  k_rho.modify_host();
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::operator()(TagPairEAMInitialize, const int &i) const {
  d_rho[i] = 0;
}

/* ---------------------------------------------------------------------- */

////Specialisation for Neighborlist types Half, HalfThread, Full
template<class DeviceType>
template<int NEIGHFLAG, int NEWTON_PAIR>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelA<NEIGHFLAG,NEWTON_PAIR>, const int &ii) const {

  // rho = density at each atom
  // loop over neighbors of my atoms

  // The rho array is duplicated for OpenMP, atomic for GPU, and neither for Serial

  auto v_rho = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_rho),decltype(ndup_rho)>::get(dup_rho,ndup_rho);
  auto a_rho = v_rho.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();

  const int i = d_ilist[ii];
  const KK_FLOAT xtmp = x(i,0);
  const KK_FLOAT ytmp = x(i,1);
  const KK_FLOAT ztmp = x(i,2);
  const int itype = type(i);

  const int jnum = d_numneigh[i];

  KK_ACC_FLOAT rhotmp = 0.0;

  for (int jj = 0; jj < jnum; jj++) {
    int j = d_neighbors(i,jj);
    j &= NEIGHMASK;
    const KK_FLOAT delx = xtmp - x(j,0);
    const KK_FLOAT dely = ytmp - x(j,1);
    const KK_FLOAT delz = ztmp - x(j,2);
    const int jtype = type(j);
    const KK_FLOAT rsq = delx*delx + dely*dely + delz*delz;

    if (rsq < cutforcesq_kk) {
      KK_FLOAT p = sqrt(rsq)*rdr_kk + static_cast<KK_FLOAT>(1.0);
      int m = static_cast<int> (p);
      m = MIN(m,nr-1);
      p -= static_cast<KK_FLOAT>(m);
      p = MIN(p,static_cast<KK_FLOAT>(1.0));
      const int d_type2rhor_ji = d_type2rhor(jtype,itype);
      rhotmp += static_cast<KK_ACC_FLOAT>(((d_rhor_spline(d_type2rhor_ji,m,3)*p + d_rhor_spline(d_type2rhor_ji,m,4))*p +
                  d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6));
      if (NEWTON_PAIR || j < nlocal) {
        const int d_type2rhor_ij = d_type2rhor(itype,jtype);
        a_rho[j] += ((d_rhor_spline(d_type2rhor_ij,m,3)*p + d_rhor_spline(d_type2rhor_ij,m,4))*p +
                      d_rhor_spline(d_type2rhor_ij,m,5))*p + d_rhor_spline(d_type2rhor_ij,m,6);
      }
    }

  }
  a_rho[i] += static_cast<KK_FLOAT>(rhotmp);
}

/* ---------------------------------------------------------------------- */

////Specialisation for Neighborlist types Half, HalfThread, Full
template<class DeviceType>
template<int EFLAG>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelB<EFLAG>, const int &ii, EV_FLOAT& ev) const {

  // fp = derivative of embedding energy at each atom
  // phi = embedding energy at each atom
  // if rho > rhomax (e.g. due to close approach of two atoms),
  //   will exceed table, so add linear term to conserve energy

  const int i = d_ilist[ii];
  const int itype = type(i);

  KK_FLOAT p = d_rho[i]*rdrho_kk + static_cast<KK_FLOAT>(1.0);
  int m = static_cast<int> (p);
  m = MAX(1,MIN(m,nrho-1));
  p -= static_cast<KK_FLOAT>(m);
  p = MIN(p,static_cast<KK_FLOAT>(1.0));
  const int d_type2frho_i = d_type2frho[itype];
  d_fp[i] = (d_frho_spline(d_type2frho_i,m,0)*p + d_frho_spline(d_type2frho_i,m,1))*p + d_frho_spline(d_type2frho_i,m,2);
  if (EFLAG) {
    KK_FLOAT phi = ((d_frho_spline(d_type2frho_i,m,3)*p + d_frho_spline(d_type2frho_i,m,4))*p +
                    d_frho_spline(d_type2frho_i,m,5))*p + d_frho_spline(d_type2frho_i,m,6);
    if (d_rho[i] > rhomax_kk) phi += d_fp[i] * (d_rho[i]-rhomax_kk);
    if (eflag_global) ev.evdwl += static_cast<KK_ACC_FLOAT>(phi);
    if (eflag_atom) d_eatom[i] += static_cast<KK_ACC_FLOAT>(phi);
  }
}

template<class DeviceType>
template<int EFLAG>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelB<EFLAG>, const int &ii) const {
  EV_FLOAT ev;
  this->template operator()<EFLAG>(TagPairEAMKernelB<EFLAG>(), ii, ev);
}

/* ---------------------------------------------------------------------- */

////Specialisation for Neighborlist types Half, HalfThread, Full
template<class DeviceType>
template<int EFLAG>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelAB<EFLAG>, const int &ii, EV_FLOAT& ev) const {

  // rho = density at each atom
  // loop over neighbors of my atoms

  const int i = d_ilist[ii];
  const KK_FLOAT xtmp = x(i,0);
  const KK_FLOAT ytmp = x(i,1);
  const KK_FLOAT ztmp = x(i,2);
  const int itype = type(i);

  const int jnum = d_numneigh[i];

  KK_ACC_FLOAT rhotmp = 0;

  for (int jj = 0; jj < jnum; jj++) {
    int j = d_neighbors(i,jj);
    j &= NEIGHMASK;

    const KK_FLOAT delx = xtmp - x(j,0);
    const KK_FLOAT dely = ytmp - x(j,1);
    const KK_FLOAT delz = ztmp - x(j,2);
    const int jtype = type(j);
    const KK_FLOAT rsq = delx*delx + dely*dely + delz*delz;

    if (rsq < cutforcesq_kk) {
      KK_FLOAT p = sqrt(rsq)*rdr_kk + static_cast<KK_FLOAT>(1.0);
      int m = static_cast<int> (p);
      m = MIN(m,nr-1);
      p -= static_cast<KK_FLOAT>(m);
      p = MIN(p,static_cast<KK_FLOAT>(1.0));
      const int d_type2rhor_ji = d_type2rhor(jtype,itype);
      rhotmp += static_cast<KK_ACC_FLOAT>(((d_rhor_spline(d_type2rhor_ji,m,3)*p + d_rhor_spline(d_type2rhor_ji,m,4))*p +
                  d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6));
    }

  }
  d_rho[i] += static_cast<KK_FLOAT>(rhotmp);

  // fp = derivative of embedding energy at each atom
  // phi = embedding energy at each atom
  // if rho > rhomax (e.g. due to close approach of two atoms),
  //   will exceed table, so add linear term to conserve energy

  KK_FLOAT p = d_rho[i]*rdrho_kk + static_cast<KK_FLOAT>(1.0);
  int m = static_cast<int> (p);
  m = MAX(1,MIN(m,nrho-1));
  p -= static_cast<KK_FLOAT>(m);
  p = MIN(p,static_cast<KK_FLOAT>(1.0));
  const int d_type2frho_i = d_type2frho[itype];
  d_fp[i] = (d_frho_spline(d_type2frho_i,m,0)*p + d_frho_spline(d_type2frho_i,m,1))*p + d_frho_spline(d_type2frho_i,m,2);
  if (EFLAG) {
    KK_FLOAT phi = ((d_frho_spline(d_type2frho_i,m,3)*p + d_frho_spline(d_type2frho_i,m,4))*p +
                    d_frho_spline(d_type2frho_i,m,5))*p + d_frho_spline(d_type2frho_i,m,6);
    if (d_rho[i] > rhomax_kk) phi += d_fp[i] * (d_rho[i]-rhomax_kk);
    if (eflag_global) ev.evdwl += static_cast<KK_ACC_FLOAT>(phi);
    if (eflag_atom) d_eatom[i] += static_cast<KK_ACC_FLOAT>(phi);
  }

}

template<class DeviceType>
template<int EFLAG>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelAB<EFLAG>, const int &ii) const {
  EV_FLOAT ev;
  this->template operator()<EFLAG>(TagPairEAMKernelAB<EFLAG>(), ii, ev);
}

/* ---------------------------------------------------------------------- */

////Specialisation for Neighborlist types Half, HalfThread, Full
template<class DeviceType>
template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {

  // The f array is duplicated for OpenMP, atomic for GPU, and neither for Serial

  auto v_f = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
  auto a_f = v_f.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();

  const int i = d_ilist[ii];
  const KK_FLOAT xtmp = x(i,0);
  const KK_FLOAT ytmp = x(i,1);
  const KK_FLOAT ztmp = x(i,2);
  const int itype = type(i);

  const int jnum = d_numneigh[i];

  KK_ACC_FLOAT fxtmp = 0;
  KK_ACC_FLOAT fytmp = 0;
  KK_ACC_FLOAT fztmp = 0;

  for (int jj = 0; jj < jnum; jj++) {
    int j = d_neighbors(i,jj);
    j &= NEIGHMASK;
    const KK_FLOAT delx = xtmp - x(j,0);
    const KK_FLOAT dely = ytmp - x(j,1);
    const KK_FLOAT delz = ztmp - x(j,2);
    const int jtype = type(j);
    const KK_FLOAT rsq = delx*delx + dely*dely + delz*delz;

    if (rsq < cutforcesq_kk) {
      const KK_FLOAT r = sqrt(rsq);
      KK_FLOAT p = r*rdr_kk + static_cast<KK_FLOAT>(1.0);
      int m = static_cast<int> (p);
      m = MIN(m,nr-1);
      p -= static_cast<KK_FLOAT>(m);
      p = MIN(p,static_cast<KK_FLOAT>(1.0));

      // rhoip = derivative of (density at atom j due to atom i)
      // rhojp = derivative of (density at atom i due to atom j)
      // phi = pair potential energy
      // phip = phi'
      // z2 = phi * r
      // z2p = (phi * r)' = (phi' r) + phi
      // psip needs both fp[i] and fp[j] terms since r_ij appears in two
      //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
      //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip

      const int d_type2rhor_ij = d_type2rhor(itype,jtype);
      const KK_FLOAT rhoip = (d_rhor_spline(d_type2rhor_ij,m,0)*p + d_rhor_spline(d_type2rhor_ij,m,1))*p +
                             d_rhor_spline(d_type2rhor_ij,m,2);
      const int d_type2rhor_ji = d_type2rhor(jtype,itype);
      const KK_FLOAT rhojp = (d_rhor_spline(d_type2rhor_ji,m,0)*p + d_rhor_spline(d_type2rhor_ji,m,1))*p +
                             d_rhor_spline(d_type2rhor_ji,m,2);
      const int d_type2z2r_ij = d_type2z2r(itype,jtype);

      const auto z2r_spline_3 = d_z2r_spline(d_type2z2r_ij,m,3);
      const auto z2r_spline_4 = d_z2r_spline(d_type2z2r_ij,m,4);
      const auto z2r_spline_5 = d_z2r_spline(d_type2z2r_ij,m,5);
      const auto z2r_spline_6 = d_z2r_spline(d_type2z2r_ij,m,6);

      const KK_FLOAT z2p = (static_cast<KK_FLOAT>(3.0)*rdr_kk*z2r_spline_3*p + static_cast<KK_FLOAT>(2.0)*rdr_kk*z2r_spline_4)*p +
                           rdr_kk*z2r_spline_5; // the rdr and the factors of 3.0 and 2.0 come out of the interpolate function
      const KK_FLOAT z2 = ((z2r_spline_3*p + z2r_spline_4)*p +
                           z2r_spline_5)*p + z2r_spline_6;

      const KK_FLOAT recip = static_cast<KK_FLOAT>(1.0)/r;
      const KK_FLOAT phi = z2*recip;
      const KK_FLOAT phip = z2p*recip - phi*recip;
      const KK_FLOAT psip = d_fp[i]*rhojp + d_fp[j]*rhoip + phip;
      const KK_FLOAT fpair = -psip*recip;

      fxtmp += static_cast<KK_ACC_FLOAT>(delx*fpair);
      fytmp += static_cast<KK_ACC_FLOAT>(dely*fpair);
      fztmp += static_cast<KK_ACC_FLOAT>(delz*fpair);

      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
        a_f(j,0) -= static_cast<KK_ACC_FLOAT>(delx*fpair);
        a_f(j,1) -= static_cast<KK_ACC_FLOAT>(dely*fpair);
        a_f(j,2) -= static_cast<KK_ACC_FLOAT>(delz*fpair);
      }

      if (EVFLAG) {
        if (eflag) {
          ev.evdwl += static_cast<KK_ACC_FLOAT>((((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<nlocal)))?static_cast<KK_FLOAT>(1.0):static_cast<KK_FLOAT>(0.5))*phi);
        }

        if (vflag_either || eflag_atom) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,phi,fpair,delx,dely,delz);
      }

    }
  }

  a_f(i,0) += fxtmp;
  a_f(i,1) += fytmp;
  a_f(i,2) += fztmp;
}

template<class DeviceType>
template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii) const {
  EV_FLOAT ev;
  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), ii, ev);
}

/* ---------------------------------------------------------------------- */
#ifdef KOKKOS_ENABLE_HIP
////Specialisation for Neighborlist types Half, HalfThread, Full
template<class DeviceType>
template<int EFLAG>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelAB<EFLAG>,
                                     const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member,
                                     EV_FLOAT& ev) const {
  int ii = team_member.league_rank()*team_member.team_size() + team_member.team_rank();
  // rho = density at each atom
  // loop over neighbors of my atoms
  const int m_max = d_rhor_spline.extent_int(1);
  const int j_max = t_kkfloat_2d_n7::static_extent(2);
  const int d_rhor_spline_cached = (m_max > MAX_CACHE_ROWS) ? 0 : 1;
  Kokkos::View<KK_FLOAT*[t_kkfloat_2d_n7::static_extent(2)], typename DeviceType::scratch_memory_space,
               Kokkos::MemoryTraits<Kokkos::Unmanaged>> A(team_member.team_scratch(0), MAX_CACHE_ROWS);

  if (d_rhor_spline_cached) {
    for(int i = team_member.team_rank(); i < m_max*j_max; i+= team_member.team_size()) {
      int j = i%j_max;
      int m = i/j_max;
      A(m,j) = d_rhor_spline(0,m,j);
    }
    team_member.team_barrier();
  }
  if (ii < inum) {
    const int i = d_ilist[ii];
    const KK_FLOAT xtmp = x(i,0);
    const KK_FLOAT ytmp = x(i,1);
    const KK_FLOAT ztmp = x(i,2);
    const int itype = type(i);

    const int jnum = d_numneigh[i];

    KK_ACC_FLOAT rhotmp = 0;

    for (int jj = 0; jj < jnum; jj++) {
      int j = d_neighbors(i,jj);
      j &= NEIGHMASK;

      const KK_FLOAT delx = xtmp - x(j,0);
      const KK_FLOAT dely = ytmp - x(j,1);
      const KK_FLOAT delz = ztmp - x(j,2);
      const int jtype = type(j);
      const KK_FLOAT rsq = delx*delx + dely*dely + delz*delz;

      if (rsq < cutforcesq_kk) {
        KK_FLOAT p = sqrt(rsq)*rdr_kk + static_cast<KK_FLOAT>(1.0);
        int m = static_cast<int> (p);
        m = MIN(m,nr-1);
        p -= m;
        p = MIN(p,static_cast<KK_FLOAT>(1.0));
        const int d_type2rhor_ji = d_type2rhor(jtype,itype);
        if (d_type2rhor_ji == 0 && d_rhor_spline_cached == 1) {
          rhotmp += static_cast<KK_ACC_FLOAT>(((A(m,3)*p + A(m,4))*p +
                       A(m,5))*p + A(m,6));
        } else
          rhotmp += static_cast<KK_ACC_FLOAT>(((d_rhor_spline(d_type2rhor_ji,m,3)*p + d_rhor_spline(d_type2rhor_ji,m,4))*p +
                      d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6));
      }

    }
    d_rho[i] += static_cast<KK_FLOAT>(rhotmp);

    // fp = derivative of embedding energy at each atom
    // phi = embedding energy at each atom
    // if rho > rhomax (e.g. due to close approach of two atoms),
    //   will exceed table, so add linear term to conserve energy

    KK_FLOAT p = d_rho[i]*rdrho_kk + static_cast<KK_FLOAT>(1.0);
    int m = static_cast<int> (p);
    m = MAX(1,MIN(m,nrho-1));
    p -= static_cast<KK_FLOAT>(m);
    p = MIN(p,static_cast<KK_FLOAT>(1.0));
    const int d_type2frho_i = d_type2frho[itype];
    d_fp[i] = (d_frho_spline(d_type2frho_i,m,0)*p + d_frho_spline(d_type2frho_i,m,1))*p + d_frho_spline(d_type2frho_i,m,2);
    if (EFLAG) {
      KK_FLOAT phi = ((d_frho_spline(d_type2frho_i,m,3)*p + d_frho_spline(d_type2frho_i,m,4))*p +
                      d_frho_spline(d_type2frho_i,m,5))*p + d_frho_spline(d_type2frho_i,m,6);
      if (d_rho[i] > rhomax_kk) phi += d_fp[i] * (d_rho[i]-rhomax_kk);
      if (eflag_global) ev.evdwl += static_cast<KK_ACC_FLOAT>(phi);
      if (eflag_atom) d_eatom[i] += static_cast<KK_ACC_FLOAT>(phi);
    }
  }
}
#endif
template<class DeviceType>
template<int EFLAG>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelAB<EFLAG>,
                                           const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member) const {
  EV_FLOAT ev;
  this->template operator()<EFLAG>(TagPairEAMKernelAB<EFLAG>(), team_member, ev);
}

/* ---------------------------------------------------------------------- */
#ifdef KOKKOS_ENABLE_HIP
////Specialisation for Neighborlist types Half, HalfThread, Full
template<class DeviceType>
template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>,
                                           const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member,
                                           EV_FLOAT& ev) const {

  int ii = team_member.league_rank()*team_member.team_size() + team_member.team_rank();

  // The f array is duplicated for OpenMP, atomic for GPU, and neither for Serial

  auto v_f = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
  auto a_f = v_f.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();

  const int m_max = d_z2r_spline.extent_int(1);
  const int j_max = t_kkfloat_2d_n7::static_extent(2);
  const int d_z2r_spline_cached = (m_max > MAX_CACHE_ROWS) ? 0 : 1;
  Kokkos::View<KK_FLOAT*[t_kkfloat_2d_n7::static_extent(2)], typename DeviceType::scratch_memory_space,
               Kokkos::MemoryTraits<Kokkos::Unmanaged>> A(team_member.team_scratch(0), MAX_CACHE_ROWS);

  if (d_z2r_spline_cached) {
    for(int i = team_member.team_rank(); i < m_max*j_max; i+= team_member.team_size()) {
      int j = i%j_max;
      int m = i/j_max;
      A(m,j) = d_z2r_spline(0,m,j);
    }
    team_member.team_barrier();
  }
  if (ii < inum) {
    const int i = d_ilist[ii];
    const KK_FLOAT xtmp = x(i,0);
    const KK_FLOAT ytmp = x(i,1);
    const KK_FLOAT ztmp = x(i,2);
    const int itype = type(i);

    const int jnum = d_numneigh[i];

    KK_ACC_FLOAT fxtmp = 0;
    KK_ACC_FLOAT fytmp = 0;
    KK_ACC_FLOAT fztmp = 0;

    for (int jj = 0; jj < jnum; jj++) {
      int j = d_neighbors(i,jj);
      j &= NEIGHMASK;
      const KK_FLOAT delx = xtmp - x(j,0);
      const KK_FLOAT dely = ytmp - x(j,1);
      const KK_FLOAT delz = ztmp - x(j,2);
      const int jtype = type(j);
      const KK_FLOAT rsq = delx*delx + dely*dely + delz*delz;

      if (rsq < cutforcesq_kk) {
        const KK_FLOAT r = sqrt(rsq);
        KK_FLOAT p = r*rdr_kk + static_cast<KK_FLOAT>(1.0);
        int m = static_cast<int> (p);
        m = MIN(m,nr-1);
        p -= static_cast<KK_FLOAT>(m);
        p = MIN(p,static_cast<KK_FLOAT>(1.0));

        // rhoip = derivative of (density at atom j due to atom i)
        // rhojp = derivative of (density at atom i due to atom j)
        // phi = pair potential energy
        // phip = phi'
        // z2 = phi * r
        // z2p = (phi * r)' = (phi' r) + phi
        // psip needs both fp[i] and fp[j] terms since r_ij appears in two
        //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
        //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip

        const int d_type2rhor_ij = d_type2rhor(itype,jtype);
        const KK_FLOAT rhoip = (d_rhor_spline(d_type2rhor_ij,m,0)*p + d_rhor_spline(d_type2rhor_ij,m,1))*p +
                             d_rhor_spline(d_type2rhor_ij,m,2);
        const int d_type2rhor_ji = d_type2rhor(jtype,itype);
        const KK_FLOAT rhojp = (d_rhor_spline(d_type2rhor_ji,m,0)*p + d_rhor_spline(d_type2rhor_ji,m,1))*p +
                               d_rhor_spline(d_type2rhor_ji,m,2);
        const int d_type2z2r_ij = d_type2z2r(itype,jtype);

        const auto have_cache = (d_z2r_spline_cached == 1) && (0 == d_type2z2r_ij);
        const auto z2r_spline_3 = (have_cache) ? A(m,3) : d_z2r_spline(d_type2z2r_ij,m,3);
        const auto z2r_spline_4 = (have_cache) ? A(m,4) : d_z2r_spline(d_type2z2r_ij,m,4);
        const auto z2r_spline_5 = (have_cache) ? A(m,5) : d_z2r_spline(d_type2z2r_ij,m,5);
        const auto z2r_spline_6 = (have_cache) ? A(m,6) : d_z2r_spline(d_type2z2r_ij,m,6);

        const KK_FLOAT z2p = (static_cast<KK_FLOAT>(3.0)*rdr_kk*z2r_spline_3*p + static_cast<KK_FLOAT>(2.0)*rdr_kk*z2r_spline_4)*p +
                             rdr_kk*z2r_spline_5; // the rdr and the factors of 3.0 and 2.0 come out of the interpolate function
        const KK_FLOAT z2 = ((z2r_spline_3*p + z2r_spline_4)*p +
                             z2r_spline_5)*p + z2r_spline_6;

        const KK_FLOAT recip = static_cast<KK_FLOAT>(1.0)/r;
        const KK_FLOAT phi = z2*recip;
        const KK_FLOAT phip = z2p*recip - phi*recip;
        const KK_FLOAT psip = d_fp[i]*rhojp + d_fp[j]*rhoip + phip;
        const KK_FLOAT fpair = -psip*recip;

        fxtmp += static_cast<KK_ACC_FLOAT>(delx*fpair);
        fytmp += static_cast<KK_ACC_FLOAT>(dely*fpair);
        fztmp += static_cast<KK_ACC_FLOAT>(delz*fpair);

        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
          a_f(j,0) -= static_cast<KK_ACC_FLOAT>(delx*fpair);
          a_f(j,1) -= static_cast<KK_ACC_FLOAT>(dely*fpair);
          a_f(j,2) -= static_cast<KK_ACC_FLOAT>(delz*fpair);
        }

        if (EVFLAG) {
          if (eflag) {
            ev.evdwl += static_cast<KK_ACC_FLOAT>((((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<nlocal)))?static_cast<KK_FLOAT>(1.0):static_cast<KK_FLOAT>(0.5))*phi);
          }

          if (vflag_either || eflag_atom) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,phi,fpair,delx,dely,delz);
        }

      }
    }

    a_f(i,0) += fxtmp;
    a_f(i,1) += fytmp;
    a_f(i,2) += fztmp;
  }
}
#endif
template<class DeviceType>
template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>,
                /*const int &ii*/
                const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member) const {
  EV_FLOAT ev;
  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), team_member, ev);
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
template<int NEIGHFLAG, int NEWTON_PAIR>
KOKKOS_INLINE_FUNCTION
void PairEAMKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int &j,
      const KK_FLOAT &epair, const KK_FLOAT &fpair, const KK_FLOAT &delx,
                const KK_FLOAT &dely, const KK_FLOAT &delz) const
{
  const int EFLAG = eflag;
  const int VFLAG = vflag_either;

  // The eatom and vatom arrays are duplicated for OpenMP, atomic for GPU, and neither for Serial

  auto v_eatom = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom);
  auto a_eatom = v_eatom.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();

  auto v_vatom = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom);
  auto a_vatom = v_vatom.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();

  if (EFLAG) {
    if (eflag_atom) {
      const KK_ACC_FLOAT epairhalf = static_cast<KK_ACC_FLOAT>(static_cast<KK_FLOAT>(0.5) * epair);
      if (NEIGHFLAG!=FULL) {
        if (NEWTON_PAIR || i < nlocal) a_eatom[i] += epairhalf;
        if (NEWTON_PAIR || j < nlocal) a_eatom[j] += epairhalf;
      } else {
        a_eatom[i] += epairhalf;
      }
    }
  }

  if (VFLAG) {
    const KK_ACC_FLOAT v_half_acc[6] =
     { static_cast<KK_ACC_FLOAT>(static_cast<KK_FLOAT>(0.5)*delx*delx*fpair),
       static_cast<KK_ACC_FLOAT>(static_cast<KK_FLOAT>(0.5)*dely*dely*fpair),
       static_cast<KK_ACC_FLOAT>(static_cast<KK_FLOAT>(0.5)*delz*delz*fpair),
       static_cast<KK_ACC_FLOAT>(static_cast<KK_FLOAT>(0.5)*delx*dely*fpair),
       static_cast<KK_ACC_FLOAT>(static_cast<KK_FLOAT>(0.5)*delx*delz*fpair),
       static_cast<KK_ACC_FLOAT>(static_cast<KK_FLOAT>(0.5)*dely*delz*fpair) };

    if (vflag_global) {
      if (NEIGHFLAG!=FULL) {
        if (NEWTON_PAIR || i < nlocal) {
          for (int n = 0; n < 6; n++)
            ev.v[n] += v_half_acc[n];
        }
        if (NEWTON_PAIR || j < nlocal) {
          for (int n = 0; n < 6; n++)
            ev.v[n] += v_half_acc[n];
        }
      } else {
        for (int n = 0; n < 6; n++)
          ev.v[n] += v_half_acc[n];
      }
    }

    if (vflag_atom) {
      if (NEIGHFLAG!=FULL) {
        if (NEWTON_PAIR || i < nlocal) {
          for (int n = 0; n < 6; n++)
            a_vatom(i,n) += v_half_acc[n];
        }
        if (NEWTON_PAIR || j < nlocal) {
          for (int n = 0; n < 6; n++)
            a_vatom(j,n) += v_half_acc[n];
        }
      } else {
        for (int n = 0; n < 6; n++)
          a_vatom(i,n) += v_half_acc[n];
      }
    }
  }
}

/* ---------------------------------------------------------------------- */

template<typename DeviceType>
template<class TAG>
struct PairEAMKokkos<DeviceType>::policyInstance {

  static auto get(int inum) {
    auto policy = Kokkos::RangePolicy<DeviceType, TAG>(0,inum);
    return policy;
  }
};

#ifdef KOKKOS_ENABLE_HIP
template<>
template<class TAG>
struct PairEAMKokkos<Kokkos::Experimental::HIP>::policyInstance {

  static auto get(int inum) {
    static_assert(t_kkfloat_2d_n7::static_extent(2) == 7,
                  "Breaking assumption of spline dim for KernelAB and KernelC scratch caching");

    auto policy = Kokkos::TeamPolicy<Kokkos::Experimental::HIP,TAG>((inum+1023)/1024, 1024)
                           .set_scratch_size(0,
                                Kokkos::PerTeam(MAX_CACHE_ROWS*7*sizeof(double)));
    return policy;
  }
};
#endif

/* ---------------------------------------------------------------------- */

namespace LAMMPS_NS {
template class PairEAMKokkos<LMPDeviceType>;
#ifdef LMP_KOKKOS_GPU
template class PairEAMKokkos<LMPHostType>;
#endif
}
