37#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
38#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
44#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
45#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
48template <
typename index_t>
64 ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
65 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
66 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
81template <
typename Evaluator,
typename CoeffReturnType,
typename OutAccessor,
typename Op,
typename Index,
84 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
96 OutAccessor out_accessor_, OutAccessor temp_accessor_,
98 const bool inclusive_)
107 template <scan_step sst = stp,
typename Input>
108 typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type
EIGEN_DEVICE_FUNC
111 return inpt.coeff(global_id);
114 template <scan_step sst = stp,
typename Input>
115 typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type
EIGEN_DEVICE_FUNC
118 return inpt[global_id];
121 template <scan_step sst = stp,
typename InclusiveOp>
127 template <scan_step sst = stp,
typename InclusiveOp>
134 auto scratch_ptr =
scratch.get_pointer().get();
137 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
148 CoeffReturnType inclusive_scan;
156 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
157 Index next_elements = 0;
159 for (
int i = 0; i < ScanParameters<Index>::ScanPerThread;
i++) {
160 Index global_id = global_offset + next_elements;
175 for (
int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex +=
PacketSize) {
176 Index private_offset = 1;
181 for (
Index l = 0; l < d; l++) {
182 Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
183 Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
191 scratch_ptr[2 * local_id + (packetIndex /
PacketSize) + scratch_offset] =
197 private_offset >>= 1;
199 for (
Index l = 0; l < d; l++) {
200 Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
201 Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
205 private_scan[ai] = private_scan[bi];
213 for (
Index d = scratch_stride >> 1; d > 0; d >>= 1) {
215 itemID.barrier(cl::sycl::access::fence_space::local_space);
217 Index ai =
offset * (2 * local_id + 1) - 1 + scratch_offset;
218 Index bi =
offset * (2 * local_id + 2) - 1 + scratch_offset;
227 itemID.barrier(cl::sycl::access::fence_space::local_space);
235 tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
238 scratch_ptr[scratch_stride - 1 + scratch_offset] =
accumulator.initialize();
241 for (
Index d = 1; d < scratch_stride; d *= 2) {
244 itemID.barrier(cl::sycl::access::fence_space::local_space);
246 Index ai =
offset * (2 * local_id + 1) - 1 + scratch_offset;
247 Index bi =
offset * (2 * local_id + 2) - 1 + scratch_offset;
251 scratch_ptr[ai] = scratch_ptr[bi];
256 itemID.barrier(cl::sycl::access::fence_space::local_space);
259 for (
int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex +=
PacketSize) {
262 CoeffReturnType accum = private_scan[packetIndex +
i];
264 private_scan[packetIndex +
i] =
accumulator.finalize(accum);
270 private_scan[0] =
accumulator.finalize(inclusive_scan);
276 for (
Index i = 0; i < ScanParameters<Index>::ScanPerThread;
i++) {
277 Index global_id = global_offset + next_elements;
282 out_ptr[global_id] = private_scan[private_id];
290template <
typename CoeffReturnType,
typename InAccessor,
typename OutAccessor,
typename Op,
typename Index>
292 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
300 OutAccessor out_accessor_,
313 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
329 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
332 CoeffReturnType adjust_val = in_ptr[in_id];
334 Index next_elements = 0;
336 for (
Index i = 0; i < ScanParameters<Index>::ScanPerThread;
i++) {
337 Index global_id = global_offset + next_elements;
341 CoeffReturnType accum = adjust_val;
351template <
typename Index>
370 const Eigen::SyclDevice &
dev;
372 const Index &non_scan_size_,
const Index &scan_stride_,
const Index &non_scan_stride_,
373 const Eigen::SyclDevice &dev_)
383 Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
395#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
413template <
typename EvaluatorPo
interType,
typename CoeffReturnType,
typename Reducer,
typename Index>
416 Reducer &accumulator,
const Index total_size,
417 const Index scan_size,
const Index panel_size,
418 const Index non_scan_size,
const Index scan_stride,
419 const Index non_scan_stride,
const Eigen::SyclDevice &dev) {
421 ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
425 dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
426 scan_info.max_elements_per_block,
427 scan_info.get_scan_parameter(), accumulator);
431template <
typename CoeffReturnType, scan_step stp>
433 template <
typename Input,
typename EvaluatorPo
interType,
typename Reducer,
typename Index>
436 const Index non_scan_size,
const Index scan_stride,
437 const Index non_scan_stride,
const bool inclusive,
438 const Eigen::SyclDevice &dev) {
440 ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
441 const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
443 CoeffReturnType *temp_pointer =
444 static_cast<CoeffReturnType *
>(dev.allocate_temp(temp_pointer_size *
sizeof(CoeffReturnType)));
445 EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
448 dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
449 in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
450 scan_info.get_scan_parameter(), accumulator, inclusive);
452 if (scan_info.block_size > 1) {
454 tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
455 non_scan_size,
Index(1), scan_info.block_size,
false, dev);
458 tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
459 non_scan_stride, dev);
461 dev.deallocate_temp(temp_pointer);
468template <
typename Self,
typename Reducer,
bool vectorize>
479 auto accumulator =
self.accumulator();
480 auto inclusive = !
self.exclusive();
481 auto consume_dim =
self.consume_dim();
482 auto dev =
self.device();
484 auto dims =
self.inner().dimensions();
486 Index non_scan_size = 1;
487 Index panel_size = 1;
488 if (
static_cast<int>(Self::Layout) ==
static_cast<int>(
ColMajor)) {
489 for (
int i = 0;
i < consume_dim;
i++) {
490 non_scan_size *= dims[
i];
492 for (
int i = consume_dim + 1;
i < Self::NumDims;
i++) {
493 panel_size *= dims[
i];
496 for (
int i = Self::NumDims - 1;
i > consume_dim;
i--) {
497 non_scan_size *= dims[
i];
499 for (
int i = consume_dim - 1;
i >= 0;
i--) {
500 panel_size *= dims[
i];
503 const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
504 auto eval_impl =
self.inner();
506 eval_impl,
data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
int i
Definition BiCGSTAB_step_by_step.cpp:9
#define EIGEN_UNROLL_LOOP
Definition Macros.h:1461
#define EIGEN_CONSTEXPR
Definition Macros.h:787
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
int data[]
Definition Map_placement_new.cpp:1
#define EIGEN_SYCL_MAX_GLOBAL_RANGE
Definition TensorScanSycl.h:45
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64
@ ColMajor
Definition Constants.h:319
scan_step
Definition TensorScanSycl.h:80
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
Definition TensorDimensions.h:140
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
Definition BandTriangularSolver.h:13
Definition TensorScanSycl.h:414
static EIGEN_STRONG_INLINE void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, const Index total_size, const Index scan_size, const Index panel_size, const Index non_scan_size, const Index scan_stride, const Index non_scan_stride, const Eigen::SyclDevice &dev)
Definition TensorScanSycl.h:415
Definition TensorScanSycl.h:291
OutAccessor out_accessor
Definition TensorScanSycl.h:296
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
Definition TensorScanSycl.h:308
const ScanParameters< Index > scanParameters
Definition TensorScanSycl.h:297
InAccessor in_accessor
Definition TensorScanSycl.h:295
static EIGEN_CONSTEXPR int PacketSize
Definition TensorScanSycl.h:294
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition TensorScanSycl.h:293
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, OutAccessor out_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_)
Definition TensorScanSycl.h:299
Op accumulator
Definition TensorScanSycl.h:298
Definition TensorScanSycl.h:352
Index max_elements_per_block
Definition TensorScanSycl.h:360
const Index & total_size
Definition TensorScanSycl.h:353
const Index & non_scan_stride
Definition TensorScanSycl.h:358
Index group_threads
Definition TensorScanSycl.h:363
const Index & non_scan_size
Definition TensorScanSycl.h:356
const Index & scan_stride
Definition TensorScanSycl.h:357
const Eigen::SyclDevice & dev
Definition TensorScanSycl.h:370
Index local_range
Definition TensorScanSycl.h:369
EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_, const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_, const Eigen::SyclDevice &dev_)
Definition TensorScanSycl.h:371
Index elements_per_block
Definition TensorScanSycl.h:366
Index block_threads
Definition TensorScanSycl.h:364
Index block_size
Definition TensorScanSycl.h:361
Index panel_threads
Definition TensorScanSycl.h:362
Index elements_per_group
Definition TensorScanSycl.h:365
Index loop_range
Definition TensorScanSycl.h:367
cl::sycl::nd_range< 1 > get_thread_range()
Definition TensorScanSycl.h:408
const Index & panel_size
Definition TensorScanSycl.h:355
const Index & scan_size
Definition TensorScanSycl.h:354
ScanParameters< Index > get_scan_parameter()
Definition TensorScanSycl.h:404
Index global_range
Definition TensorScanSycl.h:368
Definition TensorScanSycl.h:83
Op accumulator
Definition TensorScanSycl.h:93
typename::Eigen::internal::enable_if< sst==scan_step::first >::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp inclusive_op)
Definition TensorScanSycl.h:123
static EIGEN_CONSTEXPR int PacketSize
Definition TensorScanSycl.h:86
OutAccessor temp_accessor
Definition TensorScanSycl.h:91
Evaluator dev_eval
Definition TensorScanSycl.h:89
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition TensorScanSycl.h:85
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_, OutAccessor out_accessor_, OutAccessor temp_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_, const bool inclusive_)
Definition TensorScanSycl.h:95
LocalAccessor scratch
Definition TensorScanSycl.h:88
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
Definition TensorScanSycl.h:131
typename::Eigen::internal::enable_if< sst!=scan_step::first >::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp)
Definition TensorScanSycl.h:129
typename::Eigen::internal::enable_if< sst==scan_step::first, CoeffReturnType >::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id)
Definition TensorScanSycl.h:110
const bool inclusive
Definition TensorScanSycl.h:94
OutAccessor out_accessor
Definition TensorScanSycl.h:90
const ScanParameters< Index > scanParameters
Definition TensorScanSycl.h:92
typename::Eigen::internal::enable_if< sst!=scan_step::first, CoeffReturnType >::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id)
Definition TensorScanSycl.h:117
Definition TensorScanSycl.h:432
static EIGEN_STRONG_INLINE void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, const Index total_size, const Index scan_size, const Index panel_size, const Index non_scan_size, const Index scan_stride, const Index non_scan_stride, const bool inclusive, const Eigen::SyclDevice &dev)
Definition TensorScanSycl.h:434
Definition TensorScanSycl.h:49
const index_t non_scan_stride
Definition TensorScanSycl.h:55
const index_t scan_stride
Definition TensorScanSycl.h:56
const index_t total_size
Definition TensorScanSycl.h:52
const index_t non_scan_size
Definition TensorScanSycl.h:53
const index_t block_threads
Definition TensorScanSycl.h:59
static EIGEN_CONSTEXPR index_t ScanPerThread
Definition TensorScanSycl.h:51
const index_t group_threads
Definition TensorScanSycl.h:58
const index_t panel_threads
Definition TensorScanSycl.h:57
ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_, index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_, index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
Definition TensorScanSycl.h:64
const index_t elements_per_group
Definition TensorScanSycl.h:60
const index_t elements_per_block
Definition TensorScanSycl.h:61
const index_t scan_size
Definition TensorScanSycl.h:54
const index_t loop_range
Definition TensorScanSycl.h:62
Self::EvaluatorPointerType EvaluatorPointerType
Definition TensorScanSycl.h:473
Self::Index Index
Definition TensorScanSycl.h:470
Self::CoeffReturnType CoeffReturnType
Definition TensorScanSycl.h:471
Self::Storage Storage
Definition TensorScanSycl.h:472
void operator()(Self &self, EvaluatorPointerType data)
Definition TensorScanSycl.h:474
Definition TensorScan.h:191
Definition ForwardDeclarations.h:17