TR-mbed 1.0
Loading...
Searching...
No Matches
TensorScanSycl.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Mehdi Goli Codeplay Software Ltd.
5// Ralph Potter Codeplay Software Ltd.
6// Luke Iwanski Codeplay Software Ltd.
7// Contact: <eigen@codeplay.com>
8//
9// This Source Code Form is subject to the terms of the Mozilla
10// Public License v. 2.0. If a copy of the MPL was not distributed
11// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12
13/*****************************************************************
14 * TensorScanSycl.h
15 *
16 * \brief:
17 * Tensor Scan Sycl implement the extend version of
18 * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
19 * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
20 * the size of the tensor. In the first kernel (ScanKernelFunctor), each
21 * threads within the work-group individually reduces the allocated elements per
22 * thread in order to reduces the total number of blocks. In the next step all
23 * thread within the work-group will reduce the associated blocks into the
24 * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
25 * buffer is given as an input and all the threads within a work-group scan and
26 * reduces the boundaries between the blocks (generated from the previous
27 * kernel). and write the data on the temporary buffer. If the second kernel is
28 * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will
29 * adjust the final result into the output buffer.
30 * The original algorithm for the parallel prefix sum can be found here:
31 *
32 * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
33 * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
34 *1, no. 1 (2008): 1-17.
35 *****************************************************************/
36
37#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
38#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
39
40namespace Eigen {
41namespace TensorSycl {
42namespace internal {
43
44#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
45#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
46#endif
47
48template <typename index_t>
50 // must be power of 2
51 static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
52 const index_t total_size;
53 const index_t non_scan_size;
54 const index_t scan_size;
55 const index_t non_scan_stride;
56 const index_t scan_stride;
57 const index_t panel_threads;
58 const index_t group_threads;
59 const index_t block_threads;
60 const index_t elements_per_group;
61 const index_t elements_per_block;
62 const index_t loop_range;
63
64 ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
65 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
66 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
67 : total_size(total_size_),
68 non_scan_size(non_scan_size_),
69 scan_size(scan_size_),
70 non_scan_stride(non_scan_stride_),
71 scan_stride(scan_stride_),
72 panel_threads(panel_threads_),
73 group_threads(group_threads_),
74 block_threads(block_threads_),
75 elements_per_group(elements_per_group_),
76 elements_per_block(elements_per_block_),
77 loop_range(loop_range_) {}
78};
79
80enum class scan_step { first, second };
81template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
82 scan_step stp>
84 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
87
89 Evaluator dev_eval;
90 OutAccessor out_accessor;
91 OutAccessor temp_accessor;
94 const bool inclusive;
96 OutAccessor out_accessor_, OutAccessor temp_accessor_,
97 const ScanParameters<Index> scanParameters_, Op accumulator_,
98 const bool inclusive_)
99 : scratch(scratch_),
100 dev_eval(dev_eval_),
101 out_accessor(out_accessor_),
102 temp_accessor(temp_accessor_),
103 scanParameters(scanParameters_),
104 accumulator(accumulator_),
105 inclusive(inclusive_) {}
106
107 template <scan_step sst = stp, typename Input>
108 typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
110 read(const Input &inpt, Index global_id) {
111 return inpt.coeff(global_id);
112 }
113
114 template <scan_step sst = stp, typename Input>
115 typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
117 read(const Input &inpt, Index global_id) {
118 return inpt[global_id];
119 }
120
121 template <scan_step sst = stp, typename InclusiveOp>
122 typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
123 first_step_inclusive_Operation(InclusiveOp inclusive_op) {
124 inclusive_op();
125 }
126
127 template <scan_step sst = stp, typename InclusiveOp>
128 typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
130
131 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
132 auto out_ptr = out_accessor.get_pointer();
133 auto tmp_ptr = temp_accessor.get_pointer();
134 auto scratch_ptr = scratch.get_pointer().get();
135
136 for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
137 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
138 Index tmp = data_offset % scanParameters.panel_threads;
139 const Index panel_id = data_offset / scanParameters.panel_threads;
140 const Index group_id = tmp / scanParameters.group_threads;
141 tmp = tmp % scanParameters.group_threads;
142 const Index block_id = tmp / scanParameters.block_threads;
143 const Index local_id = tmp % scanParameters.block_threads;
144 // we put one element per packet in scratch_mem
145 const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
146 const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
147 CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
148 CoeffReturnType inclusive_scan;
149 // the actual panel size is scan_size * non_scan_size.
150 // elements_per_panel is roundup to power of 2 for binary tree
151 const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
152 const Index group_offset = group_id * scanParameters.non_scan_stride;
153 // This will be effective when the size is bigger than elements_per_block
154 const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
155 const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
156 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
157 Index next_elements = 0;
159 for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
160 Index global_id = global_offset + next_elements;
161 private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
163 (global_id < scanParameters.total_size))
164 ? read(dev_eval, global_id)
165 : accumulator.initialize();
166 next_elements += scanParameters.scan_stride;
167 }
169 if (inclusive) {
170 inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
171 }
172 });
173 // This for loop must be 2
175 for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
176 Index private_offset = 1;
177 // build sum in place up the tree
179 for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
181 for (Index l = 0; l < d; l++) {
182 Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
183 Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
184 CoeffReturnType accum = accumulator.initialize();
185 accumulator.reduce(private_scan[ai], &accum);
186 accumulator.reduce(private_scan[bi], &accum);
187 private_scan[bi] = accumulator.finalize(accum);
188 }
189 private_offset *= 2;
190 }
191 scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
192 private_scan[PacketSize - 1 + packetIndex];
193 private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
194 // traverse down tree & build scan
196 for (Index d = 1; d < PacketSize; d *= 2) {
197 private_offset >>= 1;
199 for (Index l = 0; l < d; l++) {
200 Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
201 Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
202 CoeffReturnType accum = accumulator.initialize();
203 accumulator.reduce(private_scan[ai], &accum);
204 accumulator.reduce(private_scan[bi], &accum);
205 private_scan[ai] = private_scan[bi];
206 private_scan[bi] = accumulator.finalize(accum);
207 }
208 }
209 }
210
211 Index offset = 1;
212 // build sum in place up the tree
213 for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
214 // Synchronise
215 itemID.barrier(cl::sycl::access::fence_space::local_space);
216 if (local_id < d) {
217 Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
218 Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
219 CoeffReturnType accum = accumulator.initialize();
220 accumulator.reduce(scratch_ptr[ai], &accum);
221 accumulator.reduce(scratch_ptr[bi], &accum);
222 scratch_ptr[bi] = accumulator.finalize(accum);
223 }
224 offset *= 2;
225 }
226 // Synchronise
227 itemID.barrier(cl::sycl::access::fence_space::local_space);
228 // next step optimisation
229 if (local_id == 0) {
234 block_id;
235 tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
236 }
237 // clear the last element
238 scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
239 }
240 // traverse down tree & build scan
241 for (Index d = 1; d < scratch_stride; d *= 2) {
242 offset >>= 1;
243 // Synchronise
244 itemID.barrier(cl::sycl::access::fence_space::local_space);
245 if (local_id < d) {
246 Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
247 Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
248 CoeffReturnType accum = accumulator.initialize();
249 accumulator.reduce(scratch_ptr[ai], &accum);
250 accumulator.reduce(scratch_ptr[bi], &accum);
251 scratch_ptr[ai] = scratch_ptr[bi];
252 scratch_ptr[bi] = accumulator.finalize(accum);
253 }
254 }
255 // Synchronise
256 itemID.barrier(cl::sycl::access::fence_space::local_space);
257 // This for loop must be 2
259 for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
261 for (Index i = 0; i < PacketSize; i++) {
262 CoeffReturnType accum = private_scan[packetIndex + i];
263 accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
264 private_scan[packetIndex + i] = accumulator.finalize(accum);
265 }
266 }
268 if (inclusive) {
269 accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
270 private_scan[0] = accumulator.finalize(inclusive_scan);
271 }
272 });
273 next_elements = 0;
274 // right the first set of private param
276 for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
277 Index global_id = global_offset + next_elements;
278 if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
280 (global_id < scanParameters.total_size)) {
281 Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
282 out_ptr[global_id] = private_scan[private_id];
283 }
284 next_elements += scanParameters.scan_stride;
285 }
286 } // end for loop
287 }
288};
289
290template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
292 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
295 InAccessor in_accessor;
296 OutAccessor out_accessor;
300 OutAccessor out_accessor_,
301 const ScanParameters<Index> scanParameters_,
302 Op accumulator_)
303 : in_accessor(in_accessor_),
304 out_accessor(out_accessor_),
305 scanParameters(scanParameters_),
306 accumulator(accumulator_) {}
307
308 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
309 auto in_ptr = in_accessor.get_pointer();
310 auto out_ptr = out_accessor.get_pointer();
311
312 for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
313 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
314 Index tmp = data_offset % scanParameters.panel_threads;
315 const Index panel_id = data_offset / scanParameters.panel_threads;
316 const Index group_id = tmp / scanParameters.group_threads;
317 tmp = tmp % scanParameters.group_threads;
318 const Index block_id = tmp / scanParameters.block_threads;
319 const Index local_id = tmp % scanParameters.block_threads;
320
321 // the actual panel size is scan_size * non_scan_size.
322 // elements_per_panel is roundup to power of 2 for binary tree
323 const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
324 const Index group_offset = group_id * scanParameters.non_scan_stride;
325 // This will be effective when the size is bigger than elements_per_block
326 const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
327 const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
328
329 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
331 const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
332 CoeffReturnType adjust_val = in_ptr[in_id];
333
334 Index next_elements = 0;
336 for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
337 Index global_id = global_offset + next_elements;
338 if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
340 (global_id < scanParameters.total_size)) {
341 CoeffReturnType accum = adjust_val;
342 accumulator.reduce(out_ptr[global_id], &accum);
343 out_ptr[global_id] = accumulator.finalize(accum);
344 }
345 next_elements += scanParameters.scan_stride;
346 }
347 }
348 }
349};
350
351template <typename Index>
352struct ScanInfo {
359
370 const Eigen::SyclDevice &dev;
371 EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
372 const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
373 const Eigen::SyclDevice &dev_)
374 : total_size(total_size_),
375 scan_size(scan_size_),
376 panel_size(panel_size_),
377 non_scan_size(non_scan_size_),
378 scan_stride(scan_stride_),
379 non_scan_stride(non_scan_stride_),
380 dev(dev_) {
381 // must be power of 2
382 local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
383 Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
384
386
388 dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
389 const Index elements_per_panel = elements_per_group * non_scan_size;
395#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
396 const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
397#else
398 const Index max_threads = panel_threads * panel_size;
399#endif
400 global_range = roundUp(max_threads, local_range);
402 std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
403 }
408 inline cl::sycl::nd_range<1> get_thread_range() {
409 return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
410 }
411};
412
413template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
415 EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
416 Reducer &accumulator, const Index total_size,
417 const Index scan_size, const Index panel_size,
418 const Index non_scan_size, const Index scan_stride,
419 const Index non_scan_stride, const Eigen::SyclDevice &dev) {
420 auto scan_info =
421 ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
422
424 AdjustFuctor;
425 dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
426 scan_info.max_elements_per_block,
427 scan_info.get_scan_parameter(), accumulator);
428 }
429};
430
431template <typename CoeffReturnType, scan_step stp>
433 template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
434 EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
435 const Index total_size, const Index scan_size, const Index panel_size,
436 const Index non_scan_size, const Index scan_stride,
437 const Index non_scan_stride, const bool inclusive,
438 const Eigen::SyclDevice &dev) {
439 auto scan_info =
440 ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
441 const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
442 const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
443 CoeffReturnType *temp_pointer =
444 static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
445 EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
446
448 dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
449 in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
450 scan_info.get_scan_parameter(), accumulator, inclusive);
451
452 if (scan_info.block_size > 1) {
454 tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
455 non_scan_size, Index(1), scan_info.block_size, false, dev);
456
458 tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
459 non_scan_stride, dev);
460 }
461 dev.deallocate_temp(temp_pointer);
462 }
463};
464
465} // namespace internal
466} // namespace TensorSycl
467namespace internal {
468template <typename Self, typename Reducer, bool vectorize>
469struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
470 typedef typename Self::Index Index;
471 typedef typename Self::CoeffReturnType CoeffReturnType;
472 typedef typename Self::Storage Storage;
473 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
475 const Index total_size = internal::array_prod(self.dimensions());
476 const Index scan_size = self.size();
477 const Index scan_stride = self.stride();
478 // this is the scan op (can be sum or ...)
479 auto accumulator = self.accumulator();
480 auto inclusive = !self.exclusive();
481 auto consume_dim = self.consume_dim();
482 auto dev = self.device();
483
484 auto dims = self.inner().dimensions();
485
486 Index non_scan_size = 1;
487 Index panel_size = 1;
488 if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
489 for (int i = 0; i < consume_dim; i++) {
490 non_scan_size *= dims[i];
491 }
492 for (int i = consume_dim + 1; i < Self::NumDims; i++) {
493 panel_size *= dims[i];
494 }
495 } else {
496 for (int i = Self::NumDims - 1; i > consume_dim; i--) {
497 non_scan_size *= dims[i];
498 }
499 for (int i = consume_dim - 1; i >= 0; i--) {
500 panel_size *= dims[i];
501 }
502 }
503 const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
504 auto eval_impl = self.inner();
506 eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
507 inclusive, dev);
508 }
509};
510} // namespace internal
511} // namespace Eigen
512
513#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
int i
Definition BiCGSTAB_step_by_step.cpp:9
#define EIGEN_UNROLL_LOOP
Definition Macros.h:1461
#define EIGEN_CONSTEXPR
Definition Macros.h:787
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
int data[]
Definition Map_placement_new.cpp:1
#define EIGEN_SYCL_MAX_GLOBAL_RANGE
Definition TensorScanSycl.h:45
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64
@ ColMajor
Definition Constants.h:319
scan_step
Definition TensorScanSycl.h:80
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
Definition TensorDimensions.h:140
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
Definition BandTriangularSolver.h:13
static EIGEN_STRONG_INLINE void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, const Index total_size, const Index scan_size, const Index panel_size, const Index non_scan_size, const Index scan_stride, const Index non_scan_stride, const Eigen::SyclDevice &dev)
Definition TensorScanSycl.h:415
OutAccessor out_accessor
Definition TensorScanSycl.h:296
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
Definition TensorScanSycl.h:308
const ScanParameters< Index > scanParameters
Definition TensorScanSycl.h:297
InAccessor in_accessor
Definition TensorScanSycl.h:295
static EIGEN_CONSTEXPR int PacketSize
Definition TensorScanSycl.h:294
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition TensorScanSycl.h:293
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, OutAccessor out_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_)
Definition TensorScanSycl.h:299
Definition TensorScanSycl.h:352
Index max_elements_per_block
Definition TensorScanSycl.h:360
const Index & total_size
Definition TensorScanSycl.h:353
const Index & non_scan_stride
Definition TensorScanSycl.h:358
Index group_threads
Definition TensorScanSycl.h:363
const Index & non_scan_size
Definition TensorScanSycl.h:356
const Index & scan_stride
Definition TensorScanSycl.h:357
const Eigen::SyclDevice & dev
Definition TensorScanSycl.h:370
Index local_range
Definition TensorScanSycl.h:369
EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_, const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_, const Eigen::SyclDevice &dev_)
Definition TensorScanSycl.h:371
Index elements_per_block
Definition TensorScanSycl.h:366
Index block_threads
Definition TensorScanSycl.h:364
Index block_size
Definition TensorScanSycl.h:361
Index panel_threads
Definition TensorScanSycl.h:362
Index elements_per_group
Definition TensorScanSycl.h:365
Index loop_range
Definition TensorScanSycl.h:367
cl::sycl::nd_range< 1 > get_thread_range()
Definition TensorScanSycl.h:408
const Index & panel_size
Definition TensorScanSycl.h:355
const Index & scan_size
Definition TensorScanSycl.h:354
ScanParameters< Index > get_scan_parameter()
Definition TensorScanSycl.h:404
Index global_range
Definition TensorScanSycl.h:368
Op accumulator
Definition TensorScanSycl.h:93
typename::Eigen::internal::enable_if< sst==scan_step::first >::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp inclusive_op)
Definition TensorScanSycl.h:123
static EIGEN_CONSTEXPR int PacketSize
Definition TensorScanSycl.h:86
OutAccessor temp_accessor
Definition TensorScanSycl.h:91
Evaluator dev_eval
Definition TensorScanSycl.h:89
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition TensorScanSycl.h:85
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_, OutAccessor out_accessor_, OutAccessor temp_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_, const bool inclusive_)
Definition TensorScanSycl.h:95
LocalAccessor scratch
Definition TensorScanSycl.h:88
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
Definition TensorScanSycl.h:131
typename::Eigen::internal::enable_if< sst!=scan_step::first >::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp)
Definition TensorScanSycl.h:129
typename::Eigen::internal::enable_if< sst==scan_step::first, CoeffReturnType >::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id)
Definition TensorScanSycl.h:110
const bool inclusive
Definition TensorScanSycl.h:94
OutAccessor out_accessor
Definition TensorScanSycl.h:90
const ScanParameters< Index > scanParameters
Definition TensorScanSycl.h:92
typename::Eigen::internal::enable_if< sst!=scan_step::first, CoeffReturnType >::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id)
Definition TensorScanSycl.h:117
Definition TensorScanSycl.h:432
static EIGEN_STRONG_INLINE void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, const Index total_size, const Index scan_size, const Index panel_size, const Index non_scan_size, const Index scan_stride, const Index non_scan_stride, const bool inclusive, const Eigen::SyclDevice &dev)
Definition TensorScanSycl.h:434
Definition TensorScanSycl.h:49
const index_t non_scan_stride
Definition TensorScanSycl.h:55
const index_t scan_stride
Definition TensorScanSycl.h:56
const index_t total_size
Definition TensorScanSycl.h:52
const index_t non_scan_size
Definition TensorScanSycl.h:53
const index_t block_threads
Definition TensorScanSycl.h:59
static EIGEN_CONSTEXPR index_t ScanPerThread
Definition TensorScanSycl.h:51
const index_t group_threads
Definition TensorScanSycl.h:58
const index_t panel_threads
Definition TensorScanSycl.h:57
ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_, index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_, index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
Definition TensorScanSycl.h:64
const index_t elements_per_group
Definition TensorScanSycl.h:60
const index_t elements_per_block
Definition TensorScanSycl.h:61
const index_t scan_size
Definition TensorScanSycl.h:54
const index_t loop_range
Definition TensorScanSycl.h:62
Self::EvaluatorPointerType EvaluatorPointerType
Definition TensorScanSycl.h:473
Self::CoeffReturnType CoeffReturnType
Definition TensorScanSycl.h:471
void operator()(Self &self, EvaluatorPointerType data)
Definition TensorScanSycl.h:474
Definition TensorScan.h:191
Definition ForwardDeclarations.h:17