TR-mbed 1.0
Loading...
Searching...
No Matches
TensorReductionSycl.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Mehdi Goli Codeplay Software Ltd.
5// Ralph Potter Codeplay Software Ltd.
6// Luke Iwanski Codeplay Software Ltd.
7// Contact: <eigen@codeplay.com>
8//
9// This Source Code Form is subject to the terms of the Mozilla
10// Public License v. 2.0. If a copy of the MPL was not distributed
11// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12
13/*****************************************************************
14 * TensorReductionSycl.h
15 *
16 * \brief:
17 * This is the specialization of the reduction operation. Two phase reduction approach
18 * is used since the GPU does not have Global Synchronization for global memory among
19 * different work-group/thread block. To solve the problem, we need to create two kernels
20 * to reduce the data, where the first kernel reduce the data locally and each local
21 * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
22 * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
23 * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
24 * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
25 *
26 *****************************************************************/
27
28#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
29#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
30namespace Eigen {
31namespace TensorSycl {
32namespace internal {
33
34template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
35struct OpDefiner {
36 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
37 typedef Op type;
38 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
39
41 const Index &) {
42 return accumulator;
43 }
44};
45
46template <typename CoeffReturnType, typename Index>
47struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
52
53 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
54 const Index &scale) {
56 return quotient_op(accumulator, CoeffReturnType(scale));
57 }
58};
59
60template <typename CoeffReturnType, typename Index>
61struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
62 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
67
69 const Index &scale) {
70 return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
71 }
72};
73
74template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
75 Index local_range>
77 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
80 typedef typename OpDef::type Op;
82 InputAccessor aI;
83 OutputAccessor outAcc;
85 SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
86 : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
87
88 void operator()(cl::sycl::nd_item<1> itemID) {
89 // Our empirical research shows that the best performance will be achieved
90 // when there is only one element per thread to reduce in the second step.
91 // in this step the second step reduction time is almost negligible.
92 // Hence, in the second step of reduction the input size is fixed to the
93 // local size, thus, there is only one element read per thread. The
94 // algorithm must be changed if the number of reduce per thread in the
95 // second step is greater than 1. Otherwise, the result will be wrong.
96 const Index localid = itemID.get_local_id(0);
97 auto aInPtr = aI.get_pointer() + localid;
98 auto aOutPtr = outAcc.get_pointer();
99 CoeffReturnType *scratchptr = scratch.get_pointer();
100 CoeffReturnType accumulator = *aInPtr;
101
102 scratchptr[localid] = op.finalize(accumulator);
103 for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
104 itemID.barrier(cl::sycl::access::fence_space::local_space);
105 if (localid < offset) {
106 op.reduce(scratchptr[localid + offset], &accumulator);
107 scratchptr[localid] = op.finalize(accumulator);
108 }
109 }
110 if (localid == 0) *aOutPtr = op.finalize(accumulator);
111 }
112};
113
114// Full reduction first phase. In this version the vectorization is true and the reduction accept
115// any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ).
116template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
118 public:
119 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
120 typedef typename Evaluator::Index Index;
121 typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
122 (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
124
125 typedef typename OpDef::type Op;
126 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
127 typedef typename Evaluator::PacketReturnType PacketReturnType;
128 typedef
129 typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
131 typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
134 Evaluator evaluator;
138
139 FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
140 Index rng_, OpType op_)
141 : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
142
143 void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
144
145 template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
146 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
147 const cl::sycl::nd_item<1> &itemID) {
148 auto output_ptr = final_output.get_pointer();
149 Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
150 Index globalid = itemID.get_global_id(0);
151 Index localid = itemID.get_local_id(0);
152 Index step = Evaluator::PacketSize * itemID.get_global_range(0);
153 Index start = Evaluator::PacketSize * globalid;
154 // vectorizable parts
155 PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
156 for (Index i = start; i < VectorizedRange; i += step) {
157 op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
158 }
159 globalid += VectorizedRange;
160 // non vectorizable parts
161 for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
162 op.template reducePacket<PacketReturnType>(
164 evaluator.impl().coeff(i), op.initialize()),
165 &packetAccumulator);
166 }
167 scratch[localid] = packetAccumulator =
168 OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
169 // reduction parts // Local size is always power of 2
171 for (Index offset = local_range / 2; offset > 0; offset /= 2) {
172 itemID.barrier(cl::sycl::access::fence_space::local_space);
173 if (localid < offset) {
174 op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
175 scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
176 }
177 }
178 if (localid == 0) {
179 output_ptr[itemID.get_group(0)] =
180 op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
181 }
182 }
183
184 template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
185 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
186 const cl::sycl::nd_item<1> &itemID) {
187 auto output_ptr = final_output.get_pointer();
188 Index globalid = itemID.get_global_id(0);
189 Index localid = itemID.get_local_id(0);
190 // vectorizable parts
191 CoeffReturnType accumulator = op.initialize();
192 // non vectorizable parts
193 for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
194 op.reduce(evaluator.impl().coeff(i), &accumulator);
195 }
196 scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
197
198 // reduction parts. the local size is always power of 2
200 for (Index offset = local_range / 2; offset > 0; offset /= 2) {
201 itemID.barrier(cl::sycl::access::fence_space::local_space);
202 if (localid < offset) {
203 op.reduce(scratch[localid + offset], &accumulator);
204 scratch[localid] = op.finalize(accumulator);
205 }
206 }
207 if (localid == 0) {
208 output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
209 }
210 }
211};
212
213template <typename Evaluator, typename OpType>
215 public:
216 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
217 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
218 typedef typename Evaluator::Index Index;
220 typedef typename OpDef::type Op;
221 template <typename Scratch>
222 GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
223 Index range_, Index num_values_to_reduce_)
224 : evaluator(evaluator_),
225 output_accessor(output_accessor_),
226 functor(OpDef::get_op(functor_)),
227 range(range_),
228 num_values_to_reduce(num_values_to_reduce_) {}
229
230 void operator()(cl::sycl::nd_item<1> itemID) {
231 auto output_accessor_ptr = output_accessor.get_pointer();
233 Index globalid = static_cast<Index>(itemID.get_global_linear_id());
234 if (globalid < range) {
235 CoeffReturnType accum = functor.initialize();
237 evaluator, evaluator.firstInput(globalid), functor, &accum);
238 output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
239 }
240 }
241
242 private:
243 Evaluator evaluator;
244 EvaluatorPointerType output_accessor;
245 Op functor;
246 Index range;
247 Index num_values_to_reduce;
248};
249
251// default is preserver
252template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
254 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
255 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
256 typedef typename Evaluator::Index Index;
258 typedef typename OpDef::type Op;
259 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
262 Evaluator evaluator;
269
270 PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
271 const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
272 const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
273 : scratch(scratch_),
274 evaluator(evaluator_),
275 output_accessor(output_accessor_),
276 op(OpDef::get_op(op_)),
277 preserve_elements_num_groups(preserve_elements_num_groups_),
278 reduce_elements_num_groups(reduce_elements_num_groups_),
279 num_coeffs_to_preserve(num_coeffs_to_preserve_),
280 num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
281
283 CoeffReturnType &accumulator) {
284 if (globalPId >= num_coeffs_to_preserve) {
285 return;
286 }
287 Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
288 : globalRId + (globalPId * num_coeffs_to_reduce);
289 Index localOffset = globalRId;
290
291 const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
292 const Index per_thread_global_stride =
293 rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
294 for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
295 op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
296 localOffset += per_thread_local_stride;
297 global_offset += per_thread_global_stride;
298 }
299 }
300 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
301 const Index linearLocalThreadId = itemID.get_local_id(0);
302 Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
303 : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
304 Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
305 : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
306 const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
307 : itemID.get_group(0) / reduce_elements_num_groups;
308 const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
309 : itemID.get_group(0) % reduce_elements_num_groups;
310
311 Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
312 const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
313 auto scratchPtr = scratch.get_pointer().get();
314 auto outPtr =
315 output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
316 CoeffReturnType accumulator = op.initialize();
317
318 element_wise_reduce(globalRId, globalPId, accumulator);
319
320 accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
321 scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
322 accumulator;
323 if (rt == reduction_dim::inner_most) {
324 pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
325 rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
326 globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
327 }
328
329 /* Apply the reduction operation between the current local
330 * id and the one on the other half of the vector. */
331 auto out_scratch_ptr =
332 scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
333 itemID.barrier(cl::sycl::access::fence_space::local_space);
334 if (rt == reduction_dim::inner_most) {
335 accumulator = *out_scratch_ptr;
336 }
337 // The Local LocalThreadSizeR is always power of 2
339 for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
340 if (rLocalThreadId < offset) {
341 op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
342 // The result has already been divided for mean reducer in the
343 // previous reduction so no need to divide furthermore
344 *out_scratch_ptr = op.finalize(accumulator);
345 }
346 /* All threads collectively read from global memory into local.
347 * The barrier ensures all threads' IO is resolved before
348 * execution continues (strictly speaking, all threads within
349 * a single work-group - there is no co-ordination between
350 * work-groups, only work-items). */
351 itemID.barrier(cl::sycl::access::fence_space::local_space);
352 }
353
354 if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
355 outPtr[globalPId] = op.finalize(accumulator);
356 }
357 }
358};
359
360template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
363 typedef typename OpDef::type Op;
364 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
366 InputAccessor input_accessor;
367 OutputAccessor output_accessor;
371
373 OutputAccessor output_accessor_, OpType op_,
374 const Index num_coeffs_to_preserve_,
375 const Index num_coeffs_to_reduce_)
376 : input_accessor(input_accessor_),
377 output_accessor(output_accessor_),
378 op(OpDef::get_op(op_)),
379 num_coeffs_to_preserve(num_coeffs_to_preserve_),
380 num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
381
382 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
383 const Index globalId = itemID.get_global_id(0);
384
385 if (globalId >= num_coeffs_to_preserve) return;
386
387 auto in_ptr = input_accessor.get_pointer() + globalId;
388
389 OutScalar accumulator = op.initialize();
390// num_coeffs_to_reduce is not bigger that 256
391 for (Index i = 0; i < num_coeffs_to_reduce; i++) {
392 op.reduce(*in_ptr, &accumulator);
393 in_ptr += num_coeffs_to_preserve;
394 }
395 output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
396 }
397}; // namespace internal
398
399template <typename Index, Index LTP, Index LTR, bool BC_>
405
406template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
408 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
409 typedef typename Self::CoeffReturnType CoeffReturnType;
410 typedef typename Self::Storage Storage;
411 typedef typename Self::Index Index;
414
416
417 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
418 Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
419 Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
420
421 // getPowerOfTwo makes sure local range is power of 2 and <=
422 // maxSyclThreadPerBlock this will help us to avoid extra check on the
423 // kernel
426 "The Local thread size must be a power of 2 for the reduction "
427 "operation");
428
430 // In this step, we force the code not to be more than 2-step reduction:
431 // Our empirical research shows that if each thread reduces at least 64
432 // elemnts individually, we get better performance. However, this can change
433 // on different platforms. In this step we force the code not to be
434 // morthan step reduction: Our empirical research shows that for inner_most
435 // dim reducer, it is better to have 8 group in a reduce dimension for sizes
436 // > 1024 to achieve the best performance.
437 const Index reductionPerThread = 64;
438 Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
439 const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
440 Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
441 const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
442 const Index globalRange = pNumGroups * rNumGroups * localRange;
443
444 EIGEN_CONSTEXPR Index scratchSize =
446 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
447 if (rNumGroups > 1) {
448 CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
449 dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
450 EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
451 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
452 self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
453 num_coeffs_to_reduce);
454
456 SecondStepPartialReductionKernel;
457
458 dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
459 temp_accessor, output,
460 cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
461 reducer, num_coeffs_to_preserve, rNumGroups);
462
463 self.device().deallocate_temp(temp_pointer);
464 } else {
465 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
466 self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
467 num_coeffs_to_reduce);
468 }
469 return false;
470 }
471};
472} // namespace internal
473} // namespace TensorSycl
474
475namespace internal {
476
477template <typename Self, typename Op, bool Vectorizable>
478struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
479 typedef typename Self::CoeffReturnType CoeffReturnType;
480 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
482 static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
483 static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
487 "The Local thread size must be a power of 2 for the reduction "
488 "operation");
490
491 typename Self::Index inputSize = self.impl().dimensions().TotalSize();
492 // In this step we force the code not to be more than 2-step reduction:
493 // Our empirical research shows that if each thread reduces at least 512
494 // elemnts individually, we get better performance.
495 const Index reductionPerThread = 2048;
496 // const Index num_work_group =
497 Index reductionGroup = dev.getPowerOfTwo(
498 (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
499 const Index num_work_group = std::min(reductionGroup, local_range);
500 // 1
501 // ? local_range
502 // : 1);
503 const Index global_range = num_work_group * local_range;
504
505 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
507 if (num_work_group > 1) {
509 static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
510 typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
512 local_range, inputSize, reducer);
513
515 EvaluatorPointerType, Index, local_range>
519 cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
520 reducer);
521
522 dev.deallocate_temp(temp_pointer);
523 } else {
525 reducer);
526 }
527 }
528};
529// vectorizable inner_most most dim preserver
530// col reduction
531template <typename Self, typename Op>
532struct OuterReducer<Self, Op, Eigen::SyclDevice> {
534
535 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
536 typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
537 typename Self::Index num_coeffs_to_preserve) {
538 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
540 num_coeffs_to_reduce,
541 num_coeffs_to_preserve);
542 }
543};
544// row reduction
545template <typename Self, typename Op>
546struct InnerReducer<Self, Op, Eigen::SyclDevice> {
548
549 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
550 typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
551 typename Self::Index num_coeffs_to_preserve) {
552 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
554 num_coeffs_to_reduce,
555 num_coeffs_to_preserve);
556 }
557};
558
559// ArmgMax uses this kernel for partial reduction//
560// TODO(@mehdi.goli) come up with a better kernel
561// generic partial reduction
562template <typename Self, typename Op>
563struct GenericReducer<Self, Op, Eigen::SyclDevice> {
564 static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
565 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
566 typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
567 typename Self::Index num_coeffs_to_preserve) {
568 typename Self::Index range, GRange, tileSize;
569 dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
570
571 dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
573 self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
574 reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
575 return false;
576 }
577};
578
579} // namespace internal
580} // namespace Eigen
581
582#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
int i
Definition BiCGSTAB_step_by_step.cpp:9
#define EIGEN_UNROLL_LOOP
Definition Macros.h:1461
#define EIGEN_CONSTEXPR
Definition Macros.h:787
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
int data[]
Definition Map_placement_new.cpp:1
FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_, Index rng_, OpType op_)
Definition TensorReductionSycl.h:139
EvaluatorPointerType final_output
Definition TensorReductionSycl.h:135
Evaluator::PacketReturnType PacketReturnType
Definition TensorReductionSycl.h:127
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition TensorReductionSycl.h:126
cl::sycl::accessor< OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition TensorReductionSycl.h:132
typename::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess &Evaluator::InputPacketAccess), PacketReturnType, CoeffReturnType >::type OutType
Definition TensorReductionSycl.h:130
Evaluator evaluator
Definition TensorReductionSycl.h:134
Evaluator::Index Index
Definition TensorReductionSycl.h:120
Index rng
Definition TensorReductionSycl.h:136
OpDef::type Op
Definition TensorReductionSycl.h:125
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ::Eigen::internal::enable_if<!Vect >::type compute_reduction(const cl::sycl::nd_item< 1 > &itemID)
Definition TensorReductionSycl.h:185
Evaluator::CoeffReturnType CoeffReturnType
Definition TensorReductionSycl.h:119
LocalAccessor scratch
Definition TensorReductionSycl.h:133
void operator()(cl::sycl::nd_item< 1 > itemID)
Definition TensorReductionSycl.h:143
Op op
Definition TensorReductionSycl.h:137
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ::Eigen::internal::enable_if< Vect >::type compute_reduction(const cl::sycl::nd_item< 1 > &itemID)
Definition TensorReductionSycl.h:146
GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_, Index range_, Index num_values_to_reduce_)
Definition TensorReductionSycl.h:222
Evaluator::Index Index
Definition TensorReductionSycl.h:218
void operator()(cl::sycl::nd_item< 1 > itemID)
Definition TensorReductionSycl.h:230
Evaluator::CoeffReturnType CoeffReturnType
Definition TensorReductionSycl.h:216
OpDef::type Op
Definition TensorReductionSycl.h:220
OpDefiner< OpType, CoeffReturnType, Index, false > OpDef
Definition TensorReductionSycl.h:219
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition TensorReductionSycl.h:217
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics scale
Definition gnuplot_common_settings.hh:54
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64
reduction_dim
Definition TensorReductionSycl.h:250
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
Definition BandTriangularSolver.h:13
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer< CoeffReturnType > &)
Definition TensorReductionSycl.h:64
Vectorise< CoeffReturnType, Eigen::SyclDevice, true >::PacketReturnType PacketReturnType
Definition TensorReductionSycl.h:62
Eigen::internal::SumReducer< CoeffReturnType > type
Definition TensorReductionSycl.h:63
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, const Index &scale)
Definition TensorReductionSycl.h:68
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer< CoeffReturnType > &)
Definition TensorReductionSycl.h:49
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator, const Index &scale)
Definition TensorReductionSycl.h:53
Eigen::internal::SumReducer< CoeffReturnType > type
Definition TensorReductionSycl.h:48
Definition TensorReductionSycl.h:35
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op)
Definition TensorReductionSycl.h:38
Op type
Definition TensorReductionSycl.h:37
Vectorise< CoeffReturnType, Eigen::SyclDevice, Vectorizable >::PacketReturnType PacketReturnType
Definition TensorReductionSycl.h:36
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, const Index &)
Definition TensorReductionSycl.h:40
Definition InteropHeaders.h:130
Definition TensorReductionSycl.h:407
Self::EvaluatorPointerType EvaluatorPointerType
Definition TensorReductionSycl.h:408
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output, Index num_coeffs_to_reduce, Index num_coeffs_to_preserve)
Definition TensorReductionSycl.h:417
Self::Index Index
Definition TensorReductionSycl.h:411
Self::CoeffReturnType CoeffReturnType
Definition TensorReductionSycl.h:409
PartialReductionKernel< Self, Op, PannelParameters, rt > SyclReducerKerneType
Definition TensorReductionSycl.h:415
Self::Storage Storage
Definition TensorReductionSycl.h:410
ReductionPannel< typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true > PannelParameters
Definition TensorReductionSycl.h:413
Definition TensorReductionSycl.h:253
const Index preserve_elements_num_groups
Definition TensorReductionSycl.h:265
Evaluator::Index Index
Definition TensorReductionSycl.h:256
const Index num_coeffs_to_reduce
Definition TensorReductionSycl.h:268
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId, CoeffReturnType &accumulator)
Definition TensorReductionSycl.h:282
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > ScratchAcc
Definition TensorReductionSycl.h:260
ScratchAcc scratch
Definition TensorReductionSycl.h:261
const Index num_coeffs_to_preserve
Definition TensorReductionSycl.h:267
EvaluatorPointerType output_accessor
Definition TensorReductionSycl.h:263
OpDef::type Op
Definition TensorReductionSycl.h:258
const Index reduce_elements_num_groups
Definition TensorReductionSycl.h:266
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition TensorReductionSycl.h:255
PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_, const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_, const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
Definition TensorReductionSycl.h:270
Op op
Definition TensorReductionSycl.h:264
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
Definition TensorReductionSycl.h:300
Evaluator evaluator
Definition TensorReductionSycl.h:262
Evaluator::CoeffReturnType CoeffReturnType
Definition TensorReductionSycl.h:254
OpDefiner< OpType, CoeffReturnType, Index, false > OpDef
Definition TensorReductionSycl.h:257
Definition TensorReductionSycl.h:400
static EIGEN_CONSTEXPR Index LocalThreadSizeR
Definition TensorReductionSycl.h:402
static EIGEN_CONSTEXPR Index LocalThreadSizeP
Definition TensorReductionSycl.h:401
static EIGEN_CONSTEXPR bool BC
Definition TensorReductionSycl.h:403
Definition TensorReductionSycl.h:76
SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
Definition TensorReductionSycl.h:85
void operator()(cl::sycl::nd_item< 1 > itemID)
Definition TensorReductionSycl.h:88
OpDefiner< OpType, CoeffReturnType, Index, true > OpDef
Definition TensorReductionSycl.h:79
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition TensorReductionSycl.h:78
Op op
Definition TensorReductionSycl.h:84
OutputAccessor outAcc
Definition TensorReductionSycl.h:83
OpDef::type Op
Definition TensorReductionSycl.h:80
LocalAccessor scratch
Definition TensorReductionSycl.h:81
InputAccessor aI
Definition TensorReductionSycl.h:82
OutputAccessor output_accessor
Definition TensorReductionSycl.h:367
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID)
Definition TensorReductionSycl.h:382
const Index num_coeffs_to_preserve
Definition TensorReductionSycl.h:369
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > ScratchAccessor
Definition TensorReductionSycl.h:365
OpDefiner< OpType, OutScalar, Index, false > OpDef
Definition TensorReductionSycl.h:362
const Index num_coeffs_to_reduce
Definition TensorReductionSycl.h:370
Op op
Definition TensorReductionSycl.h:368
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_, OutputAccessor output_accessor_, OpType op_, const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
Definition TensorReductionSycl.h:372
InputAccessor input_accessor
Definition TensorReductionSycl.h:366
OpDef::type Op
Definition TensorReductionSycl.h:363
Self::CoeffReturnType CoeffReturnType
Definition TensorReductionSycl.h:479
static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data)
Definition TensorReductionSycl.h:483
Self::EvaluatorPointerType EvaluatorPointerType
Definition TensorReductionSycl.h:480
Definition TensorReduction.h:311
static const bool HasOptimizedImplementation
Definition TensorReduction.h:312
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
Definition TensorReduction.h:144
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve)
Definition TensorReductionSycl.h:565
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_coeffs_to_preserve)
Definition TensorReductionSycl.h:549
Definition TensorReduction.h:395
static const bool HasOptimizedImplementation
Definition TensorReduction.h:396
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition TensorReduction.h:398
Definition TensorFunctors.h:112
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_coeffs_to_preserve)
Definition TensorReductionSycl.h:535
Definition TensorReduction.h:406
static const bool HasOptimizedImplementation
Definition TensorReduction.h:407
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition TensorReduction.h:409
Definition ForwardDeclarations.h:17