10#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
17#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
24template <
typename T,
typename R>
26#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
29 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
30 unsigned int newval = oldval;
31 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
32 if (newval == oldval) {
35 unsigned int readback;
36 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
39 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
40 if (newval == oldval) {
45 else if (
sizeof(
T) == 8) {
46 unsigned long long oldval = *
reinterpret_cast<unsigned long long*
>(output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
49 if (newval == oldval) {
52 unsigned long long readback;
53 while ((readback = atomicCAS((
unsigned long long*)output, oldval, newval)) != oldval) {
56 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
57 if (newval == oldval) {
63 gpu_assert(0 &&
"Wordsize not supported");
66 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
71template <
typename Type>
72__device__
inline Type atomicExchCustom(Type* address, Type val) {
73 return atomicExch(address, val);
77__device__
inline double atomicExchCustom(
double* address,
double val) {
78 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(address);
79 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
82#ifdef EIGEN_HAS_GPU_FP16
84__device__
inline void atomicReduce(half2* output, half2 accum, R& reducer) {
85 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
86 unsigned int newval = oldval;
87 reducer.reducePacket(accum,
reinterpret_cast<half2*
>(&newval));
88 if (newval == oldval) {
91 unsigned int readback;
92 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
95 reducer.reducePacket(accum,
reinterpret_cast<half2*
>(&newval));
96 if (newval == oldval) {
103__device__
inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
104 half2* houtput=
reinterpret_cast<half2*
>(output);
105 half2* haccum=
reinterpret_cast<half2*
>(&accum);
106 for(
int i=0;
i<4;++
i){
107 atomicReduce(houtput+
i,*(haccum+
i),reducer);
113__device__
inline void atomicReduce(
float* output,
float accum, SumReducer<float>&) {
114#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
115 atomicAdd(output, accum);
117 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
122template <
typename CoeffType,
typename Index>
126 for (
Index i = thread_id;
i < num_preserved_coeffs;
i += num_threads) {
132template <
int BlockSize,
int NumPerThread,
typename Self,
133 typename Reducer,
typename Index>
135 typename Self::CoeffReturnType* output,
unsigned int* semaphore) {
136#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
139 if (gridDim.x == 1) {
140 if (first_index == 0) {
141 *output = reducer.initialize();
146 unsigned int block = atomicCAS(semaphore, 0u, 1u);
149 atomicExchCustom(output, reducer.initialize());
151 atomicExch(semaphore, 2u);
158 val = atomicCAS(semaphore, 2u, 2u);
169 typename Self::CoeffReturnType accum = reducer.initialize();
170 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
171 for (
Index i = 0;
i < max_iter;
i+=BlockSize) {
172 const Index index = first_index +
i;
174 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
175 reducer.reduce(val, &accum);
180 #if defined(EIGEN_HIPCC)
184 if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
185 reducer.reduce(__shfl_down(
static_cast<float>(accum),
offset, warpSize), &accum);
187 reducer.reduce(__shfl_down(
static_cast<int>(accum),
offset, warpSize), &accum);
189 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
190 reducer.reduce(__shfl_down(accum,
offset, warpSize), &accum);
192 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum,
offset, warpSize), &accum);
196 if ((
threadIdx.x & (warpSize - 1)) == 0) {
197 atomicReduce(output, accum, reducer);
202 atomicInc(semaphore, gridDim.x + 1);
203#if defined(EIGEN_HIPCC)
204 __threadfence_system();
208 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
213#ifdef EIGEN_HAS_GPU_FP16
214template <
typename Self,
215 typename Reducer,
typename Index>
221 Index packet_remainder =
223 if (packet_remainder != 0) {
224 half2* h2scratch =
reinterpret_cast<half2*
>(scratch);
225 for (
Index i = num_coeffs - packet_remainder;
i + 2 <= num_coeffs;
i += 2) {
227 __halves2half2(input.m_impl.coeff(
i), input.m_impl.coeff(
i + 1));
230 if ((num_coeffs & 1) != 0) {
231 half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
232 *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
235 *scratch = reducer.template initializePacket<packet_type>();
239template <
typename Self,
240 typename Reducer,
typename Index>
246 const Index num_packets =
248 PacketType* p_output =
reinterpret_cast<PacketType*
>(output);
249 for (
Index i = thread_id;
i < num_packets;
i += num_threads) {
250 p_output[
i] = reducer.template initializePacket<PacketType>();
252 Index packet_remainder =
254 if (thread_id < packet_remainder) {
255 output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
259template <
int BlockSize,
int NumPerThread,
typename Self,
260 typename Reducer,
typename Index>
266 const Index first_index =
271 if (gridDim.x == 1) {
272 if (first_index == 0) {
273 int rem = num_coeffs % packet_width;
275 half2* p_scratch =
reinterpret_cast<half2*
>(scratch);
276 *scratch = reducer.template initializePacket<PacketType>();
277 for (
int i = 0;
i < rem / 2;
i++) {
278 *p_scratch = __halves2half2(
279 input.m_impl.coeff(num_coeffs - packet_width + 2 *
i),
280 input.m_impl.coeff(num_coeffs - packet_width + 2 *
i + 1));
283 if ((num_coeffs & 1) != 0) {
284 half last = input.m_impl.coeff(num_coeffs - 1);
285 *p_scratch = __halves2half2(last, reducer.initialize());
288 *scratch = reducer.template initializePacket<PacketType>();
294 PacketType accum = reducer.template initializePacket<PacketType>();
295 const Index max_iter =
296 numext::mini<Index>((num_coeffs - first_index) / packet_width,
297 NumPerThread * BlockSize / packet_width);
298 for (
Index i = 0;
i < max_iter;
i += BlockSize) {
299 const Index index = first_index + packet_width *
i;
301 PacketType val = input.m_impl.template packet<Unaligned>(index);
302 reducer.reducePacket(val, &accum);
307 #if defined(EIGEN_HIPCC)
309 half2* hr =
reinterpret_cast<half2*
>(&r1);
310 half2* hacc =
reinterpret_cast<half2*
>(&accum);
311 for (
int i = 0;
i < packet_width / 2;
i++) {
313 union {
int i; half2 h; } wka_in, wka_out;
315 wka_out.i = __shfl_down(wka_in.i,
offset, warpSize);
318 reducer.reducePacket(r1, &accum);
319 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
321 half2* hr =
reinterpret_cast<half2*
>(&r1);
322 half2* hacc =
reinterpret_cast<half2*
>(&accum);
323 for (
int i = 0;
i < packet_width / 2;
i++) {
324 hr[
i] = __shfl_down(hacc[
i],
offset, warpSize);
326 reducer.reducePacket(r1, &accum);
329 half2* hr =
reinterpret_cast<half2*
>(&r1);
330 half2* hacc =
reinterpret_cast<half2*
>(&accum);
331 for (
int i = 0;
i < packet_width / 2;
i++) {
332 hr[
i] = __shfl_down_sync(0xFFFFFFFF, hacc[
i], (
unsigned)
offset, warpSize);
334 reducer.reducePacket(r1, &accum);
339 if ((
threadIdx.x & (warpSize - 1)) == 0) {
340 atomicReduce(scratch, accum, reducer);
344 half2* rv1 =
reinterpret_cast<half2*
>(scratch);
345 if (packet_width > 2) {
346 reducer.reducePacket(rv1[2], rv1);
347 reducer.reducePacket(rv1[3], rv1 + 1);
348 reducer.reducePacket(rv1[1], rv1);
350 if (gridDim.x == 1) {
351 if (first_index == 0) {
352 half tmp = __low2half(*rv1);
353 reducer.reduce(__high2half(*rv1), &tmp);
359template <
typename Op>
362 half2* pscratch =
reinterpret_cast<half2*
>(scratch);
363 half tmp = __float2half(0.f);
365 for (
int i = 0; i < unpacket_traits<packet_type>::size;
i += 2) {
366 reducer.reduce(__low2half(*pscratch), &tmp);
367 reducer.reduce(__high2half(*pscratch), &tmp);
375template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
376struct FullReductionLauncher {
377 static void run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index) {
378 gpu_assert(
false &&
"Should only be called on doubles, floats and half floats");
383template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
384struct FullReductionLauncher<
385 Self, Op, OutputType, PacketAccess,
387 internal::is_same<float, OutputType>::value ||
388 internal::is_same<double, OutputType>::value,
390 static void run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs) {
392 typedef typename Self::Index
Index;
393 const int block_size = 256;
394 const int num_per_thread = 128;
395 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
397 unsigned int* semaphore = NULL;
398 if (num_blocks > 1) {
399 semaphore = device.semaphore();
402 LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
403 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
407#ifdef EIGEN_HAS_GPU_FP16
408template <
typename Self,
typename Op>
409struct FullReductionLauncher<Self, Op,
Eigen::half, false> {
410 static void run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index) {
411 gpu_assert(
false &&
"Should not be called since there is no packet accessor");
415template <
typename Self,
typename Op>
416struct FullReductionLauncher<Self, Op,
Eigen::half, true> {
417 static void run(
const Self& self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs) {
418 typedef typename Self::Index
Index;
421 const int block_size = 256;
422 const int num_per_thread = 128;
423 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
424 PacketType* scratch =
static_cast<PacketType*
>(device.scratchpad());
427 if (num_blocks > 1) {
430 LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
431 1, 1, 0, device, reducer, self, num_coeffs, scratch);
434 LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
435 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
437 if (num_blocks > 1) {
438 LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
439 1, 1, 0, device, reducer, output, scratch);
446template <
typename Self,
typename Op,
bool Vectorizable>
447struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
451#ifdef EIGEN_HAS_GPU_FP16
462 template <
typename OutputType>
463 static void run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output) {
467 if (num_coeffs == 0) {
471 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>
::run(self, reducer, device, output, num_coeffs);
476template <
int NumPerThread,
typename Self,
477 typename Reducer,
typename Index>
479 typename Self::CoeffReturnType* output) {
480#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
481 typedef typename Self::CoeffReturnType
Type;
487 const int unroll_times = 16;
490 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce,
blockDim.x * NumPerThread);
491 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
497 if (gridDim.x == 1) {
498 for (
Index i = thread_id;
i < num_preserved_coeffs;
i += num_threads) {
499 output[
i] = reducer.initialize();
507 if (
row < num_preserved_coeffs) {
508 const Index col_block =
i % input_col_blocks;
511 Type reduced_val = reducer.initialize();
513 for (
Index j = 0;
j < NumPerThread;
j += unroll_times) {
514 const Index last_col = col_begin +
blockDim.x * (
j + unroll_times - 1);
515 if (last_col >= num_coeffs_to_reduce) {
517 const Type val = input.m_impl.coeff(
row * num_coeffs_to_reduce +
col);
518 reducer.reduce(val, &reduced_val);
524 for (
int k = 0; k < unroll_times; ++k) {
526 reducer.reduce(input.m_impl.coeff(
row * num_coeffs_to_reduce +
col), &reduced_val);
533 #if defined(EIGEN_HIPCC)
537 if (std::is_floating_point<Type>::value) {
538 reducer.reduce(__shfl_down(
static_cast<float>(reduced_val),
offset), &reduced_val);
540 reducer.reduce(__shfl_down(
static_cast<int>(reduced_val),
offset), &reduced_val);
542 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
543 reducer.reduce(__shfl_down(reduced_val,
offset), &reduced_val);
545 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val,
offset), &reduced_val);
549 if ((
threadIdx.x & (warpSize - 1)) == 0) {
550 atomicReduce(&(output[
row]), reduced_val, reducer);
555 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
559#ifdef EIGEN_HAS_GPU_FP16
561template <
int NumPerThread,
typename Self,
562 typename Reducer,
typename Index>
572 const int unroll_times = 16 / packet_width;
576 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce,
blockDim.x * NumPerThread * 2);
577 const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
583 if (gridDim.x == 1) {
584 Index i = packet_width * thread_id;
585 for (;
i + packet_width <= num_preserved_coeffs;
586 i += packet_width * num_threads) {
587 PacketType* poutput =
reinterpret_cast<PacketType*
>(output +
i);
588 *poutput = reducer.template initializePacket<PacketType>();
590 if (
i < num_preserved_coeffs) {
591 output[
i] = reducer.initialize();
597 const Index row = 2 * (
i / input_col_blocks);
599 if (
row + 1 < num_preserved_coeffs) {
600 const Index col_block =
i % input_col_blocks;
601 const Index col_begin =
604 PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
605 PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
607 for (
Index j = 0;
j < NumPerThread;
j += unroll_times) {
608 const Index last_col =
609 col_begin +
blockDim.x * (
j + unroll_times - 1) * packet_width;
610 if (last_col >= num_coeffs_to_reduce) {
612 for (;
col + packet_width <= num_coeffs_to_reduce;
614 const PacketType val1 = input.m_impl.template packet<Unaligned>(
615 row * num_coeffs_to_reduce +
col);
616 reducer.reducePacket(val1, &reduced_val1);
617 const PacketType val2 = input.m_impl.template packet<Unaligned>(
618 (
row + 1) * num_coeffs_to_reduce +
col);
619 reducer.reducePacket(val2, &reduced_val2);
621 if (
col < num_coeffs_to_reduce) {
622 PacketType r1 = reducer.template initializePacket<PacketType>();
623 PacketType r2 = reducer.template initializePacket<PacketType>();
624 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
625 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
626 while (
col + 1 < num_coeffs_to_reduce) {
627 *hr1 = __halves2half2(
628 input.m_impl.coeff(
row * num_coeffs_to_reduce +
col),
629 input.m_impl.coeff(
row * num_coeffs_to_reduce +
col + 1));
630 *hr2 = __halves2half2(
631 input.m_impl.coeff((
row + 1) * num_coeffs_to_reduce +
col),
632 input.m_impl.coeff((
row + 1) * num_coeffs_to_reduce +
col +
638 if (
col < num_coeffs_to_reduce) {
641 input.m_impl.coeff(
row * num_coeffs_to_reduce +
col);
642 *hr1 = __halves2half2(last1, reducer.initialize());
644 input.m_impl.coeff((
row + 1) * num_coeffs_to_reduce +
col);
645 *hr2 = __halves2half2(last2, reducer.initialize());
647 reducer.reducePacket(r1, &reduced_val1);
648 reducer.reducePacket(r2, &reduced_val2);
654 for (
int k = 0; k < unroll_times; ++k) {
656 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
657 row * num_coeffs_to_reduce +
col),
659 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
660 (
row + 1) * num_coeffs_to_reduce +
col),
668 #if defined(EIGEN_HIPCC)
671 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
672 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
673 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
674 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
675 for (
int i = 0;
i < packet_width / 2;
i++) {
677 union {
int i; half2 h; } wka_in1, wka_out1;
679 wka_out1.i = __shfl_down(wka_in1.i,
offset, warpSize);
682 union {
int i; half2 h; } wka_in2, wka_out2;
684 wka_out2.i = __shfl_down(wka_in2.i,
offset, warpSize);
687 reducer.reducePacket(r1, &reduced_val1);
688 reducer.reducePacket(r2, &reduced_val2);
689 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
692 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
693 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
694 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
695 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
696 for (
int i = 0;
i < packet_width / 2;
i++) {
697 hr1[
i] = __shfl_down(rv1[
i],
offset, warpSize);
698 hr2[
i] = __shfl_down(rv2[
i],
offset, warpSize);
700 reducer.reducePacket(r1, &reduced_val1);
701 reducer.reducePacket(r2, &reduced_val2);
705 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
706 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
707 half2* rr1 =
reinterpret_cast<half2*
>(&reduced_val1);
708 half2* rr2 =
reinterpret_cast<half2*
>(&reduced_val2);
709 for (
int i = 0;
i < packet_width / 2;
i++) {
711 __shfl_down_sync(0xFFFFFFFF, rr1[
i], (
unsigned)
offset, warpSize);
713 __shfl_down_sync(0xFFFFFFFF, rr2[
i], (
unsigned)
offset, warpSize);
715 reducer.reducePacket(r1, &reduced_val1);
716 reducer.reducePacket(r2, &reduced_val2);
720 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
721 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
723 if (packet_width > 2) {
724 reducer.reducePacket(rv1[2], rv1);
725 reducer.reducePacket(rv1[3], rv1 + 1);
726 reducer.reducePacket(rv1[1], rv1);
727 reducer.reducePacket(rv2[2], rv2);
728 reducer.reducePacket(rv2[3], rv2 + 1);
729 reducer.reducePacket(rv2[1], rv2);
731 half val1 = __low2half(*rv1);
732 reducer.reduce(__high2half(*rv1), &val1);
733 half val2 = __low2half(*rv2);
734 reducer.reduce(__high2half(*rv2), &val2);
735 val = __halves2half2(val1, val2);
736 if ((
threadIdx.x & (warpSize - 1)) == 0) {
737 half* loc = output +
row;
738 atomicReduce((half2*)loc, val, reducer);
746template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
747struct InnerReductionLauncher {
748 static EIGEN_DEVICE_FUNC bool run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index,
typename Self::Index) {
749 gpu_assert(
false &&
"Should only be called to reduce doubles, floats and half floats on a gpu device");
755template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
756struct InnerReductionLauncher<
757 Self, Op, OutputType, PacketAccess,
759 internal::is_same<float, OutputType>::value ||
760 internal::is_same<double, OutputType>::value,
762 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
763 typedef typename Self::Index
Index;
765 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
766 const int block_size = 256;
767 const int num_per_thread = 128;
768 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
769 const int max_blocks = device.getNumGpuMultiProcessors() *
770 device.maxGpuThreadsPerMultiProcessor() / block_size;
771 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
773 if (num_blocks > 1) {
776 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
777 const int max_blocks = device.getNumGpuMultiProcessors() *
778 device.maxGpuThreadsPerMultiProcessor() / 1024;
779 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
780 LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
781 num_blocks, 1024, 0, device, reducer.initialize(),
782 num_preserved_vals, output);
785 LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
786 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
792#ifdef EIGEN_HAS_GPU_FP16
793template <
typename Self,
typename Op>
794struct InnerReductionLauncher<Self, Op,
Eigen::half, false> {
795 static bool run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index,
typename Self::Index) {
796 gpu_assert(
false &&
"Should not be called since there is no packet accessor");
801template <
typename Self,
typename Op>
802struct InnerReductionLauncher<Self, Op,
Eigen::half, true> {
803 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
804 typedef typename Self::Index
Index;
806 if (num_preserved_vals % 2 != 0) {
811 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
812 const int block_size = 128;
813 const int num_per_thread = 64;
814 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
815 const int max_blocks = device.getNumGpuMultiProcessors() *
816 device.maxGpuThreadsPerMultiProcessor() / block_size;
817 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
819 if (num_blocks > 1) {
822 LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
823 1, 1, 0, device, reducer, self, num_preserved_vals, output);
826 LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
827 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
835template <
typename Self,
typename Op>
836struct InnerReducer<Self, Op, GpuDevice> {
840#ifdef EIGEN_HAS_GPU_FP16
851 template <
typename OutputType>
852 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
856 if (num_coeffs == 0) {
860 if (num_coeffs_to_reduce <= 128) {
864 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>
::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
868template <
int NumPerThread,
typename Self,
869 typename Reducer,
typename Index>
871 typename Self::CoeffReturnType* output) {
875 if (gridDim.x == 1) {
876 for (
Index i = thread_id;
i < num_preserved_coeffs;
i += num_threads) {
877 output[
i] = reducer.initialize();
883 const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
884 for (
Index i = thread_id;
i < max_iter;
i += num_threads) {
885 const Index input_col =
i % num_preserved_coeffs;
886 const Index input_row = (
i / num_preserved_coeffs) * NumPerThread;
887 typename Self::CoeffReturnType reduced_val = reducer.initialize();
888 const Index max_row =
numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
889 for (
Index j = input_row;
j < max_row;
j++) {
890 typename Self::CoeffReturnType val = input.m_impl.coeff(
j * num_preserved_coeffs + input_col);
891 reducer.reduce(val, &reduced_val);
893 atomicReduce(&(output[input_col]), reduced_val, reducer);
898template <
typename Self,
typename Op>
899struct OuterReducer<Self, Op, GpuDevice> {
906 template <
typename Device,
typename OutputType>
908 #if !defined(EIGEN_HIPCC)
920 bool run(
const Self&, Op&,
const Device&, OutputType*,
typename Self::Index,
typename Self::Index) {
921 gpu_assert(
false &&
"Should only be called to reduce doubles or floats on a gpu device");
925 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device,
float* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
926 typedef typename Self::Index
Index;
929 if (num_coeffs_to_reduce <= 32) {
933 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
934 const int block_size = 256;
935 const int num_per_thread = 16;
936 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
937 const int max_blocks = device.getNumGpuMultiProcessors() *
938 device.maxGpuThreadsPerMultiProcessor() / block_size;
939 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
941 if (num_blocks > 1) {
944 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
945 const int max_blocks = device.getNumGpuMultiProcessors() *
946 device.maxGpuThreadsPerMultiProcessor() / 1024;
947 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
948 LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
949 num_blocks, 1024, 0, device, reducer.initialize(),
950 num_preserved_vals, output);
953 LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
954 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
int i
Definition BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALWAYS_INLINE
Definition Macros.h:932
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition Macros.h:510
#define eigen_assert(x)
Definition Macros.h:1037
m m block(1, 0, 2, 2)<< 4
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64
dim3 threadIdx
Definition gpu_common.h:19
dim3 blockDim
Definition gpu_common.h:19
dim3 blockIdx
Definition gpu_common.h:19
Type
Definition Constants.h:471
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
Definition TensorDimensions.h:140
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition MathFunctions.h:1083
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
Definition BandTriangularSolver.h:13
static EIGEN_DEVICE_FUNC void run(const Self &self, Op &reducer, const Device &, typename Self::EvaluatorPointerType output)
Definition TensorReduction.h:314
static const bool HasOptimizedImplementation
Definition TensorReduction.h:312
static const bool HasOptimizedImplementation
Definition TensorReduction.h:396
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition TensorReduction.h:398
static const bool HasOptimizedImplementation
Definition TensorReduction.h:407
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition TensorReduction.h:409
@ value
Definition Meta.h:148
T type
Definition GenericPacketMath.h:108
@ PacketAccess
Definition TensorFunctors.h:61
@ size
Definition GenericPacketMath.h:138
std::ptrdiff_t j
Definition tut_arithmetic_redux_minmax.cpp:2