TR-mbed 1.0
Loading...
Searching...
No Matches
TensorReductionGpu.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
12
13namespace Eigen {
14namespace internal {
15
16
17#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
18// Full reducers for GPU, don't vectorize for now
19
20// Reducer function that enables multiple gpu thread to safely accumulate at the same
21// output address. It basically reads the current value of the output variable, and
22// attempts to update it with the new value. If in the meantime another gpu thread
23// updated the content of the output address it will try again.
24template <typename T, typename R>
25__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
26#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
27 if (sizeof(T) == 4)
28 {
29 unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
30 unsigned int newval = oldval;
31 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
32 if (newval == oldval) {
33 return;
34 }
35 unsigned int readback;
36 while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
37 oldval = readback;
38 newval = oldval;
39 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
40 if (newval == oldval) {
41 return;
42 }
43 }
44 }
45 else if (sizeof(T) == 8) {
46 unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49 if (newval == oldval) {
50 return;
51 }
52 unsigned long long readback;
53 while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
54 oldval = readback;
55 newval = oldval;
56 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57 if (newval == oldval) {
58 return;
59 }
60 }
61 }
62 else {
63 gpu_assert(0 && "Wordsize not supported");
64 }
65#else // EIGEN_CUDA_ARCH >= 300
66 gpu_assert(0 && "Shouldn't be called on unsupported device");
67#endif // EIGEN_CUDA_ARCH >= 300
68}
69
70// We extend atomicExch to support extra data types
71template <typename Type>
72__device__ inline Type atomicExchCustom(Type* address, Type val) {
73 return atomicExch(address, val);
74}
75
76template <>
77__device__ inline double atomicExchCustom(double* address, double val) {
78 unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
79 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
80}
81
82#ifdef EIGEN_HAS_GPU_FP16
83template <typename R>
84__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
85 unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
86 unsigned int newval = oldval;
87 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
88 if (newval == oldval) {
89 return;
90 }
91 unsigned int readback;
92 while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
93 oldval = readback;
94 newval = oldval;
95 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
96 if (newval == oldval) {
97 return;
98 }
99 }
100}
101// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
102template <typename R>
103__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
104 half2* houtput=reinterpret_cast<half2*>(output);
105 half2* haccum=reinterpret_cast<half2*>(&accum);
106 for(int i=0;i<4;++i){
107 atomicReduce(houtput+i,*(haccum+i),reducer);
108 }
109}
110#endif // EIGEN_HAS_GPU_FP16
111
112template <>
113__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
114#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
115 atomicAdd(output, accum);
116#else // EIGEN_CUDA_ARCH >= 300
117 gpu_assert(0 && "Shouldn't be called on unsupported device");
118#endif // EIGEN_CUDA_ARCH >= 300
119}
120
121
122template <typename CoeffType, typename Index>
123__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
124 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
125 const Index num_threads = blockDim.x * gridDim.x;
126 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
127 output[i] = val;
128 }
129}
130
131
132template <int BlockSize, int NumPerThread, typename Self,
133 typename Reducer, typename Index>
134__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
135 typename Self::CoeffReturnType* output, unsigned int* semaphore) {
136#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
137 // Initialize the output value
138 const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
139 if (gridDim.x == 1) {
140 if (first_index == 0) {
141 *output = reducer.initialize();
142 }
143 }
144 else {
145 if (threadIdx.x == 0) {
146 unsigned int block = atomicCAS(semaphore, 0u, 1u);
147 if (block == 0) {
148 // We're the first block to run, initialize the output value
149 atomicExchCustom(output, reducer.initialize());
150 __threadfence();
151 atomicExch(semaphore, 2u);
152 }
153 else {
154 // Wait for the first block to initialize the output value.
155 // Use atomicCAS here to ensure that the reads aren't cached
156 unsigned int val;
157 do {
158 val = atomicCAS(semaphore, 2u, 2u);
159 }
160 while (val < 2u);
161 }
162 }
163 }
164
165 __syncthreads();
166
167 eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
168
169 typename Self::CoeffReturnType accum = reducer.initialize();
170 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
171 for (Index i = 0; i < max_iter; i+=BlockSize) {
172 const Index index = first_index + i;
173 eigen_assert(index < num_coeffs);
174 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
175 reducer.reduce(val, &accum);
176 }
177
178#pragma unroll
179 for (int offset = warpSize/2; offset > 0; offset /= 2) {
180 #if defined(EIGEN_HIPCC)
181 // use std::is_floating_point to determine the type of reduced_val
182 // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
183 // and list the float and int versions of __shfl_down as the candidate functions.
184 if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
185 reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
186 } else {
187 reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
188 }
189 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
190 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
191 #else
192 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
193 #endif
194 }
195
196 if ((threadIdx.x & (warpSize - 1)) == 0) {
197 atomicReduce(output, accum, reducer);
198 }
199
200 if (gridDim.x > 1 && threadIdx.x == 0) {
201 // Let the last block reset the semaphore
202 atomicInc(semaphore, gridDim.x + 1);
203#if defined(EIGEN_HIPCC)
204 __threadfence_system();
205#endif
206 }
207#else // EIGEN_CUDA_ARCH >= 300
208 gpu_assert(0 && "Shouldn't be called on unsupported device");
209#endif // EIGEN_CUDA_ARCH >= 300
210}
211
212
213#ifdef EIGEN_HAS_GPU_FP16
214template <typename Self,
215 typename Reducer, typename Index>
216__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
218 eigen_assert(blockDim.x == 1);
219 eigen_assert(gridDim.x == 1);
220 typedef packet_traits<Eigen::half>::type packet_type;
221 Index packet_remainder =
223 if (packet_remainder != 0) {
224 half2* h2scratch = reinterpret_cast<half2*>(scratch);
225 for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
226 *h2scratch =
227 __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
228 h2scratch++;
229 }
230 if ((num_coeffs & 1) != 0) {
231 half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
232 *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
233 }
234 } else {
235 *scratch = reducer.template initializePacket<packet_type>();
236 }
237}
238
239template <typename Self,
240 typename Reducer, typename Index>
241__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
242 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
243 const Index num_threads = blockDim.x * gridDim.x;
244 typedef typename packet_traits<Eigen::half>::type PacketType;
245
246 const Index num_packets =
248 PacketType* p_output = reinterpret_cast<PacketType*>(output);
249 for (Index i = thread_id; i < num_packets; i += num_threads) {
250 p_output[i] = reducer.template initializePacket<PacketType>();
251 }
252 Index packet_remainder =
254 if (thread_id < packet_remainder) {
255 output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
256 }
257}
258
259template <int BlockSize, int NumPerThread, typename Self,
260 typename Reducer, typename Index>
261__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
262 half* output, packet_traits<Eigen::half>::type* scratch) {
263 typedef typename packet_traits<Eigen::half>::type PacketType;
264 const int packet_width = unpacket_traits<PacketType>::size;
265 eigen_assert(NumPerThread % packet_width == 0);
266 const Index first_index =
267 blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
268
269 // Initialize the output value if it wasn't initialized by the ReductionInitKernel
270
271 if (gridDim.x == 1) {
272 if (first_index == 0) {
273 int rem = num_coeffs % packet_width;
274 if (rem != 0) {
275 half2* p_scratch = reinterpret_cast<half2*>(scratch);
276 *scratch = reducer.template initializePacket<PacketType>();
277 for (int i = 0; i < rem / 2; i++) {
278 *p_scratch = __halves2half2(
279 input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
280 input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
281 p_scratch++;
282 }
283 if ((num_coeffs & 1) != 0) {
284 half last = input.m_impl.coeff(num_coeffs - 1);
285 *p_scratch = __halves2half2(last, reducer.initialize());
286 }
287 } else {
288 *scratch = reducer.template initializePacket<PacketType>();
289 }
290 }
291 __syncthreads();
292 }
293
294 PacketType accum = reducer.template initializePacket<PacketType>();
295 const Index max_iter =
296 numext::mini<Index>((num_coeffs - first_index) / packet_width,
297 NumPerThread * BlockSize / packet_width);
298 for (Index i = 0; i < max_iter; i += BlockSize) {
299 const Index index = first_index + packet_width * i;
300 eigen_assert(index + packet_width < num_coeffs);
301 PacketType val = input.m_impl.template packet<Unaligned>(index);
302 reducer.reducePacket(val, &accum);
303 }
304
305#pragma unroll
306 for (int offset = warpSize/2; offset > 0; offset /= 2) {
307 #if defined(EIGEN_HIPCC)
308 PacketType r1;
309 half2* hr = reinterpret_cast<half2*>(&r1);
310 half2* hacc = reinterpret_cast<half2*>(&accum);
311 for (int i = 0; i < packet_width / 2; i++) {
312 // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
313 union { int i; half2 h; } wka_in, wka_out;
314 wka_in.h = hacc[i];
315 wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
316 hr[i] = wka_out.h;
317 }
318 reducer.reducePacket(r1, &accum);
319 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
320 PacketType r1;
321 half2* hr = reinterpret_cast<half2*>(&r1);
322 half2* hacc = reinterpret_cast<half2*>(&accum);
323 for (int i = 0; i < packet_width / 2; i++) {
324 hr[i] = __shfl_down(hacc[i], offset, warpSize);
325 }
326 reducer.reducePacket(r1, &accum);
327 #else
328 PacketType r1;
329 half2* hr = reinterpret_cast<half2*>(&r1);
330 half2* hacc = reinterpret_cast<half2*>(&accum);
331 for (int i = 0; i < packet_width / 2; i++) {
332 hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
333 }
334 reducer.reducePacket(r1, &accum);
335
336 #endif
337 }
338
339 if ((threadIdx.x & (warpSize - 1)) == 0) {
340 atomicReduce(scratch, accum, reducer);
341 }
342
343 __syncthreads();
344 half2* rv1 = reinterpret_cast<half2*>(scratch);
345 if (packet_width > 2) {
346 reducer.reducePacket(rv1[2], rv1);
347 reducer.reducePacket(rv1[3], rv1 + 1);
348 reducer.reducePacket(rv1[1], rv1);
349 }
350 if (gridDim.x == 1) {
351 if (first_index == 0) {
352 half tmp = __low2half(*rv1);
353 reducer.reduce(__high2half(*rv1), &tmp);
354 *output = tmp;
355 }
356 }
357}
358
359template <typename Op>
360__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
361 eigen_assert(threadIdx.x == 1);
362 half2* pscratch = reinterpret_cast<half2*>(scratch);
363 half tmp = __float2half(0.f);
364 typedef packet_traits<Eigen::half>::type packet_type;
365 for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
366 reducer.reduce(__low2half(*pscratch), &tmp);
367 reducer.reduce(__high2half(*pscratch), &tmp);
368 pscratch++;
369 }
370 *output = tmp;
371}
372
373#endif // EIGEN_HAS_GPU_FP16
374
375template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
376struct FullReductionLauncher {
377 static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
378 gpu_assert(false && "Should only be called on doubles, floats and half floats");
379 }
380};
381
382// Specialization for float and double
383template <typename Self, typename Op, typename OutputType, bool PacketAccess>
384struct FullReductionLauncher<
385 Self, Op, OutputType, PacketAccess,
386 typename internal::enable_if<
387 internal::is_same<float, OutputType>::value ||
388 internal::is_same<double, OutputType>::value,
389 void>::type> {
390 static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
391
392 typedef typename Self::Index Index;
393 const int block_size = 256;
394 const int num_per_thread = 128;
395 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
396
397 unsigned int* semaphore = NULL;
398 if (num_blocks > 1) {
399 semaphore = device.semaphore();
400 }
401
402 LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
403 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
404 }
405};
406
407#ifdef EIGEN_HAS_GPU_FP16
408template <typename Self, typename Op>
409struct FullReductionLauncher<Self, Op, Eigen::half, false> {
410 static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
411 gpu_assert(false && "Should not be called since there is no packet accessor");
412 }
413};
414
415template <typename Self, typename Op>
416struct FullReductionLauncher<Self, Op, Eigen::half, true> {
417 static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
418 typedef typename Self::Index Index;
419 typedef typename packet_traits<Eigen::half>::type PacketType;
420
421 const int block_size = 256;
422 const int num_per_thread = 128;
423 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
424 PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
425 // half2* scratch = static_cast<half2*>(device.scratchpad());
426
427 if (num_blocks > 1) {
428 // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
429 // won't be a race conditions between multiple thread blocks.
430 LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
431 1, 1, 0, device, reducer, self, num_coeffs, scratch);
432 }
433
434 LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
435 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
436
437 if (num_blocks > 1) {
438 LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
439 1, 1, 0, device, reducer, output, scratch);
440 }
441 }
442};
443#endif // EIGEN_HAS_GPU_FP16
444
445
446template <typename Self, typename Op, bool Vectorizable>
447struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
448 // Unfortunately nvidia doesn't support well exotic types such as complex,
449 // so reduce the scope of the optimized version of the code to the simple cases
450 // of doubles, floats and half floats
451#ifdef EIGEN_HAS_GPU_FP16
452 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
456#else // EIGEN_HAS_GPU_FP16
457 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
460#endif // EIGEN_HAS_GPU_FP16
461
462 template <typename OutputType>
463 static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
464 gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
465 const Index num_coeffs = array_prod(self.m_impl.dimensions());
466 // Don't crash when we're called with an input tensor of size 0.
467 if (num_coeffs == 0) {
468 return;
469 }
470
471 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
472 }
473};
474
475
476template <int NumPerThread, typename Self,
477 typename Reducer, typename Index>
478__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
479 typename Self::CoeffReturnType* output) {
480#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
481 typedef typename Self::CoeffReturnType Type;
482 eigen_assert(blockDim.y == 1);
483 eigen_assert(blockDim.z == 1);
484 eigen_assert(gridDim.y == 1);
485 eigen_assert(gridDim.z == 1);
486
487 const int unroll_times = 16;
488 eigen_assert(NumPerThread % unroll_times == 0);
489
490 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
491 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
492
493 const Index num_threads = blockDim.x * gridDim.x;
494 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
495
496 // Initialize the output values if they weren't initialized by the ReductionInitKernel
497 if (gridDim.x == 1) {
498 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
499 output[i] = reducer.initialize();
500 }
501 __syncthreads();
502 }
503
504 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
505 const Index row = i / input_col_blocks;
506
507 if (row < num_preserved_coeffs) {
508 const Index col_block = i % input_col_blocks;
509 const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
510
511 Type reduced_val = reducer.initialize();
512
513 for (Index j = 0; j < NumPerThread; j += unroll_times) {
514 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
515 if (last_col >= num_coeffs_to_reduce) {
516 for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
517 const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
518 reducer.reduce(val, &reduced_val);
519 }
520 break;
521 } else {
522 // Faster version of the loop with no branches after unrolling.
523#pragma unroll
524 for (int k = 0; k < unroll_times; ++k) {
525 const Index col = col_begin + blockDim.x * (j + k);
526 reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
527 }
528 }
529 }
530
531#pragma unroll
532 for (int offset = warpSize/2; offset > 0; offset /= 2) {
533 #if defined(EIGEN_HIPCC)
534 // use std::is_floating_point to determine the type of reduced_val
535 // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
536 // and list the float and int versions of __shfl_down as the candidate functions.
537 if (std::is_floating_point<Type>::value) {
538 reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
539 } else {
540 reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
541 }
542 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
543 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
544 #else
545 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
546 #endif
547 }
548
549 if ((threadIdx.x & (warpSize - 1)) == 0) {
550 atomicReduce(&(output[row]), reduced_val, reducer);
551 }
552 }
553 }
554#else // EIGEN_CUDA_ARCH >= 300
555 gpu_assert(0 && "Shouldn't be called on unsupported device");
556#endif // EIGEN_CUDA_ARCH >= 300
557}
558
559#ifdef EIGEN_HAS_GPU_FP16
560
561template <int NumPerThread, typename Self,
562 typename Reducer, typename Index>
563__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
564 half* output) {
565 eigen_assert(blockDim.y == 1);
566 eigen_assert(blockDim.z == 1);
567 eigen_assert(gridDim.y == 1);
568 eigen_assert(gridDim.z == 1);
569
570 typedef typename packet_traits<Eigen::half>::type PacketType;
571 const int packet_width = unpacket_traits<PacketType>::size;
572 const int unroll_times = 16 / packet_width;
573 eigen_assert(NumPerThread % unroll_times == 0);
574 eigen_assert(unroll_times % 2 == 0);
575
576 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
577 const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
578
579 const Index num_threads = blockDim.x * gridDim.x;
580 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
581
582 // Initialize the output values if they weren't initialized by the ReductionInitKernel
583 if (gridDim.x == 1) {
584 Index i = packet_width * thread_id;
585 for (; i + packet_width <= num_preserved_coeffs;
586 i += packet_width * num_threads) {
587 PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
588 *poutput = reducer.template initializePacket<PacketType>();
589 }
590 if (i < num_preserved_coeffs) {
591 output[i] = reducer.initialize();
592 }
593 __syncthreads();
594 }
595
596 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
597 const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows
598
599 if (row + 1 < num_preserved_coeffs) {
600 const Index col_block = i % input_col_blocks;
601 const Index col_begin =
602 packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
603
604 PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
605 PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
606
607 for (Index j = 0; j < NumPerThread; j += unroll_times) {
608 const Index last_col =
609 col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
610 if (last_col >= num_coeffs_to_reduce) {
611 Index col = col_begin + blockDim.x * j;
612 for (; col + packet_width <= num_coeffs_to_reduce;
613 col += blockDim.x) {
614 const PacketType val1 = input.m_impl.template packet<Unaligned>(
615 row * num_coeffs_to_reduce + col);
616 reducer.reducePacket(val1, &reduced_val1);
617 const PacketType val2 = input.m_impl.template packet<Unaligned>(
618 (row + 1) * num_coeffs_to_reduce + col);
619 reducer.reducePacket(val2, &reduced_val2);
620 }
621 if (col < num_coeffs_to_reduce) {
622 PacketType r1 = reducer.template initializePacket<PacketType>();
623 PacketType r2 = reducer.template initializePacket<PacketType>();
624 half2* hr1 = reinterpret_cast<half2*>(&r1);
625 half2* hr2 = reinterpret_cast<half2*>(&r2);
626 while (col + 1 < num_coeffs_to_reduce) {
627 *hr1 = __halves2half2(
628 input.m_impl.coeff(row * num_coeffs_to_reduce + col),
629 input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
630 *hr2 = __halves2half2(
631 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
632 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
633 1));
634 hr1++;
635 hr2++;
636 col += 2;
637 }
638 if (col < num_coeffs_to_reduce) {
639 // Peel;
640 const half last1 =
641 input.m_impl.coeff(row * num_coeffs_to_reduce + col);
642 *hr1 = __halves2half2(last1, reducer.initialize());
643 const half last2 =
644 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
645 *hr2 = __halves2half2(last2, reducer.initialize());
646 }
647 reducer.reducePacket(r1, &reduced_val1);
648 reducer.reducePacket(r2, &reduced_val2);
649 }
650 break;
651 } else {
652 // Faster version of the loop with no branches after unrolling.
653#pragma unroll
654 for (int k = 0; k < unroll_times; ++k) {
655 const Index col = col_begin + blockDim.x * (j + k) * packet_width;
656 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
657 row * num_coeffs_to_reduce + col),
658 &reduced_val1);
659 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
660 (row + 1) * num_coeffs_to_reduce + col),
661 &reduced_val2);
662 }
663 }
664 }
665
666#pragma unroll
667 for (int offset = warpSize/2; offset > 0; offset /= 2) {
668 #if defined(EIGEN_HIPCC)
669 PacketType r1;
670 PacketType r2;
671 half2* hr1 = reinterpret_cast<half2*>(&r1);
672 half2* hr2 = reinterpret_cast<half2*>(&r2);
673 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
674 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
675 for (int i = 0; i < packet_width / 2; i++) {
676 // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
677 union { int i; half2 h; } wka_in1, wka_out1;
678 wka_in1.h = rv1[i];
679 wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
680 hr1[i] = wka_out1.h;
681
682 union { int i; half2 h; } wka_in2, wka_out2;
683 wka_in2.h = rv2[i];
684 wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
685 hr2[i] = wka_out2.h;
686 }
687 reducer.reducePacket(r1, &reduced_val1);
688 reducer.reducePacket(r2, &reduced_val2);
689 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
690 PacketType r1;
691 PacketType r2;
692 half2* hr1 = reinterpret_cast<half2*>(&r1);
693 half2* hr2 = reinterpret_cast<half2*>(&r2);
694 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
695 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
696 for (int i = 0; i < packet_width / 2; i++) {
697 hr1[i] = __shfl_down(rv1[i], offset, warpSize);
698 hr2[i] = __shfl_down(rv2[i], offset, warpSize);
699 }
700 reducer.reducePacket(r1, &reduced_val1);
701 reducer.reducePacket(r2, &reduced_val2);
702 #else
703 PacketType r1;
704 PacketType r2;
705 half2* hr1 = reinterpret_cast<half2*>(&r1);
706 half2* hr2 = reinterpret_cast<half2*>(&r2);
707 half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
708 half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
709 for (int i = 0; i < packet_width / 2; i++) {
710 hr1[i] =
711 __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
712 hr2[i] =
713 __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
714 }
715 reducer.reducePacket(r1, &reduced_val1);
716 reducer.reducePacket(r2, &reduced_val2);
717
718 #endif
719 }
720 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
721 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
722 half2 val;
723 if (packet_width > 2) {
724 reducer.reducePacket(rv1[2], rv1);
725 reducer.reducePacket(rv1[3], rv1 + 1);
726 reducer.reducePacket(rv1[1], rv1);
727 reducer.reducePacket(rv2[2], rv2);
728 reducer.reducePacket(rv2[3], rv2 + 1);
729 reducer.reducePacket(rv2[1], rv2);
730 }
731 half val1 = __low2half(*rv1);
732 reducer.reduce(__high2half(*rv1), &val1);
733 half val2 = __low2half(*rv2);
734 reducer.reduce(__high2half(*rv2), &val2);
735 val = __halves2half2(val1, val2);
736 if ((threadIdx.x & (warpSize - 1)) == 0) {
737 half* loc = output + row;
738 atomicReduce((half2*)loc, val, reducer);
739 }
740 }
741 }
742}
743
744#endif // EIGEN_HAS_GPU_FP16
745
746template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
747struct InnerReductionLauncher {
748 static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
749 gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
750 return true;
751 }
752};
753
754// Specialization for float and double
755template <typename Self, typename Op, typename OutputType, bool PacketAccess>
756struct InnerReductionLauncher<
757 Self, Op, OutputType, PacketAccess,
758 typename internal::enable_if<
759 internal::is_same<float, OutputType>::value ||
760 internal::is_same<double, OutputType>::value,
761 void>::type> {
762 static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
763 typedef typename Self::Index Index;
764
765 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
766 const int block_size = 256;
767 const int num_per_thread = 128;
768 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
769 const int max_blocks = device.getNumGpuMultiProcessors() *
770 device.maxGpuThreadsPerMultiProcessor() / block_size;
771 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
772
773 if (num_blocks > 1) {
774 // We initialize the outputs outside the reduction kernel when we can't be sure that there
775 // won't be a race conditions between multiple thread blocks.
776 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
777 const int max_blocks = device.getNumGpuMultiProcessors() *
778 device.maxGpuThreadsPerMultiProcessor() / 1024;
779 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
780 LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
781 num_blocks, 1024, 0, device, reducer.initialize(),
782 num_preserved_vals, output);
783 }
784
785 LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
786 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
787
788 return false;
789 }
790};
791
792#ifdef EIGEN_HAS_GPU_FP16
793template <typename Self, typename Op>
794struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
795 static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
796 gpu_assert(false && "Should not be called since there is no packet accessor");
797 return true;
798 }
799};
800
801template <typename Self, typename Op>
802struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
803 static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
804 typedef typename Self::Index Index;
805
806 if (num_preserved_vals % 2 != 0) {
807 // Not supported yet, revert to the slower code path
808 return true;
809 }
810
811 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
812 const int block_size = /*256*/128;
813 const int num_per_thread = /*128*/64;
814 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
815 const int max_blocks = device.getNumGpuMultiProcessors() *
816 device.maxGpuThreadsPerMultiProcessor() / block_size;
817 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
818
819 if (num_blocks > 1) {
820 // We initialize the outputs outside the reduction kernel when we can't be sure that there
821 // won't be a race conditions between multiple thread blocks.
822 LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
823 1, 1, 0, device, reducer, self, num_preserved_vals, output);
824 }
825
826 LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
827 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
828
829 return false;
830 }
831};
832#endif // EIGEN_HAS_GPU_FP16
833
834
835template <typename Self, typename Op>
836struct InnerReducer<Self, Op, GpuDevice> {
837 // Unfortunately nvidia doesn't support well exotic types such as complex,
838 // so reduce the scope of the optimized version of the code to the simple case
839 // of floats and half floats.
840#ifdef EIGEN_HAS_GPU_FP16
841 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
845#else // EIGEN_HAS_GPU_FP16
846 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
849#endif // EIGEN_HAS_GPU_FP16
850
851 template <typename OutputType>
852 static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
853 gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
854 const Index num_coeffs = array_prod(self.m_impl.dimensions());
855 // Don't crash when we're called with an input tensor of size 0.
856 if (num_coeffs == 0) {
857 return true;
858 }
859 // It's faster to use the usual code.
860 if (num_coeffs_to_reduce <= 128) {
861 return true;
862 }
863
864 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
865 }
866};
867
868template <int NumPerThread, typename Self,
869 typename Reducer, typename Index>
870__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
871 typename Self::CoeffReturnType* output) {
872 const Index num_threads = blockDim.x * gridDim.x;
873 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
874 // Initialize the output values if they weren't initialized by the ReductionInitKernel
875 if (gridDim.x == 1) {
876 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
877 output[i] = reducer.initialize();
878 }
879 __syncthreads();
880 }
881
882 // Do the reduction.
883 const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
884 for (Index i = thread_id; i < max_iter; i += num_threads) {
885 const Index input_col = i % num_preserved_coeffs;
886 const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
887 typename Self::CoeffReturnType reduced_val = reducer.initialize();
888 const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
889 for (Index j = input_row; j < max_row; j++) {
890 typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
891 reducer.reduce(val, &reduced_val);
892 }
893 atomicReduce(&(output[input_col]), reduced_val, reducer);
894 }
895}
896
897
898template <typename Self, typename Op>
899struct OuterReducer<Self, Op, GpuDevice> {
900 // Unfortunately nvidia doesn't support well exotic types such as complex,
901 // so reduce the scope of the optimized version of the code to the simple case
902 // of floats.
903 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
906 template <typename Device, typename OutputType>
907 static
908 #if !defined(EIGEN_HIPCC)
909 // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
910 // (in the cxx11_tensor_reduction_gpu test)
911 //
912 // terminate called after throwing an instance of 'std::runtime_error'
913 // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
914 //
915 // don't know why this happens (and why is it a runtime error instead of a compile time error)
916 //
917 // this will be fixed by HIP PR#457
919 #endif
920 bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
921 gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
922 return true;
923 }
924
925 static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
926 typedef typename Self::Index Index;
927
928 // It's faster to use the usual code.
929 if (num_coeffs_to_reduce <= 32) {
930 return true;
931 }
932
933 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
934 const int block_size = 256;
935 const int num_per_thread = 16;
936 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
937 const int max_blocks = device.getNumGpuMultiProcessors() *
938 device.maxGpuThreadsPerMultiProcessor() / block_size;
939 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
940
941 if (num_blocks > 1) {
942 // We initialize the outputs in the reduction kernel itself when we don't have to worry
943 // about race conditions between multiple thread blocks.
944 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
945 const int max_blocks = device.getNumGpuMultiProcessors() *
946 device.maxGpuThreadsPerMultiProcessor() / 1024;
947 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
948 LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
949 num_blocks, 1024, 0, device, reducer.initialize(),
950 num_preserved_vals, output);
951 }
952
953 LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
954 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
955
956 return false;
957 }
958};
959
960#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
961
962
963} // end namespace internal
964} // end namespace Eigen
965
966#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
int i
Definition BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALWAYS_INLINE
Definition Macros.h:932
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition Macros.h:510
#define eigen_assert(x)
Definition Macros.h:1037
m col(1)
m row(1)
m m block(1, 0, 2, 2)<< 4
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64
dim3 threadIdx
Definition gpu_common.h:19
dim3 blockDim
Definition gpu_common.h:19
dim3 blockIdx
Definition gpu_common.h:19
Type
Definition Constants.h:471
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
Definition TensorDimensions.h:140
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition MathFunctions.h:1083
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
Definition BandTriangularSolver.h:13
static EIGEN_DEVICE_FUNC void run(const Self &self, Op &reducer, const Device &, typename Self::EvaluatorPointerType output)
Definition TensorReduction.h:314
static const bool HasOptimizedImplementation
Definition TensorReduction.h:312
static const bool HasOptimizedImplementation
Definition TensorReduction.h:396
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition TensorReduction.h:398
static const bool HasOptimizedImplementation
Definition TensorReduction.h:407
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition TensorReduction.h:409
@ value
Definition Meta.h:148
T type
Definition GenericPacketMath.h:108
@ PacketAccess
Definition TensorFunctors.h:61
@ size
Definition GenericPacketMath.h:138
std::ptrdiff_t j
Definition tut_arithmetic_redux_minmax.cpp:2