TR-mbed 1.0
Loading...
Searching...
No Matches
TensorConvolution.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12
13namespace Eigen {
14
22namespace internal {
23
24template <typename Index, typename InputDims, int NumKernelDims, int Layout>
26 public:
28 const array<Index, NumKernelDims>& indices) {
29
30 array<Index, NumDims> dimensions = input_dims;
31 for (int i = 0; i < NumKernelDims; ++i) {
32 const Index index = indices[i];
33 const Index input_dim = input_dims[index];
35 const Index result_dim = input_dim - kernel_dim + 1;
36 dimensions[index] = result_dim;
37 }
38
41 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
42 inputStrides[0] = 1;
43 outputStrides[0] = 1;
44 for (int i = 1; i < NumDims; ++i) {
45 inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
46 outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
47 }
48 } else {
49 inputStrides[NumDims - 1] = 1;
50 outputStrides[NumDims - 1] = 1;
51 for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
52 inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
53 outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
54 }
55 }
56
59 array<Index, NumDims> tmp = dimensions;
60 array<Index, NumDims> ordering;
61 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
62 ? 0
63 : NumDims - NumKernelDims;
64 for (int i = 0; i < NumKernelDims; ++i) {
65 const Index index = i + offset;
66 ordering[index] = indices[i];
67 tmp[indices[i]] = -1;
68 gpuInputDimensions[index] = input_dims[indices[i]];
69 gpuOutputDimensions[index] = dimensions[indices[i]];
70 }
71
72 int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
73 ? NumKernelDims
74 : 0;
75 for (int i = 0; i < NumDims; ++i) {
76 if (tmp[i] >= 0) {
77 ordering[written] = i;
78 gpuInputDimensions[written] = input_dims[i];
79 gpuOutputDimensions[written] = dimensions[i];
80 ++written;
81 }
82 }
83
84 for (int i = 0; i < NumDims; ++i) {
85 m_inputStrides[i] = inputStrides[ordering[i]];
86 m_outputStrides[i] = outputStrides[ordering[i]];
87 }
88
89 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
90 for (int i = 0; i < NumDims; ++i) {
91 if (i > NumKernelDims) {
92 m_gpuInputStrides[i] =
93 m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
94 m_gpuOutputStrides[i] =
95 m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
96 } else {
97 m_gpuInputStrides[i] = 1;
98 m_gpuOutputStrides[i] = 1;
99 }
100 }
101 } else {
102 for (int i = NumDims - 1; i >= 0; --i) {
103 if (static_cast<size_t>(i + 1) < offset) {
104 m_gpuInputStrides[i] =
105 m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
106 m_gpuOutputStrides[i] =
107 m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
108 } else {
109 m_gpuInputStrides[i] = 1;
110 m_gpuOutputStrides[i] = 1;
111 }
112 }
113 }
114 }
115
117 Index inputIndex = 0;
118 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
119 for (int d = NumDims - 1; d > NumKernelDims; --d) {
120 const Index idx = p / m_gpuInputStrides[d];
121 inputIndex += idx * m_inputStrides[d];
122 p -= idx * m_gpuInputStrides[d];
123 }
124 inputIndex += p * m_inputStrides[NumKernelDims];
125 } else {
126 std::ptrdiff_t limit = 0;
127 if (NumKernelDims < NumDims) {
128 limit = NumDims - NumKernelDims - 1;
129 }
130 for (int d = 0; d < limit; ++d) {
131 const Index idx = p / m_gpuInputStrides[d];
132 inputIndex += idx * m_inputStrides[d];
133 p -= idx * m_gpuInputStrides[d];
134 }
135 inputIndex += p * m_inputStrides[limit];
136 }
137 return inputIndex;
138 }
139
141 Index outputIndex = 0;
142 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
143 for (int d = NumDims - 1; d > NumKernelDims; --d) {
144 const Index idx = p / m_gpuOutputStrides[d];
145 outputIndex += idx * m_outputStrides[d];
146 p -= idx * m_gpuOutputStrides[d];
147 }
148 outputIndex += p * m_outputStrides[NumKernelDims];
149 } else {
150 std::ptrdiff_t limit = 0;
151 if (NumKernelDims < NumDims) {
152 limit = NumDims - NumKernelDims - 1;
153 }
154 for (int d = 0; d < limit; ++d) {
155 const Index idx = p / m_gpuOutputStrides[d];
156 outputIndex += idx * m_outputStrides[d];
157 p -= idx * m_gpuOutputStrides[d];
158 }
159 outputIndex += p * m_outputStrides[limit];
160 }
161 return outputIndex;
162 }
163
165 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
166 ? 0
167 : NumDims - NumKernelDims;
168 return i * m_inputStrides[offset];
169 }
170
172 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
173 ? 0
174 : NumDims - NumKernelDims;
175 return i * m_outputStrides[offset];
176 }
177
179 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
180 ? 0
181 : NumDims - NumKernelDims;
182 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
183 }
184
186 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
187 ? 0
188 : NumDims - NumKernelDims;
189 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
190 }
191
193 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
194 ? 0
195 : NumDims - NumKernelDims;
196 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
197 k * m_inputStrides[offset + 2];
198 }
199
201 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
202 ? 0
203 : NumDims - NumKernelDims;
204 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
205 k * m_outputStrides[offset + 2];
206 }
207
208 private:
209 static const int NumDims = internal::array_size<InputDims>::value;
210 array<Index, NumDims> m_inputStrides;
211 array<Index, NumDims> m_outputStrides;
212 array<Index, NumDims> m_gpuInputStrides;
213 array<Index, NumDims> m_gpuOutputStrides;
214};
215
216
217
218template<typename Dimensions, typename InputXprType, typename KernelXprType>
220{
221 // Type promotion to handle the case where the types of the lhs and the rhs are different.
222 typedef typename promote_storage_type<typename InputXprType::Scalar,
223 typename KernelXprType::Scalar>::ret Scalar;
228 typedef typename InputXprType::Nested LhsNested;
229 typedef typename KernelXprType::Nested RhsNested;
232 static const int NumDimensions = traits<InputXprType>::NumDimensions;
233 static const int Layout = traits<InputXprType>::Layout;
236
237 enum {
238 Flags = 0
239 };
240};
241
242template<typename Dimensions, typename InputXprType, typename KernelXprType>
247
248template<typename Dimensions, typename InputXprType, typename KernelXprType>
253
254} // end namespace internal
255
256
257
258template<typename Indices, typename InputXprType, typename KernelXprType>
259class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
260{
261 public:
264 typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
265 typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
269
270 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
271 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
272
274 const Indices& indices() const { return m_indices; }
275
279 inputExpression() const { return m_input_xpr; }
280
283 kernelExpression() const { return m_kernel_xpr; }
284
285 protected:
286 typename InputXprType::Nested m_input_xpr;
287 typename KernelXprType::Nested m_kernel_xpr;
288 const Indices m_indices;
289};
290
291
292template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
293struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
294{
296
298 static const int NumKernelDims = internal::array_size<Indices>::value;
299 typedef typename XprType::Index Index;
301
302 typedef typename XprType::Scalar Scalar;
308
309 enum {
312 BlockAccess = false,
315 CoordAccess = false, // to be implemented
316 RawAccess = false
317 };
318
319 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
321 //===--------------------------------------------------------------------===//
322
323 EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
324 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
325 {
326 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
327
328 const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
329 const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
330
331 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
332 m_inputStride[0] = 1;
333 for (int i = 1; i < NumDims; ++i) {
334 m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
335 }
336 } else {
337 m_inputStride[NumDims - 1] = 1;
338 for (int i = NumDims - 2; i >= 0; --i) {
339 m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
340 }
341 }
342
343 m_dimensions = m_inputImpl.dimensions();
344 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
345 for (int i = 0; i < NumKernelDims; ++i) {
346 const Index index = op.indices()[i];
347 const Index input_dim = input_dims[index];
348 const Index kernel_dim = kernel_dims[i];
349 const Index result_dim = input_dim - kernel_dim + 1;
350 m_dimensions[index] = result_dim;
351 if (i > 0) {
352 m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
353 } else {
354 m_kernelStride[0] = 1;
355 }
356 m_indexStride[i] = m_inputStride[index];
357 }
358
359 m_outputStride[0] = 1;
360 for (int i = 1; i < NumDims; ++i) {
361 m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
362 }
363 } else {
364 for (int i = NumKernelDims - 1; i >= 0; --i) {
365 const Index index = op.indices()[i];
366 const Index input_dim = input_dims[index];
367 const Index kernel_dim = kernel_dims[i];
368 const Index result_dim = input_dim - kernel_dim + 1;
369 m_dimensions[index] = result_dim;
370 if (i < NumKernelDims - 1) {
371 m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
372 } else {
373 m_kernelStride[NumKernelDims - 1] = 1;
374 }
375 m_indexStride[i] = m_inputStride[index];
376 }
377
378 m_outputStride[NumDims - 1] = 1;
379 for (int i = NumDims - 2; i >= 0; --i) {
380 m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
381 }
382 }
383 }
384
385 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
386
388 m_inputImpl.evalSubExprsIfNeeded(NULL);
389 preloadKernel();
390 return true;
391 }
393 m_inputImpl.cleanup();
394 if (m_local_kernel) {
395 m_device.deallocate((void*)m_kernel);
396 m_local_kernel = false;
397 }
398 m_kernel = NULL;
399 }
400
401 void evalTo(typename XprType::Scalar* buffer) {
403 for (int i = 0; i < dimensions().TotalSize(); ++i) {
404 buffer[i] += coeff(i);
405 }
406 cleanup();
407 }
408
410 {
412 convolve(firstInput(index), 0, NumKernelDims-1, result);
413 return result;
414 }
415
416 template<int LoadMode>
418 {
419 Index indices[2] = {index, index+PacketSize-1};
420 Index startInputs[2] = {0, 0};
421 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
422 for (int i = NumDims - 1; i > 0; --i) {
423 const Index idx0 = indices[0] / m_outputStride[i];
424 const Index idx1 = indices[1] / m_outputStride[i];
425 startInputs[0] += idx0 * m_inputStride[i];
426 startInputs[1] += idx1 * m_inputStride[i];
427 indices[0] -= idx0 * m_outputStride[i];
428 indices[1] -= idx1 * m_outputStride[i];
429 }
430 } else {
431 for (int i = 0; i < NumDims - 1; ++i) {
432 const Index idx0 = indices[0] / m_outputStride[i];
433 const Index idx1 = indices[1] / m_outputStride[i];
434 startInputs[0] += idx0 * m_inputStride[i];
435 startInputs[1] += idx1 * m_inputStride[i];
436 indices[0] -= idx0 * m_outputStride[i];
437 indices[1] -= idx1 * m_outputStride[i];
438 }
439 }
440 startInputs[0] += indices[0];
441 startInputs[1] += indices[1];
442
443 if (startInputs[1]-startInputs[0] == PacketSize-1) {
445 convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
446 return result;
447 } else {
449 data[0] = Scalar(0);
450 convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
451 for (int i = 1; i < PacketSize-1; ++i) {
452 data[i] = Scalar(0);
453 convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
454 }
455 data[PacketSize-1] = Scalar(0);
456 convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
458 }
459 }
460
462 costPerCoeff(bool vectorized) const {
463 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
464 // We ignore the use of fused multiply-add.
465 const double convolve_compute_cost =
466 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
467 const double firstIndex_compute_cost =
468 NumDims *
469 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
470 TensorOpCost::DivCost<Index>());
471 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
472 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
473 m_kernelImpl.costPerCoeff(vectorized) +
474 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
475 PacketSize));
476 }
477
479
480 private:
481 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
482 Index startInput = 0;
483 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
484 for (int i = NumDims - 1; i > 0; --i) {
485 const Index idx = index / m_outputStride[i];
486 startInput += idx * m_inputStride[i];
487 index -= idx * m_outputStride[i];
488 }
489 } else {
490 for (int i = 0; i < NumDims - 1; ++i) {
491 const Index idx = index / m_outputStride[i];
492 startInput += idx * m_inputStride[i];
493 index -= idx * m_outputStride[i];
494 }
495 }
496 startInput += index;
497 return startInput;
498 }
499
500 EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
501 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
502 const Index input = firstIndex + j * m_indexStride[DimIndex];
503 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
504 if (DimIndex > 0) {
505 convolve(input, kernel, DimIndex-1, accum);
506 } else {
507 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
508 }
509 }
510 }
511
512 template <typename Packet>
513 EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
514 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
515 const Index input = firstIndex + j * m_indexStride[DimIndex];
516 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
517 if (DimIndex > 0) {
518 convolvePacket(input, kernel, DimIndex-1, accum);
519 } else {
520 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
521 }
522 }
523 }
524
525 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
526 // Don't make a local copy of the kernel unless we have to (i.e. it's an
527 // expression that needs to be evaluated)
528 const Scalar* in_place = m_kernelImpl.data();
529 if (in_place) {
530 m_kernel = in_place;
531 m_local_kernel = false;
532 } else {
533 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
534 Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
535 typedef TensorEvalToOp<const KernelArgType> EvalTo;
536 EvalTo evalToTmp(local, m_kernelArg);
539
540 m_kernel = local;
541 m_local_kernel = true;
542 }
543 }
544
545 array<Index, NumDims> m_inputStride;
546 array<Index, NumDims> m_outputStride;
547
548 array<Index, NumKernelDims> m_indexStride;
549 array<Index, NumKernelDims> m_kernelStride;
550 TensorEvaluator<InputArgType, Device> m_inputImpl;
551 TensorEvaluator<KernelArgType, Device> m_kernelImpl;
552 Dimensions m_dimensions;
553
554 KernelArgType m_kernelArg;
555 const Scalar* m_kernel;
556 bool m_local_kernel;
557 const Device EIGEN_DEVICE_REF m_device;
558};
559
560
561
562
563// Use an optimized implementation of the evaluation code for GPUs whenever possible.
564#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
565
566template <int StaticKernelSize>
567struct GetKernelSize {
568 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
569 return StaticKernelSize;
570 }
571};
572template <>
573struct GetKernelSize<Dynamic> {
574 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
575 return kernelSize;
576 }
577};
578
579template <typename InputEvaluator, typename Index, typename InputDims,
580 int StaticKernelSize>
581__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
582 InputEvaluator eval,
583 const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
584 indexMapper,
585 const float* __restrict kernel, const int numPlanes, const int numX,
586 const int maxX, const int kernelSize, float* buffer) {
587#if defined(EIGEN_HIPCC)
588 HIP_DYNAMIC_SHARED(float, s)
589#else
590 extern __shared__ float s[];
591#endif
592
593 const int first_x = blockIdx.x * maxX;
594 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
595 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
596 const int num_x_output = last_x - first_x + 1;
597
598 const int first_plane = blockIdx.y * blockDim.y;
599 const int plane_stride = blockDim.y * gridDim.y;
600
601 for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
602 // Load inputs to shared memory
603 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
604 const int plane_kernel_offset = threadIdx.y * num_x_input;
605 #pragma unroll
606 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
607 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
608 s[i + plane_kernel_offset] = eval.coeff(tensor_index);
609 }
610
611 __syncthreads();
612
613 // Compute the convolution
614 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
615
616 #pragma unroll
617 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
618 const int kernel_offset = plane_kernel_offset + i;
619 float result = 0.0f;
620 #pragma unroll
621 for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
622 result += s[k + kernel_offset] * kernel[k];
623 }
624 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
625 buffer[tensor_index] = result;
626 }
627 __syncthreads();
628 }
629};
630
631template <typename InputEvaluator, typename Index, typename InputDims,
632 int StaticKernelSizeX, int StaticKernelSizeY>
633__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
634 InputEvaluator eval,
635 const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
636 indexMapper,
637 const float* __restrict kernel, const int numPlanes, const int numX,
638 const int maxX, const int numY, const int maxY, const int kernelSizeX,
639 const int kernelSizeY, float* buffer) {
640#if defined(EIGEN_HIPCC)
641 HIP_DYNAMIC_SHARED(float, s)
642#else
643 extern __shared__ float s[];
644#endif
645
646 const int first_x = blockIdx.x * maxX;
647 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
648 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
649 const int num_x_output = last_x - first_x + 1;
650
651 const int first_y = blockIdx.y * maxY;
652 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
653 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
654 const int num_y_output = last_y - first_y + 1;
655
656 const int first_plane = blockIdx.z * blockDim.z;
657 const int plane_stride = blockDim.z * gridDim.z;
658
659 for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
660
661 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
662 const int plane_kernel_offset = threadIdx.z * num_y_input;
663
664 // Load inputs to shared memory
665 #pragma unroll
666 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
667 const int input_offset = num_x_input * (j + plane_kernel_offset);
668 #pragma unroll
669 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
670 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
671 s[i + input_offset] = eval.coeff(tensor_index);
672 }
673 }
674
675 __syncthreads();
676
677 // Convolution
678 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
679
680 #pragma unroll
681 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
682 #pragma unroll
683 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
684 float result = 0.0f;
685 #pragma unroll
686 for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
687 const int kernel_offset = kernelSizeX * l;
688 const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
689 #pragma unroll
690 for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
691 result += s[k + input_offset] * kernel[k + kernel_offset];
692 }
693 }
694 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
695 buffer[tensor_index] = result;
696 }
697 }
698
699 __syncthreads();
700 }
701};
702
703template <typename InputEvaluator, typename Index, typename InputDims>
704__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
705 InputEvaluator eval,
706 const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
707 indexMapper,
708 const float* __restrict kernel, const size_t numPlanes, const size_t numX,
709 const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
710 const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
711 const size_t kernelSizeZ, float* buffer) {
712#if defined(EIGEN_HIPCC)
713 HIP_DYNAMIC_SHARED(float, s)
714#else
715 extern __shared__ float s[];
716#endif
717
718 // Load inputs to shared memory
719 const int first_x = blockIdx.x * maxX;
720 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
721 const int num_x_input = last_x - first_x + kernelSizeX;
722
723 const int first_y = blockIdx.y * maxY;
724 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
725 const int num_y_input = last_y - first_y + kernelSizeY;
726
727 const int first_z = blockIdx.z * maxZ;
728 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
729 const int num_z_input = last_z - first_z + kernelSizeZ;
730
731 for (int p = 0; p < numPlanes; ++p) {
732
733 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
734 const int plane_kernel_offset = 0;
735
736 for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
737 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
738 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
739 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
740 s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
741 }
742 }
743 }
744
745 __syncthreads();
746
747 // Convolution
748 const int num_z_output = last_z - first_z + 1;
749 const int num_y_output = last_y - first_y + 1;
750 const int num_x_output = last_x - first_x + 1;
751 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
752
753 for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
754 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
755 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
756 float result = 0.0f;
757 for (int n = 0; n < kernelSizeZ; ++n) {
758 for (int m = 0; m < kernelSizeY; ++m) {
759 for (int l = 0; l < kernelSizeX; ++l) {
760 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
761 }
762 }
763 }
764 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
765 buffer[tensor_index] = result;
766 }
767 }
768 }
769 __syncthreads();
770 }
771};
772
773
774
775template<typename Indices, typename InputArgType, typename KernelArgType>
776struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
777{
778 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
779
780 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
781 static const int NumKernelDims = internal::array_size<Indices>::value;
782 typedef typename XprType::Index Index;
783 typedef DSizes<Index, NumDims> Dimensions;
784 typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
785
786 enum {
788 PacketAccess = false,
789 BlockAccess = false,
790 PreferBlockAccess = false,
792 CoordAccess = false, // to be implemented
793 RawAccess = false
794 };
795
796 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
797 typedef internal::TensorBlockNotImplemented TensorBlock;
798 //===--------------------------------------------------------------------===//
799
800 TensorEvaluator(const XprType& op, const GpuDevice& device)
801 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
802 {
803 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
804
805 const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
806 const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
807
808 m_dimensions = m_inputImpl.dimensions();
809 for (int i = 0; i < NumKernelDims; ++i) {
810 const Index index = op.indices()[i];
811 const Index input_dim = input_dims[index];
812 const Index kernel_dim = kernel_dims[i];
813 const Index result_dim = input_dim - kernel_dim + 1;
814 m_dimensions[index] = result_dim;
815 }
816 }
817
818 typedef typename XprType::CoeffReturnType CoeffReturnType;
820 typedef typename InputArgType::Scalar Scalar;
822
823 EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
824
826 preloadKernel();
827 m_inputImpl.evalSubExprsIfNeeded(NULL);
828 if (data) {
829 executeEval(data);
830 return false;
831 } else {
832 m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
833 executeEval(m_buf);
834 return true;
835 }
836 }
837
839 m_inputImpl.cleanup();
840 if (m_buf) {
841 m_device.deallocate(m_buf);
842 m_buf = NULL;
843 }
844 if (m_local_kernel) {
845 m_device.deallocate((void*)m_kernel);
846 m_local_kernel = false;
847 }
848 m_kernel = NULL;
849 }
850
851 EIGEN_STRONG_INLINE void preloadKernel() {
852 // Don't make a local copy of the kernel unless we have to (i.e. it's an
853 // expression that needs to be evaluated)
854 const Scalar* in_place = m_kernelImpl.data();
855 if (in_place) {
856 m_kernel = in_place;
857 m_local_kernel = false;
858 } else {
859 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
860 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
861 typedef TensorEvalToOp<const KernelArgType> EvalTo;
862 EvalTo evalToTmp(local, m_kernelArg);
865
866 m_kernel = local;
867 m_local_kernel = true;
868 }
869 }
870
871 static unsigned int ceil(unsigned int num, unsigned int denom) {
872 const unsigned int rounded_toward_zero = num / denom;
873 if (num > rounded_toward_zero * denom) {
874 return rounded_toward_zero + 1;
875 }
876 return rounded_toward_zero;
877 }
878
879 void executeEval(Scalar* data) const {
881
882 const int maxSharedMem = m_device.sharedMemPerBlock();
883 const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
884 const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
885 const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
886 const int warpSize = 32;
887
888 switch (NumKernelDims) {
889 case 1: {
890 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
891
892 const int numX = dimensions()[m_indices[0]];
893 const int numP = dimensions().TotalSize() / numX;
894 int maxX;
895 dim3 block_size;
896
897 const int single_stride_dim =
898 static_cast<int>(Layout) == static_cast<int>(ColMajor)
899 ? 0
900 : m_inputImpl.dimensions().rank() - 1;
901 if (m_indices[0] == single_stride_dim) {
902 // Maximum the reuse
903 const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
904 maxX = numext::mini<int>(inner_dim, numX);
905 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
906 block_size.x = numext::mini(maxThreadsPerBlock, maxX);
907 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
908 }
909 else {
910 // Read as much as possible alongside the inner most dimension, that is the plane
911 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
912 const int maxP = numext::mini<int>(inner_dim, numP);
913 maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
914
915 block_size.x = numext::mini(warpSize, maxX);
916 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
917 }
918
919 const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
920 gpu_assert(shared_mem <= maxSharedMem);
921
922 const int num_x_blocks = ceil(numX, maxX);
923 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
924 const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
925
926 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
927
928
929 //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
930
931 const array<Index, 1> indices(m_indices[0]);
932 const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
933 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
934 m_inputImpl.dimensions(), kernel_dims, indices);
935 switch(kernel_size) {
936 case 4: {
937 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
938 break;
939 }
940 case 7: {
941 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
942 break;
943 }
944 default: {
945 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
946 }
947 }
948 break;
949 }
950
951 case 2: {
952 const int idxX =
953 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
954 const int idxY =
955 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
956 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
957 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
958
959 const int numX = dimensions()[m_indices[idxX]];
960 const int numY = dimensions()[m_indices[idxY]];
961 const int numP = dimensions().TotalSize() / (numX*numY);
962
963 const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
964
965 // Snap maxX to warp size
966 int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
967 const int maxX = numext::mini<int>(inner_dim, numX);
968 const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
969 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
970
971 dim3 block_size;
972 block_size.x = numext::mini(1024, maxX);
973 block_size.y = numext::mini<int>(1024/block_size.x, maxY);
974 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
975
976 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
977 gpu_assert(shared_mem <= maxSharedMem);
978
979 const int num_x_blocks = ceil(numX, maxX);
980 const int num_y_blocks = ceil(numY, maxY);
981 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
982 const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
983
984 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
985
986
987 //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
988
989 const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
990 const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
991 m_kernelImpl.dimensions()[idxY]);
992 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
993 m_inputImpl.dimensions(), kernel_dims, indices);
994 switch (kernel_size_x) {
995 case 4: {
996 switch (kernel_size_y) {
997 case 7: {
998 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
999 break;
1000 }
1001 default: {
1002 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
1003 break;
1004 }
1005 }
1006 break;
1007 }
1008 case 7: {
1009 switch (kernel_size_y) {
1010 case 4: {
1011 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
1012 break;
1013 }
1014 default: {
1015 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
1016 break;
1017 }
1018 }
1019 break;
1020 }
1021 default: {
1022 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1023 break;
1024 }
1025 }
1026 break;
1027 }
1028
1029 case 3: {
1030 const int idxX =
1031 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1032 const int idxY =
1033 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1034 const int idxZ =
1035 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1036
1037 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1038 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1039 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1040
1041 const int numX = dimensions()[m_indices[idxX]];
1042 const int numY = dimensions()[m_indices[idxY]];
1043 const int numZ = dimensions()[m_indices[idxZ]];
1044 const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1045
1046 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1047 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1048 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1049
1050 dim3 block_size;
1051 block_size.x = numext::mini(32, maxX);
1052 block_size.y = numext::mini(32, maxY);
1053 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1054 dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1055
1056 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1057 gpu_assert(shared_mem <= maxSharedMem);
1058
1059 //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
1060 const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1061 m_indices[idxZ]);
1062 const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1063 m_kernelImpl.dimensions()[idxY],
1064 m_kernelImpl.dimensions()[idxZ]);
1065 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1066 m_inputImpl.dimensions(), kernel_dims, indices);
1067
1068 LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1069 break;
1070 }
1071
1072 default: {
1073 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1074 }
1075 }
1076 }
1077
1079 {
1080 eigen_assert(m_buf);
1081 eigen_assert(index < m_dimensions.TotalSize());
1082 return m_buf[index];
1083 }
1084
1085 template<int LoadMode>
1087 {
1088 eigen_assert(m_buf);
1089 eigen_assert(index < m_dimensions.TotalSize());
1090 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1091 }
1092
1094 costPerCoeff(bool vectorized) const {
1095 // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1096 // model.
1097 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1098 // We ignore the use of fused multiply-add.
1099 const double convolve_compute_cost =
1100 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1101 const double firstIndex_compute_cost =
1102 NumDims *
1103 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1104 TensorOpCost::DivCost<Index>());
1105 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1106 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1107 m_kernelImpl.costPerCoeff(vectorized) +
1108 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1109 PacketSize));
1110 }
1111
1112 private:
1113 // No assignment (copies are needed by the kernels)
1114 TensorEvaluator& operator = (const TensorEvaluator&);
1115
1116 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1117 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1118 KernelArgType m_kernelArg;
1119 Indices m_indices;
1120 Dimensions m_dimensions;
1121 Scalar* m_buf;
1122 const Scalar* m_kernel;
1123 bool m_local_kernel;
1124
1125 const GpuDevice& m_device;
1126};
1127#endif
1128
1129
1130} // end namespace Eigen
1131
1132#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Matrix3f m
Definition AngleAxis_mimic_euler.cpp:1
EIGEN_DEVICE_FUNC const CeilReturnType ceil() const
Definition ArrayCwiseUnaryOps.h:495
int n
Definition BiCGSTAB_simple.cpp:1
int i
Definition BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALIGN_MAX
Definition ConfigureVectorization.h:157
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typenameEIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typenameEIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
Definition IndexedViewMethods.h:73
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition Macros.h:510
#define eigen_assert(x)
Definition Macros.h:1037
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition StaticAssert.h:127
#define EIGEN_DEVICE_REF
Definition TensorMacros.h:50
float * p
Definition Tutorial_Map_using.cpp:9
SCALAR Scalar
Definition bench_gemm.cpp:46
The tensor base class.
Definition TensorBase.h:973
Definition TensorConvolution.h:260
Eigen::internal::traits< TensorConvolutionOp >::Index Index
Definition TensorConvolution.h:268
Eigen::internal::traits< TensorConvolutionOp >::Scalar Scalar
Definition TensorConvolution.h:262
Eigen::internal::traits< TensorConvolutionOp >::StorageKind StorageKind
Definition TensorConvolution.h:267
internal::promote_storage_type< typenameInputXprType::CoeffReturnType, typenameKernelXprType::CoeffReturnType >::ret CoeffReturnType
Definition TensorConvolution.h:265
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices & indices() const
Definition TensorConvolution.h:274
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType &input, const KernelXprType &kernel, const Indices &dims)
Definition TensorConvolution.h:270
const Indices m_indices
Definition TensorConvolution.h:288
Eigen::internal::nested< TensorConvolutionOp >::type Nested
Definition TensorConvolution.h:266
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typenameKernelXprType::Nested >::type & kernelExpression() const
Definition TensorConvolution.h:283
Eigen::NumTraits< Scalar >::Real RealScalar
Definition TensorConvolution.h:263
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typenameInputXprType::Nested >::type & inputExpression() const
Definition TensorConvolution.h:279
InputXprType::Nested m_input_xpr
Definition TensorConvolution.h:286
KernelXprType::Nested m_kernel_xpr
Definition TensorConvolution.h:287
Definition TensorCostModel.h:25
Definition TensorConvolution.h:25
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const
Definition TensorConvolution.h:140
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const
Definition TensorConvolution.h:192
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const
Definition TensorConvolution.h:164
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const
Definition TensorConvolution.h:200
IndexMapper(const InputDims &input_dims, const array< Index, NumKernelDims > &kernel_dims, const array< Index, NumKernelDims > &indices)
Definition TensorConvolution.h:27
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const
Definition TensorConvolution.h:185
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const
Definition TensorConvolution.h:116
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const
Definition TensorConvolution.h:178
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const
Definition TensorConvolution.h:171
Definition TensorBlock.h:617
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=Device())
Definition TensorExecutor.h:96
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64
dim3 threadIdx
Definition gpu_common.h:19
dim3 blockDim
Definition gpu_common.h:19
dim3 blockIdx
Definition gpu_common.h:19
@ ColMajor
Definition Constants.h:319
RealScalar s
Definition level1_cplx_impl.h:126
return int(ret)+1
DenseIndex ret
Definition level1_cplx_impl.h:44
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition MathFunctions.h:1083
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
const int Dynamic
Definition Constants.h:22
Definition BandTriangularSolver.h:13
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition nestbyvalue.cpp:15
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Definition sparse_permutations.cpp:38
Definition TensorDimensions.h:263
Definition Constants.h:507
Definition TensorMeta.h:50
internal::packet_traits< Scalar >::type type
Definition TensorMeta.h:51
Definition TensorForwardDeclarations.h:37
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition TensorConvolution.h:409
EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
Definition TensorConvolution.h:323
EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
Definition TensorConvolution.h:417
internal::TensorBlockNotImplemented TensorBlock
Definition TensorConvolution.h:320
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition TensorConvolution.h:462
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition TensorConvolution.h:478
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition TensorConvolution.h:385
StorageMemory< Scalar, Device > Storage
Definition TensorConvolution.h:306
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar *)
Definition TensorConvolution.h:387
TensorConvolutionOp< Indices, InputArgType, KernelArgType > XprType
Definition TensorConvolution.h:295
void evalTo(typename XprType::Scalar *buffer)
Definition TensorConvolution.h:401
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition TensorConvolution.h:304
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition TensorEvaluator.h:29
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest)
Definition TensorEvaluator.h:75
Derived::Scalar Scalar
Definition TensorEvaluator.h:31
EIGEN_STRONG_INLINE TensorEvaluator(const Derived &m, const Device &device)
Definition TensorEvaluator.h:66
const Device EIGEN_DEVICE_REF m_device
Definition TensorEvaluator.h:192
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition TensorEvaluator.h:73
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition TensorEvaluator.h:33
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition TensorEvaluator.h:181
Derived::Scalar CoeffReturnType
Definition TensorEvaluator.h:32
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition TensorEvaluator.h:94
EIGEN_STRONG_INLINE void cleanup()
Definition TensorEvaluator.h:92
Derived XprType
Definition TensorEvaluator.h:35
@ BlockAccess
Definition TensorEvaluator.h:48
@ PreferBlockAccess
Definition TensorEvaluator.h:49
@ PacketAccess
Definition TensorEvaluator.h:47
@ Layout
Definition TensorEvaluator.h:50
@ IsAligned
Definition TensorEvaluator.h:46
Derived::Index Index
Definition TensorEvaluator.h:30
internal::TensorMaterializedBlock< ScalarNoConst, NumCoords, Layout, Index > TensorBlock
Definition TensorEvaluator.h:63
Derived::Dimensions Dimensions
Definition TensorEvaluator.h:34
static const int PacketSize
Definition TensorEvaluator.h:36
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition TensorEvaluator.h:147
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
Definition TensorEvaluator.h:105
static const bool value
Definition TensorForwardDeclarations.h:148
Definition Meta.h:445
@ value
Definition Meta.h:446
const TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > & type
Definition TensorConvolution.h:245
Definition XprHelper.h:332
Definition TensorTraits.h:175
Definition XprHelper.h:518
remove_reference< LhsNested >::type _LhsNested
Definition TensorConvolution.h:230
remove_reference< RhsNested >::type _RhsNested
Definition TensorConvolution.h:231
promote_storage_type< typenameInputXprType::Scalar, typenameKernelXprType::Scalar >::ret Scalar
Definition TensorConvolution.h:223
conditional< Pointer_type_promotion< typenameInputXprType::Scalar, Scalar >::val, typenametraits< InputXprType >::PointerType, typenametraits< KernelXprType >::PointerType >::type PointerType
Definition TensorConvolution.h:235
promote_index_type< typenametraits< InputXprType >::Index, typenametraits< KernelXprType >::Index >::type Index
Definition TensorConvolution.h:227
promote_storage_type< typenametraits< InputXprType >::StorageKind, typenametraits< KernelXprType >::StorageKind >::ret StorageKind
Definition TensorConvolution.h:225
Definition ForwardDeclarations.h:17
@ size
Definition GenericPacketMath.h:138
std::ptrdiff_t j
Definition tut_arithmetic_redux_minmax.cpp:2