10#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
24template <
typename Index,
typename InputDims,
int NumKernelDims,
int Layout>
31 for (
int i = 0;
i < NumKernelDims; ++
i) {
32 const Index index = indices[
i];
33 const Index input_dim = input_dims[index];
41 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
44 for (
int i = 1;
i < NumDims; ++
i) {
51 for (
int i =
static_cast<int>(NumDims) - 2;
i >= 0; --
i) {
61 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
63 : NumDims - NumKernelDims;
64 for (
int i = 0;
i < NumKernelDims; ++
i) {
66 ordering[index] = indices[
i];
75 for (
int i = 0;
i < NumDims; ++
i) {
84 for (
int i = 0;
i < NumDims; ++
i) {
89 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
90 for (
int i = 0;
i < NumDims; ++
i) {
91 if (
i > NumKernelDims) {
92 m_gpuInputStrides[
i] =
94 m_gpuOutputStrides[
i] =
97 m_gpuInputStrides[
i] = 1;
98 m_gpuOutputStrides[
i] = 1;
102 for (
int i = NumDims - 1;
i >= 0; --
i) {
103 if (
static_cast<size_t>(
i + 1) <
offset) {
104 m_gpuInputStrides[
i] =
106 m_gpuOutputStrides[
i] =
109 m_gpuInputStrides[
i] = 1;
110 m_gpuOutputStrides[
i] = 1;
118 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
119 for (
int d = NumDims - 1; d > NumKernelDims; --d) {
120 const Index idx =
p / m_gpuInputStrides[d];
122 p -=
idx * m_gpuInputStrides[d];
126 std::ptrdiff_t
limit = 0;
127 if (NumKernelDims < NumDims) {
128 limit = NumDims - NumKernelDims - 1;
130 for (
int d = 0; d <
limit; ++d) {
131 const Index idx =
p / m_gpuInputStrides[d];
133 p -=
idx * m_gpuInputStrides[d];
142 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
143 for (
int d = NumDims - 1; d > NumKernelDims; --d) {
144 const Index idx =
p / m_gpuOutputStrides[d];
146 p -=
idx * m_gpuOutputStrides[d];
150 std::ptrdiff_t
limit = 0;
151 if (NumKernelDims < NumDims) {
152 limit = NumDims - NumKernelDims - 1;
154 for (
int d = 0; d <
limit; ++d) {
155 const Index idx =
p / m_gpuOutputStrides[d];
157 p -=
idx * m_gpuOutputStrides[d];
165 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
167 : NumDims - NumKernelDims;
168 return i * m_inputStrides[
offset];
172 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
174 : NumDims - NumKernelDims;
175 return i * m_outputStrides[
offset];
179 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
181 : NumDims - NumKernelDims;
182 return i * m_inputStrides[
offset] +
j * m_inputStrides[
offset + 1];
186 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
188 : NumDims - NumKernelDims;
189 return i * m_outputStrides[
offset] +
j * m_outputStrides[
offset + 1];
193 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
195 : NumDims - NumKernelDims;
196 return i * m_inputStrides[
offset] +
j * m_inputStrides[
offset + 1] +
197 k * m_inputStrides[
offset + 2];
201 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
203 : NumDims - NumKernelDims;
204 return i * m_outputStrides[
offset] +
j * m_outputStrides[
offset + 1] +
205 k * m_outputStrides[
offset + 2];
218template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
242template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
248template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
258template<
typename Indices,
typename InputXprType,
typename KernelXprType>
292template<
typename Indices,
typename InputArgType,
typename KernelArgType,
typename Device>
324 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false),
m_device(device)
332 m_inputStride[0] = 1;
333 for (
int i = 1;
i < NumDims; ++
i) {
334 m_inputStride[
i] = m_inputStride[
i - 1] * input_dims[
i - 1];
337 m_inputStride[NumDims - 1] = 1;
338 for (
int i = NumDims - 2;
i >= 0; --
i) {
339 m_inputStride[
i] = m_inputStride[
i + 1] * input_dims[
i + 1];
345 for (
int i = 0;
i < NumKernelDims; ++
i) {
347 const Index input_dim = input_dims[index];
348 const Index kernel_dim = kernel_dims[
i];
349 const Index result_dim = input_dim - kernel_dim + 1;
350 m_dimensions[index] = result_dim;
352 m_kernelStride[
i] = m_kernelStride[
i - 1] * kernel_dims[
i - 1];
354 m_kernelStride[0] = 1;
356 m_indexStride[
i] = m_inputStride[index];
359 m_outputStride[0] = 1;
360 for (
int i = 1;
i < NumDims; ++
i) {
361 m_outputStride[
i] = m_outputStride[
i - 1] * m_dimensions[
i - 1];
364 for (
int i = NumKernelDims - 1;
i >= 0; --
i) {
366 const Index input_dim = input_dims[index];
367 const Index kernel_dim = kernel_dims[
i];
368 const Index result_dim = input_dim - kernel_dim + 1;
369 m_dimensions[index] = result_dim;
370 if (
i < NumKernelDims - 1) {
371 m_kernelStride[
i] = m_kernelStride[
i + 1] * kernel_dims[
i + 1];
373 m_kernelStride[NumKernelDims - 1] = 1;
375 m_indexStride[
i] = m_inputStride[index];
378 m_outputStride[NumDims - 1] = 1;
379 for (
int i = NumDims - 2;
i >= 0; --
i) {
380 m_outputStride[
i] = m_outputStride[
i + 1] * m_dimensions[
i + 1];
388 m_inputImpl.evalSubExprsIfNeeded(NULL);
393 m_inputImpl.cleanup();
394 if (m_local_kernel) {
395 m_device.deallocate((
void*)m_kernel);
396 m_local_kernel =
false;
412 convolve(firstInput(index), 0, NumKernelDims-1, result);
416 template<
int LoadMode>
420 Index startInputs[2] = {0, 0};
422 for (
int i = NumDims - 1;
i > 0; --
i) {
423 const Index idx0 = indices[0] / m_outputStride[
i];
424 const Index idx1 = indices[1] / m_outputStride[
i];
425 startInputs[0] += idx0 * m_inputStride[
i];
426 startInputs[1] += idx1 * m_inputStride[
i];
427 indices[0] -= idx0 * m_outputStride[
i];
428 indices[1] -= idx1 * m_outputStride[
i];
431 for (
int i = 0;
i < NumDims - 1; ++
i) {
432 const Index idx0 = indices[0] / m_outputStride[
i];
433 const Index idx1 = indices[1] / m_outputStride[
i];
434 startInputs[0] += idx0 * m_inputStride[
i];
435 startInputs[1] += idx1 * m_inputStride[
i];
436 indices[0] -= idx0 * m_outputStride[
i];
437 indices[1] -= idx1 * m_outputStride[
i];
440 startInputs[0] += indices[0];
441 startInputs[1] += indices[1];
443 if (startInputs[1]-startInputs[0] ==
PacketSize-1) {
445 convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
450 convolve(startInputs[0], 0, NumKernelDims-1,
data[0]);
453 convolve(firstInput(index+
i), 0, NumKernelDims-1,
data[
i]);
463 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
465 const double convolve_compute_cost =
466 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
467 const double firstIndex_compute_cost =
469 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
470 TensorOpCost::DivCost<Index>());
472 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
473 m_kernelImpl.costPerCoeff(vectorized) +
482 Index startInput = 0;
484 for (
int i = NumDims - 1;
i > 0; --
i) {
485 const Index idx = index / m_outputStride[
i];
486 startInput += idx * m_inputStride[
i];
487 index -= idx * m_outputStride[
i];
490 for (
int i = 0;
i < NumDims - 1; ++
i) {
491 const Index idx = index / m_outputStride[
i];
492 startInput += idx * m_inputStride[
i];
493 index -= idx * m_outputStride[
i];
501 for (
int j = 0;
j < m_kernelImpl.dimensions()[DimIndex]; ++
j) {
502 const Index input = firstIndex +
j * m_indexStride[DimIndex];
503 const Index kernel = firstKernel +
j * m_kernelStride[DimIndex];
505 convolve(input, kernel, DimIndex-1, accum);
507 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
512 template <
typename Packet>
514 for (
int j = 0;
j < m_kernelImpl.dimensions()[DimIndex]; ++
j) {
515 const Index input = firstIndex +
j * m_indexStride[DimIndex];
516 const Index kernel = firstKernel +
j * m_kernelStride[DimIndex];
518 convolvePacket(input, kernel, DimIndex-1, accum);
520 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
528 const Scalar* in_place = m_kernelImpl.data();
531 m_local_kernel =
false;
533 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
535 typedef TensorEvalToOp<const KernelArgType> EvalTo;
536 EvalTo evalToTmp(local, m_kernelArg);
541 m_local_kernel =
true;
545 array<Index, NumDims> m_inputStride;
546 array<Index, NumDims> m_outputStride;
548 array<Index, NumKernelDims> m_indexStride;
549 array<Index, NumKernelDims> m_kernelStride;
550 TensorEvaluator<InputArgType, Device> m_inputImpl;
551 TensorEvaluator<KernelArgType, Device> m_kernelImpl;
554 KernelArgType m_kernelArg;
564#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
566template <
int StaticKernelSize>
567struct GetKernelSize {
569 return StaticKernelSize;
579template <
typename InputEvaluator,
typename Index,
typename InputDims,
580 int StaticKernelSize>
583 const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
585 const float* __restrict kernel,
const int numPlanes,
const int numX,
586 const int maxX,
const int kernelSize,
float* buffer) {
587#if defined(EIGEN_HIPCC)
588 HIP_DYNAMIC_SHARED(
float,
s)
590 extern __shared__
float s[];
593 const int first_x =
blockIdx.x * maxX;
594 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
595 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
596 const int num_x_output = last_x - first_x + 1;
599 const int plane_stride =
blockDim.y * gridDim.y;
601 for (
int p = first_plane +
threadIdx.y;
p < numPlanes;
p += plane_stride) {
603 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(
p);
604 const int plane_kernel_offset =
threadIdx.y * num_x_input;
607 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
i+first_x);
608 s[
i + plane_kernel_offset] =
eval.coeff(tensor_index);
614 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(
p);
618 const int kernel_offset = plane_kernel_offset +
i;
621 for (
int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
622 result +=
s[k + kernel_offset] * kernel[k];
624 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
i+first_x);
625 buffer[tensor_index] = result;
631template <
typename InputEvaluator,
typename Index,
typename InputDims,
632 int StaticKernelSizeX,
int StaticKernelSizeY>
635 const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
637 const float* __restrict kernel,
const int numPlanes,
const int numX,
638 const int maxX,
const int numY,
const int maxY,
const int kernelSizeX,
639 const int kernelSizeY,
float* buffer) {
640#if defined(EIGEN_HIPCC)
641 HIP_DYNAMIC_SHARED(
float,
s)
643 extern __shared__
float s[];
646 const int first_x =
blockIdx.x * maxX;
647 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
648 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
649 const int num_x_output = last_x - first_x + 1;
651 const int first_y =
blockIdx.y * maxY;
652 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
653 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
654 const int num_y_output = last_y - first_y + 1;
657 const int plane_stride =
blockDim.z * gridDim.z;
659 for (
int p = first_plane +
threadIdx.z;
p < numPlanes;
p += plane_stride) {
661 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(
p);
662 const int plane_kernel_offset =
threadIdx.z * num_y_input;
667 const int input_offset = num_x_input * (
j + plane_kernel_offset);
670 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
i+first_x,
j+first_y);
671 s[
i + input_offset] =
eval.coeff(tensor_index);
678 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(
p);
686 for (
int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
687 const int kernel_offset = kernelSizeX * l;
688 const int input_offset =
i + num_x_input * (
j + l + plane_kernel_offset);
690 for (
int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
691 result +=
s[k + input_offset] * kernel[k + kernel_offset];
694 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
i+first_x,
j+first_y);
695 buffer[tensor_index] = result;
703template <
typename InputEvaluator,
typename Index,
typename InputDims>
706 const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
708 const float* __restrict kernel,
const size_t numPlanes,
const size_t numX,
709 const size_t maxX,
const size_t numY,
const size_t maxY,
const size_t numZ,
710 const size_t maxZ,
const size_t kernelSizeX,
const size_t kernelSizeY,
711 const size_t kernelSizeZ,
float* buffer) {
712#if defined(EIGEN_HIPCC)
713 HIP_DYNAMIC_SHARED(
float,
s)
715 extern __shared__
float s[];
719 const int first_x =
blockIdx.x * maxX;
720 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
721 const int num_x_input = last_x - first_x + kernelSizeX;
723 const int first_y =
blockIdx.y * maxY;
724 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
725 const int num_y_input = last_y - first_y + kernelSizeY;
727 const int first_z =
blockIdx.z * maxZ;
728 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
729 const int num_z_input = last_z - first_z + kernelSizeZ;
731 for (
int p = 0;
p < numPlanes; ++
p) {
733 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(
p);
734 const int plane_kernel_offset = 0;
739 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
i+first_x,
j+first_y, k+first_z);
740 s[
i + num_x_input * (
j + num_y_input * (k + plane_kernel_offset))] =
eval.coeff(tensor_index);
748 const int num_z_output = last_z - first_z + 1;
749 const int num_y_output = last_y - first_y + 1;
750 const int num_x_output = last_x - first_x + 1;
751 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(
p);
757 for (
int n = 0;
n < kernelSizeZ; ++
n) {
758 for (
int m = 0;
m < kernelSizeY; ++
m) {
759 for (
int l = 0; l < kernelSizeX; ++l) {
760 result +=
s[
i + l + num_x_input * (
j +
m + num_y_input * (k +
n + plane_kernel_offset))] * kernel[l + kernelSizeX * (
m + kernelSizeY *
n)];
764 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
i+first_x,
j+first_y, k+first_z);
765 buffer[tensor_index] = result;
775template<
typename Indices,
typename InputArgType,
typename KernelArgType>
776struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
778 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType>
XprType;
780 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
782 typedef typename XprType::Index
Index;
797 typedef internal::TensorBlockNotImplemented
TensorBlock;
801 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false),
m_device(device)
808 m_dimensions = m_inputImpl.dimensions();
809 for (
int i = 0;
i < NumKernelDims; ++
i) {
810 const Index index = op.indices()[
i];
811 const Index input_dim = input_dims[index];
812 const Index kernel_dim = kernel_dims[
i];
813 const Index result_dim = input_dim - kernel_dim + 1;
814 m_dimensions[index] = result_dim;
820 typedef typename InputArgType::Scalar
Scalar;
827 m_inputImpl.evalSubExprsIfNeeded(NULL);
839 m_inputImpl.cleanup();
844 if (m_local_kernel) {
845 m_device.deallocate((
void*)m_kernel);
846 m_local_kernel =
false;
854 const Scalar* in_place = m_kernelImpl.data();
857 m_local_kernel =
false;
859 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
861 typedef TensorEvalToOp<const KernelArgType> EvalTo;
862 EvalTo evalToTmp(local, m_kernelArg);
867 m_local_kernel =
true;
871 static unsigned int ceil(
unsigned int num,
unsigned int denom) {
872 const unsigned int rounded_toward_zero = num / denom;
873 if (num > rounded_toward_zero * denom) {
874 return rounded_toward_zero + 1;
876 return rounded_toward_zero;
882 const int maxSharedMem =
m_device.sharedMemPerBlock();
883 const int maxThreadsPerBlock =
m_device.maxGpuThreadsPerBlock();
884 const int maxBlocksPerProcessor =
m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
885 const int numMultiProcessors =
m_device.getNumGpuMultiProcessors();
886 const int warpSize = 32;
888 switch (NumKernelDims) {
890 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
893 const int numP =
dimensions().TotalSize() / numX;
897 const int single_stride_dim =
900 : m_inputImpl.dimensions().rank() - 1;
901 if (m_indices[0] == single_stride_dim) {
903 const int inner_dim = ((maxSharedMem / (
sizeof(
Scalar)) - kernel_size + 1 + 31) / 32) * 32;
904 maxX = numext::mini<int>(inner_dim, numX);
905 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) *
sizeof(
Scalar)), numP);
907 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
911 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) *
sizeof(
Scalar));
912 const int maxP = numext::mini<int>(inner_dim, numP);
913 maxX = numext::mini<int>(maxSharedMem / (inner_dim *
sizeof(
Scalar)) - kernel_size + 1, numX);
916 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
919 const int shared_mem = block_size.y * (maxX + kernel_size - 1) *
sizeof(
Scalar);
920 gpu_assert(shared_mem <= maxSharedMem);
922 const int num_x_blocks =
ceil(numX, maxX);
923 const int blocksPerProcessor =
numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
924 const int num_y_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
926 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks,
ceil(numP, block_size.y)));
931 const array<Index, 1> indices(m_indices[0]);
932 const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
933 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
934 m_inputImpl.dimensions(), kernel_dims, indices);
935 switch(kernel_size) {
937 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 4>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4,
data);
941 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 7>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7,
data);
945 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims,
Dynamic>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size,
data);
956 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
957 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
959 const int numX =
dimensions()[m_indices[idxX]];
960 const int numY =
dimensions()[m_indices[idxY]];
961 const int numP =
dimensions().TotalSize() / (numX*numY);
963 const float scaling_factor = sqrtf(
static_cast<float>(maxSharedMem) / (
sizeof(
Scalar) * kernel_size_y * kernel_size_x));
966 int inner_dim = ((
static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
967 const int maxX = numext::mini<int>(inner_dim, numX);
968 const int maxY = numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
969 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) *
sizeof(
Scalar)), numP);
973 block_size.y = numext::mini<int>(1024/block_size.x, maxY);
974 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
976 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) *
sizeof(
Scalar);
977 gpu_assert(shared_mem <= maxSharedMem);
979 const int num_x_blocks =
ceil(numX, maxX);
980 const int num_y_blocks =
ceil(numY, maxY);
981 const int blocksPerProcessor =
numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
982 const int num_z_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
984 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks,
ceil(numP, block_size.z)));
989 const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
990 const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
991 m_kernelImpl.dimensions()[idxY]);
992 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
993 m_inputImpl.dimensions(), kernel_dims, indices);
994 switch (kernel_size_x) {
996 switch (kernel_size_y) {
998 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7,
data);
1002 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 4,
Dynamic>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y,
data);
1009 switch (kernel_size_y) {
1011 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4,
data);
1015 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 7,
Dynamic>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y,
data);
1022 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims,
Dynamic,
Dynamic>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y,
data);
1037 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1038 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1039 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1041 const int numX =
dimensions()[m_indices[idxX]];
1042 const int numY =
dimensions()[m_indices[idxY]];
1043 const int numZ =
dimensions()[m_indices[idxZ]];
1044 const int numP =
dimensions().TotalSize() / (numX*numY*numZ);
1046 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1047 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1048 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1053 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1054 dim3 num_blocks(
ceil(numX, maxX),
ceil(numY, maxY),
ceil(numZ, maxZ));
1056 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) *
sizeof(
Scalar);
1057 gpu_assert(shared_mem <= maxSharedMem);
1060 const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1062 const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1063 m_kernelImpl.dimensions()[idxY],
1064 m_kernelImpl.dimensions()[idxZ]);
1065 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1066 m_inputImpl.dimensions(), kernel_dims, indices);
1068 LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z,
data);
1073 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1082 return m_buf[index];
1085 template<
int LoadMode>
1090 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1097 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1099 const double convolve_compute_cost =
1100 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1101 const double firstIndex_compute_cost =
1103 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1104 TensorOpCost::DivCost<Index>());
1105 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized,
PacketSize) +
1106 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1107 m_kernelImpl.costPerCoeff(vectorized) +
1108 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1116 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1117 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1118 KernelArgType m_kernelArg;
1123 bool m_local_kernel;
Matrix3f m
Definition AngleAxis_mimic_euler.cpp:1
EIGEN_DEVICE_FUNC const CeilReturnType ceil() const
Definition ArrayCwiseUnaryOps.h:495
int n
Definition BiCGSTAB_simple.cpp:1
int i
Definition BiCGSTAB_step_by_step.cpp:9
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typenameEIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typenameEIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
Definition IndexedViewMethods.h:73
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition Macros.h:510
#define eigen_assert(x)
Definition Macros.h:1037
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition StaticAssert.h:127
#define EIGEN_DEVICE_REF
Definition TensorMacros.h:50
float * p
Definition Tutorial_Map_using.cpp:9
SCALAR Scalar
Definition bench_gemm.cpp:46
The tensor base class.
Definition TensorBase.h:973
Definition TensorConvolution.h:260
Eigen::internal::traits< TensorConvolutionOp >::Index Index
Definition TensorConvolution.h:268
Eigen::internal::traits< TensorConvolutionOp >::Scalar Scalar
Definition TensorConvolution.h:262
Eigen::internal::traits< TensorConvolutionOp >::StorageKind StorageKind
Definition TensorConvolution.h:267
internal::promote_storage_type< typenameInputXprType::CoeffReturnType, typenameKernelXprType::CoeffReturnType >::ret CoeffReturnType
Definition TensorConvolution.h:265
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices & indices() const
Definition TensorConvolution.h:274
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType &input, const KernelXprType &kernel, const Indices &dims)
Definition TensorConvolution.h:270
const Indices m_indices
Definition TensorConvolution.h:288
Eigen::internal::nested< TensorConvolutionOp >::type Nested
Definition TensorConvolution.h:266
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typenameKernelXprType::Nested >::type & kernelExpression() const
Definition TensorConvolution.h:283
Eigen::NumTraits< Scalar >::Real RealScalar
Definition TensorConvolution.h:263
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all< typenameInputXprType::Nested >::type & inputExpression() const
Definition TensorConvolution.h:279
InputXprType::Nested m_input_xpr
Definition TensorConvolution.h:286
KernelXprType::Nested m_kernel_xpr
Definition TensorConvolution.h:287
Definition TensorCostModel.h:25
Definition TensorConvolution.h:25
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const
Definition TensorConvolution.h:140
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const
Definition TensorConvolution.h:192
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const
Definition TensorConvolution.h:164
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const
Definition TensorConvolution.h:200
IndexMapper(const InputDims &input_dims, const array< Index, NumKernelDims > &kernel_dims, const array< Index, NumKernelDims > &indices)
Definition TensorConvolution.h:27
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const
Definition TensorConvolution.h:185
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const
Definition TensorConvolution.h:116
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const
Definition TensorConvolution.h:178
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const
Definition TensorConvolution.h:171
Definition TensorBlock.h:617
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=Device())
Definition TensorExecutor.h:96
Definition TensorRef.h:81
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64
dim3 threadIdx
Definition gpu_common.h:19
dim3 blockDim
Definition gpu_common.h:19
dim3 blockIdx
Definition gpu_common.h:19
@ ColMajor
Definition Constants.h:319
RealScalar s
Definition level1_cplx_impl.h:126
DenseIndex ret
Definition level1_cplx_impl.h:44
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition MathFunctions.h:1083
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
const int Dynamic
Definition Constants.h:22
Definition BandTriangularSolver.h:13
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition nestbyvalue.cpp:15
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Definition sparse_permutations.cpp:38
Definition TensorDimensions.h:263
Definition Constants.h:507
Definition TensorMeta.h:50
internal::packet_traits< Scalar >::type type
Definition TensorMeta.h:51
Definition TensorForwardDeclarations.h:37
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition TensorEvaluator.h:29
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest)
Definition TensorEvaluator.h:75
Derived::Scalar Scalar
Definition TensorEvaluator.h:31
EIGEN_STRONG_INLINE TensorEvaluator(const Derived &m, const Device &device)
Definition TensorEvaluator.h:66
const Device EIGEN_DEVICE_REF m_device
Definition TensorEvaluator.h:192
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition TensorEvaluator.h:73
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition TensorEvaluator.h:33
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition TensorEvaluator.h:181
Derived::Scalar CoeffReturnType
Definition TensorEvaluator.h:32
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition TensorEvaluator.h:94
EIGEN_STRONG_INLINE void cleanup()
Definition TensorEvaluator.h:92
Derived XprType
Definition TensorEvaluator.h:35
@ BlockAccess
Definition TensorEvaluator.h:48
@ PreferBlockAccess
Definition TensorEvaluator.h:49
@ PacketAccess
Definition TensorEvaluator.h:47
@ Layout
Definition TensorEvaluator.h:50
@ IsAligned
Definition TensorEvaluator.h:46
Derived::Index Index
Definition TensorEvaluator.h:30
internal::TensorMaterializedBlock< ScalarNoConst, NumCoords, Layout, Index > TensorBlock
Definition TensorEvaluator.h:63
Derived::Dimensions Dimensions
Definition TensorEvaluator.h:34
static const int PacketSize
Definition TensorEvaluator.h:36
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition TensorEvaluator.h:147
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
Definition TensorEvaluator.h:105
static const bool value
Definition TensorForwardDeclarations.h:148
@ value
Definition Meta.h:446
Definition XprHelper.h:332
Definition TensorTraits.h:175
Definition ForwardDeclarations.h:17
@ size
Definition GenericPacketMath.h:138
std::ptrdiff_t j
Definition tut_arithmetic_redux_minmax.cpp:2