TR-mbed 1.0
Loading...
Searching...
No Matches
TensorConvolutionSycl.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Mehdi Goli Codeplay Software Ltd.
5// Ralph Potter Codeplay Software Ltd.
6// Luke Iwanski Codeplay Software Ltd.
7// Contact: <eigen@codeplay.com>
8// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9
10//
11// This Source Code Form is subject to the terms of the Mozilla
12// Public License v. 2.0. If a copy of the MPL was not distributed
13// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14
15#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
17
18namespace Eigen {
19
29template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
30 typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
32template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
33 typename Kernel_accessor, typename Buffer_accessor>
34struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
35 Buffer_accessor, convolution_type::CONV1D> {
36 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
40 Kernel_accessor kernel_filter;
41 Buffer_accessor buffer_acc;
43 const size_t kernelSize;
44 const cl::sycl::range<2> input_range;
45 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
46 Buffer_accessor buffer_acc_,
48 const size_t kernelSize_, const cl::sycl::range<2> input_range_)
49 : local_acc(local_acc_),
50 device_evaluator(device_evaluator_),
51 kernel_filter(kernel_filter_),
52 buffer_acc(buffer_acc_),
53 indexMapper(indexMapper_),
54 kernelSize(kernelSize_),
55 input_range(input_range_) {}
56
57 template <typename BooleanDim2>
58 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
59 return (boolean_check[0] && boolean_check[1]);
60 }
61 void operator()(cl::sycl::nd_item<2> itemID) {
62 auto buffer_ptr = buffer_acc.get_pointer();
63 auto kernel_ptr = kernel_filter.get_pointer();
64 // the required row to be calculated for the for each plane in shered memory
65 const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
66 const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
67 const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
68 const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
70 for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
71 const size_t local_index = i + plane_kernel_offset;
72 const size_t tensor_index =
73 plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
74
75 local_acc[local_index] =
76 (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
77 ? device_evaluator.coeff(tensor_index)
78 : CoeffReturnType(0);
79 }
80
81 itemID.barrier(cl::sycl::access::fence_space::local_space);
82
83 // calculate the convolution // output start x
84 const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
85 if (boundary_check(itemID.get_global_id() < input_range)) {
86 CoeffReturnType result = static_cast<CoeffReturnType>(0);
87 const size_t index = plane_kernel_offset + itemID.get_local_id(0);
88 for (size_t k = 0; k < kernelSize; ++k) {
89 result += (local_acc[k + index] * kernel_ptr[k]);
90 }
91 const size_t tensor_index =
92 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
93 indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
94 buffer_ptr[tensor_index] = result;
95 }
96 }
97};
98
99template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
100 typename Kernel_accessor, typename Buffer_accessor>
101struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
102 Buffer_accessor, convolution_type::CONV2D> {
103 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
107 Kernel_accessor kernel_filter;
108 Buffer_accessor buffer_acc;
110 const cl::sycl::range<2> kernel_size;
111 const cl::sycl::range<3> input_range;
112 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
113 Buffer_accessor buffer_acc_,
115 const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
116 : local_acc(local_acc_),
117 device_evaluator(device_evaluator_),
118 kernel_filter(kernel_filter_),
119 buffer_acc(buffer_acc_),
120 indexMapper(indexMapper_),
121 kernel_size(kernel_size_),
122 input_range(input_range_) {}
123 template <typename BooleanDim3>
124 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
125 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
126 }
127
128 void operator()(cl::sycl::nd_item<3> itemID) {
129 auto buffer_ptr = buffer_acc.get_pointer();
130 auto kernel_ptr = kernel_filter.get_pointer();
131 // the required row to be calculated for the for each plane in shered memory
132 const auto num_input = cl::sycl::range<2>{
133 (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
134
135 const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
136 const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
137
138 const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
139 itemID.get_group(1) * itemID.get_local_range()[1]};
140
141 // fill the local memory
142 bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
143 for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
144 const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
145 bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
146 for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
147 const size_t local_index = i + local_input_offset;
148 const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
149 i + input_offset[0], j + input_offset[1]);
150 local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
151 in_range_dim1 && in_range_dim2)
152 ? device_evaluator.coeff(tensor_index)
153 : CoeffReturnType(0);
154 }
155 }
156
157 itemID.barrier(cl::sycl::access::fence_space::local_space);
158
159 // output offset start for each thread
160 const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
161 itemID.get_group(1) * itemID.get_local_range()[1]};
162
163 if (boundary_check(itemID.get_global_id() < input_range)) {
164 CoeffReturnType result = static_cast<CoeffReturnType>(0);
165
166 for (size_t j = 0; j < kernel_size[1]; j++) {
167 size_t kernel_offset = kernel_size[0] * j;
168 const size_t index =
169 (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
170 for (size_t i = 0; i < kernel_size[0]; i++) {
171 result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
172 }
173 }
174 const size_t tensor_index =
175 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
176 indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
177 itemID.get_local_id(1) + output_offset[1]);
178
179 buffer_ptr[tensor_index] = result;
180 }
181 }
182};
183
184template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
185 typename Kernel_accessor, typename Buffer_accessor>
186struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
187 Buffer_accessor, convolution_type::CONV3D> {
188 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
192 Kernel_accessor kernel_filter;
193 Buffer_accessor buffer_acc;
195 const cl::sycl::range<3> kernel_size;
196 const cl::sycl::range<3> input_range;
197 const size_t numP;
198
199 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
200 Buffer_accessor buffer_acc_,
202 const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
203 const size_t numP_)
204 : local_acc(local_acc_),
205 device_evaluator(device_evaluator_),
206 kernel_filter(kernel_filter_),
207 buffer_acc(buffer_acc_),
208 indexMapper(indexMapper_),
209 kernel_size(kernel_size_),
210 input_range(input_range_),
211 numP(numP_) {}
212 template <typename BooleanDim3>
213 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
214 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
215 }
216 void operator()(cl::sycl::nd_item<3> itemID) {
217 auto buffer_ptr = buffer_acc.get_pointer();
218 auto kernel_ptr = kernel_filter.get_pointer();
219 const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
220
221 const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
222
223 const auto output_offset =
224 cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
225
226 for (size_t p = 0; p < numP; p++) {
228 const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
229 for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
230 size_t local_index_dim2 = num_input[0] * num_input[1] * k;
231 bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
232 for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
233 bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
234 size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
235 for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
236 bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
237 const size_t local_index = local_index_dim1 + i;
238 const size_t tensor_index =
239 plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
240 i + input_offset[0], j + input_offset[1], k + input_offset[2]);
241 local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
242 }
243 }
244 }
245 itemID.barrier(cl::sycl::access::fence_space::local_space);
246
247 // calculate the convolution
248
249 if (boundary_check(itemID.get_global_id() < input_range)) {
250 CoeffReturnType result = static_cast<CoeffReturnType>(0);
251 for (size_t k = 0; k < kernel_size[2]; k++) {
252 for (size_t j = 0; j < kernel_size[1]; j++) {
253 for (size_t i = 0; i < kernel_size[0]; i++) {
254 const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
255 const size_t local_index =
256 ((i + itemID.get_local_id(0)) +
257 num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
258
259 result += (local_acc[local_index] * kernel_ptr[kernel_index]);
260 }
261 }
262 }
263 const size_t tensor_index =
264 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
265 indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
266 buffer_ptr[tensor_index] = result;
267 }
268
269 itemID.barrier(cl::sycl::access::fence_space::local_space);
270 }
271 }
272};
273
274template <typename Indices, typename InputArgType, typename KernelArgType>
275struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
277
278 static const int NumDims =
280 static const int NumKernelDims = internal::array_size<Indices>::value;
281 typedef typename XprType::Index Index;
284 typedef const Eigen::SyclDevice Device;
287 typedef typename InputArgType::Scalar Scalar;
292
293 enum {
297 BlockAccess = false,
300 CoordAccess = false, // to be implemented
301 RawAccess = false
302 };
303
304 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
306 //===--------------------------------------------------------------------===//
307
308 TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
309 : m_inputImpl(op.inputExpression(), device),
310 m_kernelArg(op.kernelExpression()),
311 m_kernelImpl(op.kernelExpression(), device),
312 m_indices(op.indices()),
313 m_buf(NULL),
314 m_kernel(NULL),
315 m_local_kernel(false),
316 m_device(device) {
319 YOU_MADE_A_PROGRAMMING_MISTAKE);
320
321 const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
323 m_kernelImpl.dimensions();
324
325 m_dimensions = m_inputImpl.dimensions();
326 for (int i = 0; i < NumKernelDims; ++i) {
327 const Index index = op.indices()[i];
328 const Index input_dim = input_dims[index];
329 const Index kernel_dim = kernel_dims[i];
330 const Index result_dim = input_dim - kernel_dim + 1;
331 m_dimensions[index] = result_dim;
332 }
333 }
334
335 EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
336
338 preloadKernel();
339 m_inputImpl.evalSubExprsIfNeeded(NULL);
340 if (data) {
341 executeEval(data);
342 return false;
343 } else {
344 m_buf = (EvaluatorPointerType)m_device.get(
345 (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
346 executeEval(m_buf);
347 return true;
348 }
349 }
350
352 m_inputImpl.cleanup();
353 if (m_buf) {
354 m_device.deallocate_temp(m_buf);
355 m_buf = NULL;
356 }
357 if (m_local_kernel) {
358 m_device.deallocate_temp(m_kernel);
359 m_local_kernel = false;
360 }
361 m_kernel = NULL;
362 }
367
369 // Don't make a local copy of the kernel unless we have to (i.e. it's an
370 // expression that needs to be evaluated)
371 typename KernelStorage::Type in_place = m_kernelImpl.data();
372 if (in_place) {
373 m_kernel = in_place;
374 m_local_kernel = false;
375 } else {
376 ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
377 EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
379 EvalTo evalToTmp(m_device.get(local), m_kernelArg);
382 m_kernel = local;
383 m_local_kernel = true;
384 }
385 }
386
389 typedef typename InputEvaluator::Dimensions InputDims;
390 switch (NumKernelDims) {
391 case 1: {
392 const size_t numX = dimensions()[m_indices[0]];
393 const size_t numP = dimensions().TotalSize() / numX;
394 const auto input_dim = std::array<size_t, 2>{numX, numP};
395 auto global_range = cl::sycl::range<2>{};
396 auto local_range = cl::sycl::range<2>{};
397 const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
398
399 m_device.parallel_for_setup(input_dim, global_range, local_range);
400 const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
401 gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
402 const array<Index, 1> indices{{m_indices[0]}};
403 const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
404 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
405
406 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
408 ConvKernel;
409
410 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
411 m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
412 indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
413 break;
414 }
415
416 case 2: {
417 auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
418 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
419 auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
420 (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
421 const size_t numX = dimensions()[m_indices[kernel_index[0]]];
422 const size_t numY = dimensions()[m_indices[kernel_index[1]]];
423 const size_t numP = dimensions().TotalSize() / (numX * numY);
424 auto input_dim = std::array<size_t, 3>{numX, numY, numP};
425
426 auto global_range = cl::sycl::range<3>{};
427 auto local_range = cl::sycl::range<3>{};
428
429 m_device.parallel_for_setup(input_dim, global_range, local_range);
430
431 const size_t local_memory_size =
432 (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
433 gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
434 const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
435 const array<Index, 2> kernel_dims{
436 {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
437 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
438 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
440 ConvKernel;
441 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
442 m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
443 indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
444 break;
445 }
446
447 case 3: {
448 auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
449 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
450 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
451
452 auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
453 (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
454 (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
455
456 const size_t numX = dimensions()[m_indices[kernel_index[0]]];
457 const size_t numY = dimensions()[m_indices[kernel_index[1]]];
458 const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
459 auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
460 const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
461
462 const array<Index, 3> indices{
463 {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
464 const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
465 m_kernelImpl.dimensions()[kernel_index[1]],
466 m_kernelImpl.dimensions()[kernel_index[2]]}};
467
468 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
469
470 auto global_range = cl::sycl::range<3>{};
471 auto local_range = cl::sycl::range<3>{};
472
473 m_device.parallel_for_setup(input_dim, global_range, local_range);
474 auto local_memory_range = (local_range + kernel_size - 1);
475 const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
476
477 gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
478 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
480 ConvKernel;
481 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
482 m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
483 indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
484 break;
485 }
486
487 default: {
488 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
489 THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
490 }
491 }
492 }
493
495 eigen_assert(m_buf != NULL);
496 eigen_assert(index < m_dimensions.TotalSize());
497 return m_buf[index];
498 }
499
500 template <int LoadMode>
502 eigen_assert(m_buf != NULL);
503 eigen_assert(index < m_dimensions.TotalSize());
505 }
506
508 // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
509 // model.
510 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
511 // We ignore the use of fused multiply-add.
512 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
513 const double firstIndex_compute_cost =
514 NumDims *
515 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
516 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
517 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
518 TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
519 }
520 // binding placeholder accessors to a command group handler for SYCL
521 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
522 m_kernelImpl.bind(cgh);
523 m_inputImpl.bind(cgh);
524 m_buf.bind(cgh);
525 m_kernel.bind(cgh);
526 }
527
528 private:
529 // No assignment (copies are needed by the kernels)
530 TensorEvaluator &operator=(const TensorEvaluator &);
532 KernelArgType m_kernelArg;
534 Indices m_indices;
535 Dimensions m_dimensions;
537 typename KernelStorage::Type m_kernel;
538 bool m_local_kernel;
539 const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
540}; // namespace Eigen
541
542} // end namespace Eigen
543
544#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
int i
Definition BiCGSTAB_step_by_step.cpp:9
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define eigen_assert(x)
Definition Macros.h:1037
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition StaticAssert.h:127
#define EIGEN_DEVICE_REF
Definition TensorMacros.h:50
float * p
Definition Tutorial_Map_using.cpp:9
Definition TensorConvolution.h:260
Eigen::internal::traits< TensorConvolutionOp >::Index Index
Definition TensorConvolution.h:268
internal::promote_storage_type< typenameInputXprType::CoeffReturnType, typenameKernelXprType::CoeffReturnType >::ret CoeffReturnType
Definition TensorConvolution.h:265
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices & indices() const
Definition TensorConvolution.h:274
Definition TensorEvalTo.h:69
Definition TensorCostModel.h:25
Definition EmulateArray.h:21
Definition TensorBlock.h:617
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=Device())
Definition TensorExecutor.h:96
@ ColMajor
Definition Constants.h:319
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
convolution_type
Definition TensorConvolutionSycl.h:28
Definition TensorDimensions.h:263
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
Definition TensorConvolutionSycl.h:104
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check)
Definition TensorConvolutionSycl.h:124
internal::IndexMapper< Index, InputDims, 2, Evaluator::Layout > indexMapper
Definition TensorConvolutionSycl.h:109
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 2, Evaluator::Layout > indexMapper_, const cl::sycl::range< 2 > kernel_size_, const cl::sycl::range< 3 > input_range_)
Definition TensorConvolutionSycl.h:112
internal::IndexMapper< Index, InputDims, 1, Evaluator::Layout > indexMapper
Definition TensorConvolutionSycl.h:42
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 1, Evaluator::Layout > indexMapper_, const size_t kernelSize_, const cl::sycl::range< 2 > input_range_)
Definition TensorConvolutionSycl.h:45
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check)
Definition TensorConvolutionSycl.h:58
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
Definition TensorConvolutionSycl.h:37
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
Definition TensorConvolutionSycl.h:189
internal::IndexMapper< Index, InputDims, 3, Evaluator::Layout > indexMapper
Definition TensorConvolutionSycl.h:194
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check)
Definition TensorConvolutionSycl.h:213
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 3, Evaluator::Layout > indexMapper_, const cl::sycl::range< 3 > kernel_size_, const cl::sycl::range< 3 > input_range_, const size_t numP_)
Definition TensorConvolutionSycl.h:199
Definition TensorConvolutionSycl.h:31
Definition TensorMeta.h:50
Definition TensorForwardDeclarations.h:37
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
Definition TensorConvolutionSycl.h:337
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const
Definition TensorConvolutionSycl.h:521
PacketType< CoeffReturnType, Eigen::SyclDevice >::type PacketReturnType
Definition TensorConvolutionSycl.h:286
TensorEvaluator< KernelArgType, Eigen::SyclDevice >::Dimensions KernelDimensions
Definition TensorConvolutionSycl.h:283
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition TensorConvolutionSycl.h:507
internal::TensorBlockNotImplemented TensorBlock
Definition TensorConvolutionSycl.h:305
StorageMemory< const CoeffReturnType, Eigen::SyclDevice > KernelStorage
Definition TensorConvolutionSycl.h:291
EIGEN_DEVICE_FUNC const Dimensions & dimensions() const
Definition TensorConvolutionSycl.h:335
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
used by sycl in order to build the sycl buffer
Definition TensorConvolutionSycl.h:366
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device & device() const
used by sycl in order to build the sycl buffer
Definition TensorConvolutionSycl.h:364
TensorConvolutionOp< Indices, InputArgType, KernelArgType > XprType
Definition TensorConvolutionSycl.h:276
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const
Definition TensorConvolutionSycl.h:387
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
Definition TensorConvolutionSycl.h:501
TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
Definition TensorConvolutionSycl.h:308
StorageMemory< CoeffReturnType, Eigen::SyclDevice > Storage
Definition TensorConvolutionSycl.h:289
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel()
Definition TensorConvolutionSycl.h:368
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition TensorConvolutionSycl.h:494
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition TensorEvaluator.h:29
Derived::Scalar Scalar
Definition TensorEvaluator.h:31
const Device EIGEN_DEVICE_REF m_device
Definition TensorEvaluator.h:192
Storage::Type EvaluatorPointerType
Definition TensorEvaluator.h:39
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition TensorEvaluator.h:73
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition TensorEvaluator.h:181
Derived::Scalar CoeffReturnType
Definition TensorEvaluator.h:32
@ BlockAccess
Definition TensorEvaluator.h:48
@ PreferBlockAccess
Definition TensorEvaluator.h:49
@ PacketAccess
Definition TensorEvaluator.h:47
@ Layout
Definition TensorEvaluator.h:50
@ IsAligned
Definition TensorEvaluator.h:46
Derived::Index Index
Definition TensorEvaluator.h:30
Derived::Dimensions Dimensions
Definition TensorEvaluator.h:34
static const int PacketSize
Definition TensorEvaluator.h:36
Definition TensorForwardDeclarations.h:147
Definition Meta.h:445
Definition ForwardDeclarations.h:17
std::ptrdiff_t j
Definition tut_arithmetic_redux_minmax.cpp:2