TR-mbed 1.0
Loading...
Searching...
No Matches
TensorDeviceSycl.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Mehdi Goli Codeplay Software Ltd.
5// Ralph Potter Codeplay Software Ltd.
6// Luke Iwanski Codeplay Software Ltd.
7// Contact: <eigen@codeplay.com>
8// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9
10//
11// This Source Code Form is subject to the terms of the Mozilla
12// Public License v. 2.0. If a copy of the MPL was not distributed
13// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14
15#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
16#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
17#include <unordered_set>
18
19namespace Eigen {
20
21namespace TensorSycl {
22namespace internal {
23
25struct SyclDeviceInfo {
26 SyclDeviceInfo(cl::sycl::queue queue)
27 : local_mem_type(
28 queue.get_device()
29 .template get_info<cl::sycl::info::device::local_mem_type>()),
30 max_work_item_sizes(
31 queue.get_device()
32 .template get_info<
33 cl::sycl::info::device::max_work_item_sizes>()),
34 max_mem_alloc_size(
35 queue.get_device()
36 .template get_info<
37 cl::sycl::info::device::max_mem_alloc_size>()),
38 max_compute_units(queue.get_device()
39 .template get_info<
40 cl::sycl::info::device::max_compute_units>()),
41 max_work_group_size(
42 queue.get_device()
43 .template get_info<
44 cl::sycl::info::device::max_work_group_size>()),
45 local_mem_size(
46 queue.get_device()
47 .template get_info<cl::sycl::info::device::local_mem_size>()),
48 platform_name(queue.get_device()
49 .get_platform()
50 .template get_info<cl::sycl::info::platform::name>()),
51 device_name(queue.get_device()
52 .template get_info<cl::sycl::info::device::name>()),
53 device_vendor(
54 queue.get_device()
55 .template get_info<cl::sycl::info::device::vendor>()) {}
56
57 cl::sycl::info::local_mem_type local_mem_type;
58 cl::sycl::id<3> max_work_item_sizes;
59 unsigned long max_mem_alloc_size;
60 unsigned long max_compute_units;
61 unsigned long max_work_group_size;
62 size_t local_mem_size;
63 std::string platform_name;
64 std::string device_name;
65 std::string device_vendor;
66};
67
68} // end namespace internal
69} // end namespace TensorSycl
70
71typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
72// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
73// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
74// TensorFlow via the Eigen SYCL Backend.
75EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
76 -> decltype(cl::sycl::device::get_devices()) {
77#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
78 return {cl::sycl::device(cl::sycl::default_selector())};
79#else
80 std::vector<cl::sycl::device> supported_devices;
81 auto platform_list = cl::sycl::platform::get_platforms();
82 for (const auto &platform : platform_list) {
83 auto device_list = platform.get_devices();
84 auto platform_name =
85 platform.template get_info<cl::sycl::info::platform::name>();
86 std::transform(platform_name.begin(), platform_name.end(),
87 platform_name.begin(), ::tolower);
88 for (const auto &device : device_list) {
89 auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
90 std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
91 bool unsupported_condition =
92 (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
93 vendor.find("apu") == std::string::npos) ||
94 (platform_name.find("experimental") != std::string::npos) ||
95 device.is_host();
96 if (!unsupported_condition) {
97 supported_devices.push_back(device);
98 }
99 }
100 }
101 return supported_devices;
102#endif
103}
104
105class QueueInterface {
106 public:
108 template <typename DeviceOrSelector>
109 explicit QueueInterface(
110 const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
111 unsigned num_threads = std::thread::hardware_concurrency())
112 : m_queue(dev_or_sel, handler),
113#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
114 m_prog(m_queue.get_context(), get_sycl_supported_devices()),
115#endif
116 m_thread_pool(num_threads),
117 m_device_info(m_queue) {
118#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
119 m_prog.build_with_kernel_type<DeviceOrSelector>();
120 auto f = [&](cl::sycl::handler &cgh) {
121 cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
122 [=]() {})
123 };
124 EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
125#endif
126 }
127
128 template <typename DeviceOrSelector>
129 explicit QueueInterface(
130 const DeviceOrSelector &dev_or_sel,
131 unsigned num_threads = std::thread::hardware_concurrency())
132 : QueueInterface(dev_or_sel,
133 [this](cl::sycl::exception_list l) {
134 this->exception_caught_ = this->sycl_async_handler(l);
135 },
136 num_threads) {}
137
138#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
139 EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
140#endif
141
143 EIGEN_STRONG_INLINE void *attach_buffer(
144 cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
145 std::lock_guard<std::mutex> lock(pmapper_mutex_);
146 return static_cast<void *>(pMapper.add_pointer(buf));
147 }
148
150 EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
151 std::lock_guard<std::mutex> lock(pmapper_mutex_);
152 TensorSycl::internal::SYCLfree<false>(p, pMapper);
153 }
154
163 EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
164#if EIGEN_MAX_ALIGN_BYTES > 0
165 size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
166 if (align > 0) {
167 num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
168 }
169#endif
170 std::lock_guard<std::mutex> lock(pmapper_mutex_);
171 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
172 }
173
174 EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
175#if EIGEN_MAX_ALIGN_BYTES > 0
176 size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
177 if (align > 0) {
178 num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
179 }
180#endif
181 std::lock_guard<std::mutex> lock(pmapper_mutex_);
182#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
183 if (scratch_buffers.empty()) {
184 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
185 ;
186 } else {
187 for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
188 auto buff = pMapper.get_buffer(*it);
189 if (buff.get_size() >= num_bytes) {
190 auto ptr = *it;
191 scratch_buffers.erase(it);
192 return ptr;
193 } else {
194 ++it;
195 }
196 }
197 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
198 }
199#else
200 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
201#endif
202 }
203 template <typename data_t>
204 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
205 cl::sycl::access::mode::read_write, data_t>
206 get(data_t *data) const {
207 return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
208 }
209 template <typename data_t>
211 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
212 data_t>
213 data) const {
214 return static_cast<data_t *>(data.get_virtual_pointer());
215 }
216
217 EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
218 std::lock_guard<std::mutex> lock(pmapper_mutex_);
219#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
220 scratch_buffers.insert(p);
221#else
222 TensorSycl::internal::SYCLfree(p, pMapper);
223#endif
224 }
225 template <cl::sycl::access::mode AcMd, typename T>
226 EIGEN_STRONG_INLINE void deallocate_temp(
227 const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
228 deallocate_temp(p.get_virtual_pointer());
229 }
230
233 EIGEN_STRONG_INLINE void deallocate(void *p) const {
234 std::lock_guard<std::mutex> lock(pmapper_mutex_);
235 TensorSycl::internal::SYCLfree(p, pMapper);
236 }
237
238 EIGEN_STRONG_INLINE void deallocate_all() const {
239 std::lock_guard<std::mutex> lock(pmapper_mutex_);
240 TensorSycl::internal::SYCLfreeAll(pMapper);
241#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
242 scratch_buffers.clear();
243#endif
244 }
245
250 EIGEN_STRONG_INLINE void memcpyHostToDevice(
251 void *dst, const void *src, size_t n,
252 std::function<void()> callback) const {
253 static const auto write_mode = cl::sycl::access::mode::discard_write;
254 static const auto global_access = cl::sycl::access::target::global_buffer;
255 typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
256 write_accessor;
257 if (n == 0) {
258 if (callback) callback();
259 return;
260 }
261 n /= sizeof(buffer_scalar_t);
262 auto f = [&](cl::sycl::handler &cgh) {
263 write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
264 buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
265 auto non_deleter = [](buffer_scalar_t const *) {};
266 std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
267 cgh.copy(s_ptr, dst_acc);
268 };
269 cl::sycl::event e;
270 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
271 synchronize_and_callback(e, callback);
272 }
273
278 EIGEN_STRONG_INLINE void memcpyDeviceToHost(
279 void *dst, const void *src, size_t n,
280 std::function<void()> callback) const {
281 static const auto read_mode = cl::sycl::access::mode::read;
282 static const auto global_access = cl::sycl::access::target::global_buffer;
283 typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
284 read_accessor;
285 if (n == 0) {
286 if (callback) callback();
287 return;
288 }
289 n /= sizeof(buffer_scalar_t);
290 auto f = [&](cl::sycl::handler &cgh) {
291 read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
292 buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
293 auto non_deleter = [](buffer_scalar_t *) {};
294 std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
295 cgh.copy(src_acc, s_ptr);
296 };
297 cl::sycl::event e;
298 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
299 synchronize_and_callback(e, callback);
300 }
301
305 EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
306 static const auto read_mode = cl::sycl::access::mode::read;
307 static const auto write_mode = cl::sycl::access::mode::discard_write;
308 if (n == 0) {
309 return;
310 }
311 n /= sizeof(buffer_scalar_t);
312 auto f = [&](cl::sycl::handler &cgh) {
313 auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
314 auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
315 cgh.copy(src_acc, dst_acc);
316 };
317 cl::sycl::event e;
318 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
319 async_synchronize(e);
320 }
321
325 EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
326 static const auto write_mode = cl::sycl::access::mode::discard_write;
327 if (n == 0) {
328 return;
329 }
330 n /= sizeof(buffer_scalar_t);
331 auto f = [&](cl::sycl::handler &cgh) {
332 auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
333 // The cast to uint8_t is here to match the behaviour of the standard
334 // memset. The cast to buffer_scalar_t is needed to match the type of the
335 // accessor (in case buffer_scalar_t is not uint8_t)
336 cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
337 };
338 cl::sycl::event e;
339 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
340 async_synchronize(e);
341 }
342
350 template <cl::sycl::access::mode AcMd, typename T>
351 EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
352 get_range_accessor(const void *ptr) const {
353 static const auto global_access = cl::sycl::access::target::global_buffer;
354 static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
355 typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
356 typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
357
358 std::lock_guard<std::mutex> lock(pmapper_mutex_);
359
360 auto original_buffer = pMapper.get_buffer(ptr);
361 const ptrdiff_t offset = pMapper.get_offset(ptr);
362 const ptrdiff_t typed_offset = offset / sizeof(T);
363 eigen_assert(typed_offset >= 0);
364 const auto typed_size = original_buffer.get_size() / sizeof(T);
365 auto buffer = original_buffer.template reinterpret<
367 cl::sycl::range<1>(typed_size));
368 const ptrdiff_t size = buffer.get_count() - typed_offset;
369 eigen_assert(size >= 0);
370 typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
371 1, AcMd, global_access, is_place_holder>
372 placeholder_accessor_t;
373 const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
374 return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
375 cl::sycl::id<1>(typed_offset)),
376 static_cast<size_t>(typed_offset),
377 reinterpret_cast<std::intptr_t>(start_ptr));
378 }
379
382 template <cl::sycl::access::mode AcMd, typename Index>
383 EIGEN_STRONG_INLINE cl::sycl::accessor<
384 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
385 get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
386 const Index n_bytes) const {
387 static const auto global_access = cl::sycl::access::target::global_buffer;
388 eigen_assert(n_bytes >= 0);
389 std::lock_guard<std::mutex> lock(pmapper_mutex_);
390 auto buffer = pMapper.get_buffer(ptr);
391 const ptrdiff_t offset = pMapper.get_offset(ptr);
392 eigen_assert(offset >= 0);
393 eigen_assert(offset + n_bytes <= buffer.get_size());
394 return buffer.template get_access<AcMd, global_access>(
395 cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
396 }
397
402 template <cl::sycl::access::mode AcMd>
403 EIGEN_STRONG_INLINE cl::sycl::accessor<
404 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
405 get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
406 std::lock_guard<std::mutex> lock(pmapper_mutex_);
407 return pMapper.get_buffer(ptr)
408 .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
409 cgh);
410 }
411
412 EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
413 const void *ptr) const {
414 std::lock_guard<std::mutex> lock(pmapper_mutex_);
415 return pMapper.get_buffer(ptr);
416 }
417
418 EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
419 std::lock_guard<std::mutex> lock(pmapper_mutex_);
420 return pMapper.get_offset(ptr);
421 }
422
423 template <typename OutScalar, typename sycl_kernel, typename Lhs,
424 typename Rhs, typename OutPtr, typename Range, typename Index,
425 typename... T>
426 EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
427 const Rhs &rhs, OutPtr outptr,
428 Range thread_range,
429 Index scratchSize,
430 T... var) const {
431 auto kernel_functor = [=](cl::sycl::handler &cgh) {
432 // binding the placeholder accessors to a commandgroup handler
433 lhs.bind(cgh);
434 rhs.bind(cgh);
435 outptr.bind(cgh);
436 typedef cl::sycl::accessor<OutScalar, 1,
437 cl::sycl::access::mode::read_write,
438 cl::sycl::access::target::local>
439 LocalAccessor;
440
441 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
442 cgh.parallel_for(
443#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
444 program().template get_kernel<sycl_kernel>(),
445#endif
446 thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
447 };
448 cl::sycl::event e;
449 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
450 async_synchronize(e);
451 }
452
453 template <typename OutScalar, typename sycl_kernel, typename InPtr,
454 typename OutPtr, typename Range, typename Index, typename... T>
455 EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
456 OutPtr &outptr,
457 Range thread_range,
458 Index scratchSize,
459 T... var) const {
460 auto kernel_functor = [=](cl::sycl::handler &cgh) {
461 // binding the placeholder accessors to a commandgroup handler
462 inptr.bind(cgh);
463 outptr.bind(cgh);
464 typedef cl::sycl::accessor<OutScalar, 1,
465 cl::sycl::access::mode::read_write,
466 cl::sycl::access::target::local>
467 LocalAccessor;
468
469 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
470 cgh.parallel_for(
471#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
472 program().template get_kernel<sycl_kernel>(),
473#endif
474 thread_range, sycl_kernel(scratch, inptr, outptr, var...));
475 };
476 cl::sycl::event e;
477 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
478 async_synchronize(e);
479 }
480
481 template <typename OutScalar, typename sycl_kernel, typename InPtr,
482 typename Range, typename Index, typename... T>
483 EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
484 Range thread_range,
485 Index scratchSize,
486 T... var) const {
487 auto kernel_functor = [=](cl::sycl::handler &cgh) {
488 // binding the placeholder accessors to a commandgroup handler
489 inptr.bind(cgh);
490 typedef cl::sycl::accessor<OutScalar, 1,
491 cl::sycl::access::mode::read_write,
492 cl::sycl::access::target::local>
493 LocalAccessor;
494
495 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
496 cgh.parallel_for(
497#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
498 program().template get_kernel<sycl_kernel>(),
499#endif
500 thread_range, sycl_kernel(scratch, inptr, var...));
501 };
502 cl::sycl::event e;
503 EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
504 async_synchronize(e);
505 }
506
507
508 EIGEN_STRONG_INLINE void synchronize() const {
509#ifdef EIGEN_EXCEPTIONS
510 m_queue.wait_and_throw();
511#else
512 m_queue.wait();
513#endif
514 }
515
516
517 EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
518 set_latest_event(e);
519#ifndef EIGEN_SYCL_ASYNC_EXECUTION
520 synchronize();
521#endif
522 }
523
524 template <typename Index>
525 EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
526 Index &rng, Index &GRange) const {
527 tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
528 tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
529 EIGEN_SYCL_LOCAL_THREAD_DIM1),
530 static_cast<Index>(tileSize));
531 rng = n;
532 if (rng == 0) rng = static_cast<Index>(1);
533 GRange = rng;
534 if (tileSize > GRange)
535 tileSize = GRange;
536 else if (GRange > tileSize) {
537 Index xMode = static_cast<Index>(GRange % tileSize);
538 if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
539 }
540 }
541
544 template <typename Index>
545 EIGEN_STRONG_INLINE void parallel_for_setup(
546 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
547 cl::sycl::range<2> &local_range) const {
548 std::array<Index, 2> input_range = input_dim;
549 Index max_workgroup_Size =
550 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
551 max_workgroup_Size =
552 std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
553 EIGEN_SYCL_LOCAL_THREAD_DIM1),
554 static_cast<Index>(max_workgroup_Size));
555 Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
556 local_range[1] =
557 static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
558 input_range[1] = input_dim[1];
559 if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
560 global_range[1] = input_range[1];
561 if (local_range[1] > global_range[1])
562 local_range[1] = global_range[1];
563 else if (global_range[1] > local_range[1]) {
564 Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
565 if (xMode != 0)
566 global_range[1] += static_cast<Index>(local_range[1] - xMode);
567 }
568 local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
569 input_range[0] = input_dim[0];
570 if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
571 global_range[0] = input_range[0];
572 if (local_range[0] > global_range[0])
573 local_range[0] = global_range[0];
574 else if (global_range[0] > local_range[0]) {
575 Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
576 if (xMode != 0)
577 global_range[0] += static_cast<Index>(local_range[0] - xMode);
578 }
579 }
580
583 template <typename Index>
584 EIGEN_STRONG_INLINE void parallel_for_setup(
585 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
586 cl::sycl::range<3> &local_range) const {
587 std::array<Index, 3> input_range = input_dim;
588 Index max_workgroup_Size =
589 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
590 max_workgroup_Size =
591 std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
592 EIGEN_SYCL_LOCAL_THREAD_DIM1),
593 static_cast<Index>(max_workgroup_Size));
594 Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
595 local_range[2] =
596 static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
597 input_range[2] = input_dim[2];
598 if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
599 global_range[2] = input_range[2];
600 if (local_range[2] > global_range[2])
601 local_range[2] = global_range[2];
602 else if (global_range[2] > local_range[2]) {
603 Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
604 if (xMode != 0)
605 global_range[2] += static_cast<Index>(local_range[2] - xMode);
606 }
607 pow_of_2 = static_cast<Index>(
608 std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
609 local_range[1] =
610 static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
611 input_range[1] = input_dim[1];
612 if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
613 global_range[1] = input_range[1];
614 if (local_range[1] > global_range[1])
615 local_range[1] = global_range[1];
616 else if (global_range[1] > local_range[1]) {
617 Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
618 if (xMode != 0)
619 global_range[1] += static_cast<Index>(local_range[1] - xMode);
620 }
621 local_range[0] = static_cast<Index>(max_workgroup_Size /
622 (local_range[1] * local_range[2]));
623 input_range[0] = input_dim[0];
624 if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
625 global_range[0] = input_range[0];
626 if (local_range[0] > global_range[0])
627 local_range[0] = global_range[0];
628 else if (global_range[0] > local_range[0]) {
629 Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
630 if (xMode != 0)
631 global_range[0] += static_cast<Index>(local_range[0] - xMode);
632 }
633 }
634
635 EIGEN_STRONG_INLINE bool has_local_memory() const {
636#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
637 return false;
638#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
639 return true;
640#else
641 return m_device_info.local_mem_type ==
642 cl::sycl::info::local_mem_type::local;
643#endif
644 }
645
646 EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
647 return m_device_info.max_mem_alloc_size;
648 }
649
650 EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
651 return m_device_info.max_compute_units;
652 }
653
654 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
655 return m_device_info.max_work_group_size;
656 }
657
658 EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
659 return m_device_info.max_work_item_sizes;
660 }
661
663 EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
664
665 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
666 // OpenCL doesnot have such concept
667 return 2;
668 }
669
670 EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
671 return m_device_info.local_mem_size;
672 }
673
674 // This function returns the nearest power of 2 Work-group size which is <=
675 // maximum device workgroup size.
676 EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
677 return getPowerOfTwo(m_device_info.max_work_group_size, false);
678 }
679
680 EIGEN_STRONG_INLINE std::string getPlatformName() const {
681 return m_device_info.platform_name;
682 }
683
684 EIGEN_STRONG_INLINE std::string getDeviceName() const {
685 return m_device_info.device_name;
686 }
687
688 EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
689 return m_device_info.device_vendor;
690 }
691
692 // This function returns the nearest power of 2
693 // if roundup is true returns result>=wgsize
694 // else it return result <= wgsize
695 EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
696 if (roundUp) --wGSize;
697 wGSize |= (wGSize >> 1);
698 wGSize |= (wGSize >> 2);
699 wGSize |= (wGSize >> 4);
700 wGSize |= (wGSize >> 8);
701 wGSize |= (wGSize >> 16);
702#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
703 wGSize |= (wGSize >> 32);
704#endif
705 return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
706 }
707
708 EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
709
710 // This function checks if the runtime recorded an error for the
711 // underlying stream device.
712 EIGEN_STRONG_INLINE bool ok() const {
713 if (!exception_caught_) {
714 synchronize();
715 }
716 return !exception_caught_;
717 }
718
719 EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
720#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
721 std::lock_guard<std::mutex> lock(event_mutex_);
722 return latest_events_[std::this_thread::get_id()];
723#else
724 eigen_assert(false);
725 return cl::sycl::event();
726#endif
727 }
728
729 // destructor
730 ~QueueInterface() {
731 pMapper.clear();
732#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
733 scratch_buffers.clear();
734#endif
735 }
736
737 protected:
738 EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
739#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
740 std::lock_guard<std::mutex> lock(event_mutex_);
741 latest_events_[std::this_thread::get_id()] = e;
742#else
744#endif
745 }
746
747 void synchronize_and_callback(cl::sycl::event e,
748 const std::function<void()> &callback) const {
749 set_latest_event(e);
750 if (callback) {
751 auto callback_ = [=]() {
752#ifdef EIGEN_EXCEPTIONS
753 cl::sycl::event(e).wait_and_throw();
754#else
755 cl::sycl::event(e).wait();
756#endif
757 callback();
758 };
759 m_thread_pool.Schedule(std::move(callback_));
760 } else {
761#ifdef EIGEN_EXCEPTIONS
762 m_queue.wait_and_throw();
763#else
764 m_queue.wait();
765#endif
766 }
767 }
768
769 bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
770 bool exception_caught = false;
771 for (const auto &e : exceptions) {
772 if (e) {
773 exception_caught = true;
775 }
776 }
777 return exception_caught;
778 }
779
781 bool exception_caught_ = false;
782
783 mutable std::mutex pmapper_mutex_;
784
785#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
786 mutable std::mutex event_mutex_;
787 mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
788#endif
789
794 mutable TensorSycl::internal::PointerMapper pMapper;
795#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
796 mutable std::unordered_set<void *> scratch_buffers;
797#endif
799 mutable cl::sycl::queue m_queue;
800#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
801 mutable cl::sycl::program m_prog;
802#endif
803
806 mutable Eigen::ThreadPool m_thread_pool;
807
808 const TensorSycl::internal::SyclDeviceInfo m_device_info;
809};
810
811struct SyclDeviceBase {
814 const QueueInterface *m_queue_stream;
815 explicit SyclDeviceBase(const QueueInterface *queue_stream)
816 : m_queue_stream(queue_stream) {}
817 EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
818 return m_queue_stream;
819 }
820};
821
822// Here is a sycl device struct which accept the sycl queue interface
823// as an input
824struct SyclDevice : public SyclDeviceBase {
825 explicit SyclDevice(const QueueInterface *queue_stream)
826 : SyclDeviceBase(queue_stream) {}
827
828 // this is the accessor used to construct the evaluator
829 template <cl::sycl::access::mode AcMd, typename T>
830 EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
831 get_range_accessor(const void *ptr) const {
832 return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
833 }
834
835 // get sycl accessor
836 template <cl::sycl::access::mode AcMd>
837 EIGEN_STRONG_INLINE cl::sycl::accessor<
838 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
839 get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
840 return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
841 }
842
844 EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
845 const void *ptr) const {
846 return queue_stream()->get_sycl_buffer(ptr);
847 }
848
851 template <typename Index>
852 EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
853 Index &rng, Index &GRange) const {
854 queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
855 }
856
859 template <typename Index>
860 EIGEN_STRONG_INLINE void parallel_for_setup(
861 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
862 cl::sycl::range<2> &local_range) const {
863 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
864 }
865
868 template <typename Index>
869 EIGEN_STRONG_INLINE void parallel_for_setup(
870 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
871 cl::sycl::range<3> &local_range) const {
872 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
873 }
874
876 EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
877 return queue_stream()->allocate(num_bytes);
878 }
879
880 EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
881 return queue_stream()->allocate_temp(num_bytes);
882 }
883
885 EIGEN_STRONG_INLINE void deallocate(void *p) const {
886 queue_stream()->deallocate(p);
887 }
888
889 EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
890 queue_stream()->deallocate_temp(buffer);
891 }
892 template <cl::sycl::access::mode AcMd, typename T>
893 EIGEN_STRONG_INLINE void deallocate_temp(
894 const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
895 queue_stream()->deallocate_temp(buffer);
896 }
897 EIGEN_STRONG_INLINE void deallocate_all() const {
898 queue_stream()->deallocate_all();
899 }
900
901 template <typename data_t>
902 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
903 cl::sycl::access::mode::read_write, data_t>
904 get(data_t *data) const {
905 return queue_stream()->get(data);
906 }
907 template <typename data_t>
909 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
910 data_t>
911 data) const {
912 return queue_stream()->get(data);
913 }
914
916 EIGEN_STRONG_INLINE void *attach_buffer(
917 cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
918 return queue_stream()->attach_buffer(buf);
919 }
921 EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
922 queue_stream()->detach_buffer(p);
923 }
924 EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
925 return queue_stream()->get_offset(ptr);
926 }
927
928 // some runtime conditions that can be applied here
929 EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
930
932 template <typename Index>
933 EIGEN_STRONG_INLINE void memcpyHostToDevice(
934 Index *dst, const Index *src, size_t n,
935 std::function<void()> callback = {}) const {
936 queue_stream()->memcpyHostToDevice(dst, src, n, callback);
937 }
939 template <typename Index>
940 EIGEN_STRONG_INLINE void memcpyDeviceToHost(
941 void *dst, const Index *src, size_t n,
942 std::function<void()> callback = {}) const {
943 queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
944 }
946 template <typename Index>
947 EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
948 queue_stream()->memcpy(dst, src, n);
949 }
951 EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
952 queue_stream()->memset(data, c, n);
953 }
955 EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
956 return queue_stream()->sycl_queue();
957 }
958#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
959 EIGEN_STRONG_INLINE cl::sycl::program &program() const {
960 return queue_stream()->program();
961 }
962#endif
963
964 EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
965
966 EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
967 // We won't try to take advantage of the l2 cache for the time being, and
968 // there is no l3 cache on sycl devices.
969 return firstLevelCacheSize();
970 }
971 EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
972 return queue_stream()->getNumSyclMultiProcessors();
973 }
974 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
975 return queue_stream()->maxSyclThreadsPerBlock();
976 }
977 EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
978 return queue_stream()->maxWorkItemSizes();
979 }
980 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
981 // OpenCL doesnot have such concept
982 return queue_stream()->maxSyclThreadsPerMultiProcessor();
983 }
984 EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
985 return queue_stream()->sharedMemPerBlock();
986 }
987 EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
988 return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
989 }
990
991 EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
992 return queue_stream()->getPowerOfTwo(val, roundUp);
993 }
995 EIGEN_STRONG_INLINE int majorDeviceVersion() const {
996 return queue_stream()->majorDeviceVersion();
997 }
998
999 EIGEN_STRONG_INLINE void synchronize() const {
1000 queue_stream()->synchronize();
1001 }
1002 EIGEN_STRONG_INLINE void async_synchronize(
1003 cl::sycl::event e = cl::sycl::event()) const {
1004 queue_stream()->async_synchronize(e);
1005 }
1006 EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
1007 return queue_stream()->get_latest_event();
1008 }
1009
1010 // This function checks if the runtime recorded an error for the
1011 // underlying stream device.
1012 EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
1013
1014 EIGEN_STRONG_INLINE bool has_local_memory() const {
1015 return queue_stream()->has_local_memory();
1016 }
1017 EIGEN_STRONG_INLINE long max_buffer_size() const {
1018 return queue_stream()->max_buffer_size();
1019 }
1020 EIGEN_STRONG_INLINE std::string getPlatformName() const {
1021 return queue_stream()->getPlatformName();
1022 }
1023 EIGEN_STRONG_INLINE std::string getDeviceName() const {
1024 return queue_stream()->getDeviceName();
1025 }
1026 EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
1027 return queue_stream()->getDeviceVendor();
1028 }
1029 template <typename OutScalar, typename KernelType, typename... T>
1030 EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
1031 queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
1032 var...);
1033 }
1034 template <typename OutScalar, typename KernelType, typename... T>
1035 EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
1036 queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
1037 var...);
1038 }
1039
1040 template <typename OutScalar, typename KernelType, typename... T>
1041 EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
1042 queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
1043 var...);
1044 }
1045};
1046} // end namespace Eigen
1047
1048#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
int n
Definition BiCGSTAB_simple.cpp:1
#define EIGEN_MAX_ALIGN_BYTES
Definition ConfigureVectorization.h:175
Array< double, 1, 3 > e(1./3., 0.5, 2.)
#define EIGEN_ALWAYS_INLINE
Definition Macros.h:932
#define EIGEN_UNUSED_VARIABLE(var)
Definition Macros.h:1076
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define eigen_assert(x)
Definition Macros.h:1037
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
#define EIGEN_THROW_X(X)
Definition Macros.h:1403
int data[]
Definition Map_placement_new.cpp:1
#define EIGEN_SYCL_TRY_CATCH(X)
Definition TensorMacros.h:54
float * p
Definition Tutorial_Map_using.cpp:9
Eigen::Triplet< double > T
Definition Tutorial_sparse_example.cpp:6
Scalar Scalar * c
Definition benchVecAdd.cpp:17
Scalar Scalar int size
Definition benchVecAdd.cpp:17
Definition NonBlockingThreadPool.h:16
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64
else if n * info
Definition cholesky.cpp:18
@ Lhs
Definition TensorContractionMapper.h:19
@ Rhs
Definition TensorContractionMapper.h:18
::uint8_t uint8_t
Definition Meta.h:52
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
Definition cxx11_tensor_builtins_sycl.cpp:30
Definition BandTriangularSolver.h:13
buff_t buff
Definition ref_serial.cpp:62
Container::iterator get(Container &c, Position position)
Definition stdlist_overload.cpp:29