15#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
16#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
17#include <unordered_set>
25struct SyclDeviceInfo {
26 SyclDeviceInfo(cl::sycl::queue queue)
29 .template get_info<
cl::sycl::
info::device::local_mem_type>()),
33 cl::sycl::
info::device::max_work_item_sizes>()),
37 cl::sycl::
info::device::max_mem_alloc_size>()),
38 max_compute_units(queue.get_device()
40 cl::sycl::
info::device::max_compute_units>()),
44 cl::sycl::
info::device::max_work_group_size>()),
47 .template get_info<
cl::sycl::
info::device::local_mem_size>()),
48 platform_name(queue.get_device()
50 .template get_info<
cl::sycl::
info::platform::name>()),
51 device_name(queue.get_device()
52 .template get_info<
cl::sycl::
info::device::name>()),
55 .template get_info<
cl::sycl::
info::device::vendor>()) {}
57 cl::sycl::info::local_mem_type local_mem_type;
58 cl::sycl::id<3> max_work_item_sizes;
59 unsigned long max_mem_alloc_size;
60 unsigned long max_compute_units;
61 unsigned long max_work_group_size;
62 size_t local_mem_size;
63 std::string platform_name;
64 std::string device_name;
65 std::string device_vendor;
71typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
76 ->
decltype(cl::sycl::device::get_devices()) {
77#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
78 return {cl::sycl::device(cl::sycl::default_selector())};
80 std::vector<cl::sycl::device> supported_devices;
81 auto platform_list = cl::sycl::platform::get_platforms();
82 for (
const auto &platform : platform_list) {
83 auto device_list = platform.get_devices();
85 platform.template get_info<cl::sycl::info::platform::name>();
86 std::transform(platform_name.begin(), platform_name.end(),
87 platform_name.begin(), ::tolower);
88 for (
const auto &device : device_list) {
89 auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
90 std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
91 bool unsupported_condition =
92 (device.is_cpu() && platform_name.find(
"amd") != std::string::npos &&
93 vendor.find(
"apu") == std::string::npos) ||
94 (platform_name.find(
"experimental") != std::string::npos) ||
96 if (!unsupported_condition) {
97 supported_devices.push_back(device);
101 return supported_devices;
105class QueueInterface {
108 template <
typename DeviceOrSelector>
109 explicit QueueInterface(
110 const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
111 unsigned num_threads = std::thread::hardware_concurrency())
112 : m_queue(dev_or_sel, handler),
113#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
114 m_prog(m_queue.get_context(), get_sycl_supported_devices()),
116 m_thread_pool(num_threads),
117 m_device_info(m_queue) {
118#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
119 m_prog.build_with_kernel_type<DeviceOrSelector>();
120 auto f = [&](cl::sycl::handler &cgh) {
121 cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
128 template <
typename DeviceOrSelector>
129 explicit QueueInterface(
130 const DeviceOrSelector &dev_or_sel,
131 unsigned num_threads = std::thread::hardware_concurrency())
132 : QueueInterface(dev_or_sel,
133 [this](
cl::sycl::exception_list l) {
134 this->exception_caught_ = this->sycl_async_handler(l);
138#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
144 cl::sycl::buffer<buffer_scalar_t, 1> &buf)
const {
145 std::lock_guard<std::mutex> lock(pmapper_mutex_);
146 return static_cast<void *
>(pMapper.add_pointer(buf));
151 std::lock_guard<std::mutex> lock(pmapper_mutex_);
152 TensorSycl::internal::SYCLfree<false>(
p, pMapper);
164#if EIGEN_MAX_ALIGN_BYTES > 0
170 std::lock_guard<std::mutex> lock(pmapper_mutex_);
171 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
175#if EIGEN_MAX_ALIGN_BYTES > 0
181 std::lock_guard<std::mutex> lock(pmapper_mutex_);
182#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
183 if (scratch_buffers.empty()) {
184 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
187 for (
auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
188 auto buff = pMapper.get_buffer(*it);
189 if (
buff.get_size() >= num_bytes) {
191 scratch_buffers.erase(it);
197 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
200 return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
203 template <
typename data_t>
205 cl::sycl::access::mode::read_write, data_t>
207 return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(
data);
209 template <
typename data_t>
211 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
214 return static_cast<data_t *
>(
data.get_virtual_pointer());
218 std::lock_guard<std::mutex> lock(pmapper_mutex_);
219#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
220 scratch_buffers.insert(
p);
222 TensorSycl::internal::SYCLfree(
p, pMapper);
225 template <cl::sycl::access::mode AcMd,
typename T>
227 const TensorSycl::internal::RangeAccess<AcMd, T> &
p)
const {
228 deallocate_temp(
p.get_virtual_pointer());
234 std::lock_guard<std::mutex> lock(pmapper_mutex_);
235 TensorSycl::internal::SYCLfree(
p, pMapper);
239 std::lock_guard<std::mutex> lock(pmapper_mutex_);
240 TensorSycl::internal::SYCLfreeAll(pMapper);
241#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
242 scratch_buffers.clear();
251 void *dst,
const void *src,
size_t n,
252 std::function<
void()> callback)
const {
253 static const auto write_mode = cl::sycl::access::mode::discard_write;
254 static const auto global_access = cl::sycl::access::target::global_buffer;
255 typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
258 if (callback) callback();
261 n /=
sizeof(buffer_scalar_t);
262 auto f = [&](cl::sycl::handler &cgh) {
263 write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst,
n);
264 buffer_scalar_t
const *ptr =
static_cast<buffer_scalar_t
const *
>(src);
265 auto non_deleter = [](buffer_scalar_t
const *) {};
266 std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
267 cgh.copy(s_ptr, dst_acc);
271 synchronize_and_callback(
e, callback);
279 void *dst,
const void *src,
size_t n,
280 std::function<
void()> callback)
const {
281 static const auto read_mode = cl::sycl::access::mode::read;
282 static const auto global_access = cl::sycl::access::target::global_buffer;
283 typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
286 if (callback) callback();
289 n /=
sizeof(buffer_scalar_t);
290 auto f = [&](cl::sycl::handler &cgh) {
291 read_accessor src_acc = get_range_accessor<read_mode>(cgh, src,
n);
292 buffer_scalar_t *ptr =
static_cast<buffer_scalar_t *
>(dst);
293 auto non_deleter = [](buffer_scalar_t *) {};
294 std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
295 cgh.copy(src_acc, s_ptr);
299 synchronize_and_callback(
e, callback);
306 static const auto read_mode = cl::sycl::access::mode::read;
307 static const auto write_mode = cl::sycl::access::mode::discard_write;
311 n /=
sizeof(buffer_scalar_t);
312 auto f = [&](cl::sycl::handler &cgh) {
313 auto src_acc = get_range_accessor<read_mode>(cgh, src,
n);
314 auto dst_acc = get_range_accessor<write_mode>(cgh, dst,
n);
315 cgh.copy(src_acc, dst_acc);
319 async_synchronize(
e);
326 static const auto write_mode = cl::sycl::access::mode::discard_write;
330 n /=
sizeof(buffer_scalar_t);
331 auto f = [&](cl::sycl::handler &cgh) {
332 auto dst_acc = get_range_accessor<write_mode>(cgh,
data,
n);
336 cgh.fill(dst_acc,
static_cast<buffer_scalar_t
>(
static_cast<uint8_t>(
c)));
340 async_synchronize(
e);
350 template <cl::sycl::access::mode AcMd,
typename T>
352 get_range_accessor(
const void *ptr)
const {
353 static const auto global_access = cl::sycl::access::target::global_buffer;
354 static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
355 typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
356 typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
358 std::lock_guard<std::mutex> lock(pmapper_mutex_);
360 auto original_buffer = pMapper.get_buffer(ptr);
361 const ptrdiff_t
offset = pMapper.get_offset(ptr);
362 const ptrdiff_t typed_offset =
offset /
sizeof(
T);
364 const auto typed_size = original_buffer.get_size() /
sizeof(
T);
365 auto buffer = original_buffer.template reinterpret<
367 cl::sycl::range<1>(typed_size));
368 const ptrdiff_t
size = buffer.get_count() - typed_offset;
370 typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
371 1, AcMd, global_access, is_place_holder>
372 placeholder_accessor_t;
373 const auto start_ptr =
static_cast<internal_ptr_t
>(ptr) -
offset;
374 return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(
size),
375 cl::sycl::id<1>(typed_offset)),
376 static_cast<size_t>(typed_offset),
377 reinterpret_cast<std::intptr_t
>(start_ptr));
382 template <cl::sycl::access::mode AcMd,
typename Index>
384 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
385 get_range_accessor(cl::sycl::handler &cgh,
const void *ptr,
386 const Index n_bytes)
const {
387 static const auto global_access = cl::sycl::access::target::global_buffer;
389 std::lock_guard<std::mutex> lock(pmapper_mutex_);
390 auto buffer = pMapper.get_buffer(ptr);
391 const ptrdiff_t
offset = pMapper.get_offset(ptr);
394 return buffer.template get_access<AcMd, global_access>(
395 cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(
offset));
402 template <cl::sycl::access::mode AcMd>
404 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
405 get_sycl_accessor(cl::sycl::handler &cgh,
const void *ptr)
const {
406 std::lock_guard<std::mutex> lock(pmapper_mutex_);
407 return pMapper.get_buffer(ptr)
408 .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
413 const void *ptr)
const {
414 std::lock_guard<std::mutex> lock(pmapper_mutex_);
415 return pMapper.get_buffer(ptr);
419 std::lock_guard<std::mutex> lock(pmapper_mutex_);
420 return pMapper.get_offset(ptr);
423 template <
typename OutScalar,
typename sycl_kernel,
typename Lhs,
424 typename Rhs,
typename OutPtr,
typename Range,
typename Index,
427 const Rhs &rhs, OutPtr outptr,
431 auto kernel_functor = [=](cl::sycl::handler &cgh) {
436 typedef cl::sycl::accessor<OutScalar, 1,
437 cl::sycl::access::mode::read_write,
438 cl::sycl::access::target::local>
441 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
443#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
444 program().
template get_kernel<sycl_kernel>(),
446 thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
450 async_synchronize(
e);
453 template <
typename OutScalar,
typename sycl_kernel,
typename InPtr,
454 typename OutPtr,
typename Range,
typename Index,
typename...
T>
460 auto kernel_functor = [=](cl::sycl::handler &cgh) {
464 typedef cl::sycl::accessor<OutScalar, 1,
465 cl::sycl::access::mode::read_write,
466 cl::sycl::access::target::local>
469 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
471#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
472 program().
template get_kernel<sycl_kernel>(),
474 thread_range, sycl_kernel(scratch, inptr, outptr, var...));
478 async_synchronize(
e);
481 template <
typename OutScalar,
typename sycl_kernel,
typename InPtr,
482 typename Range,
typename Index,
typename...
T>
487 auto kernel_functor = [=](cl::sycl::handler &cgh) {
490 typedef cl::sycl::accessor<OutScalar, 1,
491 cl::sycl::access::mode::read_write,
492 cl::sycl::access::target::local>
495 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
497#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
498 program().
template get_kernel<sycl_kernel>(),
500 thread_range, sycl_kernel(scratch, inptr, var...));
504 async_synchronize(
e);
509#ifdef EIGEN_EXCEPTIONS
510 m_queue.wait_and_throw();
519#ifndef EIGEN_SYCL_ASYNC_EXECUTION
524 template <
typename Index>
526 Index &rng, Index &GRange)
const {
527 tileSize =
static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
528 tileSize = std::min(
static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
529 EIGEN_SYCL_LOCAL_THREAD_DIM1),
530 static_cast<Index>(tileSize));
532 if (rng == 0) rng =
static_cast<Index>(1);
534 if (tileSize > GRange)
536 else if (GRange > tileSize) {
537 Index xMode =
static_cast<Index>(GRange % tileSize);
538 if (xMode != 0) GRange +=
static_cast<Index>(tileSize - xMode);
544 template <
typename Index>
546 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
547 cl::sycl::range<2> &local_range)
const {
548 std::array<Index, 2> input_range = input_dim;
549 Index max_workgroup_Size =
550 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
552 std::min(
static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
553 EIGEN_SYCL_LOCAL_THREAD_DIM1),
554 static_cast<Index>(max_workgroup_Size));
555 Index pow_of_2 =
static_cast<Index>(std::log2(max_workgroup_Size));
557 static_cast<Index>(std::pow(2,
static_cast<Index>(pow_of_2 / 2)));
558 input_range[1] = input_dim[1];
559 if (input_range[1] == 0) input_range[1] =
static_cast<Index>(1);
560 global_range[1] = input_range[1];
561 if (local_range[1] > global_range[1])
562 local_range[1] = global_range[1];
563 else if (global_range[1] > local_range[1]) {
564 Index xMode =
static_cast<Index>(global_range[1] % local_range[1]);
566 global_range[1] +=
static_cast<Index>(local_range[1] - xMode);
568 local_range[0] =
static_cast<Index>(max_workgroup_Size / local_range[1]);
569 input_range[0] = input_dim[0];
570 if (input_range[0] == 0) input_range[0] =
static_cast<Index>(1);
571 global_range[0] = input_range[0];
572 if (local_range[0] > global_range[0])
573 local_range[0] = global_range[0];
574 else if (global_range[0] > local_range[0]) {
575 Index xMode =
static_cast<Index>(global_range[0] % local_range[0]);
577 global_range[0] +=
static_cast<Index>(local_range[0] - xMode);
583 template <
typename Index>
585 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
586 cl::sycl::range<3> &local_range)
const {
587 std::array<Index, 3> input_range = input_dim;
588 Index max_workgroup_Size =
589 static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
591 std::min(
static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
592 EIGEN_SYCL_LOCAL_THREAD_DIM1),
593 static_cast<Index>(max_workgroup_Size));
594 Index pow_of_2 =
static_cast<Index>(std::log2(max_workgroup_Size));
596 static_cast<Index>(std::pow(2,
static_cast<Index>(pow_of_2 / 3)));
597 input_range[2] = input_dim[2];
598 if (input_range[2] == 0) input_range[1] =
static_cast<Index>(1);
599 global_range[2] = input_range[2];
600 if (local_range[2] > global_range[2])
601 local_range[2] = global_range[2];
602 else if (global_range[2] > local_range[2]) {
603 Index xMode =
static_cast<Index>(global_range[2] % local_range[2]);
605 global_range[2] +=
static_cast<Index>(local_range[2] - xMode);
607 pow_of_2 =
static_cast<Index>(
608 std::log2(
static_cast<Index>(max_workgroup_Size / local_range[2])));
610 static_cast<Index>(std::pow(2,
static_cast<Index>(pow_of_2 / 2)));
611 input_range[1] = input_dim[1];
612 if (input_range[1] == 0) input_range[1] =
static_cast<Index>(1);
613 global_range[1] = input_range[1];
614 if (local_range[1] > global_range[1])
615 local_range[1] = global_range[1];
616 else if (global_range[1] > local_range[1]) {
617 Index xMode =
static_cast<Index>(global_range[1] % local_range[1]);
619 global_range[1] +=
static_cast<Index>(local_range[1] - xMode);
621 local_range[0] =
static_cast<Index>(max_workgroup_Size /
622 (local_range[1] * local_range[2]));
623 input_range[0] = input_dim[0];
624 if (input_range[0] == 0) input_range[0] =
static_cast<Index>(1);
625 global_range[0] = input_range[0];
626 if (local_range[0] > global_range[0])
627 local_range[0] = global_range[0];
628 else if (global_range[0] > local_range[0]) {
629 Index xMode =
static_cast<Index>(global_range[0] % local_range[0]);
631 global_range[0] +=
static_cast<Index>(local_range[0] - xMode);
636#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
638#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
641 return m_device_info.local_mem_type ==
642 cl::sycl::info::local_mem_type::local;
647 return m_device_info.max_mem_alloc_size;
651 return m_device_info.max_compute_units;
655 return m_device_info.max_work_group_size;
659 return m_device_info.max_work_item_sizes;
671 return m_device_info.local_mem_size;
677 return getPowerOfTwo(m_device_info.max_work_group_size,
false);
681 return m_device_info.platform_name;
685 return m_device_info.device_name;
689 return m_device_info.device_vendor;
696 if (roundUp) --wGSize;
697 wGSize |= (wGSize >> 1);
698 wGSize |= (wGSize >> 2);
699 wGSize |= (wGSize >> 4);
700 wGSize |= (wGSize >> 8);
701 wGSize |= (wGSize >> 16);
702#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
703 wGSize |= (wGSize >> 32);
705 return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
713 if (!exception_caught_) {
716 return !exception_caught_;
720#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
721 std::lock_guard<std::mutex> lock(event_mutex_);
722 return latest_events_[std::this_thread::get_id()];
725 return cl::sycl::event();
732#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
733 scratch_buffers.clear();
739#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
740 std::lock_guard<std::mutex> lock(event_mutex_);
741 latest_events_[std::this_thread::get_id()] =
e;
747 void synchronize_and_callback(cl::sycl::event
e,
748 const std::function<
void()> &callback)
const {
751 auto callback_ = [=]() {
752#ifdef EIGEN_EXCEPTIONS
753 cl::sycl::event(
e).wait_and_throw();
755 cl::sycl::event(
e).wait();
759 m_thread_pool.Schedule(std::move(callback_));
761#ifdef EIGEN_EXCEPTIONS
762 m_queue.wait_and_throw();
769 bool sycl_async_handler(cl::sycl::exception_list exceptions)
const {
770 bool exception_caught =
false;
771 for (
const auto &
e : exceptions) {
773 exception_caught =
true;
777 return exception_caught;
781 bool exception_caught_ =
false;
783 mutable std::mutex pmapper_mutex_;
785#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
786 mutable std::mutex event_mutex_;
787 mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
794 mutable TensorSycl::internal::PointerMapper pMapper;
795#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
796 mutable std::unordered_set<void *> scratch_buffers;
799 mutable cl::sycl::queue m_queue;
800#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
801 mutable cl::sycl::program m_prog;
808 const TensorSycl::internal::SyclDeviceInfo m_device_info;
811struct SyclDeviceBase {
814 const QueueInterface *m_queue_stream;
815 explicit SyclDeviceBase(
const QueueInterface *queue_stream)
816 : m_queue_stream(queue_stream) {}
818 return m_queue_stream;
824struct SyclDevice :
public SyclDeviceBase {
825 explicit SyclDevice(
const QueueInterface *queue_stream)
826 : SyclDeviceBase(queue_stream) {}
829 template <cl::sycl::access::mode AcMd,
typename T>
831 get_range_accessor(
const void *ptr)
const {
832 return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
836 template <cl::sycl::access::mode AcMd>
838 buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
839 get_sycl_accessor(cl::sycl::handler &cgh,
const void *ptr)
const {
840 return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
845 const void *ptr)
const {
846 return queue_stream()->get_sycl_buffer(ptr);
851 template <
typename Index>
853 Index &rng, Index &GRange)
const {
854 queue_stream()->parallel_for_setup(
n, tileSize, rng, GRange);
859 template <
typename Index>
861 const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
862 cl::sycl::range<2> &local_range)
const {
863 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
868 template <
typename Index>
870 const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
871 cl::sycl::range<3> &local_range)
const {
872 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
877 return queue_stream()->allocate(num_bytes);
881 return queue_stream()->allocate_temp(num_bytes);
886 queue_stream()->deallocate(
p);
890 queue_stream()->deallocate_temp(buffer);
892 template <cl::sycl::access::mode AcMd,
typename T>
894 const TensorSycl::internal::RangeAccess<AcMd, T> &buffer)
const {
895 queue_stream()->deallocate_temp(buffer);
898 queue_stream()->deallocate_all();
901 template <
typename data_t>
903 cl::sycl::access::mode::read_write, data_t>
905 return queue_stream()->get(
data);
907 template <
typename data_t>
909 TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
912 return queue_stream()->get(
data);
917 cl::sycl::buffer<buffer_scalar_t, 1> &buf)
const {
918 return queue_stream()->attach_buffer(buf);
922 queue_stream()->detach_buffer(
p);
925 return queue_stream()->get_offset(ptr);
932 template <
typename Index>
934 Index *dst,
const Index *src,
size_t n,
935 std::function<
void()> callback = {})
const {
936 queue_stream()->memcpyHostToDevice(dst, src,
n, callback);
939 template <
typename Index>
941 void *dst,
const Index *src,
size_t n,
942 std::function<
void()> callback = {})
const {
943 queue_stream()->memcpyDeviceToHost(dst, src,
n, callback);
946 template <
typename Index>
948 queue_stream()->memcpy(dst, src,
n);
952 queue_stream()->memset(
data,
c,
n);
956 return queue_stream()->sycl_queue();
958#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
960 return queue_stream()->program();
969 return firstLevelCacheSize();
972 return queue_stream()->getNumSyclMultiProcessors();
975 return queue_stream()->maxSyclThreadsPerBlock();
978 return queue_stream()->maxWorkItemSizes();
982 return queue_stream()->maxSyclThreadsPerMultiProcessor();
985 return queue_stream()->sharedMemPerBlock();
988 return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
992 return queue_stream()->getPowerOfTwo(val, roundUp);
996 return queue_stream()->majorDeviceVersion();
1000 queue_stream()->synchronize();
1003 cl::sycl::event
e = cl::sycl::event())
const {
1004 queue_stream()->async_synchronize(
e);
1007 return queue_stream()->get_latest_event();
1015 return queue_stream()->has_local_memory();
1018 return queue_stream()->max_buffer_size();
1021 return queue_stream()->getPlatformName();
1024 return queue_stream()->getDeviceName();
1027 return queue_stream()->getDeviceVendor();
1029 template <
typename OutScalar,
typename KernelType,
typename...
T>
1031 queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
1034 template <
typename OutScalar,
typename KernelType,
typename...
T>
1036 queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
1040 template <
typename OutScalar,
typename KernelType,
typename...
T>
1042 queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
int n
Definition BiCGSTAB_simple.cpp:1
Array< double, 1, 3 > e(1./3., 0.5, 2.)
#define EIGEN_ALWAYS_INLINE
Definition Macros.h:932
#define EIGEN_UNUSED_VARIABLE(var)
Definition Macros.h:1076
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define eigen_assert(x)
Definition Macros.h:1037
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
#define EIGEN_THROW_X(X)
Definition Macros.h:1403
int data[]
Definition Map_placement_new.cpp:1
#define EIGEN_SYCL_TRY_CATCH(X)
Definition TensorMacros.h:54
float * p
Definition Tutorial_Map_using.cpp:9
Eigen::Triplet< double > T
Definition Tutorial_sparse_example.cpp:6
Scalar Scalar * c
Definition benchVecAdd.cpp:17
Scalar Scalar int size
Definition benchVecAdd.cpp:17
Definition NonBlockingThreadPool.h:16
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64
else if n * info
Definition cholesky.cpp:18
@ Lhs
Definition TensorContractionMapper.h:19
@ Rhs
Definition TensorContractionMapper.h:18
::uint8_t uint8_t
Definition Meta.h:52
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
Definition cxx11_tensor_builtins_sycl.cpp:30
Definition BandTriangularSolver.h:13
buff_t buff
Definition ref_serial.cpp:62
Container::iterator get(Container &c, Position position)
Definition stdlist_overload.cpp:29