1#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
2#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
5#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
7#include "unsupported/Eigen/CXX11/Tensor"
10#define BENCHMARK_RANGE(bench, lo, hi) \
11 BENCHMARK(bench)->Range(lo, hi)
21 : m_(
m), k_(k), n_(
n), device_(device) {
26 : m_(
m), k_(
m), n_(
m), device_(device) {
31 : m_(1), k_(k), n_(
m), device_(device) {
36 device_.deallocate(a_);
37 device_.deallocate(b_);
38 device_.deallocate(c_);
44 for (
int iter = 0; iter < 10; ++iter) {
45 device_.memcpy(c_, a_, m_ * m_ *
sizeof(
T));
49 for (
int iter = 0; iter < num_iters; ++iter) {
50 device_.memcpy(c_, a_, m_ * m_ *
sizeof(
T));
53 finalizeBenchmark(
static_cast<int64_t
>(m_) * m_ * num_iters);
59 if (
sizeof(
T) >=
sizeof(
int)) {
63 sizes[0] = m_ *
sizeof(
T) /
sizeof(
int);
64 sizes[1] = k_ *
sizeof(
T) /
sizeof(
int);
69 for (
int iter = 0; iter < 10; ++iter) {
70 B.device(device_) =
A.template cast<T>();
74 for (
int iter = 0; iter < num_iters; ++iter) {
75 B.device(device_) =
A.template cast<T>();
78 finalizeBenchmark(
static_cast<int64_t
>(m_) * k_ * num_iters);
88 for (
int iter = 0; iter < 10; ++iter) {
89 C.device(device_) =
C.random();
93 for (
int iter = 0; iter < num_iters; ++iter) {
94 C.device(device_) =
C.random();
97 finalizeBenchmark(
static_cast<int64_t
>(m_) * m_ * num_iters);
115 for (
int iter = 0; iter < 10; ++iter) {
116 C.slice(first_quadrant, quarter_sizes).device(device_) =
117 A.slice(first_quadrant, quarter_sizes);
118 C.slice(second_quadrant, quarter_sizes).device(device_) =
119 B.slice(second_quadrant, quarter_sizes);
120 C.slice(third_quadrant, quarter_sizes).device(device_) =
121 A.slice(third_quadrant, quarter_sizes);
122 C.slice(fourth_quadrant, quarter_sizes).device(device_) =
123 B.slice(fourth_quadrant, quarter_sizes);
127 for (
int iter = 0; iter < num_iters; ++iter) {
128 C.slice(first_quadrant, quarter_sizes).device(device_) =
129 A.slice(first_quadrant, quarter_sizes);
130 C.slice(second_quadrant, quarter_sizes).device(device_) =
131 B.slice(second_quadrant, quarter_sizes);
132 C.slice(third_quadrant, quarter_sizes).device(device_) =
133 A.slice(third_quadrant, quarter_sizes);
134 C.slice(fourth_quadrant, quarter_sizes).device(device_) =
135 B.slice(fourth_quadrant, quarter_sizes);
139 finalizeBenchmark(
static_cast<int64_t
>(m_) * m_ * num_iters);
151 for (
int iter = 0; iter < 10; ++iter) {
152 C.device(device_) =
B.chip(iter % k_, 0);
156 for (
int iter = 0; iter < num_iters; ++iter) {
157 C.device(device_) =
B.chip(iter % k_, 0);
160 finalizeBenchmark(
static_cast<int64_t
>(n_) * num_iters);
172 for (
int iter = 0; iter < 10; ++iter) {
173 C.device(device_) =
B.chip(iter % n_, 1);
177 for (
int iter = 0; iter < num_iters; ++iter) {
178 C.device(device_) =
B.chip(iter % n_, 1);
181 finalizeBenchmark(
static_cast<int64_t
>(n_) * num_iters);
199 for (
int iter = 0; iter < 10; ++iter) {
200 B.device(device_) =
A.shuffle(shuffle);
204 for (
int iter = 0; iter < num_iters; ++iter) {
205 B.device(device_) =
A.shuffle(shuffle);
208 finalizeBenchmark(
static_cast<int64_t
>(m_) * k_ * num_iters);
222#if defined(EIGEN_HAS_INDEX_LIST)
223 Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
224 Eigen::type2indexpair<2, 1> > paddings;
231 for (
int iter = 0; iter < 10; ++iter) {
232 B.device(device_) =
A.pad(paddings);
236 for (
int iter = 0; iter < num_iters; ++iter) {
237 B.device(device_) =
A.pad(paddings);
240 finalizeBenchmark(
static_cast<int64_t
>(m_) * k_ * num_iters);
254#ifndef EIGEN_HAS_INDEX_LIST
261 Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
265 for (
int iter = 0; iter < 10; ++iter) {
266 B.device(device_) =
A.stride(strides);
270 for (
int iter = 0; iter < num_iters; ++iter) {
271 B.device(device_) =
A.stride(strides);
274 finalizeBenchmark(
static_cast<int64_t
>(m_) * k_ * num_iters);
288#ifndef EIGEN_HAS_INDEX_LIST
295 Eigen::IndexList<Eigen::type2index<1>,
int> broadcast;
296 broadcast.set(1, n_);
300 for (
int iter = 0; iter < 10; ++iter) {
301 C.device(device_) =
A.broadcast(broadcast);
305 for (
int iter = 0; iter < num_iters; ++iter) {
306 C.device(device_) =
A.broadcast(broadcast);
309 finalizeBenchmark(
static_cast<int64_t
>(m_) * n_ * num_iters);
321 for (
int iter = 0; iter < 10; ++iter) {
322 C.device(device_) =
A *
A.constant(
static_cast<T>(3.14)) +
B *
B.constant(
static_cast<T>(2.7));
326 for (
int iter = 0; iter < num_iters; ++iter) {
327 C.device(device_) =
A *
A.constant(
static_cast<T>(3.14)) +
B *
B.constant(
static_cast<T>(2.7));
331 finalizeBenchmark(
static_cast<int64_t
>(3) * m_ * m_ * num_iters);
344for (
int iter = 0; iter < 10; ++iter) {
345 C.device(device_) =
A.rsqrt() +
B.sqrt() *
B.square();
349 for (
int iter = 0; iter < num_iters; ++iter) {
350 C.device(device_) =
A.rsqrt() +
B.sqrt() *
B.square();
354 finalizeBenchmark(
static_cast<int64_t
>(m_) * m_ * num_iters);
366 for (
int iter = 0; iter < 10; ++iter) {
367 C.device(device_) =
A.exp() +
B.log();
371 for (
int iter = 0; iter < num_iters; ++iter) {
372 C.device(device_) =
A.exp() +
B.log();
376 finalizeBenchmark(
static_cast<int64_t
>(m_) * m_ * num_iters);
389#ifndef EIGEN_HAS_INDEX_LIST
391 sum_along_dim[0] = 0;
395 Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
398 for (
int iter = 0; iter < 10; ++iter) {
399 C.device(device_) =
B.sum(sum_along_dim);
403 for (
int iter = 0; iter < num_iters; ++iter) {
404 C.device(device_) =
B.sum(sum_along_dim);
408 finalizeBenchmark(
static_cast<int64_t
>(k_) * n_ * num_iters);
423#ifndef EIGEN_HAS_INDEX_LIST
425 sum_along_dim[0] = 1;
429 Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
432 for (
int iter = 0; iter < 10; ++iter) {
433 A.device(device_) =
B.sum(sum_along_dim);
437 for (
int iter = 0; iter < num_iters; ++iter) {
438 A.device(device_) =
B.sum(sum_along_dim);
442 finalizeBenchmark(
static_cast<int64_t
>(k_) * n_ * num_iters);
456 for (
int iter = 0; iter < 10; ++iter) {
457 C.device(device_) =
B.sum();
461 for (
int iter = 0; iter < num_iters; ++iter) {
462 C.device(device_) =
B.sum();
466 finalizeBenchmark(
static_cast<int64_t
>(k_) * n_ * num_iters);
473 contraction<static_cast<int>(
Eigen::ColMajor)>(num_iters,
false,
false);
477 contraction<static_cast<int>(
Eigen::RowMajor)>(num_iters,
false,
false);
481 contraction<static_cast<int>(
Eigen::RowMajor)>(num_iters,
true,
false);
485 contraction<static_cast<int>(
Eigen::RowMajor)>(num_iters,
false,
true);
498 kernel_sizes[0] = kernel_x;
499 kernel_sizes[1] = kernel_y;
502 result_sizes[0] = m_ - kernel_x + 1;
503 result_sizes[1] = n_ - kernel_y + 1;
509 for (
int iter = 0; iter < 10; ++iter) {
510 C.device(device_) =
A.convolve(
B, dims);
514 for (
int iter = 0; iter < num_iters; ++iter) {
515 C.device(device_) =
A.convolve(
B, dims);
519 finalizeBenchmark(
static_cast<int64_t
>(2) *
520 (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
526 void contraction(
int num_iters,
bool trans_a,
bool trans_b) {
528 sizeA[0] = (trans_a ? k_: m_);
529 sizeA[1] = (trans_a ? m_: k_);
531 sizeB[0] = (trans_b ? n_: k_);
532 sizeB[1] = (trans_b ? k_: n_);
545 dims[0] =
DimPair(a_contract_dim, b_contract_dim);
547 for (
int iter = 0; iter < 10; ++iter) {
548 C.device(device_) =
A.contract(
B, dims);
552 for (
int iter = 0; iter < num_iters; ++iter) {
553 C.device(device_) =
A.contract(
B, dims);
557 finalizeBenchmark(
static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
561 a_ = (
T *) device_.allocate(m_ * k_ *
sizeof(
T));
562 b_ = (
T *) device_.allocate(k_ * n_ *
sizeof(
T));
563 c_ = (
T *) device_.allocate(m_ * n_ *
sizeof(
T));
567 device_.memset(a_, 12, m_ * k_ *
sizeof(
T));
568 device_.memset(b_, 23, k_ * n_ *
sizeof(
T));
569 device_.memset(c_, 31, m_ * n_ *
sizeof(
T));
573 inline void finalizeBenchmark(int64_t num_items) {
574#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
576 device_.synchronize();
578#elif defined(EIGEN_USE_SYCL)
580 device_.synchronize();
Matrix3f m
Definition AngleAxis_mimic_euler.cpp:1
int n
Definition BiCGSTAB_simple.cpp:1
#define eigen_assert(x)
Definition Macros.h:1037
Eigen::Triplet< double > T
Definition Tutorial_sparse_example.cpp:6
Matrix< Scalar, Dynamic, Dynamic > C
Definition bench_gemm.cpp:50
Matrix< SCALARA, Dynamic, Dynamic, opt_A > A
Definition bench_gemm.cpp:48
Matrix< SCALARB, Dynamic, Dynamic, opt_B > B
Definition bench_gemm.cpp:49
void SetBenchmarkFlopsProcessed(int64_t)
Definition benchmark_main.cc:193
void StopBenchmarkTiming()
Definition benchmark_main.cc:196
void StartBenchmarkTiming()
Definition benchmark_main.cc:202
Definition tensor_benchmarks.h:18
void padding(int num_iters)
Definition tensor_benchmarks.h:211
BenchmarkSuite(const Device &device, size_t m, size_t k)
Definition tensor_benchmarks.h:30
void typeCasting(int num_iters)
Definition tensor_benchmarks.h:56
void colReduction(int num_iters)
Definition tensor_benchmarks.h:412
void rowChip(int num_iters)
Definition tensor_benchmarks.h:142
void contractionRowMajorABT(int num_iters)
Definition tensor_benchmarks.h:488
~BenchmarkSuite()
Definition tensor_benchmarks.h:35
void striding(int num_iters)
Definition tensor_benchmarks.h:243
void slicing(int num_iters)
Definition tensor_benchmarks.h:100
void coeffWiseOp(int num_iters)
Definition tensor_benchmarks.h:312
void contraction(int num_iters)
Definition tensor_benchmarks.h:472
void rowReduction(int num_iters)
Definition tensor_benchmarks.h:380
void transcendentalFunc(int num_iters)
Definition tensor_benchmarks.h:357
BenchmarkSuite(const Device &device, size_t m, size_t k, size_t n)
Definition tensor_benchmarks.h:20
void broadcasting(int num_iters)
Definition tensor_benchmarks.h:278
void memcpy(int num_iters)
Definition tensor_benchmarks.h:41
void fullReduction(int num_iters)
Definition tensor_benchmarks.h:446
void algebraicFunc(int num_iters)
Definition tensor_benchmarks.h:334
void contractionRowMajor(int num_iters)
Definition tensor_benchmarks.h:476
void contractionRowMajorAT(int num_iters)
Definition tensor_benchmarks.h:480
BenchmarkSuite(const Device &device, size_t m)
Definition tensor_benchmarks.h:25
void contractionRowMajorBT(int num_iters)
Definition tensor_benchmarks.h:484
void shuffling(int num_iters)
Definition tensor_benchmarks.h:184
void random(int num_iters)
Definition tensor_benchmarks.h:81
void colChip(int num_iters)
Definition tensor_benchmarks.h:163
void convolution(int num_iters, int kernel_x, int kernel_y)
Definition tensor_benchmarks.h:492
The matrix class, also used for vectors and row-vectors.
Definition Matrix.h:180
A tensor expression mapping an existing array of data.
Definition TensorMap.h:30
The tensor class.
Definition Tensor.h:64
Definition EmulateArray.h:21
Tensor< float, 1 >::DimensionPair DimPair
Definition cxx11_tensor_contraction.cpp:17
std::vector< Array2i > sizes
Definition dense_solvers.cpp:12
@ Aligned
Definition Constants.h:240
@ ColMajor
Definition Constants.h:319
@ RowMajor
Definition Constants.h:321
::int64_t int64_t
Definition Meta.h:59
Definition TensorDimensions.h:263
Definition TensorMeta.h:247
int TensorIndex
Definition tensor_benchmarks.h:4