TR-mbed 1.0
Loading...
Searching...
No Matches
TensorCostModel.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
11#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
12
13namespace Eigen {
14
23// Class storing the cost of evaluating a tensor expression in terms of the
24// estimated number of operand bytes loads, bytes stored, and compute cycles.
26 public:
27 // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
28 // model based on minimal reciprocal throughput numbers from Intel or
29 // Agner Fog's tables would be better than what is there now.
30 template <typename ArgType>
35 template <typename ArgType>
39 template <typename ArgType>
44 template <typename ArgType>
48 template <typename SrcType, typename TargetType>
53
55 TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
58 : bytes_loaded_(bytes_loaded),
59 bytes_stored_(bytes_stored),
60 compute_cycles_(compute_cycles) {}
61
64 bool vectorized, double packet_size)
65 : bytes_loaded_(bytes_loaded),
66 bytes_stored_(bytes_stored),
67 compute_cycles_(vectorized ? compute_cycles / packet_size
72 }
73
75 return bytes_loaded_;
76 }
78 return bytes_stored_;
79 }
81 return compute_cycles_;
82 }
84 double load_cost, double store_cost, double compute_cost) const {
85 return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
86 compute_cost * compute_cycles_;
87 }
88
89 // Drop memory access component. Intended for cases when memory accesses are
90 // sequential or are completely masked by computations.
92 bytes_loaded_ = 0;
93 bytes_stored_ = 0;
94 }
95
96 // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
98 const TensorOpCost& rhs) const {
99 double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
100 double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
101 double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
103 }
104
105 // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
107 const TensorOpCost& rhs) const {
108 double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
109 double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
110 double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
112 }
113
115 const TensorOpCost& rhs) {
116 bytes_loaded_ += rhs.bytes_loaded();
117 bytes_stored_ += rhs.bytes_stored();
118 compute_cycles_ += rhs.compute_cycles();
119 return *this;
120 }
121
123 bytes_loaded_ *= rhs;
124 bytes_stored_ *= rhs;
125 compute_cycles_ *= rhs;
126 return *this;
127 }
128
130 TensorOpCost lhs, const TensorOpCost& rhs) {
131 lhs += rhs;
132 return lhs;
133 }
135 TensorOpCost lhs, double rhs) {
136 lhs *= rhs;
137 return lhs;
138 }
140 double lhs, TensorOpCost rhs) {
141 rhs *= lhs;
142 return rhs;
143 }
144
145 friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
146 return os << "[bytes_loaded = " << tc.bytes_loaded()
147 << ", bytes_stored = " << tc.bytes_stored()
148 << ", compute_cycles = " << tc.compute_cycles() << "]";
149 }
150
151 private:
152 double bytes_loaded_;
153 double bytes_stored_;
154 double compute_cycles_;
155};
156
157// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
158// in [1:max_threads] instead of just switching multi-threading off for small
159// work units.
160template <typename Device>
162 public:
163 // Scaling from Eigen compute cost to device cycles.
164 static const int kDeviceCyclesPerComputeCycle = 1;
165
166 // Costs in device cycles.
167 static const int kStartupCycles = 100000;
168 static const int kPerThreadCycles = 100000;
169 static const int kTaskSize = 40000;
170
171 // Returns the number of threads in [1:max_threads] to use for
172 // evaluating an expression with the given output size and cost per
173 // coefficient.
175 double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
176 double cost = totalCost(output_size, cost_per_coeff);
177 double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
178 // Make sure we don't invoke undefined behavior when we convert to an int.
179 threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
180 return numext::mini(max_threads,
181 numext::maxi<int>(1, static_cast<int>(threads)));
182 }
183
184 // taskSize assesses parallel task size.
185 // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
186 // granularity needs to be increased to mitigate parallelization overheads.
188 double output_size, const TensorOpCost& cost_per_coeff) {
189 return totalCost(output_size, cost_per_coeff) / kTaskSize;
190 }
191
193 double output_size, const TensorOpCost& cost_per_coeff) {
194 // Cost of memory fetches from L2 cache. 64 is typical cache line size.
195 // 11 is L2 cache latency on Haswell.
196 // We don't know whether data is in L1, L2 or L3. But we are most interested
197 // in single-threaded computational time around 100us-10ms (smaller time
198 // is too small for parallelization, larger time is not interesting
199 // either because we are probably using all available threads already).
200 // And for the target time range, L2 seems to be what matters. Data set
201 // fitting into L1 is too small to take noticeable time. Data set fitting
202 // only into L3 presumably will take more than 10ms to load and process.
203 const double kLoadCycles = 1.0 / 64 * 11;
204 const double kStoreCycles = 1.0 / 64 * 11;
205 // Scaling from Eigen compute cost to device cycles.
206 return output_size *
207 cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
209 }
210};
211
212} // namespace Eigen
213
214#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define eigen_assert(x)
Definition Macros.h:1037
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
Definition TensorCostModel.h:161
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
Definition TensorCostModel.h:174
static const int kDeviceCyclesPerComputeCycle
Definition TensorCostModel.h:164
static const int kPerThreadCycles
Definition TensorCostModel.h:168
static const int kStartupCycles
Definition TensorCostModel.h:167
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
Definition TensorCostModel.h:187
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size, const TensorOpCost &cost_per_coeff)
Definition TensorCostModel.h:192
static const int kTaskSize
Definition TensorCostModel.h:169
Definition TensorCostModel.h:25
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(const TensorOpCost &rhs) const
Definition TensorCostModel.h:97
EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
Definition TensorCostModel.h:57
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(const TensorOpCost &rhs) const
Definition TensorCostModel.h:106
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(TensorOpCost lhs, const TensorOpCost &rhs)
Definition TensorCostModel.h:129
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const
Definition TensorCostModel.h:77
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(double lhs, TensorOpCost rhs)
Definition TensorCostModel.h:139
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost()
Definition TensorCostModel.h:31
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost()
Definition TensorCostModel.h:45
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost & operator*=(double rhs)
Definition TensorCostModel.h:122
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost & operator+=(const TensorOpCost &rhs)
Definition TensorCostModel.h:114
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost()
Definition TensorCostModel.h:36
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(TensorOpCost lhs, double rhs)
Definition TensorCostModel.h:134
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost()
Definition TensorCostModel.h:49
EIGEN_DEVICE_FUNC TensorOpCost()
Definition TensorCostModel.h:55
friend std::ostream & operator<<(std::ostream &os, const TensorOpCost &tc)
Definition TensorCostModel.h:145
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const
Definition TensorCostModel.h:74
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost()
Definition TensorCostModel.h:40
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const
Definition TensorCostModel.h:80
EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized, double packet_size)
Definition TensorCostModel.h:63
EIGEN_DEVICE_FUNC void dropMemoryCost()
Definition TensorCostModel.h:91
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost, double compute_cost) const
Definition TensorCostModel.h:83
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool() isfinite(const Eigen::bfloat16 &h)
Definition BFloat16.h:671
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition MathFunctions.h:1091
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition MathFunctions.h:1083
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
Definition NumTraits.h:153
Definition XprHelper.h:176
Definition ForwardDeclarations.h:17