Halide 22.0.0
Halide compiler and libraries
Func.h
Go to the documentation of this file.
1#ifndef HALIDE_FUNC_H
2#define HALIDE_FUNC_H
3
4/** \file
5 *
6 * Defines Func - the front-end handle on a halide function, and related classes.
7 */
8
9#include "Argument.h"
10#include "Expr.h"
11#include "JITModule.h"
12#include "Module.h"
13#include "Param.h"
14#include "Pipeline.h"
15#include "RDom.h"
16#include "Target.h"
17#include "Tuple.h"
18#include "Var.h"
19
20#include <map>
21#include <utility>
22
23namespace Halide {
24
25class OutputImageParam;
26
27/** A class that can represent Vars or RVars. Used for reorder calls
28 * which can accept a mix of either. */
29struct VarOrRVar {
30 VarOrRVar(const std::string &n, bool r)
31 : var(n), rvar(n), is_rvar(r) {
32 }
33 VarOrRVar(const Var &v)
34 : var(v), is_rvar(false) {
35 }
36 VarOrRVar(const RVar &r)
37 : rvar(r), is_rvar(true) {
38 }
39 VarOrRVar(const RDom &r)
40 : rvar(RVar(r)), is_rvar(true) {
41 }
42 template<int N>
44 : var(u), is_rvar(false) {
45 }
46
47 const std::string &name() const {
48 if (is_rvar) {
49 return rvar.name();
50 } else {
51 return var.name();
52 }
53 }
54
57 bool is_rvar;
58};
59
60class ImageParam;
61
62namespace Internal {
63struct AssociativeOp;
64class Function;
65struct Split;
66struct StorageDim;
67} // namespace Internal
68
69/** A single definition of a Func. May be a pure or update definition. */
70class Stage {
71 /** Reference to the Function this stage (or definition) belongs to. */
72 Internal::Function function;
73 Internal::Definition definition;
74 /** Indicate which stage the definition belongs to (0 for initial
75 * definition, 1 for first update, etc.). */
76 size_t stage_index;
77 /** Pure Vars of the Function (from the init definition). */
78 std::vector<Var> dim_vars;
79
80 void set_dim_type(const VarOrRVar &var, Internal::ForType t);
81 void set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api);
82 void split(const std::string &old, const std::string &outer, const std::string &inner,
83 const Expr &factor, bool exact, TailStrategy tail);
84 void remove(const std::string &var);
85
86 const std::vector<Internal::StorageDim> &storage_dims() const {
87 return function.schedule().storage_dims();
88 }
89
90 Stage &compute_with(LoopLevel loop_level, const std::map<std::string, LoopAlignStrategy> &align);
91
92 std::pair<std::vector<Internal::Split>, std::vector<Internal::Split>>
93 rfactor_validate_args(const std::vector<std::pair<RVar, Var>> &preserved, const Internal::AssociativeOp &prover_result);
94
95public:
97 : function(std::move(f)), definition(std::move(d)), stage_index(stage_index) {
98 internal_assert(definition.defined());
99
100 dim_vars.reserve(function.args().size());
101 for (const auto &arg : function.args()) {
102 dim_vars.emplace_back(arg);
103 }
104 internal_assert(definition.args().size() == dim_vars.size());
105 }
106
107 /** Return the current StageSchedule associated with this Stage. For
108 * introspection only: to modify schedule, use the Func interface. */
110 return definition.schedule();
111 }
112
113 /** Return a string describing the current var list taking into
114 * account all the splits, reorders, and tiles. */
115 std::string dump_argument_list() const;
116
117 /** Return the name of this stage, e.g. "f.update(2)" */
118 std::string name() const;
119
120 /** Calling rfactor() on an associative update definition a Func will split
121 * the update into an intermediate which computes the partial results and
122 * replaces the current update definition with a new definition which merges
123 * the partial results. If called on a init/pure definition, this will
124 * throw an error. rfactor() will automatically infer the associative reduction
125 * operator and identity of the operator. If it can't prove the operation
126 * is associative or if it cannot find an identity for that operator, this
127 * will throw an error. In addition, commutativity of the operator is required
128 * if rfactor() is called on the inner dimension but excluding the outer
129 * dimensions.
130 *
131 * rfactor() takes as input 'preserved', which is a list of <RVar, Var> pairs.
132 * The rvars not listed in 'preserved' are removed from the original Func and
133 * are lifted to the intermediate Func. The remaining rvars (the ones in
134 * 'preserved') are made pure in the intermediate Func. The intermediate Func's
135 * update definition inherits all scheduling directives (e.g. split,fuse, etc.)
136 * applied to the original Func's update definition. The loop order of the
137 * intermediate Func's update definition is the same as the original, although
138 * the RVars in 'preserved' are replaced by the new pure Vars. The loop order of the
139 * intermediate Func's init definition from innermost to outermost is the args'
140 * order of the original Func's init definition followed by the new pure Vars.
141 *
142 * The intermediate Func also inherits storage order from the original Func
143 * with the new pure Vars added to the outermost.
144 *
145 * For example, f.update(0).rfactor({{r.y, u}}) would rewrite a pipeline like this:
146 \code
147 f(x, y) = 0;
148 f(x, y) += g(r.x, r.y);
149 \endcode
150 * into a pipeline like this:
151 \code
152 f_intm(x, y, u) = 0;
153 f_intm(x, y, u) += g(r.x, u);
154
155 f(x, y) = 0;
156 f(x, y) += f_intm(x, y, r.y);
157 \endcode
158 *
159 * This has a variety of uses. You can use it to split computation of an associative reduction:
160 \code
161 f(x, y) = 10;
162 RDom r(0, 96);
163 f(x, y) = max(f(x, y), g(x, y, r.x));
164 f.update(0).split(r.x, rxo, rxi, 8).reorder(y, x).parallel(x);
165 f.update(0).rfactor({{rxo, u}}).compute_root().parallel(u).update(0).parallel(u);
166 \endcode
167 *
168 *, which is equivalent to:
169 \code
170 parallel for u = 0 to 11:
171 for y:
172 for x:
173 f_intm(x, y, u) = -inf
174 parallel for x:
175 for y:
176 parallel for u = 0 to 11:
177 for rxi = 0 to 7:
178 f_intm(x, y, u) = max(f_intm(x, y, u), g(8*u + rxi))
179 for y:
180 for x:
181 f(x, y) = 10
182 parallel for x:
183 for y:
184 for rxo = 0 to 11:
185 f(x, y) = max(f(x, y), f_intm(x, y, rxo))
186 \endcode
187 *
188 */
189 // @{
190 Func rfactor(const std::vector<std::pair<RVar, Var>> &preserved);
191 Func rfactor(const RVar &r, const Var &v);
192 // @}
193
194 /** Schedule the iteration over this stage to be fused with another
195 * stage 's' from outermost loop to a given LoopLevel. 'this' stage will
196 * be computed AFTER 's' in the innermost fused dimension. There should not
197 * be any dependencies between those two fused stages. If either of the
198 * stages being fused is a stage of an extern Func, this will throw an error.
199 *
200 * Note that the two stages that are fused together should have the same
201 * exact schedule from the outermost to the innermost fused dimension, and
202 * the stage we are calling compute_with on should not have specializations,
203 * e.g. f2.compute_with(f1, x) is allowed only if f2 has no specializations.
204 *
205 * Also, if a producer is desired to be computed at the fused loop level,
206 * the function passed to the compute_at() needs to be the "parent". Consider
207 * the following code:
208 \code
209 input(x, y) = x + y;
210 f(x, y) = input(x, y);
211 f(x, y) += 5;
212 g(x, y) = x - y;
213 g(x, y) += 10;
214 f.compute_with(g, y);
215 f.update().compute_with(g.update(), y);
216 \endcode
217 *
218 * To compute 'input' at the fused loop level at dimension y, we specify
219 * input.compute_at(g, y) instead of input.compute_at(f, y) since 'g' is
220 * the "parent" for this fused loop (i.e. 'g' is computed first before 'f'
221 * is computed). On the other hand, to compute 'input' at the innermost
222 * dimension of 'f', we specify input.compute_at(f, x) instead of
223 * input.compute_at(g, x) since the x dimension of 'f' is not fused
224 * (only the y dimension is).
225 *
226 * Given the constraints, this has a variety of uses. Consider the
227 * following code:
228 \code
229 f(x, y) = x + y;
230 g(x, y) = x - y;
231 h(x, y) = f(x, y) + g(x, y);
232 f.compute_root();
233 g.compute_root();
234 f.split(x, xo, xi, 8);
235 g.split(x, xo, xi, 8);
236 g.compute_with(f, xo);
237 \endcode
238 *
239 * This is equivalent to:
240 \code
241 for y:
242 for xo:
243 for xi:
244 f(8*xo + xi) = (8*xo + xi) + y
245 for xi:
246 g(8*xo + xi) = (8*xo + xi) - y
247 for y:
248 for x:
249 h(x, y) = f(x, y) + g(x, y)
250 \endcode
251 *
252 * The size of the dimensions of the stages computed_with do not have
253 * to match. Consider the following code where 'g' is half the size of 'f':
254 \code
255 Image<int> f_im(size, size), g_im(size/2, size/2);
256 input(x, y) = x + y;
257 f(x, y) = input(x, y);
258 g(x, y) = input(2*x, 2*y);
259 g.compute_with(f, y);
260 input.compute_at(f, y);
261 Pipeline({f, g}).realize({f_im, g_im});
262 \endcode
263 *
264 * This is equivalent to:
265 \code
266 for y = 0 to size-1:
267 for x = 0 to size-1:
268 input(x, y) = x + y;
269 for x = 0 to size-1:
270 f(x, y) = input(x, y)
271 for x = 0 to size/2-1:
272 if (y < size/2-1):
273 g(x, y) = input(2*x, 2*y)
274 \endcode
275 *
276 * 'align' specifies how the loop iteration of each dimension of the
277 * two stages being fused should be aligned in the fused loop nests
278 * (see LoopAlignStrategy for options). Consider the following loop nests:
279 \code
280 for z = f_min_z to f_max_z:
281 for y = f_min_y to f_max_y:
282 for x = f_min_x to f_max_x:
283 f(x, y, z) = x + y + z
284 for z = g_min_z to g_max_z:
285 for y = g_min_y to g_max_y:
286 for x = g_min_x to g_max_x:
287 g(x, y, z) = x - y - z
288 \endcode
289 *
290 * If no alignment strategy is specified, the following loop nest will be
291 * generated:
292 \code
293 for z = min(f_min_z, g_min_z) to max(f_max_z, g_max_z):
294 for y = min(f_min_y, g_min_y) to max(f_max_y, g_max_y):
295 for x = f_min_x to f_max_x:
296 if (f_min_z <= z <= f_max_z):
297 if (f_min_y <= y <= f_max_y):
298 f(x, y, z) = x + y + z
299 for x = g_min_x to g_max_x:
300 if (g_min_z <= z <= g_max_z):
301 if (g_min_y <= y <= g_max_y):
302 g(x, y, z) = x - y - z
303 \endcode
304 *
305 * Instead, these alignment strategies:
306 \code
307 g.compute_with(f, y, {{z, LoopAlignStrategy::AlignStart}, {y, LoopAlignStrategy::AlignEnd}});
308 \endcode
309 * will produce the following loop nest:
310 \code
311 f_loop_min_z = f_min_z
312 f_loop_max_z = max(f_max_z, (f_min_z - g_min_z) + g_max_z)
313 for z = f_min_z to f_loop_max_z:
314 f_loop_min_y = min(f_min_y, (f_max_y - g_max_y) + g_min_y)
315 f_loop_max_y = f_max_y
316 for y = f_loop_min_y to f_loop_max_y:
317 for x = f_min_x to f_max_x:
318 if (f_loop_min_z <= z <= f_loop_max_z):
319 if (f_loop_min_y <= y <= f_loop_max_y):
320 f(x, y, z) = x + y + z
321 for x = g_min_x to g_max_x:
322 g_shift_z = g_min_z - f_loop_min_z
323 g_shift_y = g_max_y - f_loop_max_y
324 if (g_min_z <= (z + g_shift_z) <= g_max_z):
325 if (g_min_y <= (y + g_shift_y) <= g_max_y):
326 g(x, y + g_shift_y, z + g_shift_z) = x - (y + g_shift_y) - (z + g_shift_z)
327 \endcode
328 *
329 * LoopAlignStrategy::AlignStart on dimension z will shift the loop iteration
330 * of 'g' at dimension z so that its starting value matches that of 'f'.
331 * Likewise, LoopAlignStrategy::AlignEnd on dimension y will shift the loop
332 * iteration of 'g' at dimension y so that its end value matches that of 'f'.
333 */
334 // @{
335 Stage &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
337 Stage &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
339 // @}
340
341 /** Scheduling calls that control how the domain of this stage is
342 * traversed. See the documentation for Func for the meanings. */
343 // @{
344
345 Stage &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
346 Stage &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
347 Stage &serial(const VarOrRVar &var);
350 Stage &unroll(const VarOrRVar &var);
351 Stage &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
352 Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
353 Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
354 Stage &partition(const VarOrRVar &var, Partition partition_policy);
356 Stage &never_partition(const std::vector<VarOrRVar> &vars);
358 Stage &always_partition(const std::vector<VarOrRVar> &vars);
359
360 Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
361 const VarOrRVar &xo, const VarOrRVar &yo,
362 const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
364 Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
365 const VarOrRVar &xi, const VarOrRVar &yi,
366 const Expr &xfactor, const Expr &yfactor,
368 Stage &tile(const std::vector<VarOrRVar> &previous,
369 const std::vector<VarOrRVar> &outers,
370 const std::vector<VarOrRVar> &inners,
371 const std::vector<Expr> &factors,
372 const std::vector<TailStrategy> &tails);
373 Stage &tile(const std::vector<VarOrRVar> &previous,
374 const std::vector<VarOrRVar> &outers,
375 const std::vector<VarOrRVar> &inners,
376 const std::vector<Expr> &factors,
378 Stage &tile(const std::vector<VarOrRVar> &previous,
379 const std::vector<VarOrRVar> &inners,
380 const std::vector<Expr> &factors,
382 Stage &reorder(const std::vector<VarOrRVar> &vars);
383
384 template<typename... Args>
385 HALIDE_NO_USER_CODE_INLINE std::enable_if_t<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>
386 reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
387 std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
388 return reorder(collected_args);
389 }
390
391 template<typename... Args>
392 HALIDE_NO_USER_CODE_INLINE std::enable_if_t<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>
393 never_partition(const VarOrRVar &x, Args &&...args) {
394 std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
395 return never_partition(collected_args);
396 }
397
398 template<typename... Args>
399 HALIDE_NO_USER_CODE_INLINE std::enable_if_t<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>
400 always_partition(const VarOrRVar &x, Args &&...args) {
401 std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
402 return always_partition(collected_args);
403 }
404
405 Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
406 Stage specialize(const Expr &condition);
407 void specialize_fail(const std::string &message);
408
410 Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
411 Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
412
414
416
418 Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
419 Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
420
421 Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
422 Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
423 const VarOrRVar &thread_x, const VarOrRVar &thread_y,
424 DeviceAPI device_api = DeviceAPI::Default_GPU);
425 Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
426 const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z,
427 DeviceAPI device_api = DeviceAPI::Default_GPU);
428
429 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
431 DeviceAPI device_api = DeviceAPI::Default_GPU);
432
433 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
435 DeviceAPI device_api = DeviceAPI::Default_GPU);
436 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
437 const VarOrRVar &bx, const VarOrRVar &by,
438 const VarOrRVar &tx, const VarOrRVar &ty,
439 const Expr &x_size, const Expr &y_size,
441 DeviceAPI device_api = DeviceAPI::Default_GPU);
442
443 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
444 const VarOrRVar &tx, const VarOrRVar &ty,
445 const Expr &x_size, const Expr &y_size,
447 DeviceAPI device_api = DeviceAPI::Default_GPU);
448
449 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
450 const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
451 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
452 const Expr &x_size, const Expr &y_size, const Expr &z_size,
454 DeviceAPI device_api = DeviceAPI::Default_GPU);
455 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
456 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
457 const Expr &x_size, const Expr &y_size, const Expr &z_size,
459 DeviceAPI device_api = DeviceAPI::Default_GPU);
460
462 Stage &atomic(bool override_associativity_test = false);
463
465
466 Stage &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
468 Stage &prefetch(const Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
470 template<typename T>
471 Stage &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
473 return prefetch(image.parameter(), at, from, std::move(offset), strategy);
474 }
475 // @}
476
477 /** Get the Vars and RVars of this definition, from innermost out, with
478 * splits applied. This represents all the potentially-valid compute_at
479 * sites for this Stage. The RVars returned will be symbolic and not tied to
480 * a particular reduction domain, like the naked RVar objects used as split
481 * outputs. Note that this list by default will end with the sentinel
482 * Var::outermost. */
483 std::vector<VarOrRVar> split_vars() const;
484
485 /** Assert that this stage has intentionally been given no schedule, and
486 * suppress the warning about unscheduled update definitions that would
487 * otherwise fire. This counts as a schedule, so calling this twice on the
488 * same Stage will fail the assertion. */
490};
491
492// For backwards compatibility, keep the ScheduleHandle name.
494
496
497/** A fragment of front-end syntax of the form f(x, y, z), where x, y,
498 * z are Vars or Exprs. If could be the left hand side of a definition or
499 * an update definition, or it could be a call to a function. We don't know
500 * until we see how this object gets used.
501 */
502class FuncRef {
504 int implicit_placeholder_pos;
505 int implicit_count;
506 std::vector<Expr> args;
507 std::vector<Expr> args_with_implicit_vars(const std::vector<Expr> &e) const;
508
509 /** Helper for function update by Tuple. If the function does not
510 * already have a pure definition, init_val will be used as RHS of
511 * each tuple element in the initial function definition. */
512 template<typename BinaryOp>
513 Stage func_ref_update(const Tuple &e, int init_val);
514
515 /** Helper for function update by Expr. If the function does not
516 * already have a pure definition, init_val will be used as RHS in
517 * the initial function definition. */
518 template<typename BinaryOp>
519 Stage func_ref_update(const Expr &e, int init_val);
520
521public:
522 FuncRef(const Internal::Function &, const std::vector<Expr> &,
523 int placeholder_pos = -1, int count = 0);
524 FuncRef(Internal::Function, const std::vector<Var> &,
525 int placeholder_pos = -1, int count = 0);
526
527 /** Use this as the left-hand-side of a definition or an update definition
528 * (see \ref RDom).
529 */
531
532 /** Use this as the left-hand-side of a definition or an update definition
533 * for a Func with multiple outputs. */
535
536 /** Define a stage that adds the given expression to this Func. If the
537 * expression refers to some RDom, this performs a sum reduction of the
538 * expression over the domain. If the function does not already have a
539 * pure definition, this sets it to zero.
540 */
541 // @{
545 // @}
546
547 /** Define a stage that adds the negative of the given expression to this
548 * Func. If the expression refers to some RDom, this performs a sum reduction
549 * of the negative of the expression over the domain. If the function does
550 * not already have a pure definition, this sets it to zero.
551 */
552 // @{
556 // @}
557
558 /** Define a stage that multiplies this Func by the given expression. If the
559 * expression refers to some RDom, this performs a product reduction of the
560 * expression over the domain. If the function does not already have a pure
561 * definition, this sets it to 1.
562 */
563 // @{
567 // @}
568
569 /** Define a stage that divides this Func by the given expression.
570 * If the expression refers to some RDom, this performs a product
571 * reduction of the inverse of the expression over the domain. If the
572 * function does not already have a pure definition, this sets it to 1.
573 */
574 // @{
578 // @}
579
580 /* Override the usual assignment operator, so that
581 * f(x, y) = g(x, y) defines f.
582 */
584
585 /** Use this as a call to the function, and not the left-hand-side
586 * of a definition. Only works for single-output Funcs. */
587 operator Expr() const;
588
589 /** When a FuncRef refers to a function that provides multiple
590 * outputs, you can access each output as an Expr using
591 * operator[].
592 */
594
595 /** How many outputs does the function this refers to produce. */
596 size_t size() const;
597
598 /** Is this FuncRef syntactically equivalent to another one? */
599 bool equivalent_to(const FuncRef &other) const;
600
601 /** What function is this calling? */
603 return func;
604 }
605};
606
607/** Explicit overloads of min and max for FuncRef. These exist to
608 * disambiguate calls to min on FuncRefs when a user has pulled both
609 * Halide::min and std::min into their namespace. */
610// @{
611inline Expr min(const FuncRef &a, const FuncRef &b) {
612 return min(Expr(a), Expr(b));
613}
614inline Expr max(const FuncRef &a, const FuncRef &b) {
615 return max(Expr(a), Expr(b));
616}
617// @}
618
619/** A fragment of front-end syntax of the form f(x, y, z)[index], where x, y,
620 * z are Vars or Exprs. If could be the left hand side of an update
621 * definition, or it could be a call to a function. We don't know
622 * until we see how this object gets used.
623 */
625 FuncRef func_ref;
626 std::vector<Expr> args; // args to the function
627 int idx; // Index to function outputs
628
629 /** Helper function that generates a Tuple where element at 'idx' is set
630 * to 'e' and the rests are undef. */
631 Tuple values_with_undefs(const Expr &e) const;
632
633public:
634 FuncTupleElementRef(const FuncRef &ref, const std::vector<Expr> &args, int idx);
635
636 /** Use this as the left-hand-side of an update definition of Tuple
637 * component 'idx' of a Func (see \ref RDom). The function must
638 * already have an initial definition.
639 */
641
642 /** Define a stage that adds the given expression to Tuple component 'idx'
643 * of this Func. The other Tuple components are unchanged. If the expression
644 * refers to some RDom, this performs a sum reduction of the expression over
645 * the domain. The function must already have an initial definition.
646 */
648
649 /** Define a stage that adds the negative of the given expression to Tuple
650 * component 'idx' of this Func. The other Tuple components are unchanged.
651 * If the expression refers to some RDom, this performs a sum reduction of
652 * the negative of the expression over the domain. The function must already
653 * have an initial definition.
654 */
656
657 /** Define a stage that multiplies Tuple component 'idx' of this Func by
658 * the given expression. The other Tuple components are unchanged. If the
659 * expression refers to some RDom, this performs a product reduction of
660 * the expression over the domain. The function must already have an
661 * initial definition.
662 */
664
665 /** Define a stage that divides Tuple component 'idx' of this Func by
666 * the given expression. The other Tuple components are unchanged.
667 * If the expression refers to some RDom, this performs a product
668 * reduction of the inverse of the expression over the domain. The function
669 * must already have an initial definition.
670 */
672
673 /* Override the usual assignment operator, so that
674 * f(x, y)[index] = g(x, y) defines f.
675 */
677
678 /** Use this as a call to Tuple component 'idx' of a Func, and not the
679 * left-hand-side of a definition. */
680 operator Expr() const;
681
682 /** What function is this calling? */
684 return func_ref.function();
685 }
686
687 /** Return index to the function outputs. */
688 int index() const {
689 return idx;
690 }
691};
692
693namespace Internal {
694class IRMutator;
695} // namespace Internal
696
697/** Helper class for identifying purpose of an Expr passed to memoize.
698 */
700protected:
702 friend class Func;
703
704public:
705 explicit EvictionKey(const Expr &expr = Expr())
706 : key(expr) {
707 }
708};
709
710/** A halide function. This class represents one stage in a Halide
711 * pipeline, and is the unit by which we schedule things. By default
712 * they are aggressively inlined, so you are encouraged to make lots
713 * of little functions, rather than storing things in Exprs. */
714class Func {
715
716 /** A handle on the internal halide function that this
717 * represents */
719
720 /** When you make a reference to this function with fewer
721 * arguments than it has dimensions, the argument list is bulked
722 * up with 'implicit' vars with canonical names. This lets you
723 * pass around partially applied Halide functions. */
724 // @{
725 std::pair<int, int> add_implicit_vars(std::vector<Var> &) const;
726 std::pair<int, int> add_implicit_vars(std::vector<Expr> &) const;
727 // @}
728
729 /** The imaging pipeline that outputs this Func alone. */
730 Pipeline pipeline_;
731
732 /** Get the imaging pipeline that outputs this Func alone,
733 * creating it (and freezing the Func) if necessary. */
734 Pipeline pipeline();
735
736 // Helper function for recursive reordering support
737 Func &reorder_storage(const std::vector<Var> &dims, size_t start);
738
739 void invalidate_cache();
740
741public:
742 /** Declare a new undefined function with the given name */
743 explicit Func(const std::string &name);
744
745 /** Declare a new undefined function with the given name.
746 * The function will be constrained to represent Exprs of required_type.
747 * If required_dims is not AnyDims, the function will be constrained to exactly
748 * that many dimensions. */
749 explicit Func(const Type &required_type, int required_dims, const std::string &name);
750
751 /** Declare a new undefined function with the given name.
752 * If required_types is not empty, the function will be constrained to represent
753 * Tuples of the same arity and types. (If required_types is empty, there is no constraint.)
754 * If required_dims is not AnyDims, the function will be constrained to exactly
755 * that many dimensions. */
756 explicit Func(const std::vector<Type> &required_types, int required_dims, const std::string &name);
757
758 /** Declare a new undefined function with an
759 * automatically-generated unique name */
761
762 /** Declare a new function with an automatically-generated unique
763 * name, and define it to return the given expression (which may
764 * not contain free variables). */
765 explicit Func(const Expr &e);
766
767 /** Construct a new Func to wrap an existing, already-defined
768 * Function object. */
770
771 /** Construct a new Func to wrap a Buffer. */
772 template<typename T, int Dims>
774 : Func() {
775 (*this)(_) = im(_);
776 }
777
778 /** Evaluate this function over some rectangular domain and return
779 * the resulting buffer or buffers. Performs compilation if the
780 * Func has not previously been realized and compile_jit has not
781 * been called. If the final stage of the pipeline is on the GPU,
782 * data is copied back to the host before being returned. The
783 * returned Realization should probably be instantly converted to
784 * a Buffer class of the appropriate type. That is, do this:
785 *
786 \code
787 f(x) = sin(x);
788 Buffer<float> im = f.realize(...);
789 \endcode
790 *
791 * If your Func has multiple values, because you defined it using
792 * a Tuple, then casting the result of a realize call to a buffer
793 * or image will produce a run-time error. Instead you should do the
794 * following:
795 *
796 \code
797 f(x) = Tuple(x, sin(x));
798 Realization r = f.realize(...);
799 Buffer<int> im0 = r[0];
800 Buffer<float> im1 = r[1];
801 \endcode
802 *
803 * In Halide formal arguments of a computation are specified using
804 * Param<T> and ImageParam objects in the expressions defining the
805 * computation. Note that this method is not thread-safe, in that
806 * Param<T> and ImageParam are globals shared by all threads; to call
807 * jitted code in a thread-safe manner, use compile_to_callable() instead.
808 *
809 \code
810 Param<int32> p(42);
811 ImageParam img(Int(32), 1);
812 f(x) = img(x) + p;
813
814 Buffer<int32_t) arg_img(10, 10);
815 <fill in arg_img...>
816
817 Target t = get_jit_target_from_environment();
818 Buffer<int32_t> result = f.realize({10, 10}, t);
819 \endcode
820 *
821 * Alternatively, an initializer list can be used
822 * directly in the realize call to pass this information:
823 *
824 \code
825 Param<int32> p(42);
826 ImageParam img(Int(32), 1);
827 f(x) = img(x) + p;
828
829 Buffer<int32_t) arg_img(10, 10);
830 <fill in arg_img...>
831
832 Target t = get_jit_target_from_environment();
833 Buffer<int32_t> result = f.realize({10, 10}, t, { { p, 17 }, { img, arg_img } });
834 \endcode
835 *
836 * If the Func cannot be realized into a buffer of the given size
837 * due to scheduling constraints on scattering update definitions,
838 * it will be realized into a larger buffer of the minimum size
839 * possible, and a cropped view at the requested size will be
840 * returned. It is thus not safe to assume the returned buffers
841 * are contiguous in memory. This behavior can be disabled with
842 * the NoBoundsQuery target flag, in which case an error about
843 * writing out of bounds on the output buffer will trigger
844 * instead.
845 *
846 */
847 Realization realize(std::vector<int32_t> sizes = {}, const Target &target = Target());
848
849 /** Same as above, but takes a custom user-provided context to be
850 * passed to runtime functions. This can be used to pass state to
851 * runtime overrides in a thread-safe manner. A nullptr context is
852 * legal, and is equivalent to calling the variant of realize
853 * that does not take a context. */
855 std::vector<int32_t> sizes = {},
856 const Target &target = Target());
857
858 /** Evaluate this function into an existing allocated buffer or
859 * buffers. If the buffer is also one of the arguments to the
860 * function, strange things may happen, as the pipeline isn't
861 * necessarily safe to run in-place. If you pass multiple buffers,
862 * they must have matching sizes. This form of realize does *not*
863 * automatically copy data back from the GPU. */
865
866 /** Same as above, but takes a custom user-provided context to be
867 * passed to runtime functions. This can be used to pass state to
868 * runtime overrides in a thread-safe manner. A nullptr context is
869 * legal, and is equivalent to calling the variant of realize
870 * that does not take a context. */
871 void realize(JITUserContext *context,
873 const Target &target = Target());
874
875 /** For a given size of output, or a given output buffer,
876 * determine the bounds required of all unbound ImageParams
877 * referenced. Communicates the result by allocating new buffers
878 * of the appropriate size and binding them to the unbound
879 * ImageParams.
880 */
881 // @{
882 void infer_input_bounds(const std::vector<int32_t> &sizes,
883 const Target &target = get_jit_target_from_environment());
885 const Target &target = get_jit_target_from_environment());
886 // @}
887
888 /** Versions of infer_input_bounds that take a custom user context
889 * to pass to runtime functions. */
890 // @{
892 const std::vector<int32_t> &sizes,
893 const Target &target = get_jit_target_from_environment());
896 const Target &target = get_jit_target_from_environment());
897 // @}
898 /** Statically compile this function to llvm bitcode, with the
899 * given filename (which should probably end in .bc), type
900 * signature, and C function name (which defaults to the same name
901 * as this halide function */
902 //@{
903 void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
904 const Target &target = get_target_from_environment());
905 void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &,
906 const Target &target = get_target_from_environment());
907 // @}
908
909 /** Statically compile this function to llvm assembly, with the
910 * given filename (which should probably end in .ll), type
911 * signature, and C function name (which defaults to the same name
912 * as this halide function */
913 //@{
914 void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
915 const Target &target = get_target_from_environment());
916 void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &,
917 const Target &target = get_target_from_environment());
918 // @}
919
920 /** Statically compile this function to an object file, with the
921 * given filename (which should probably end in .o or .obj), type
922 * signature, and C function name (which defaults to the same name
923 * as this halide function. You probably don't want to use this
924 * directly; call compile_to_static_library or compile_to_file instead. */
925 //@{
926 void compile_to_object(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
927 const Target &target = get_target_from_environment());
928 void compile_to_object(const std::string &filename, const std::vector<Argument> &,
929 const Target &target = get_target_from_environment());
930 // @}
931
932 /** Emit a header file with the given filename for this
933 * function. The header will define a function with the type
934 * signature given by the second argument, and a name given by the
935 * third. The name defaults to the same name as this halide
936 * function. You don't actually have to have defined this function
937 * yet to call this. You probably don't want to use this directly;
938 * call compile_to_static_library or compile_to_file instead. */
939 void compile_to_header(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name = "",
940 const Target &target = get_target_from_environment());
941
942 /** Statically compile this function to text assembly equivalent
943 * to the object file generated by compile_to_object. This is
944 * useful for checking what Halide is producing without having to
945 * disassemble anything, or if you need to feed the assembly into
946 * some custom toolchain to produce an object file (e.g. iOS) */
947 //@{
948 void compile_to_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
949 const Target &target = get_target_from_environment());
950 void compile_to_assembly(const std::string &filename, const std::vector<Argument> &,
951 const Target &target = get_target_from_environment());
952 // @}
953
954 /** Statically compile this function to C source code. This is
955 * useful for providing fallback code paths that will compile on
956 * many platforms. Vectorization will fail, and parallelization
957 * will produce serial code. */
958 void compile_to_c(const std::string &filename,
959 const std::vector<Argument> &,
960 const std::string &fn_name = "",
961 const Target &target = get_target_from_environment());
962
963 /** Write out an internal representation of lowered code. Useful
964 * for analyzing and debugging scheduling. Can emit html or plain
965 * text. */
966 void compile_to_lowered_stmt(const std::string &filename,
967 const std::vector<Argument> &args,
969 const Target &target = get_target_from_environment());
970
971 /** Write out a conceptual representation of lowered code, before any parallel loop
972 * get factored out into separate functions, or GPU loops are offloaded to kernel code.r
973 * Useful for analyzing and debugging scheduling. Can emit html or plain text. */
974 void compile_to_conceptual_stmt(const std::string &filename,
975 const std::vector<Argument> &args,
977 const Target &target = get_target_from_environment());
978
979 /** Write out the loop nests specified by the schedule for this
980 * Function. Helpful for understanding what a schedule is
981 * doing. */
983
984 /** Compile to object file and header pair, with the given
985 * arguments. The name defaults to the same name as this halide
986 * function.
987 */
988 void compile_to_file(const std::string &filename_prefix, const std::vector<Argument> &args,
989 const std::string &fn_name = "",
990 const Target &target = get_target_from_environment());
991
992 /** Compile to static-library file and header pair, with the given
993 * arguments. The name defaults to the same name as this halide
994 * function.
995 */
996 void compile_to_static_library(const std::string &filename_prefix, const std::vector<Argument> &args,
997 const std::string &fn_name = "",
998 const Target &target = get_target_from_environment());
999
1000 /** Compile to static-library file and header pair once for each target;
1001 * each resulting function will be considered (in order) via halide_can_use_target_features()
1002 * at runtime, with the first appropriate match being selected for subsequent use.
1003 * This is typically useful for specializations that may vary unpredictably by machine
1004 * (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
1005 * All targets must have identical arch-os-bits.
1006 */
1007 void compile_to_multitarget_static_library(const std::string &filename_prefix,
1008 const std::vector<Argument> &args,
1009 const std::vector<Target> &targets);
1010
1011 /** Like compile_to_multitarget_static_library(), except that the object files
1012 * are all output as object files (rather than bundled into a static library).
1013 *
1014 * `suffixes` is an optional list of strings to use for as the suffix for each object
1015 * file. If nonempty, it must be the same length as `targets`. (If empty, Target::to_string()
1016 * will be used for each suffix.)
1017 *
1018 * Note that if `targets.size()` > 1, the wrapper code (to select the subtarget)
1019 * will be generated with the filename `${filename_prefix}_wrapper.o`
1020 *
1021 * Note that if `targets.size()` > 1 and `no_runtime` is not specified, the runtime
1022 * will be generated with the filename `${filename_prefix}_runtime.o`
1023 */
1024 void compile_to_multitarget_object_files(const std::string &filename_prefix,
1025 const std::vector<Argument> &args,
1026 const std::vector<Target> &targets,
1027 const std::vector<std::string> &suffixes);
1028
1029 /** Store an internal representation of lowered code as a self
1030 * contained Module suitable for further compilation. */
1031 Module compile_to_module(const std::vector<Argument> &args, const std::string &fn_name = "",
1032 const Target &target = get_target_from_environment());
1033
1034 /** Compile and generate multiple target files with single call.
1035 * Deduces target files based on filenames specified in
1036 * output_files map.
1037 */
1038 void compile_to(const std::map<OutputFileType, std::string> &output_files,
1039 const std::vector<Argument> &args,
1040 const std::string &fn_name,
1041 const Target &target = get_target_from_environment());
1042
1043 /** Eagerly jit compile the function to machine code. This
1044 * normally happens on the first call to realize. If you're
1045 * running your halide pipeline inside time-sensitive code and
1046 * wish to avoid including the time taken to compile a pipeline,
1047 * then you can call this ahead of time. Default is to use the Target
1048 * returned from Halide::get_jit_target_from_environment()
1049 */
1051
1052 /** Get a struct containing the currently set custom functions
1053 * used by JIT. This can be mutated. Changes will take effect the
1054 * next time this Func is realized. */
1056
1057 /** Eagerly jit compile the function to machine code and return a callable
1058 * struct that behaves like a function pointer. The calling convention
1059 * will exactly match that of an AOT-compiled version of this Func
1060 * with the same Argument list.
1061 */
1062 Callable compile_to_callable(const std::vector<Argument> &args,
1063 const Target &target = get_jit_target_from_environment());
1064
1065 /** Add a custom pass to be used during lowering. It is run after
1066 * all other lowering passes. Can be used to verify properties of
1067 * the lowered Stmt, instrument it with extra code, or otherwise
1068 * modify it. The Func takes ownership of the pass, and will call
1069 * delete on it when the Func goes out of scope. So don't pass a
1070 * stack object, or share pass instances between multiple
1071 * Funcs. */
1072 template<typename T>
1074 // Template instantiate a custom deleter for this type, then
1075 // wrap in a lambda. The custom deleter lives in user code, so
1076 // that deletion is on the same heap as construction (I hate Windows).
1077 add_custom_lowering_pass(pass, [pass]() { delete_lowering_pass<T>(pass); });
1078 }
1079
1080 /** Add a custom pass to be used during lowering, with the
1081 * function that will be called to delete it also passed in. Set
1082 * it to nullptr if you wish to retain ownership of the object. */
1083 void add_custom_lowering_pass(Internal::IRMutator *pass, std::function<void()> deleter);
1084
1085 /** Remove all previously-set custom lowering passes */
1087
1088 /** Get the custom lowering passes. */
1089 const std::vector<CustomLoweringPass> &custom_lowering_passes();
1090
1091 /** When this function is compiled, include code that dumps its
1092 * values to a file after it is realized, for the purpose of
1093 * debugging.
1094 *
1095 * If filename ends in ".tif" or ".tiff" (case insensitive) the file
1096 * is in TIFF format and can be read by standard tools. Oherwise, the
1097 * file format is as follows:
1098 *
1099 * All data is in the byte-order of the target platform. First, a
1100 * 20 byte-header containing four 32-bit ints, giving the extents
1101 * of the first four dimensions. Dimensions beyond four are
1102 * folded into the fourth. Then, a fifth 32-bit int giving the
1103 * data type of the function. The typecodes are given by: float =
1104 * 0, double = 1, uint8_t = 2, int8_t = 3, uint16_t = 4, int16_t =
1105 * 5, uint32_t = 6, int32_t = 7, uint64_t = 8, int64_t = 9. The
1106 * data follows the header, as a densely packed array of the given
1107 * size and the given type. If given the extension .tmp, this file
1108 * format can be natively read by the program ImageStack. */
1109 void debug_to_file(const std::string &filename);
1110
1111 /** The name of this function, either given during construction,
1112 * or automatically generated. */
1113 const std::string &name() const;
1114
1115 /** Get the pure arguments. */
1116 std::vector<Var> args() const;
1117
1118 /** The right-hand-side value of the pure definition of this
1119 * function. Causes an error if there's no pure definition, or if
1120 * the function is defined to return multiple values. */
1121 Expr value() const;
1122
1123 /** The values returned by this function. An error if the function
1124 * has not been been defined. Returns a Tuple with one element for
1125 * functions defined to return a single value. */
1126 Tuple values() const;
1127
1128 /** Does this function have at least a pure definition. */
1129 bool defined() const;
1130
1131 /** Get the left-hand-side of the update definition. An empty
1132 * vector if there's no update definition. If there are
1133 * multiple update definitions for this function, use the
1134 * argument to select which one you want. */
1135 const std::vector<Expr> &update_args(int idx = 0) const;
1136
1137 /** Get the right-hand-side of an update definition. An error if
1138 * there's no update definition. If there are multiple
1139 * update definitions for this function, use the argument to
1140 * select which one you want. */
1141 Expr update_value(int idx = 0) const;
1142
1143 /** Get the right-hand-side of an update definition for
1144 * functions that returns multiple values. An error if there's no
1145 * update definition. Returns a Tuple with one element for
1146 * functions that return a single value. */
1147 Tuple update_values(int idx = 0) const;
1148
1149 /** Get the RVars of the reduction domain for an update definition, if there is
1150 * one. */
1151 std::vector<RVar> rvars(int idx = 0) const;
1152
1153 /** Does this function have at least one update definition? */
1155
1156 /** How many update definitions does this function have? */
1158
1159 /** Is this function an external stage? That is, was it defined
1160 * using define_extern? */
1161 bool is_extern() const;
1162
1163 /** Add an extern definition for this Func. This lets you define a
1164 * Func that represents an external pipeline stage. You can, for
1165 * example, use it to wrap a call to an extern library such as
1166 * fftw. */
1167 // @{
1168 void define_extern(const std::string &function_name,
1169 const std::vector<ExternFuncArgument> &params, Type t,
1170 int dimensionality,
1172 DeviceAPI device_api = DeviceAPI::Host) {
1173 define_extern(function_name, params, t,
1174 Internal::make_argument_list(dimensionality), mangling,
1175 device_api);
1176 }
1177
1178 void define_extern(const std::string &function_name,
1179 const std::vector<ExternFuncArgument> &params,
1180 const std::vector<Type> &types, int dimensionality,
1182 DeviceAPI device_api = DeviceAPI::Host) {
1183 define_extern(function_name, params, types,
1184 Internal::make_argument_list(dimensionality), mangling,
1185 device_api);
1186 }
1187
1188 void define_extern(const std::string &function_name,
1189 const std::vector<ExternFuncArgument> &params, Type t,
1190 const std::vector<Var> &arguments,
1192 DeviceAPI device_api = DeviceAPI::Host) {
1193 define_extern(function_name, params, std::vector<Type>{t}, arguments,
1194 mangling, device_api);
1195 }
1196
1197 void define_extern(const std::string &function_name,
1198 const std::vector<ExternFuncArgument> &params,
1199 const std::vector<Type> &types,
1200 const std::vector<Var> &arguments,
1202 DeviceAPI device_api = DeviceAPI::Host);
1203 // @}
1204
1205 /** Get the type(s) of the outputs of this Func.
1206 *
1207 * It is not legal to call type() unless the Func has non-Tuple elements.
1208 *
1209 * If the Func isn't yet defined, and was not specified with required types,
1210 * a runtime error will occur.
1211 *
1212 * If the Func isn't yet defined, but *was* specified with required types,
1213 * the requirements will be returned. */
1214 // @{
1215 const Type &type() const;
1216 const std::vector<Type> &types() const;
1217 // @}
1218
1219 /** Get the number of outputs of this Func. Corresponds to the
1220 * size of the Tuple this Func was defined to return.
1221 * If the Func isn't yet defined, but was specified with required types,
1222 * the number of outputs specified in the requirements will be returned. */
1223 int outputs() const;
1224
1225 /** Get the name of the extern function called for an extern
1226 * definition. */
1227 const std::string &extern_function_name() const;
1228
1229 /** The dimensionality (number of arguments) of this function.
1230 * If the Func isn't yet defined, but was specified with required dimensionality,
1231 * the dimensionality specified in the requirements will be returned. */
1232 int dimensions() const;
1233
1234 /** Construct either the left-hand-side of a definition, or a call
1235 * to a functions that happens to only contain vars as
1236 * arguments. If the function has already been defined, and fewer
1237 * arguments are given than the function has dimensions, then
1238 * enough implicit vars are added to the end of the argument list
1239 * to make up the difference (see \ref Var::implicit) */
1240 // @{
1241 FuncRef operator()(std::vector<Var>) const;
1242
1243 template<typename... Args>
1245 operator()(Args &&...args) const {
1246 std::vector<Var> collected_args{std::forward<Args>(args)...};
1247 return this->operator()(collected_args);
1248 }
1249 // @}
1250
1251 /** Either calls to the function, or the left-hand-side of
1252 * an update definition (see \ref RDom). If the function has
1253 * already been defined, and fewer arguments are given than the
1254 * function has dimensions, then enough implicit vars are added to
1255 * the end of the argument list to make up the difference. (see
1256 * \ref Var::implicit)*/
1257 // @{
1258 FuncRef operator()(std::vector<Expr>) const;
1259
1260 template<typename... Args>
1262 operator()(const Expr &x, Args &&...args) const {
1263 std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
1264 return (*this)(collected_args);
1265 }
1266 // @}
1267
1268 /** Creates and returns a new identity Func that wraps this Func. During
1269 * compilation, Halide replaces all calls to this Func done by 'f'
1270 * with calls to the wrapper. If this Func is already wrapped for
1271 * use in 'f', will return the existing wrapper.
1272 *
1273 * For example, g.in(f) would rewrite a pipeline like this:
1274 \code
1275 g(x, y) = ...
1276 f(x, y) = ... g(x, y) ...
1277 \endcode
1278 * into a pipeline like this:
1279 \code
1280 g(x, y) = ...
1281 g_wrap(x, y) = g(x, y)
1282 f(x, y) = ... g_wrap(x, y)
1283 \endcode
1284 *
1285 * This has a variety of uses. You can use it to schedule this
1286 * Func differently in the different places it is used:
1287 \code
1288 g(x, y) = ...
1289 f1(x, y) = ... g(x, y) ...
1290 f2(x, y) = ... g(x, y) ...
1291 g.in(f1).compute_at(f1, y).vectorize(x, 8);
1292 g.in(f2).compute_at(f2, x).unroll(x);
1293 \endcode
1294 *
1295 * You can also use it to stage loads from this Func via some
1296 * intermediate buffer (perhaps on the stack as in
1297 * test/performance/block_transpose.cpp, or in shared GPU memory
1298 * as in test/performance/wrap.cpp). In this we compute the
1299 * wrapper at tiles of the consuming Funcs like so:
1300 \code
1301 g.compute_root()...
1302 g.in(f).compute_at(f, tiles)...
1303 \endcode
1304 *
1305 * Func::in() can also be used to compute pieces of a Func into a
1306 * smaller scratch buffer (perhaps on the GPU) and then copy them
1307 * into a larger output buffer one tile at a time. See
1308 * apps/interpolate/interpolate.cpp for an example of this. In
1309 * this case we compute the Func at tiles of its own wrapper:
1310 \code
1311 f.in(g).compute_root().gpu_tile(...)...
1312 f.compute_at(f.in(g), tiles)...
1313 \endcode
1314 *
1315 * A similar use of Func::in() wrapping Funcs with multiple update
1316 * stages in a pure wrapper. The following code:
1317 \code
1318 f(x, y) = x + y;
1319 f(x, y) += 5;
1320 g(x, y) = f(x, y);
1321 f.compute_root();
1322 \endcode
1323 *
1324 * Is equivalent to:
1325 \code
1326 for y:
1327 for x:
1328 f(x, y) = x + y;
1329 for y:
1330 for x:
1331 f(x, y) += 5
1332 for y:
1333 for x:
1334 g(x, y) = f(x, y)
1335 \endcode
1336 * using Func::in(), we can write:
1337 \code
1338 f(x, y) = x + y;
1339 f(x, y) += 5;
1340 g(x, y) = f(x, y);
1341 f.in(g).compute_root();
1342 \endcode
1343 * which instead produces:
1344 \code
1345 for y:
1346 for x:
1347 f(x, y) = x + y;
1348 f(x, y) += 5
1349 f_wrap(x, y) = f(x, y)
1350 for y:
1351 for x:
1352 g(x, y) = f_wrap(x, y)
1353 \endcode
1354 */
1355 Func in(const Func &f);
1356
1357 /** Create and return an identity wrapper shared by all the Funcs in
1358 * 'fs'. If any of the Funcs in 'fs' already have a custom wrapper,
1359 * this will throw an error. */
1360 Func in(const std::vector<Func> &fs);
1361
1362 /** Create and return a global identity wrapper, which wraps all calls to
1363 * this Func by any other Func. If a global wrapper already exists,
1364 * returns it. The global identity wrapper is only used by callers for
1365 * which no custom wrapper has been specified.
1366 */
1368
1369 /** Similar to \ref Func::in; however, instead of replacing the call to
1370 * this Func with an identity Func that refers to it, this replaces the
1371 * call with a clone of this Func.
1372 *
1373 * For example, f.clone_in(g) would rewrite a pipeline like this:
1374 \code
1375 f(x, y) = x + y;
1376 g(x, y) = f(x, y) + 2;
1377 h(x, y) = f(x, y) - 3;
1378 \endcode
1379 * into a pipeline like this:
1380 \code
1381 f(x, y) = x + y;
1382 f_clone(x, y) = x + y;
1383 g(x, y) = f_clone(x, y) + 2;
1384 h(x, y) = f(x, y) - 3;
1385 \endcode
1386 *
1387 */
1388 //@{
1389 Func clone_in(const Func &f);
1390 Func clone_in(const std::vector<Func> &fs);
1391 //@}
1392
1393 /** Declare that this function should be implemented by a call to
1394 * halide_buffer_copy with the given target device API. Asserts
1395 * that the Func has a pure definition which is a simple call to a
1396 * single input, and no update definitions. The wrapper Funcs
1397 * returned by in() are suitable candidates. Consumes all pure
1398 * variables, and rewrites the Func to have an extern definition
1399 * that calls halide_buffer_copy. */
1401
1402 /** Declare that this function should be implemented by a call to
1403 * halide_buffer_copy with a NULL target device API. Equivalent to
1404 * copy_to_device(DeviceAPI::Host). Asserts that the Func has a
1405 * pure definition which is a simple call to a single input, and
1406 * no update definitions. The wrapper Funcs returned by in() are
1407 * suitable candidates. Consumes all pure variables, and rewrites
1408 * the Func to have an extern definition that calls
1409 * halide_buffer_copy.
1410 *
1411 * Note that if the source Func is already valid in host memory,
1412 * this compiles to code that does the minimum number of calls to
1413 * memcpy.
1414 */
1416
1417 /** Split a dimension into inner and outer subdimensions with the
1418 * given names, where the inner dimension iterates from 0 to
1419 * factor-1. The inner and outer subdimensions can then be dealt
1420 * with using the other scheduling calls. It's ok to reuse the old
1421 * variable name as either the inner or outer variable. The final
1422 * argument specifies how the tail should be handled if the split
1423 * factor does not provably divide the extent. */
1424 Func &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1425
1426 /** Join two dimensions into a single fused dimension. The fused dimension
1427 * covers the product of the extents of the inner and outer dimensions
1428 * given. The loop type (e.g. parallel, vectorized) of the resulting fused
1429 * dimension is inherited from the first argument. */
1430 Func &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
1431
1432 /** Mark a dimension to be traversed serially. This is the default. */
1433 Func &serial(const VarOrRVar &var);
1434
1435 /** Mark a dimension to be traversed in parallel */
1437
1438 /** Split a dimension by the given task_size, and the parallelize the
1439 * outer dimension. This creates parallel tasks that have size
1440 * task_size. After this call, var refers to the outer dimension of
1441 * the split. The inner dimension has a new anonymous name. If you
1442 * wish to mutate it, or schedule with respect to it, do the split
1443 * manually. */
1444 Func &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
1445
1446 /** Mark a dimension to be computed all-at-once as a single
1447 * vector. The dimension should have constant extent -
1448 * e.g. because it is the inner dimension following a split by a
1449 * constant factor. For most uses of vectorize you want the two
1450 * argument form. The variable to be vectorized should be the
1451 * innermost one. */
1453
1454 /** Mark a dimension to be completely unrolled. The dimension
1455 * should have constant extent - e.g. because it is the inner
1456 * dimension following a split by a constant factor. For most uses
1457 * of unroll you want the two-argument form. */
1458 Func &unroll(const VarOrRVar &var);
1459
1460 /** Split a dimension by the given factor, then vectorize the
1461 * inner dimension. This is how you vectorize a loop of unknown
1462 * size. The variable to be vectorized should be the innermost
1463 * one. After this call, var refers to the outer dimension of the
1464 * split. 'factor' must be an integer. */
1465 Func &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1466
1467 /** Split a dimension by the given factor, then unroll the inner
1468 * dimension. This is how you unroll a loop of unknown size by
1469 * some constant factor. After this call, var refers to the outer
1470 * dimension of the split. 'factor' must be an integer. */
1471 Func &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1472
1473 /** Set the loop partition policy. Loop partitioning can be useful to
1474 * optimize boundary conditions (such as clamp_edge). Loop partitioning
1475 * splits a for loop into three for loops: a prologue, a steady-state,
1476 * and an epilogue.
1477 * The default policy is Auto. */
1478 Func &partition(const VarOrRVar &var, Partition partition_policy);
1479
1480 /** Set the loop partition policy to Never for a vector of Vars and
1481 * RVars. */
1482 Func &never_partition(const std::vector<VarOrRVar> &vars);
1483
1484 /** Set the loop partition policy to Never for some number of Vars and RVars. */
1485 template<typename... Args>
1487 never_partition(const VarOrRVar &x, Args &&...args) {
1488 std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
1489 return never_partition(collected_args);
1490 }
1491
1492 /** Set the loop partition policy to Never for all Vars and RVar of the
1493 * initial definition of the Func. It must be called separately on any
1494 * update definitions. */
1496
1497 /** Set the loop partition policy to Always for a vector of Vars and
1498 * RVars. */
1499 Func &always_partition(const std::vector<VarOrRVar> &vars);
1500
1501 /** Set the loop partition policy to Always for some number of Vars and RVars. */
1502 template<typename... Args>
1504 always_partition(const VarOrRVar &x, Args &&...args) {
1505 std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
1506 return always_partition(collected_args);
1507 }
1508
1509 /** Set the loop partition policy to Always for all Vars and RVar of the
1510 * initial definition of the Func. It must be called separately on any
1511 * update definitions. */
1513
1514 /** Statically declare that the range over which a function should
1515 * be evaluated is given by the second and third arguments. This
1516 * can let Halide perform some optimizations. E.g. if you know
1517 * there are going to be 4 color channels, you can completely
1518 * vectorize the color channel dimension without the overhead of
1519 * splitting it up. If bounds inference decides that it requires
1520 * more of this function than the bounds you have stated, a
1521 * runtime error will occur when you try to run your pipeline. */
1522 Func &bound(const Var &var, Expr min, Expr extent);
1523
1524 /** Statically declare the range over which the function will be
1525 * evaluated in the general case. This provides a basis for the auto
1526 * scheduler to make trade-offs and scheduling decisions. The auto
1527 * generated schedules might break when the sizes of the dimensions are
1528 * very different from the estimates specified. These estimates are used
1529 * only by the auto scheduler if the function is a pipeline output. */
1530 Func &set_estimate(const Var &var, const Expr &min, const Expr &extent);
1531
1532 /** Set (min, extent) estimates for all dimensions in the Func
1533 * at once; this is equivalent to calling `set_estimate(args()[n], min, extent)`
1534 * repeatedly, but slightly terser. The size of the estimates vector
1535 * must match the dimensionality of the Func. */
1536 Func &set_estimates(const Region &estimates);
1537
1538 /** Expand the region computed so that the min coordinates is
1539 * congruent to 'remainder' modulo 'modulus', and the extent is a
1540 * multiple of 'modulus'. For example, f.align_bounds(x, 2) forces
1541 * the min and extent realized to be even, and calling
1542 * f.align_bounds(x, 2, 1) forces the min to be odd and the extent
1543 * to be even. The region computed always contains the region that
1544 * would have been computed without this directive, so no
1545 * assertions are injected.
1546 */
1547 Func &align_bounds(const Var &var, Expr modulus, Expr remainder = 0);
1548
1549 /** Expand the region computed so that the extent is a
1550 * multiple of 'modulus'. For example, f.align_extent(x, 2) forces
1551 * the extent realized to be even. The region computed always contains the
1552 * region that would have been computed without this directive, so no
1553 * assertions are injected. (This is essentially equivalent to align_bounds(),
1554 * but always leaving the min untouched.)
1555 */
1556 Func &align_extent(const Var &var, Expr modulus);
1557
1558 /** Bound the extent of a Func's realization, but not its
1559 * min. This means the dimension can be unrolled or vectorized
1560 * even when its min is not fixed (for example because it is
1561 * compute_at tiles of another Func). This can also be useful for
1562 * forcing a function's allocation to be a fixed size, which often
1563 * means it can go on the stack. */
1564 Func &bound_extent(const Var &var, Expr extent);
1565
1566 /** Split two dimensions at once by the given factors, and then
1567 * reorder the resulting dimensions to be xi, yi, xo, yo from
1568 * innermost outwards. This gives a tiled traversal. */
1569 Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1570 const VarOrRVar &xo, const VarOrRVar &yo,
1571 const VarOrRVar &xi, const VarOrRVar &yi,
1572 const Expr &xfactor, const Expr &yfactor,
1574
1575 /** A shorter form of tile, which reuses the old variable names as
1576 * the new outer dimensions */
1577 Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1578 const VarOrRVar &xi, const VarOrRVar &yi,
1579 const Expr &xfactor, const Expr &yfactor,
1581
1582 /** A more general form of tile, which defines tiles of any dimensionality. */
1583 Func &tile(const std::vector<VarOrRVar> &previous,
1584 const std::vector<VarOrRVar> &outers,
1585 const std::vector<VarOrRVar> &inners,
1586 const std::vector<Expr> &factors,
1587 const std::vector<TailStrategy> &tails);
1588
1589 /** The generalized tile, with a single tail strategy to apply to all vars. */
1590 Func &tile(const std::vector<VarOrRVar> &previous,
1591 const std::vector<VarOrRVar> &outers,
1592 const std::vector<VarOrRVar> &inners,
1593 const std::vector<Expr> &factors,
1595
1596 /** Generalized tiling, reusing the previous names as the outer names. */
1597 Func &tile(const std::vector<VarOrRVar> &previous,
1598 const std::vector<VarOrRVar> &inners,
1599 const std::vector<Expr> &factors,
1601
1602 /** Reorder variables to have the given nesting order, from
1603 * innermost out */
1604 Func &reorder(const std::vector<VarOrRVar> &vars);
1605
1606 template<typename... Args>
1608 reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
1609 std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
1610 return reorder(collected_args);
1611 }
1612
1613 /** Get the Vars of the pure definition, with splits applied. This
1614 * represents all the potentially-valid compute_at sites for this stage of
1615 * this Func. Note that this, by default, will end with the sentinel
1616 * Var::outermost. */
1617 std::vector<Var> split_vars() const;
1618
1619 /** Rename a dimension. Equivalent to split with a inner size of one. */
1620 Func &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
1621
1622 /** Specify that race conditions are permitted for this Func,
1623 * which enables parallelizing over RVars even when Halide cannot
1624 * prove that it is safe to do so. Use this with great caution,
1625 * and only if you can prove to yourself that this is safe, as it
1626 * may result in a non-deterministic routine that returns
1627 * different values at different times or on different machines. */
1629
1630 /** Issue atomic updates for this Func. This allows parallelization
1631 * on associative RVars. The function throws a compile error when
1632 * Halide fails to prove associativity. Use override_associativity_test
1633 * to disable the associativity test if you believe the function is
1634 * associative or the order of reduction variable execution does not
1635 * matter.
1636 * Halide compiles this into hardware atomic operations whenever possible,
1637 * and falls back to a mutex lock per storage element if it is impossible
1638 * to atomically update.
1639 * There are three possible outcomes of the compiled code:
1640 * atomic add, compare-and-swap loop, and mutex lock.
1641 * For example:
1642 *
1643 * hist(x) = 0;
1644 * hist(im(r)) += 1;
1645 * hist.compute_root();
1646 * hist.update().atomic().parallel();
1647 *
1648 * will be compiled to atomic add operations.
1649 *
1650 * hist(x) = 0;
1651 * hist(im(r)) = min(hist(im(r)) + 1, 100);
1652 * hist.compute_root();
1653 * hist.update().atomic().parallel();
1654 *
1655 * will be compiled to compare-and-swap loops.
1656 *
1657 * arg_max() = {0, im(0)};
1658 * Expr old_index = arg_max()[0];
1659 * Expr old_max = arg_max()[1];
1660 * Expr new_index = select(old_max < im(r), r, old_index);
1661 * Expr new_max = max(im(r), old_max);
1662 * arg_max() = {new_index, new_max};
1663 * arg_max.compute_root();
1664 * arg_max.update().atomic().parallel();
1665 *
1666 * will be compiled to updates guarded by a mutex lock,
1667 * since it is impossible to atomically update two different locations.
1668 *
1669 * Currently the atomic operation is supported by x86, CUDA, and OpenCL backends.
1670 * Compiling to other backends results in a compile error.
1671 * If an operation is compiled into a mutex lock, and is vectorized or is
1672 * compiled to CUDA or OpenCL, it also results in a compile error,
1673 * since per-element mutex lock on vectorized operation leads to a
1674 * deadlock.
1675 * Vectorization of predicated RVars (through rdom.where()) on CPU
1676 * is also unsupported yet (see https://github.com/halide/Halide/issues/4298).
1677 * 8-bit and 16-bit atomics on GPU are also not supported. */
1678 Func &atomic(bool override_associativity_test = false);
1679
1680 /** Specialize a Func. This creates a special-case version of the
1681 * Func where the given condition is true. The most effective
1682 * conditions are those of the form param == value, and boolean
1683 * Params. Consider a simple example:
1684 \code
1685 f(x) = x + select(cond, 0, 1);
1686 f.compute_root();
1687 \endcode
1688 * This is equivalent to:
1689 \code
1690 for (int x = 0; x < width; x++) {
1691 f[x] = x + (cond ? 0 : 1);
1692 }
1693 \endcode
1694 * Adding the scheduling directive:
1695 \code
1696 f.specialize(cond)
1697 \endcode
1698 * makes it equivalent to:
1699 \code
1700 if (cond) {
1701 for (int x = 0; x < width; x++) {
1702 f[x] = x;
1703 }
1704 } else {
1705 for (int x = 0; x < width; x++) {
1706 f[x] = x + 1;
1707 }
1708 }
1709 \endcode
1710 * Note that the inner loops have been simplified. In the first
1711 * path Halide knows that cond is true, and in the second path
1712 * Halide knows that it is false.
1713 *
1714 * The specialized version gets its own schedule, which inherits
1715 * every directive made about the parent Func's schedule so far
1716 * except for its specializations. This method returns a handle to
1717 * the new schedule. If you wish to retrieve the specialized
1718 * sub-schedule again later, you can call this method with the
1719 * same condition. Consider the following example of scheduling
1720 * the specialized version:
1721 *
1722 \code
1723 f(x) = x;
1724 f.compute_root();
1725 f.specialize(width > 1).unroll(x, 2);
1726 \endcode
1727 * Assuming for simplicity that width is even, this is equivalent to:
1728 \code
1729 if (width > 1) {
1730 for (int x = 0; x < width/2; x++) {
1731 f[2*x] = 2*x;
1732 f[2*x + 1] = 2*x + 1;
1733 }
1734 } else {
1735 for (int x = 0; x < width/2; x++) {
1736 f[x] = x;
1737 }
1738 }
1739 \endcode
1740 * For this case, it may be better to schedule the un-specialized
1741 * case instead:
1742 \code
1743 f(x) = x;
1744 f.compute_root();
1745 f.specialize(width == 1); // Creates a copy of the schedule so far.
1746 f.unroll(x, 2); // Only applies to the unspecialized case.
1747 \endcode
1748 * This is equivalent to:
1749 \code
1750 if (width == 1) {
1751 f[0] = 0;
1752 } else {
1753 for (int x = 0; x < width/2; x++) {
1754 f[2*x] = 2*x;
1755 f[2*x + 1] = 2*x + 1;
1756 }
1757 }
1758 \endcode
1759 * This can be a good way to write a pipeline that splits,
1760 * vectorizes, or tiles, but can still handle small inputs.
1761 *
1762 * If a Func has several specializations, the first matching one
1763 * will be used, so the order in which you define specializations
1764 * is significant. For example:
1765 *
1766 \code
1767 f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1768 f.specialize(cond1);
1769 f.specialize(cond2);
1770 \endcode
1771 * is equivalent to:
1772 \code
1773 if (cond1) {
1774 for (int x = 0; x < width; x++) {
1775 f[x] = x + a - (cond2 ? c : d);
1776 }
1777 } else if (cond2) {
1778 for (int x = 0; x < width; x++) {
1779 f[x] = x + b - c;
1780 }
1781 } else {
1782 for (int x = 0; x < width; x++) {
1783 f[x] = x + b - d;
1784 }
1785 }
1786 \endcode
1787 *
1788 * Specializations may in turn be specialized, which creates a
1789 * nested if statement in the generated code.
1790 *
1791 \code
1792 f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1793 f.specialize(cond1).specialize(cond2);
1794 \endcode
1795 * This is equivalent to:
1796 \code
1797 if (cond1) {
1798 if (cond2) {
1799 for (int x = 0; x < width; x++) {
1800 f[x] = x + a - c;
1801 }
1802 } else {
1803 for (int x = 0; x < width; x++) {
1804 f[x] = x + a - d;
1805 }
1806 }
1807 } else {
1808 for (int x = 0; x < width; x++) {
1809 f[x] = x + b - (cond2 ? c : d);
1810 }
1811 }
1812 \endcode
1813 * To create a 4-way if statement that simplifies away all of the
1814 * ternary operators above, you could say:
1815 \code
1816 f.specialize(cond1).specialize(cond2);
1817 f.specialize(cond2);
1818 \endcode
1819 * or
1820 \code
1821 f.specialize(cond1 && cond2);
1822 f.specialize(cond1);
1823 f.specialize(cond2);
1824 \endcode
1825 *
1826 * Any prior Func which is compute_at some variable of this Func
1827 * gets separately included in all paths of the generated if
1828 * statement. The Var in the compute_at call to must exist in all
1829 * paths, but it may have been generated via a different path of
1830 * splits, fuses, and renames. This can be used somewhat
1831 * creatively. Consider the following code:
1832 \code
1833 g(x, y) = 8*x;
1834 f(x, y) = g(x, y) + 1;
1835 f.compute_root().specialize(cond);
1836 Var g_loop;
1837 f.specialize(cond).rename(y, g_loop);
1838 f.rename(x, g_loop);
1839 g.compute_at(f, g_loop);
1840 \endcode
1841 * When cond is true, this is equivalent to g.compute_at(f,y).
1842 * When it is false, this is equivalent to g.compute_at(f,x).
1843 */
1844 Stage specialize(const Expr &condition);
1845
1846 /** Add a specialization to a Func that always terminates execution
1847 * with a call to halide_error(). By itself, this is of limited use,
1848 * but can be useful to terminate chains of specialize() calls where
1849 * no "default" case is expected (thus avoiding unnecessary code generation).
1850 *
1851 * For instance, say we want to optimize a pipeline to process images
1852 * in planar and interleaved format; we might typically do something like:
1853 \code
1854 ImageParam im(UInt(8), 3);
1855 Func f = do_something_with(im);
1856 f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1857 f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1858 \endcode
1859 * This code will vectorize along rows for the planar case, and across pixel
1860 * components for the interleaved case... but there is an implicit "else"
1861 * for the unhandled cases, which generates unoptimized code. If we never
1862 * anticipate passing any other sort of images to this, we code streamline
1863 * our code by adding specialize_fail():
1864 \code
1865 ImageParam im(UInt(8), 3);
1866 Func f = do_something(im);
1867 f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1868 f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1869 f.specialize_fail("Unhandled image format");
1870 \endcode
1871 * Conceptually, this produces codes like:
1872 \code
1873 if (im.dim(0).stride() == 1) {
1874 do_something_planar();
1875 } else if (im.dim(2).stride() == 1) {
1876 do_something_interleaved();
1877 } else {
1878 halide_error("Unhandled image format");
1879 }
1880 \endcode
1881 *
1882 * Note that calling specialize_fail() terminates the specialization chain
1883 * for a given Func; you cannot create new specializations for the Func
1884 * afterwards (though you can retrieve handles to previous specializations).
1885 */
1886 void specialize_fail(const std::string &message);
1887
1888 /** Tell Halide that the following dimensions correspond to GPU
1889 * thread indices. This is useful if you compute a producer
1890 * function within the block indices of a consumer function, and
1891 * want to control how that function's dimensions map to GPU
1892 * threads. If the selected target is not an appropriate GPU, this
1893 * just marks those dimensions as parallel. */
1894 // @{
1896 Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1897 Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1898 // @}
1899
1900 /** The given dimension corresponds to the lanes in a GPU
1901 * warp. GPU warp lanes are distinguished from GPU threads by the
1902 * fact that all warp lanes run together in lockstep, which
1903 * permits lightweight communication of data from one lane to
1904 * another. */
1906
1907 /** Tell Halide to run this stage using a single gpu thread and
1908 * block. This is not an efficient use of your GPU, but it can be
1909 * useful to avoid copy-back for intermediate update stages that
1910 * touch a very small part of your Func. */
1912
1913 /** Tell Halide that the following dimensions correspond to GPU
1914 * block indices. This is useful for scheduling stages that will
1915 * run serially within each GPU block. If the selected target is
1916 * not ptx, this just marks those dimensions as parallel. */
1917 // @{
1919 Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1920 Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1921 // @}
1922
1923 /** Tell Halide that the following dimensions correspond to GPU
1924 * block indices and thread indices. If the selected target is not
1925 * ptx, these just mark the given dimensions as parallel. The
1926 * dimensions are consumed by this call, so do all other
1927 * unrolling, reordering, etc first. */
1928 // @{
1929 Func &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1930 Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
1931 const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1932 Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
1933 const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1934 // @}
1935
1936 /** Short-hand for tiling a domain and mapping the tile indices
1937 * to GPU block indices and the coordinates within each tile to
1938 * GPU thread indices. Consumes the variables given, so do all
1939 * other scheduling first. */
1940 // @{
1941 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
1943 DeviceAPI device_api = DeviceAPI::Default_GPU);
1944
1945 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
1947 DeviceAPI device_api = DeviceAPI::Default_GPU);
1948 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1949 const VarOrRVar &bx, const VarOrRVar &by,
1950 const VarOrRVar &tx, const VarOrRVar &ty,
1951 const Expr &x_size, const Expr &y_size,
1953 DeviceAPI device_api = DeviceAPI::Default_GPU);
1954
1955 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1956 const VarOrRVar &tx, const VarOrRVar &ty,
1957 const Expr &x_size, const Expr &y_size,
1959 DeviceAPI device_api = DeviceAPI::Default_GPU);
1960
1961 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1962 const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
1963 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1964 const Expr &x_size, const Expr &y_size, const Expr &z_size,
1966 DeviceAPI device_api = DeviceAPI::Default_GPU);
1967 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1968 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1969 const Expr &x_size, const Expr &y_size, const Expr &z_size,
1971 DeviceAPI device_api = DeviceAPI::Default_GPU);
1972 // @}
1973
1974 /** Schedule for execution on Hexagon. When a loop is marked with
1975 * Hexagon, that loop is executed on a Hexagon DSP. */
1977
1978 /** Prefetch data written to or read from a Func or an ImageParam by a
1979 * subsequent loop iteration, at an optionally specified iteration offset. You may specify
1980 * specification of different vars for the location of the prefetch() instruction
1981 * vs. the location that is being prefetched:
1982 *
1983 * - the first var specified, 'at', indicates the loop in which the prefetch will be placed
1984 * - the second var specified, 'from', determines the var used to find the bounds to prefetch
1985 * (in conjunction with 'offset')
1986 *
1987 * If 'at' and 'from' are distinct vars, then 'from' must be at a nesting level outside 'at.'
1988 * Note that the value for 'offset' applies only to 'from', not 'at'.
1989 *
1990 * The final argument specifies how prefetch of region outside bounds
1991 * should be handled.
1992 *
1993 * For example, consider this pipeline:
1994 \code
1995 Func f, g;
1996 Var x, y, z;
1997 f(x, y) = x + y;
1998 g(x, y) = 2 * f(x, y);
1999 h(x, y) = 3 * f(x, y);
2000 \endcode
2001 *
2002 * The following schedule:
2003 \code
2004 f.compute_root();
2005 g.prefetch(f, x, x, 2, PrefetchBoundStrategy::NonFaulting);
2006 h.prefetch(f, x, y, 2, PrefetchBoundStrategy::NonFaulting);
2007 \endcode
2008 *
2009 * will inject prefetch call at the innermost loop of 'g' and 'h' and generate
2010 * the following loop nest:
2011 \code
2012 for y = ...
2013 for x = ...
2014 f(x, y) = x + y
2015 for y = ..
2016 for x = ...
2017 prefetch(&f[x + 2, y], 1, 16);
2018 g(x, y) = 2 * f(x, y)
2019 for y = ..
2020 for x = ...
2021 prefetch(&f[x, y + 2], 1, 16);
2022 h(x, y) = 3 * f(x, y)
2023 \endcode
2024 *
2025 * Note that the 'from' nesting level need not be adjacent to 'at':
2026 \code
2027 Func f, g;
2028 Var x, y, z, w;
2029 f(x, y, z, w) = x + y + z + w;
2030 g(x, y, z, w) = 2 * f(x, y, z, w);
2031 \endcode
2032 *
2033 * The following schedule:
2034 \code
2035 f.compute_root();
2036 g.prefetch(f, y, w, 2, PrefetchBoundStrategy::NonFaulting);
2037 \endcode
2038 *
2039 * will produce code that prefetches a tile of data:
2040 \code
2041 for w = ...
2042 for z = ...
2043 for y = ...
2044 for x = ...
2045 f(x, y, z, w) = x + y + z + w
2046 for w = ...
2047 for z = ...
2048 for y = ...
2049 for x0 = ...
2050 prefetch(&f[x0, y, z, w + 2], 1, 16);
2051 for x = ...
2052 g(x, y, z, w) = 2 * f(x, y, z, w)
2053 \endcode
2054 *
2055 * Note that calling prefetch() with the same var for both 'at' and 'from'
2056 * is equivalent to calling prefetch() with that var.
2057 */
2058 // @{
2059 Func &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2061 Func &prefetch(const Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2063 template<typename T>
2064 Func &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2066 return prefetch(image.parameter(), at, from, std::move(offset), strategy);
2067 }
2068 // @}
2069
2070 /** Specify how the storage for the function is laid out. These
2071 * calls let you specify the nesting order of the dimensions. For
2072 * example, foo.reorder_storage(y, x) tells Halide to use
2073 * column-major storage for any realizations of foo, without
2074 * changing how you refer to foo in the code. You may want to do
2075 * this if you intend to vectorize across y. When representing
2076 * color images, foo.reorder_storage(c, x, y) specifies packed
2077 * storage (red, green, and blue values adjacent in memory), and
2078 * foo.reorder_storage(x, y, c) specifies planar storage (entire
2079 * red, green, and blue images one after the other in memory).
2080 *
2081 * If you leave out some dimensions, those remain in the same
2082 * positions in the nesting order while the specified variables
2083 * are reordered around them. */
2084 // @{
2085 Func &reorder_storage(const std::vector<Var> &dims);
2086
2087 Func &reorder_storage(const Var &x, const Var &y);
2088 template<typename... Args>
2090 reorder_storage(const Var &x, const Var &y, Args &&...args) {
2091 std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
2092 return reorder_storage(collected_args);
2093 }
2094 // @}
2095
2096 /** Pad the storage extent of a particular dimension of
2097 * realizations of this function up to be a multiple of the
2098 * specified alignment. This guarantees that the strides for the
2099 * dimensions stored outside of dim will be multiples of the
2100 * specified alignment, where the strides and alignment are
2101 * measured in numbers of elements.
2102 *
2103 * For example, to guarantee that a function foo(x, y, c)
2104 * representing an image has scanlines starting on offsets
2105 * aligned to multiples of 16, use foo.align_storage(x, 16). */
2106 Func &align_storage(const Var &dim, const Expr &alignment);
2107
2108 /** Store realizations of this function in a circular buffer of a
2109 * given extent. This is more efficient when the extent of the
2110 * circular buffer is a power of 2. If the fold factor is too
2111 * small, or the dimension is not accessed monotonically, the
2112 * pipeline will generate an error at runtime.
2113 *
2114 * The fold_forward option indicates that the new values of the
2115 * producer are accessed by the consumer in a monotonically
2116 * increasing order. Folding storage of producers is also
2117 * supported if the new values are accessed in a monotonically
2118 * decreasing order by setting fold_forward to false.
2119 *
2120 * For example, consider the pipeline:
2121 \code
2122 Func f, g;
2123 Var x, y;
2124 g(x, y) = x*y;
2125 f(x, y) = g(x, y) + g(x, y+1);
2126 \endcode
2127 *
2128 * If we schedule f like so:
2129 *
2130 \code
2131 g.compute_at(f, y).store_root().fold_storage(y, 2);
2132 \endcode
2133 *
2134 * Then g will be computed at each row of f and stored in a buffer
2135 * with an extent in y of 2, alternately storing each computed row
2136 * of g in row y=0 or y=1.
2137 */
2138 Func &fold_storage(const Var &dim, const Expr &extent, bool fold_forward = true);
2139
2140 /** Compute this function as needed for each unique value of the
2141 * given var for the given calling function f.
2142 *
2143 * For example, consider the simple pipeline:
2144 \code
2145 Func f, g;
2146 Var x, y;
2147 g(x, y) = x*y;
2148 f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2149 \endcode
2150 *
2151 * If we schedule f like so:
2152 *
2153 \code
2154 g.compute_at(f, x);
2155 \endcode
2156 *
2157 * Then the C code equivalent to this pipeline will look like this
2158 *
2159 \code
2160
2161 int f[height][width];
2162 for (int y = 0; y < height; y++) {
2163 for (int x = 0; x < width; x++) {
2164 int g[2][2];
2165 g[0][0] = x*y;
2166 g[0][1] = (x+1)*y;
2167 g[1][0] = x*(y+1);
2168 g[1][1] = (x+1)*(y+1);
2169 f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2170 }
2171 }
2172
2173 \endcode
2174 *
2175 * The allocation and computation of g is within f's loop over x,
2176 * and enough of g is computed to satisfy all that f will need for
2177 * that iteration. This has excellent locality - values of g are
2178 * used as soon as they are computed, but it does redundant
2179 * work. Each value of g ends up getting computed four times. If
2180 * we instead schedule f like so:
2181 *
2182 \code
2183 g.compute_at(f, y);
2184 \endcode
2185 *
2186 * The equivalent C code is:
2187 *
2188 \code
2189 int f[height][width];
2190 for (int y = 0; y < height; y++) {
2191 int g[2][width+1];
2192 for (int x = 0; x < width; x++) {
2193 g[0][x] = x*y;
2194 g[1][x] = x*(y+1);
2195 }
2196 for (int x = 0; x < width; x++) {
2197 f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2198 }
2199 }
2200 \endcode
2201 *
2202 * The allocation and computation of g is within f's loop over y,
2203 * and enough of g is computed to satisfy all that f will need for
2204 * that iteration. This does less redundant work (each point in g
2205 * ends up being evaluated twice), but the locality is not quite
2206 * as good, and we have to allocate more temporary memory to store
2207 * g.
2208 */
2209 Func &compute_at(const Func &f, const Var &var);
2210
2211 /** Schedule a function to be computed within the iteration over
2212 * some dimension of an update domain. Produces equivalent code
2213 * to the version of compute_at that takes a Var. */
2214 Func &compute_at(const Func &f, const RVar &var);
2215
2216 /** Schedule a function to be computed within the iteration over
2217 * a given LoopLevel. */
2219
2220 /** Schedule the iteration over the initial definition of this function
2221 * to be fused with another stage 's' from outermost loop to a
2222 * given LoopLevel. */
2223 // @{
2224 Func &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2226 Func &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2228
2229 /** Compute all of this function once ahead of time. Reusing
2230 * the example in \ref Func::compute_at :
2231 *
2232 \code
2233 Func f, g;
2234 Var x, y;
2235 g(x, y) = x*y;
2236 f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2237
2238 g.compute_root();
2239 \endcode
2240 *
2241 * is equivalent to
2242 *
2243 \code
2244 int f[height][width];
2245 int g[height+1][width+1];
2246 for (int y = 0; y < height+1; y++) {
2247 for (int x = 0; x < width+1; x++) {
2248 g[y][x] = x*y;
2249 }
2250 }
2251 for (int y = 0; y < height; y++) {
2252 for (int x = 0; x < width; x++) {
2253 f[y][x] = g[y][x] + g[y+1][x] + g[y][x+1] + g[y+1][x+1];
2254 }
2255 }
2256 \endcode
2257 *
2258 * g is computed once ahead of time, and enough is computed to
2259 * satisfy all uses of it. This does no redundant work (each point
2260 * in g is evaluated once), but has poor locality (values of g are
2261 * probably not still in cache when they are used by f), and
2262 * allocates lots of temporary memory to store g.
2263 */
2265
2266 /** Use the halide_memoization_cache_... interface to store a
2267 * computed version of this function across invocations of the
2268 * Func.
2269 *
2270 * If an eviction_key is provided, it must be constructed with
2271 * Expr of integer or handle type. The key Expr will be promoted
2272 * to a uint64_t and can be used with halide_memoization_cache_evict
2273 * to remove memoized entries using this eviction key from the
2274 * cache. Memoized computations that do not provide an eviction
2275 * key will never be evicted by this mechanism.
2276 *
2277 * It is invalid to memoize the output of a Pipeline; attempting
2278 * to do so will issue an error. To cache an entire pipeline,
2279 * either implement a caching mechanism outside of Halide or
2280 * explicitly copy out of the cache with another output Func.
2281 */
2282 Func &memoize(const EvictionKey &eviction_key = EvictionKey());
2283
2284 /** Produce this Func asynchronously in a separate
2285 * thread. Consumers will be run by the task system when the
2286 * production is complete. If this Func's store level is different
2287 * to its compute level, consumers will be run concurrently,
2288 * blocking as necessary to prevent reading ahead of what the
2289 * producer has computed. If storage is folded, then the producer
2290 * will additionally not be permitted to run too far ahead of the
2291 * consumer, to avoid clobbering data that has not yet been
2292 * used.
2293 *
2294 * Take special care when combining this with custom thread pool
2295 * implementations, as avoiding deadlock with producer-consumer
2296 * parallelism requires a much more sophisticated parallel runtime
2297 * than with data parallelism alone. It is strongly recommended
2298 * you just use Halide's default thread pool, which guarantees no
2299 * deadlock and a bound on the number of threads launched.
2300 */
2302
2303 /** Expands the storage of the function by an extra dimension
2304 * to enable ring buffering. For this to be useful the storage
2305 * of the function has to be hoisted to an upper loop level using
2306 * \ref Func::hoist_storage. The index for the new ring buffer dimension
2307 * is calculated implicitly based on a linear combination of the all of
2308 * the loop variables between hoist_storage and compute_at/store_at
2309 * loop levels. Scheduling a function with ring_buffer increases the
2310 * amount of memory required for this function by an *extent* times.
2311 * ring_buffer is especially useful in combination with \ref Func::async,
2312 * but can be used without it.
2313 *
2314 * The extent is expected to be a positive integer.
2315 */
2317
2318 /** Bound the extent of a Func's storage, but not extent of its
2319 * compute. This can be useful for forcing a function's allocation
2320 * to be a fixed size, which often means it can go on the stack.
2321 * If bounds inference decides that it requires more storage for
2322 * this function than the allocation size you have stated, a runtime
2323 * error will occur when you try to run the pipeline. */
2324 Func &bound_storage(const Var &dim, const Expr &bound);
2325
2326 /** Allocate storage for this function within f's loop over
2327 * var. Scheduling storage is optional, and can be used to
2328 * separate the loop level at which storage occurs from the loop
2329 * level at which computation occurs to trade off between locality
2330 * and redundant work. This can open the door for two types of
2331 * optimization.
2332 *
2333 * Consider again the pipeline from \ref Func::compute_at :
2334 \code
2335 Func f, g;
2336 Var x, y;
2337 g(x, y) = x*y;
2338 f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2339 \endcode
2340 *
2341 * If we schedule it like so:
2342 *
2343 \code
2344 g.compute_at(f, x).store_at(f, y);
2345 \endcode
2346 *
2347 * Then the computation of g takes place within the loop over x,
2348 * but the storage takes place within the loop over y:
2349 *
2350 \code
2351 int f[height][width];
2352 for (int y = 0; y < height; y++) {
2353 int g[2][width+1];
2354 for (int x = 0; x < width; x++) {
2355 g[0][x] = x*y;
2356 g[0][x+1] = (x+1)*y;
2357 g[1][x] = x*(y+1);
2358 g[1][x+1] = (x+1)*(y+1);
2359 f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2360 }
2361 }
2362 \endcode
2363 *
2364 * Provided the for loop over x is serial, halide then
2365 * automatically performs the following sliding window
2366 * optimization:
2367 *
2368 \code
2369 int f[height][width];
2370 for (int y = 0; y < height; y++) {
2371 int g[2][width+1];
2372 for (int x = 0; x < width; x++) {
2373 if (x == 0) {
2374 g[0][x] = x*y;
2375 g[1][x] = x*(y+1);
2376 }
2377 g[0][x+1] = (x+1)*y;
2378 g[1][x+1] = (x+1)*(y+1);
2379 f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2380 }
2381 }
2382 \endcode
2383 *
2384 * Two of the assignments to g only need to be done when x is
2385 * zero. The rest of the time, those sites have already been
2386 * filled in by a previous iteration. This version has the
2387 * locality of compute_at(f, x), but allocates more memory and
2388 * does much less redundant work.
2389 *
2390 * Halide then further optimizes this pipeline like so:
2391 *
2392 \code
2393 int f[height][width];
2394 for (int y = 0; y < height; y++) {
2395 int g[2][2];
2396 for (int x = 0; x < width; x++) {
2397 if (x == 0) {
2398 g[0][0] = x*y;
2399 g[1][0] = x*(y+1);
2400 }
2401 g[0][(x+1)%2] = (x+1)*y;
2402 g[1][(x+1)%2] = (x+1)*(y+1);
2403 f[y][x] = g[0][x%2] + g[1][x%2] + g[0][(x+1)%2] + g[1][(x+1)%2];
2404 }
2405 }
2406 \endcode
2407 *
2408 * Halide has detected that it's possible to use a circular buffer
2409 * to represent g, and has reduced all accesses to g modulo 2 in
2410 * the x dimension. This optimization only triggers if the for
2411 * loop over x is serial, and if halide can statically determine
2412 * some power of two large enough to cover the range needed. For
2413 * powers of two, the modulo operator compiles to more efficient
2414 * bit-masking. This optimization reduces memory usage, and also
2415 * improves locality by reusing recently-accessed memory instead
2416 * of pulling new memory into cache.
2417 *
2418 */
2419 Func &store_at(const Func &f, const Var &var);
2420
2421 /** Equivalent to the version of store_at that takes a Var, but
2422 * schedules storage within the loop over a dimension of a
2423 * reduction domain */
2424 Func &store_at(const Func &f, const RVar &var);
2425
2426 /** Equivalent to the version of store_at that takes a Var, but
2427 * schedules storage at a given LoopLevel. */
2429
2430 /** Equivalent to \ref Func::store_at, but schedules storage
2431 * outside the outermost loop. */
2433
2434 /** Hoist storage for this function within f's loop over
2435 * var. This is different from \ref Func::store_at, because hoist_storage
2436 * simply moves an actual allocation to a given loop level and
2437 * doesn't trigger any of the optimizations such as sliding window.
2438 * Hoisting storage is optional and can be used as an optimization
2439 * to avoid unnecessary allocations by moving it out from an inner
2440 * loop.
2441 *
2442 * Consider again the pipeline from \ref Func::compute_at :
2443 \code
2444 Func f, g;
2445 Var x, y;
2446 g(x, y) = x*y;
2447 f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2448 \endcode
2449 *
2450 * If we schedule f like so:
2451 *
2452 \code
2453 g.compute_at(f, x);
2454 \endcode
2455 *
2456 * Then the C code equivalent to this pipeline will look like this
2457 *
2458 \code
2459
2460 int f[height][width];
2461 for (int y = 0; y < height; y++) {
2462 for (int x = 0; x < width; x++) {
2463 int g[2][2];
2464 g[0][0] = x*y;
2465 g[0][1] = (x+1)*y;
2466 g[1][0] = x*(y+1);
2467 g[1][1] = (x+1)*(y+1);
2468 f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2469 }
2470 }
2471
2472 \endcode
2473 *
2474 * Note the allocation for g inside of the loop over variable x which
2475 * can happen for each iteration of the inner loop (in total height * width times).
2476 * In some cases allocation can be expensive, so it might be better to do it once
2477 * and reuse allocated memory across all iterations of the loop.
2478 *
2479 * This can be done by scheduling g like so:
2480 *
2481 \code
2482 g.compute_at(f, x).hoist_storage(f, Var::outermost());
2483 \endcode
2484 *
2485 * Then the C code equivalent to this pipeline will look like this
2486 *
2487 \code
2488
2489 int f[height][width];
2490 int g[2][2];
2491 for (int y = 0; y < height; y++) {
2492 for (int x = 0; x < width; x++) {
2493 g[0][0] = x*y;
2494 g[0][1] = (x+1)*y;
2495 g[1][0] = x*(y+1);
2496 g[1][1] = (x+1)*(y+1);
2497 f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2498 }
2499 }
2500
2501 \endcode
2502 *
2503 * hoist_storage can be used together with \ref Func::store_at and
2504 * \ref Func::fold_storage (for example, to hoist the storage allocated
2505 * after sliding window optimization).
2506 *
2507 */
2508 Func &hoist_storage(const Func &f, const Var &var);
2509
2510 /** Equivalent to the version of hoist_storage that takes a Var, but
2511 * schedules storage within the loop over a dimension of a
2512 * reduction domain */
2513 Func &hoist_storage(const Func &f, const RVar &var);
2514
2515 /** Equivalent to the version of hoist_storage that takes a Var, but
2516 * schedules storage at a given LoopLevel. */
2518
2519 /** Equivalent to \ref Func::hoist_storage_root, but schedules storage
2520 * outside the outermost loop. */
2522
2523 /** Aggressively inline all uses of this function. This is the
2524 * default schedule, so you're unlikely to need to call this. For
2525 * a Func with an update definition, that means it gets computed
2526 * as close to the innermost loop as possible.
2527 *
2528 * Consider once more the pipeline from \ref Func::compute_at :
2529 *
2530 \code
2531 Func f, g;
2532 Var x, y;
2533 g(x, y) = x*y;
2534 f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2535 \endcode
2536 *
2537 * Leaving g as inline, this compiles to code equivalent to the following C:
2538 *
2539 \code
2540 int f[height][width];
2541 for (int y = 0; y < height; y++) {
2542 for (int x = 0; x < width; x++) {
2543 f[y][x] = x*y + x*(y+1) + (x+1)*y + (x+1)*(y+1);
2544 }
2545 }
2546 \endcode
2547 */
2549
2550 /** Get a handle on an update step for the purposes of scheduling
2551 * it. */
2552 Stage update(int idx = 0);
2553
2554 /** Set the type of memory this Func should be stored in. Controls
2555 * whether allocations go on the stack or the heap on the CPU, and
2556 * in global vs shared vs local on the GPU. See the documentation
2557 * on MemoryType for more detail. */
2558 Func &store_in(MemoryType memory_type);
2559
2560 /** Trace all loads from this Func by emitting calls to
2561 * halide_trace. If the Func is inlined, this has no
2562 * effect. */
2564
2565 /** Trace all stores to the buffer backing this Func by emitting
2566 * calls to halide_trace. If the Func is inlined, this call
2567 * has no effect. */
2569
2570 /** Trace all realizations of this Func by emitting calls to
2571 * halide_trace. */
2573
2574 /** Add a string of arbitrary text that will be passed thru to trace
2575 * inspection code if the Func is realized in trace mode. (Funcs that are
2576 * inlined won't have their tags emitted.) Ignored entirely if
2577 * tracing is not enabled for the Func (or globally).
2578 */
2579 Func &add_trace_tag(const std::string &trace_tag);
2580
2581 /** Marks this function as a function that should not be profiled
2582 * when using the target feature Profile or ProfileByTimer.
2583 * This is useful when this function is does too little work at once
2584 * such that the overhead of setting the profiling token might
2585 * become significant, or that the measured time is not representative
2586 * due to modern processors (instruction level parallelism, out-of-order
2587 * execution). */
2589
2590 /** Get a handle on the internal halide function that this Func
2591 * represents. Useful if you want to do introspection on Halide
2592 * functions */
2594 return func;
2595 }
2596
2597 /** You can cast a Func to its pure stage for the purposes of
2598 * scheduling it. */
2599 operator Stage() const;
2600
2601 /** Get a handle on the output buffer for this Func. Only relevant
2602 * if this is the output Func in a pipeline. Useful for making
2603 * static promises about strides, mins, and extents. */
2604 // @{
2606 std::vector<OutputImageParam> output_buffers() const;
2607 // @}
2608
2609 /** Use a Func as an argument to an external stage. */
2610 operator ExternFuncArgument() const;
2611
2612 /** Infer the arguments to the Func, sorted into a canonical order:
2613 * all buffers (sorted alphabetically by name), followed by all non-buffers
2614 * (sorted alphabetically by name).
2615 This lets you write things like:
2616 \code
2617 func.compile_to_assembly("/dev/stdout", func.infer_arguments());
2618 \endcode
2619 */
2620 std::vector<Argument> infer_arguments() const;
2621
2622 /** Return the current StageSchedule associated with this initial
2623 * Stage of this Func. For introspection only: to modify schedule,
2624 * use the Func interface. */
2626 return Stage(*this).get_schedule();
2627 }
2628};
2629
2630namespace Internal {
2631
2632template<typename Last>
2633inline void check_types(const Tuple &t, int idx) {
2634 using T = std::remove_pointer_t<std::remove_reference_t<Last>>;
2635 user_assert(t[idx].type() == type_of<T>())
2636 << "Can't evaluate expression "
2637 << t[idx] << " of type " << t[idx].type()
2638 << " as a scalar of type " << type_of<T>() << "\n";
2639}
2640
2641template<typename First, typename Second, typename... Rest>
2642inline void check_types(const Tuple &t, int idx) {
2643 check_types<First>(t, idx);
2644 check_types<Second, Rest...>(t, idx + 1);
2645}
2646
2647template<typename Last>
2648inline void assign_results(Realization &r, int idx, Last last) {
2649 using T = std::remove_pointer_t<std::remove_reference_t<Last>>;
2650 *last = Buffer<T>(r[idx])();
2651}
2652
2653template<typename First, typename Second, typename... Rest>
2654inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&...rest) {
2655 assign_results<First>(r, idx, first);
2656 assign_results<Second, Rest...>(r, idx + 1, second, rest...);
2657}
2658
2659} // namespace Internal
2660
2661/** JIT-Compile and run enough code to evaluate a Halide
2662 * expression. This can be thought of as a scalar version of
2663 * \ref Func::realize */
2664template<typename T>
2666 user_assert(e.type() == type_of<T>())
2667 << "Can't evaluate expression "
2668 << e << " of type " << e.type()
2669 << " as a scalar of type " << type_of<T>() << "\n";
2670 Func f;
2671 f() = e;
2672 Buffer<T, 0> im = f.realize(ctx);
2673 return im();
2674}
2675
2676/** evaluate with a default user context */
2677template<typename T>
2679 return evaluate<T>(nullptr, e);
2680}
2681
2682/** JIT-compile and run enough code to evaluate a Halide Tuple. */
2683template<typename First, typename... Rest>
2684HALIDE_NO_USER_CODE_INLINE void evaluate(JITUserContext *ctx, Tuple t, First first, Rest &&...rest) {
2685 Internal::check_types<First, Rest...>(t, 0);
2686
2687 Func f;
2688 f() = t;
2689 Realization r = f.realize(ctx);
2690 Internal::assign_results(r, 0, first, rest...);
2691}
2692
2693/** JIT-compile and run enough code to evaluate a Halide Tuple. */
2694template<typename First, typename... Rest>
2695HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&...rest) {
2696 evaluate<First, Rest...>(nullptr, std::move(t), std::forward<First>(first), std::forward<Rest...>(rest...));
2697}
2698
2699namespace Internal {
2700
2701inline void schedule_scalar(Func f) {
2703 if (t.has_gpu_feature()) {
2705 }
2706 if (t.has_feature(Target::HVX)) {
2707 f.hexagon();
2708 }
2709}
2710
2711} // namespace Internal
2712
2713/** JIT-Compile and run enough code to evaluate a Halide
2714 * expression. This can be thought of as a scalar version of
2715 * \ref Func::realize. Can use GPU if jit target from environment
2716 * specifies one.
2717 */
2718template<typename T>
2720 user_assert(e.type() == type_of<T>())
2721 << "Can't evaluate expression "
2722 << e << " of type " << e.type()
2723 << " as a scalar of type " << type_of<T>() << "\n";
2724 Func f;
2725 f() = e;
2727 Buffer<T, 0> im = f.realize();
2728 return im();
2729}
2730
2731/** JIT-compile and run enough code to evaluate a Halide Tuple. Can
2732 * use GPU if jit target from environment specifies one. */
2733// @{
2734template<typename First, typename... Rest>
2735HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&...rest) {
2736 Internal::check_types<First, Rest...>(t, 0);
2737
2738 Func f;
2739 f() = t;
2741 Realization r = f.realize();
2742 Internal::assign_results(r, 0, first, rest...);
2743}
2744// @}
2745
2746} // namespace Halide
2747
2748#endif
Defines a type used for expressing the type signature of a generated halide pipeline.
#define internal_assert(c)
Definition: Error.h:232
#define user_assert(c)
Definition: Error.h:233
Base classes for Halide expressions (Halide::Expr) and statements (Halide::Internal::Stmt)
Defines the struct representing lifetime and dependencies of a JIT compiled halide pipeline.
Defines Module, an IR container that fully describes a Halide program.
Classes for declaring scalar parameters to halide pipelines.
Defines the front-end class representing an entire Halide imaging pipeline.
Defines the front-end syntax for reduction domains and reduction variables.
Defines the structure that describes a Halide target.
Defines Tuple - the front-end handle on small arrays of expressions.
#define HALIDE_NO_USER_CODE_INLINE
Definition: Util.h:47
Defines the Var - the front-end variable.
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Definition: Buffer.h:122
Helper class for identifying purpose of an Expr passed to memoize.
Definition: Func.h:699
EvictionKey(const Expr &expr=Expr())
Definition: Func.h:705
A halide function.
Definition: Func.h:714
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices and thread indices.
Func & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices.
Func & bound_extent(const Var &var, Expr extent)
Bound the extent of a Func's realization, but not its min.
void print_loop_nest()
Write out the loop nests specified by the schedule for this Function.
Func & unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given factor, then unroll the inner dimension.
bool is_extern() const
Is this function an external stage? That is, was it defined using define_extern?
FuncRef operator()(std::vector< Expr >) const
Either calls to the function, or the left-hand-side of an update definition (see RDom).
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func(const std::string &name)
Declare a new undefined function with the given name.
void compile_to_multitarget_object_files(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets, const std::vector< std::string > &suffixes)
Like compile_to_multitarget_static_library(), except that the object files are all output as object f...
Func & memoize(const EvictionKey &eviction_key=EvictionKey())
Use the halide_memoization_cache_... interface to store a computed version of this function across in...
Func & partition(const VarOrRVar &var, Partition partition_policy)
Set the loop partition policy.
Func & trace_stores()
Trace all stores to the buffer backing this Func by emitting calls to halide_trace.
Func & trace_loads()
Trace all loads from this Func by emitting calls to halide_trace.
Func & never_partition_all()
Set the loop partition policy to Never for all Vars and RVar of the initial definition of the Func.
Func & prefetch(const Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
void specialize_fail(const std::string &message)
Add a specialization to a Func that always terminates execution with a call to halide_error().
Func & vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given factor, then vectorize the inner dimension.
Func & compute_at(const Func &f, const RVar &var)
Schedule a function to be computed within the iteration over some dimension of an update domain.
HALIDE_NO_USER_CODE_INLINE std::enable_if_t< Internal::all_are_convertible< Var, Args... >::value, FuncRef > operator()(Args &&...args) const
Definition: Func.h:1245
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
The generalized tile, with a single tail strategy to apply to all vars.
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to text assembly equivalent to the object file generated by compile_...
Internal::Function function() const
Get a handle on the internal halide function that this Func represents.
Definition: Func.h:2593
bool has_update_definition() const
Does this function have at least one update definition?
void compile_jit(const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code.
Func & compute_with(LoopLevel loop_level, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Func()
Declare a new undefined function with an automatically-generated unique name.
Func & store_in(MemoryType memory_type)
Set the type of memory this Func should be stored in.
const Type & type() const
Get the type(s) of the outputs of this Func.
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
std::vector< Var > split_vars() const
Get the Vars of the pure definition, with splits applied.
void infer_input_bounds(Pipeline::RealizationArg outputs, const Target &target=get_jit_target_from_environment())
Func & async()
Produce this Func asynchronously in a separate thread.
Func & reorder(const std::vector< VarOrRVar > &vars)
Reorder variables to have the given nesting order, from innermost out.
Func & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & set_estimate(const Var &var, const Expr &min, const Expr &extent)
Statically declare the range over which the function will be evaluated in the general case.
Func & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
The given dimension corresponds to the lanes in a GPU warp.
void compile_to_lowered_stmt(const std::string &filename, const std::vector< Argument > &args, StmtOutputFormat fmt=Text, const Target &target=get_target_from_environment())
Write out an internal representation of lowered code.
void compile_to_c(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Statically compile this function to C source code.
Stage update(int idx=0)
Get a handle on an update step for the purposes of scheduling it.
Func & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func(const Type &required_type, int required_dims, const std::string &name)
Declare a new undefined function with the given name.
bool defined() const
Does this function have at least a pure definition.
Func(const std::vector< Type > &required_types, int required_dims, const std::string &name)
Declare a new undefined function with the given name.
Func & align_storage(const Var &dim, const Expr &alignment)
Pad the storage extent of a particular dimension of realizations of this function up to be a multiple...
Func copy_to_host()
Declare that this function should be implemented by a call to halide_buffer_copy with a NULL target d...
void infer_input_bounds(JITUserContext *context, Pipeline::RealizationArg outputs, const Target &target=get_jit_target_from_environment())
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & serial(const VarOrRVar &var)
Mark a dimension to be traversed serially.
void compile_to_header(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Emit a header file with the given filename for this function.
Func & align_bounds(const Var &var, Expr modulus, Expr remainder=0)
Expand the region computed so that the min coordinates is congruent to 'remainder' modulo 'modulus',...
Func & reorder_storage(const Var &x, const Var &y)
Func & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension into inner and outer subdimensions with the given names, where the inner dimension ...
HALIDE_NO_USER_CODE_INLINE std::enable_if_t< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & > never_partition(const VarOrRVar &x, Args &&...args)
Set the loop partition policy to Never for some number of Vars and RVars.
Definition: Func.h:1487
Func(const Expr &e)
Declare a new function with an automatically-generated unique name, and define it to return the given...
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Split two dimensions at once by the given factors, and then reorder the resulting dimensions to be xi...
int dimensions() const
The dimensionality (number of arguments) of this function.
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
void compile_to_conceptual_stmt(const std::string &filename, const std::vector< Argument > &args, StmtOutputFormat fmt=Text, const Target &target=get_target_from_environment())
Write out a conceptual representation of lowered code, before any parallel loop get factored out into...
const std::vector< Type > & types() const
HALIDE_NO_USER_CODE_INLINE std::enable_if_t< Internal::all_are_convertible< Expr, Args... >::value, FuncRef > operator()(const Expr &x, Args &&...args) const
Definition: Func.h:1262
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, const std::vector< TailStrategy > &tails)
A more general form of tile, which defines tiles of any dimensionality.
HALIDE_NO_USER_CODE_INLINE std::enable_if_t< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & > always_partition(const VarOrRVar &x, Args &&...args)
Set the loop partition policy to Always for some number of Vars and RVars.
Definition: Func.h:1504
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
const std::vector< Expr > & update_args(int idx=0) const
Get the left-hand-side of the update definition.
Realization realize(JITUserContext *context, std::vector< int32_t > sizes={}, const Target &target=Target())
Same as above, but takes a custom user-provided context to be passed to runtime functions.
int outputs() const
Get the number of outputs of this Func.
Func & compute_root()
Compute all of this function once ahead of time.
Func & compute_with(LoopLevel loop_level, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Func & trace_realizations()
Trace all realizations of this Func by emitting calls to halide_trace.
JITHandlers & jit_handlers()
Get a struct containing the currently set custom functions used by JIT.
std::vector< Var > args() const
Get the pure arguments.
Tuple update_values(int idx=0) const
Get the right-hand-side of an update definition for functions that returns multiple values.
Func & allow_race_conditions()
Specify that race conditions are permitted for this Func, which enables parallelizing over RVars even...
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm bitcode, with the given filename (which should probably end ...
int num_update_definitions() const
How many update definitions does this function have?
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
A shorter form of tile, which reuses the old variable names as the new outer dimensions.
Func & never_partition(const std::vector< VarOrRVar > &vars)
Set the loop partition policy to Never for a vector of Vars and RVars.
Stage specialize(const Expr &condition)
Specialize a Func.
Callable compile_to_callable(const std::vector< Argument > &args, const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code and return a callable struct that behaves like a fun...
Func & ring_buffer(Expr extent)
Expands the storage of the function by an extra dimension to enable ring buffering.
Func & compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Func & store_at(LoopLevel loop_level)
Equivalent to the version of store_at that takes a Var, but schedules storage at a given LoopLevel.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
HALIDE_NO_USER_CODE_INLINE Func(Buffer< T, Dims > &im)
Construct a new Func to wrap a Buffer.
Definition: Func.h:773
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Func & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Schedule the iteration over the initial definition of this function to be fused with another stage 's...
Expr value() const
The right-hand-side value of the pure definition of this function.
Func & align_extent(const Var &var, Expr modulus)
Expand the region computed so that the extent is a multiple of 'modulus'.
void infer_input_bounds(const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment())
For a given size of output, or a given output buffer, determine the bounds required of all unbound Im...
Func clone_in(const std::vector< Func > &fs)
Module compile_to_module(const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Store an internal representation of lowered code as a self contained Module suitable for further comp...
Func & atomic(bool override_associativity_test=false)
Issue atomic updates for this Func.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1178
void realize(Pipeline::RealizationArg outputs, const Target &target=Target())
Evaluate this function into an existing allocated buffer or buffers.
Func & unroll(const VarOrRVar &var)
Mark a dimension to be completely unrolled.
Func & set_estimates(const Region &estimates)
Set (min, extent) estimates for all dimensions in the Func at once; this is equivalent to calling set...
Func in()
Create and return a global identity wrapper, which wraps all calls to this Func by any other Func.
OutputImageParam output_buffer() const
Get a handle on the output buffer for this Func.
Expr update_value(int idx=0) const
Get the right-hand-side of an update definition.
void compile_to(const std::map< OutputFileType, std::string > &output_files, const std::vector< Argument > &args, const std::string &fn_name, const Target &target=get_target_from_environment())
Compile and generate multiple target files with single call.
std::vector< Argument > infer_arguments() const
Infer the arguments to the Func, sorted into a canonical order: all buffers (sorted alphabetically by...
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func & store_at(const Func &f, const Var &var)
Allocate storage for this function within f's loop over var.
void add_custom_lowering_pass(T *pass)
Add a custom pass to be used during lowering.
Definition: Func.h:1073
Func in(const std::vector< Func > &fs)
Create and return an identity wrapper shared by all the Funcs in 'fs'.
Func & fold_storage(const Var &dim, const Expr &extent, bool fold_forward=true)
Store realizations of this function in a circular buffer of a given extent.
Func & hoist_storage_root()
Equivalent to Func::hoist_storage_root, but schedules storage outside the outermost loop.
Realization realize(std::vector< int32_t > sizes={}, const Target &target=Target())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers.
void realize(JITUserContext *context, Pipeline::RealizationArg outputs, const Target &target=Target())
Same as above, but takes a custom user-provided context to be passed to runtime functions.
Func & compute_at(LoopLevel loop_level)
Schedule a function to be computed within the iteration over a given LoopLevel.
Func & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU thread indices.
Func & always_partition(const std::vector< VarOrRVar > &vars)
Set the loop partition policy to Always for a vector of Vars and RVars.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Add an extern definition for this Func.
Definition: Func.h:1168
void compile_to_file(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to object file and header pair, with the given arguments.
void add_custom_lowering_pass(Internal::IRMutator *pass, std::function< void()> deleter)
Add a custom pass to be used during lowering, with the function that will be called to delete it also...
Func & add_trace_tag(const std::string &trace_tag)
Add a string of arbitrary text that will be passed thru to trace inspection code if the Func is reali...
Func & store_at(const Func &f, const RVar &var)
Equivalent to the version of store_at that takes a Var, but schedules storage within the loop over a ...
void clear_custom_lowering_passes()
Remove all previously-set custom lowering passes.
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm assembly, with the given filename (which should probably end...
const std::string & name() const
The name of this function, either given during construction, or automatically generated.
void compile_to_multitarget_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets)
Compile to static-library file and header pair once for each target; each resulting function will be ...
Func & hexagon(const VarOrRVar &x=Var::outermost())
Schedule for execution on Hexagon.
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Generalized tiling, reusing the previous names as the outer names.
Func & store_root()
Equivalent to Func::store_at, but schedules storage outside the outermost loop.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
HALIDE_NO_USER_CODE_INLINE std::enable_if_t< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & > reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:1608
Func & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Prefetch data written to or read from a Func or an ImageParam by a subsequent loop iteration,...
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
std::vector< RVar > rvars(int idx=0) const
Get the RVars of the reduction domain for an update definition, if there is one.
Func clone_in(const Func &f)
Similar to Func::in; however, instead of replacing the call to this Func with an identity Func that r...
const std::vector< CustomLoweringPass > & custom_lowering_passes()
Get the custom lowering passes.
Func & hoist_storage(const Func &f, const Var &var)
Hoist storage for this function within f's loop over var.
Func & compute_inline()
Aggressively inline all uses of this function.
Func(Internal::Function f)
Construct a new Func to wrap an existing, already-defined Function object.
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to an object file, with the given filename (which should probably en...
Func & bound_storage(const Var &dim, const Expr &bound)
Bound the extent of a Func's storage, but not extent of its compute.
Func & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Rename a dimension.
Tuple values() const
The values returned by this function.
const std::string & extern_function_name() const
Get the name of the extern function called for an extern definition.
Func copy_to_device(DeviceAPI d=DeviceAPI::Default_GPU)
Declare that this function should be implemented by a call to halide_buffer_copy with the given targe...
Func & parallel(const VarOrRVar &var)
Mark a dimension to be traversed in parallel.
Func & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func & hoist_storage(LoopLevel loop_level)
Equivalent to the version of hoist_storage that takes a Var, but schedules storage at a given LoopLev...
Func & reorder_storage(const std::vector< Var > &dims)
Specify how the storage for the function is laid out.
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & no_profiling()
Marks this function as a function that should not be profiled when using the target feature Profile o...
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1188
HALIDE_NO_USER_CODE_INLINE std::enable_if_t< Internal::all_are_convertible< Var, Args... >::value, Func & > reorder_storage(const Var &x, const Var &y, Args &&...args)
Definition: Func.h:2090
void infer_input_bounds(JITUserContext *context, const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment())
Versions of infer_input_bounds that take a custom user context to pass to runtime functions.
Func & vectorize(const VarOrRVar &var)
Mark a dimension to be computed all-at-once as a single vector.
void debug_to_file(const std::string &filename)
When this function is compiled, include code that dumps its values to a file after it is realized,...
Func & parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given task_size, and the parallelize the outer dimension.
Func & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Join two dimensions into a single fused dimension.
Func in(const Func &f)
Creates and returns a new identity Func that wraps this Func.
Func & bound(const Var &var, Expr min, Expr extent)
Statically declare that the range over which a function should be evaluated is given by the second an...
std::vector< OutputImageParam > output_buffers() const
Func & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:2064
void compile_to_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to static-library file and header pair, with the given arguments.
Func & compute_at(const Func &f, const Var &var)
Compute this function as needed for each unique value of the given var for the given calling function...
Func & hoist_storage(const Func &f, const RVar &var)
Equivalent to the version of hoist_storage that takes a Var, but schedules storage within the loop ov...
FuncRef operator()(std::vector< Var >) const
Construct either the left-hand-side of a definition, or a call to a functions that happens to only co...
Func & always_partition_all()
Set the loop partition policy to Always for all Vars and RVar of the initial definition of the Func.
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this initial Stage of this Func.
Definition: Func.h:2625
Func & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide to run this stage using a single gpu thread and block.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Short-hand for tiling a domain and mapping the tile indices to GPU block indices and the coordinates ...
A fragment of front-end syntax of the form f(x, y, z), where x, y, z are Vars or Exprs.
Definition: Func.h:502
Stage operator*=(const FuncRef &)
FuncTupleElementRef operator[](int) const
When a FuncRef refers to a function that provides multiple outputs, you can access each output as an ...
Stage operator/=(const Expr &)
Define a stage that divides this Func by the given expression.
Stage operator-=(const FuncRef &)
size_t size() const
How many outputs does the function this refers to produce.
Internal::Function function() const
What function is this calling?
Definition: Func.h:602
Stage operator-=(const Tuple &)
Stage operator+=(const FuncRef &)
Stage operator=(const Expr &)
Use this as the left-hand-side of a definition or an update definition (see RDom).
Stage operator=(const FuncRef &)
FuncRef(Internal::Function, const std::vector< Var > &, int placeholder_pos=-1, int count=0)
Stage operator+=(const Tuple &)
Stage operator-=(const Expr &)
Define a stage that adds the negative of the given expression to this Func.
Stage operator*=(const Expr &)
Define a stage that multiplies this Func by the given expression.
Stage operator+=(const Expr &)
Define a stage that adds the given expression to this Func.
bool equivalent_to(const FuncRef &other) const
Is this FuncRef syntactically equivalent to another one?
FuncRef(const Internal::Function &, const std::vector< Expr > &, int placeholder_pos=-1, int count=0)
Stage operator/=(const FuncRef &)
Stage operator*=(const Tuple &)
Stage operator/=(const Tuple &)
Stage operator=(const Tuple &)
Use this as the left-hand-side of a definition or an update definition for a Func with multiple outpu...
A fragment of front-end syntax of the form f(x, y, z)[index], where x, y, z are Vars or Exprs.
Definition: Func.h:624
int index() const
Return index to the function outputs.
Definition: Func.h:688
Stage operator+=(const Expr &e)
Define a stage that adds the given expression to Tuple component 'idx' of this Func.
Stage operator*=(const Expr &e)
Define a stage that multiplies Tuple component 'idx' of this Func by the given expression.
Stage operator/=(const Expr &e)
Define a stage that divides Tuple component 'idx' of this Func by the given expression.
Stage operator=(const Expr &e)
Use this as the left-hand-side of an update definition of Tuple component 'idx' of a Func (see RDom).
Stage operator=(const FuncRef &e)
Internal::Function function() const
What function is this calling?
Definition: Func.h:683
Stage operator-=(const Expr &e)
Define a stage that adds the negative of the given expression to Tuple component 'idx' of this Func.
FuncTupleElementRef(const FuncRef &ref, const std::vector< Expr > &args, int idx)
An Image parameter to a halide pipeline.
Definition: ImageParam.h:23
A Function definition which can either represent a init or an update definition.
Definition: Definition.h:38
const std::vector< Expr > & args() const
Get the default (no-specialization) arguments (left-hand-side) of the definition.
const StageSchedule & schedule() const
Get the default (no-specialization) stage-specific schedule associated with this definition.
bool defined() const
Definition objects are nullable.
const std::vector< StorageDim > & storage_dims() const
The list and order of dimensions used to store this function.
A reference-counted handle to Halide's internal representation of a function.
Definition: Function.h:39
FuncSchedule & schedule()
Get a handle to the function-specific schedule for the purpose of modifying it.
const std::vector< std::string > & args() const
Get the pure arguments.
A base class for passes over the IR which modify it (e.g.
Definition: IRMutator.h:28
A schedule for a single stage of a Halide pipeline.
Definition: Schedule.h:680
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition: Schedule.h:203
A halide module.
Definition: Module.h:142
A handle on the output buffer of a pipeline.
A reference-counted handle to a parameter to a halide pipeline.
Definition: Parameter.h:40
A class representing a Halide pipeline.
Definition: Pipeline.h:107
A multi-dimensional domain over which to iterate.
Definition: RDom.h:193
A reduction variable represents a single dimension of a reduction domain (RDom).
Definition: RDom.h:29
const std::string & name() const
The name of this reduction variable.
A Realization is a vector of references to existing Buffer objects.
Definition: Realization.h:19
A single definition of a Func.
Definition: Func.h:70
Stage & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
std::string name() const
Return the name of this stage, e.g.
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Func rfactor(const RVar &r, const Var &v)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & vectorize(const VarOrRVar &var)
HALIDE_NO_USER_CODE_INLINE std::enable_if_t< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & > reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:386
Stage & never_partition(const std::vector< VarOrRVar > &vars)
Stage & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & unroll(const VarOrRVar &var)
Stage & compute_with(LoopLevel loop_level, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Schedule the iteration over this stage to be fused with another stage 's' from outermost loop to a gi...
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
std::vector< VarOrRVar > split_vars() const
Get the Vars and RVars of this definition, from innermost out, with splits applied.
Stage & allow_race_conditions()
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Stage & parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail=TailStrategy::Auto)
Stage & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & partition(const VarOrRVar &var, Partition partition_policy)
Stage & unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Stage specialize(const Expr &condition)
Stage & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:471
Stage & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & prefetch(const Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Stage & reorder(const std::vector< VarOrRVar > &vars)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
Definition: Func.h:96
HALIDE_NO_USER_CODE_INLINE std::enable_if_t< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & > always_partition(const VarOrRVar &x, Args &&...args)
Definition: Func.h:400
Stage & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, const std::vector< TailStrategy > &tails)
Stage & compute_with(LoopLevel loop_level, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Stage & parallel(const VarOrRVar &var)
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this Stage.
Definition: Func.h:109
Stage & serial(const VarOrRVar &var)
Stage & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Stage & vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align=LoopAlignStrategy::Auto)
void specialize_fail(const std::string &message)
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & hexagon(const VarOrRVar &x=Var::outermost())
Stage & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Scheduling calls that control how the domain of this stage is traversed.
Stage & always_partition_all()
Stage & never_partition_all()
Stage & atomic(bool override_associativity_test=false)
Stage & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func rfactor(const std::vector< std::pair< RVar, Var > > &preserved)
Calling rfactor() on an associative update definition a Func will split the update into an intermedia...
std::string dump_argument_list() const
Return a string describing the current var list taking into account all the splits,...
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
void unscheduled()
Assert that this stage has intentionally been given no schedule, and suppress the warning about unsch...
Stage & always_partition(const std::vector< VarOrRVar > &vars)
HALIDE_NO_USER_CODE_INLINE std::enable_if_t< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & > never_partition(const VarOrRVar &x, Args &&...args)
Definition: Func.h:393
Create a small array of Exprs for defining and calling functions with multiple outputs.
Definition: Tuple.h:18
A Halide variable, to be used when defining functions.
Definition: Var.h:19
const std::string & name() const
Get the name of a Var.
static Var outermost()
A Var that represents the location outside the outermost loop.
Definition: Var.h:162
void schedule_scalar(Func f)
Definition: Func.h:2701
std::vector< Var > make_argument_list(int dimensionality)
Make a list of unique arguments for definitions with unnamed arguments.
void assign_results(Realization &r, int idx, Last last)
Definition: Func.h:2648
void check_types(const Tuple &t, int idx)
Definition: Func.h:2633
ForType
An enum describing a type of loop traversal.
Definition: Expr.h:406
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
PrefetchBoundStrategy
Different ways to handle accesses outside the original extents in a prefetch.
@ GuardWithIf
Guard the prefetch with if-guards that ignores the prefetch if any of the prefetched region ever goes...
HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2719
TailStrategy
Different ways to handle a tail case in a split when the factor does not provably divide the extent.
Definition: Schedule.h:33
@ Auto
For pure definitions use ShiftInwards.
LoopAlignStrategy
Different ways to handle the case when the start/end of the loops of stages computed with (fused) are...
Definition: Schedule.h:137
@ Auto
By default, LoopAlignStrategy is set to NoAlign.
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition: Func.h:611
NameMangling
An enum to specify calling convention for extern stages.
Definition: Function.h:26
@ Default
Match whatever is specified in the Target.
Target get_jit_target_from_environment()
Return the target that Halide will use for jit-compilation.
DeviceAPI
An enum describing a type of device API.
Definition: DeviceAPI.h:15
@ Host
Used to denote for loops that run on the same device as the containing code.
Target get_target_from_environment()
Return the target that Halide will use.
StmtOutputFormat
Used to determine if the output printed to file should be as a normal string or as an HTML file which...
Definition: Pipeline.h:72
@ Text
Definition: Pipeline.h:73
Stage ScheduleHandle
Definition: Func.h:493
std::vector< Range > Region
A multi-dimensional box.
Definition: Expr.h:350
Expr max(const FuncRef &a, const FuncRef &b)
Definition: Func.h:614
MemoryType
An enum describing different address spaces to be used with Func::store_in.
Definition: Expr.h:353
Partition
Different ways to handle loops with a potentially optimizable boundary conditions.
HALIDE_NO_USER_CODE_INLINE T evaluate(JITUserContext *ctx, const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2665
A fragment of Halide syntax.
Definition: Expr.h:258
HALIDE_ALWAYS_INLINE Type type() const
Get the type of this expression node.
Definition: Expr.h:327
An argument to an extern-defined Func.
Represent the equivalent associative op of an update definition.
Definition: Associativity.h:61
A set of custom overrides of runtime functions.
Definition: JITModule.h:35
A context to be passed to Pipeline::realize.
Definition: JITModule.h:136
A struct representing a target machine and os to generate code for.
Definition: Target.h:19
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
bool has_feature(Feature f) const
Types in the halide type system.
Definition: Type.h:281
A class that can represent Vars or RVars.
Definition: Func.h:29
bool is_rvar
Definition: Func.h:57
VarOrRVar(const Var &v)
Definition: Func.h:33
VarOrRVar(const RVar &r)
Definition: Func.h:36
VarOrRVar(const std::string &n, bool r)
Definition: Func.h:30
VarOrRVar(const ImplicitVar< N > &u)
Definition: Func.h:43
const std::string & name() const
Definition: Func.h:47
VarOrRVar(const RDom &r)
Definition: Func.h:39