Halide 22.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
HalideBuffer.h
Go to the documentation of this file.
1/** \file
2 * Defines a Buffer type that wraps from halide_buffer_t and adds
3 * functionality, and methods for more conveniently iterating over the
4 * samples in a halide_buffer_t outside of Halide code. */
5
6#ifndef HALIDE_RUNTIME_BUFFER_H
7#define HALIDE_RUNTIME_BUFFER_H
8
9#include <algorithm>
10#include <atomic>
11#include <cassert>
12#include <cstdint>
13#include <cstdlib>
14#include <cstring>
15#include <limits>
16#include <memory>
17#include <type_traits>
18#include <vector>
19
20#ifdef __APPLE__
21#include <AvailabilityVersions.h>
22#include <TargetConditionals.h>
23#endif
24
25#if defined(__has_feature)
26#if __has_feature(memory_sanitizer)
27#include <sanitizer/msan_interface.h>
28#endif
29#endif
30
31#include "HalideRuntime.h"
32
33#ifdef _MSC_VER
34#include <malloc.h>
35#define HALIDE_ALLOCA _alloca
36#else
37#define HALIDE_ALLOCA __builtin_alloca
38#endif
39
40// gcc 5.1 has a false positive warning on this code
41#if __GNUC__ == 5 && __GNUC_MINOR__ == 1
42#pragma GCC diagnostic ignored "-Warray-bounds"
43#endif
44
45#ifndef HALIDE_RUNTIME_BUFFER_CHECK_INDICES
46#define HALIDE_RUNTIME_BUFFER_CHECK_INDICES 0
47#endif
48
49#ifndef HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT
50// Conservatively align buffer allocations to 128 bytes by default.
51// This is enough alignment for all the platforms currently in use.
52// Redefine this in your compiler settings if you desire more/less alignment.
53#define HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT 128
54#endif
55
57 "HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT must be a power of 2.");
58
59// Unfortunately, not all C++17 runtimes support aligned_alloc
60// (it may depends on OS/SDK version); this is provided as an opt-out
61// if you are compiling on a platform that doesn't provide a (good)
62// implementation. (Note that we actually use the C11 `::aligned_alloc()`
63// rather than the C++17 `std::aligned_alloc()` because at least one platform
64// we found supports the former but not the latter.)
65#ifndef HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
66
67#ifdef _WIN32
68
69// Windows (regardless of which compiler) doesn't implement aligned_alloc(),
70// even in C++17 mode, and has stated they probably never will, as the issue
71// is in the incompatibility that free() needs to be able to free both pointers
72// returned by malloc() and aligned_alloc(). So, always default it off here.
73#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
74
75#elif defined(__ANDROID_API__) && __ANDROID_API__ < 28
76
77// Android doesn't provide aligned_alloc until API 28
78#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
79
80#elif defined(__APPLE__)
81
82#if TARGET_OS_OSX && (__MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_15)
83
84// macOS doesn't provide aligned_alloc until 10.15
85#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
86
87#elif TARGET_OS_IPHONE && (__IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_14_0)
88
89// iOS doesn't provide aligned_alloc until 14.0
90#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
91
92#else
93
94// Assume it's ok on all other Apple targets
95#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 1
96
97#endif
98
99#else
100
101#if defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC)
102
103// ARM GNU-A baremetal compiler doesn't provide aligned_alloc as of 12.2
104#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
105
106#else
107
108// Not Windows, Android, or Apple: just assume it's ok
109#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 1
110
111#endif
112
113#endif
114
115#endif // HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
116
117namespace Halide {
118namespace Runtime {
119
120// Forward-declare our Buffer class
121template<typename T, int Dims, int InClassDimStorage>
122class Buffer;
123
124// A helper to check if a parameter pack is entirely implicitly
125// int-convertible to use with std::enable_if
126template<typename... Args>
127struct AllInts : std::false_type {};
128
129template<>
130struct AllInts<> : std::true_type {};
131
132template<typename T, typename... Args>
133struct AllInts<T, Args...> {
134 static const bool value = std::is_convertible_v<T, int> && AllInts<Args...>::value;
135};
136
137// Floats and doubles are technically implicitly int-convertible, but
138// doing so produces a warning we treat as an error, so just disallow
139// it here.
140template<typename... Args>
141struct AllInts<float, Args...> : std::false_type {};
142
143template<typename... Args>
144struct AllInts<double, Args...> : std::false_type {};
145
146namespace Internal {
147// A helper to detect if there are any zeros in a container
148template<typename Container>
149bool any_zero(const Container &c) {
150 for (int i : c) {
151 if (i == 0) {
152 return true;
153 }
154 }
155 return false;
156}
157
159 static inline void *(*default_allocate_fn)(size_t) = nullptr;
160 static inline void (*default_deallocate_fn)(void *) = nullptr;
161};
162} // namespace Internal
163
164/** A struct acting as a header for allocations owned by the Buffer
165 * class itself. */
167 void (*deallocate_fn)(void *);
168 std::atomic<int> ref_count;
169
170 // Note that ref_count always starts at 1
171 explicit AllocationHeader(void (*deallocate_fn)(void *))
173 }
174};
175
176/** This indicates how to deallocate the device for a Halide::Runtime::Buffer. */
177enum struct BufferDeviceOwnership : int {
178 Allocated, ///> halide_device_free will be called when device ref count goes to zero
179 WrappedNative, ///> halide_device_detach_native will be called when device ref count goes to zero
180 Unmanaged, ///> No free routine will be called when device ref count goes to zero
181 AllocatedDeviceAndHost, ///> Call device_and_host_free when DevRefCount goes to zero.
182 Cropped, ///> Call halide_device_release_crop when DevRefCount goes to zero.
183};
184
185/** A similar struct for managing device allocations. */
187 // This is only ever constructed when there's something to manage,
188 // so start at one.
189 std::atomic<int> count{1};
191};
192
193constexpr int AnyDims = -1;
194
195/** A templated Buffer class that wraps halide_buffer_t and adds
196 * functionality. When using Halide from C++, this is the preferred
197 * way to create input and output buffers. The overhead of using this
198 * class relative to a naked halide_buffer_t is minimal - it uses another
199 * ~16 bytes on the stack, and does no dynamic allocations when using
200 * it to represent existing memory of a known maximum dimensionality.
201 *
202 * The template parameter T is the element type. For buffers where the
203 * element type is unknown, or may vary, use void or const void.
204 *
205 * The template parameter Dims is the number of dimensions. For buffers where
206 * the dimensionality type is unknown at, or may vary, use AnyDims.
207 *
208 * InClassDimStorage is the maximum number of dimensions that can be represented
209 * using space inside the class itself. Set it to the maximum dimensionality
210 * you expect this buffer to be. If the actual dimensionality exceeds
211 * this, heap storage is allocated to track the shape of the buffer.
212 * InClassDimStorage defaults to 4, which should cover nearly all usage.
213 *
214 * The class optionally allocates and owns memory for the image using
215 * a shared pointer allocated with the provided allocator. If they are
216 * null, malloc and free are used. Any device-side allocation is
217 * considered as owned if and only if the host-side allocation is
218 * owned. */
219template<typename T = void,
220 int Dims = AnyDims,
221 int InClassDimStorage = (Dims == AnyDims ? 4 : std::max(Dims, 1))>
222class Buffer {
223 /** The underlying halide_buffer_t */
224 halide_buffer_t buf = {};
225
226 /** Some in-class storage for shape of the dimensions. */
228
229 /** The allocation owned by this Buffer. NULL if the Buffer does not
230 * own the memory. */
231 AllocationHeader *alloc = nullptr;
232
233 /** A reference count for the device allocation owned by this
234 * buffer. */
235 mutable DeviceRefCount *dev_ref_count = nullptr;
236
237 /** True if T is of type void or const void */
238 static const bool T_is_void = std::is_same_v<std::remove_const_t<T>, void>;
239
240 /** A type function that adds a const qualifier if T is a const type. */
241 template<typename T2>
242 using add_const_if_T_is_const = std::conditional_t<std::is_const_v<T>, const T2, T2>;
243
244 /** T unless T is (const) void, in which case (const)
245 * uint8_t. Useful for providing return types for operator() */
246 using not_void_T = std::conditional_t<T_is_void,
247 add_const_if_T_is_const<uint8_t>,
248 T>;
249
250 /** T with constness removed. Useful for return type of copy(). */
251 using not_const_T = std::remove_const_t<T>;
252
253 /** The type the elements are stored as. Equal to not_void_T
254 * unless T is a pointer, in which case uint64_t. Halide stores
255 * all pointer types as uint64s internally, even on 32-bit
256 * systems. */
257 using storage_T = std::conditional_t<std::is_pointer_v<T>, uint64_t, not_void_T>;
258
259public:
260 /** True if the Halide type is not void (or const void). */
261 static constexpr bool has_static_halide_type = !T_is_void;
262
263 /** Get the Halide type of T. Callers should not use the result if
264 * has_static_halide_type is false. */
268
269 /** Does this Buffer own the host memory it refers to? */
270 bool owns_host_memory() const {
271 return alloc != nullptr;
272 }
273
274 static constexpr bool has_static_dimensions = (Dims != AnyDims);
275
276 /** Callers should not use the result if
277 * has_static_dimensions is false. */
278 static constexpr int static_dimensions() {
279 return Dims;
280 }
281
282 static_assert(!has_static_dimensions || static_dimensions() >= 0);
283
284private:
285 /** Increment the reference count of any owned allocation */
286 void incref() const {
287 if (owns_host_memory()) {
288 alloc->ref_count++;
289 }
290 if (buf.device) {
291 if (!dev_ref_count) {
292 // I seem to have a non-zero dev field but no
293 // reference count for it. I must have been given a
294 // device allocation by a Halide pipeline, and have
295 // never been copied from since. Take sole ownership
296 // of it.
297 dev_ref_count = new DeviceRefCount;
298 }
299 dev_ref_count->count++;
300 }
301 }
302
303 // Note that this is called "cropped" but can also encompass a slice/embed
304 // operation as well.
305 struct DevRefCountCropped : DeviceRefCount {
306 // We will only store Buffers that have a dynamic number of dimensions.
307 // Buffers that cropped or sliced from need to be first converted to
308 // one with variable size. This is required because we cannot possibly
309 // know what the actual dimensionality is of the buffer this is a
310 // crop or slice from. Since cropping a sliced buffer is also possible,
311 // no optimizations can be made for cropped buffers either.
312 Buffer<T, AnyDims> cropped_from;
313 explicit DevRefCountCropped(const Buffer<T, AnyDims> &cropped_from)
314 : cropped_from(cropped_from) {
315 ownership = BufferDeviceOwnership::Cropped;
316 }
317 };
318
319 /** Setup the device ref count for a buffer to indicate it is a crop (or slice, embed, etc) of cropped_from */
320 void crop_from(const Buffer<T, AnyDims> &cropped_from) {
321 assert(dev_ref_count == nullptr);
322 dev_ref_count = new DevRefCountCropped(cropped_from);
323 }
324
325 /** Decrement the reference count of any owned allocation and free host
326 * and device memory if it hits zero. Sets alloc to nullptr. */
327 void decref(bool device_only = false) {
328 if (owns_host_memory() && !device_only) {
329 int new_count = --(alloc->ref_count);
330 if (new_count == 0) {
331 void (*fn)(void *) = alloc->deallocate_fn;
332 alloc->~AllocationHeader();
333 fn(alloc);
334 }
335 buf.host = nullptr;
336 alloc = nullptr;
337 set_host_dirty(false);
338 }
339 int new_count = 0;
340 if (dev_ref_count) {
341 new_count = --(dev_ref_count->count);
342 }
343 if (new_count == 0) {
344 if (buf.device) {
345 assert(!(alloc && device_dirty()) &&
346 "Implicitly freeing a dirty device allocation while a host allocation still lives. "
347 "Call device_free explicitly if you want to drop dirty device-side data. "
348 "Call copy_to_host explicitly if you want the data copied to the host allocation "
349 "before the device allocation is freed.");
350 int result = halide_error_code_success;
351 if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative) {
352 result = buf.device_interface->detach_native(nullptr, &buf);
353 } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost) {
354 result = buf.device_interface->device_and_host_free(nullptr, &buf);
355 } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
356 result = buf.device_interface->device_release_crop(nullptr, &buf);
357 } else if (dev_ref_count == nullptr || dev_ref_count->ownership == BufferDeviceOwnership::Allocated) {
358 result = buf.device_interface->device_free(nullptr, &buf);
359 }
360 // No reasonable way to return the error, but we can at least assert-fail in debug builds.
361 assert((result == halide_error_code_success) && "device_interface call returned a nonzero result in Buffer::decref()");
362 (void)result;
363 }
364 if (dev_ref_count) {
365 if (dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
366 delete (DevRefCountCropped *)dev_ref_count;
367 } else {
368 delete dev_ref_count;
369 }
370 }
371 }
372 dev_ref_count = nullptr;
373 buf.device = 0;
374 buf.device_interface = nullptr;
375 }
376
377 void free_shape_storage() {
378 if (buf.dim != shape) {
379 delete[] buf.dim;
380 buf.dim = nullptr;
381 }
382 }
383
384 template<int DimsSpecified>
385 void make_static_shape_storage() {
386 static_assert(Dims == AnyDims || Dims == DimsSpecified,
387 "Number of arguments to Buffer() does not match static dimensionality");
389 if constexpr (Dims == AnyDims) {
390 if constexpr (DimsSpecified <= InClassDimStorage) {
391 buf.dim = shape;
392 } else {
393 static_assert(DimsSpecified >= 1);
395 }
396 } else {
397 static_assert(InClassDimStorage >= Dims);
398 buf.dim = shape;
399 }
400 }
401
402 void make_shape_storage(const int dimensions) {
403 if (Dims != AnyDims && Dims != dimensions) {
404 assert(false && "Number of arguments to Buffer() does not match static dimensionality");
405 }
406 // This should usually be inlined, so if dimensions is statically known,
407 // we can skip the call to new
408 buf.dimensions = dimensions;
409 buf.dim = (dimensions <= InClassDimStorage) ? shape : new halide_dimension_t[dimensions];
410 }
411
412 void copy_shape_from(const halide_buffer_t &other) {
413 // All callers of this ensure that buf.dimensions == other.dimensions.
414 make_shape_storage(other.dimensions);
415 std::copy(other.dim, other.dim + other.dimensions, buf.dim);
416 }
417
418 template<typename T2, int D2, int S2>
419 void move_shape_from(Buffer<T2, D2, S2> &&other) {
420 if (other.shape == other.buf.dim) {
421 copy_shape_from(other.buf);
422 } else {
423 buf.dim = other.buf.dim;
424 other.buf.dim = nullptr;
425 }
426 other.buf = halide_buffer_t();
427 }
428
429 /** Initialize the shape from a halide_buffer_t. */
430 void initialize_from_buffer(const halide_buffer_t &b,
431 BufferDeviceOwnership ownership) {
432 memcpy(&buf, &b, sizeof(halide_buffer_t));
433 copy_shape_from(b);
434 if (b.device) {
435 dev_ref_count = new DeviceRefCount;
436 dev_ref_count->ownership = ownership;
437 }
438 }
439
440 /** Initialize the shape from an array of ints */
441 void initialize_shape(const int *sizes) {
442 for (int i = 0; i < buf.dimensions; i++) {
443 buf.dim[i].min = 0;
444 buf.dim[i].extent = sizes[i];
445 if (i == 0) {
446 buf.dim[i].stride = 1;
447 } else {
448 buf.dim[i].stride = buf.dim[i - 1].stride * buf.dim[i - 1].extent;
449 }
450 }
451 }
452
453 /** Initialize the shape from a vector of extents */
454 void initialize_shape(const std::vector<int> &sizes) {
455 assert(buf.dimensions == (int)sizes.size());
456 initialize_shape(sizes.data());
457 }
458
459 /** Initialize the shape from the static shape of an array */
460 template<typename Array, size_t N>
461 void initialize_shape_from_array_shape(int next, Array (&vals)[N]) {
462 buf.dim[next].min = 0;
463 buf.dim[next].extent = (int)N;
464 if (next == 0) {
465 buf.dim[next].stride = 1;
466 } else {
467 initialize_shape_from_array_shape(next - 1, vals[0]);
468 buf.dim[next].stride = buf.dim[next - 1].stride * buf.dim[next - 1].extent;
469 }
470 }
471
472 /** Base case for the template recursion above. */
473 template<typename T2>
474 void initialize_shape_from_array_shape(int, const T2 &) {
475 }
476
477 /** Get the dimensionality of a multi-dimensional C array */
478 template<typename Array, size_t N>
479 static int dimensionality_of_array(Array (&vals)[N]) {
480 return dimensionality_of_array(vals[0]) + 1;
481 }
482
483 template<typename T2>
484 static int dimensionality_of_array(const T2 &) {
485 return 0;
486 }
487
488 /** Get the underlying halide_type_t of an array's element type. */
489 template<typename Array, size_t N>
490 static halide_type_t scalar_type_of_array(Array (&vals)[N]) {
491 return scalar_type_of_array(vals[0]);
492 }
493
494 template<typename T2>
495 static halide_type_t scalar_type_of_array(const T2 &) {
497 }
498
499 /** Crop a single dimension without handling device allocation. */
500 void crop_host(int d, int min, int extent) {
501 assert(dim(d).min() <= min);
502 assert(dim(d).max() >= min + extent - 1);
503 ptrdiff_t shift = min - dim(d).min();
504 if (buf.host != nullptr) {
505 buf.host += (shift * dim(d).stride()) * type().bytes();
506 }
507 buf.dim[d].min = min;
508 buf.dim[d].extent = extent;
509 }
510
511 /** Crop as many dimensions as are in rect, without handling device allocation. */
512 void crop_host(const std::vector<std::pair<int, int>> &rect) {
513 assert(rect.size() <= static_cast<decltype(rect.size())>(std::numeric_limits<int>::max()));
514 int limit = (int)rect.size();
515 assert(limit <= dimensions());
516 for (int i = 0; i < limit; i++) {
517 crop_host(i, rect[i].first, rect[i].second);
518 }
519 }
520
521 void complete_device_crop(Buffer<T, Dims, InClassDimStorage> &result_host_cropped) const {
522 assert(buf.device_interface != nullptr);
524 // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
525 // is it possible to get to this point without incref having run at least once since
526 // the device field was set? (I.e. in the internal logic of crop. incref might have been
527 // called.)
528 if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
529 result_host_cropped.crop_from(((DevRefCountCropped *)dev_ref_count)->cropped_from);
530 } else {
531 result_host_cropped.crop_from(*this);
532 }
533 }
534 }
535
536 /** slice a single dimension without handling device allocation. */
537 void slice_host(int d, int pos) {
538 static_assert(Dims == AnyDims);
539 assert(dimensions() > 0);
540 assert(d >= 0 && d < dimensions());
541 assert(pos >= dim(d).min() && pos <= dim(d).max());
542 buf.dimensions--;
543 ptrdiff_t shift = pos - buf.dim[d].min;
544 if (buf.host != nullptr) {
545 buf.host += (shift * buf.dim[d].stride) * type().bytes();
546 }
547 for (int i = d; i < buf.dimensions; i++) {
548 buf.dim[i] = buf.dim[i + 1];
549 }
550 buf.dim[buf.dimensions] = {0, 0, 0};
551 }
552
553 void complete_device_slice(Buffer<T, AnyDims, InClassDimStorage> &result_host_sliced, int d, int pos) const {
554 assert(buf.device_interface != nullptr);
556 // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
557 // is it possible to get to this point without incref having run at least once since
558 // the device field was set? (I.e. in the internal logic of slice. incref might have been
559 // called.)
560 if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
561 // crop_from() is correct here, despite the fact that we are slicing.
562 result_host_sliced.crop_from(((DevRefCountCropped *)dev_ref_count)->cropped_from);
563 } else {
564 // crop_from() is correct here, despite the fact that we are slicing.
565 result_host_sliced.crop_from(*this);
566 }
567 }
568 }
569
570public:
571 typedef T ElemType;
572
573 /** Read-only access to the shape */
574 class Dimension {
575 const halide_dimension_t &d;
576
577 public:
578 /** The lowest coordinate in this dimension */
580 return d.min;
581 }
582
583 /** The number of elements in memory you have to step over to
584 * increment this coordinate by one. */
586 return d.stride;
587 }
588
589 /** The extent of the image along this dimension */
591 return d.extent;
592 }
593
594 /** The highest coordinate in this dimension */
596 return min() + extent() - 1;
597 }
598
599 /** An iterator class, so that you can iterate over
600 * coordinates in a dimensions using a range-based for loop. */
601 struct iterator {
602 int val;
603 int operator*() const {
604 return val;
605 }
606 bool operator!=(const iterator &other) const {
607 return val != other.val;
608 }
610 val++;
611 return *this;
612 }
613 };
614
615 /** An iterator that points to the min coordinate */
617 return {min()};
618 }
619
620 /** An iterator that points to one past the max coordinate */
622 return {min() + extent()};
623 }
624
625 explicit Dimension(const halide_dimension_t &dim)
626 : d(dim) {
627 }
628 };
629
630 /** Access the shape of the buffer */
632 assert(i >= 0 && i < this->dimensions());
633 return Dimension(buf.dim[i]);
634 }
635
636 /** Access to the mins, strides, extents. Will be deprecated. Do not use. */
637 // @{
638 int min(int i) const {
639 return dim(i).min();
640 }
641 int extent(int i) const {
642 return dim(i).extent();
643 }
644 int stride(int i) const {
645 return dim(i).stride();
646 }
647 // @}
648
649 /** The total number of elements this buffer represents. Equal to
650 * the product of the extents */
651 size_t number_of_elements() const {
652 return buf.number_of_elements();
653 }
654
655 /** Get the dimensionality of the buffer. */
656 int dimensions() const {
657 if constexpr (has_static_dimensions) {
658 return Dims;
659 } else {
660 return buf.dimensions;
661 }
662 }
663
664 /** Get the type of the elements. */
666 return buf.type;
667 }
668
669 /** A pointer to the element with the lowest address. If all
670 * strides are positive, equal to the host pointer. */
671 T *begin() const {
672 assert(buf.host != nullptr); // Cannot call begin() on an unallocated Buffer.
673 return (T *)buf.begin();
674 }
675
676 /** A pointer to one beyond the element with the highest address. */
677 T *end() const {
678 assert(buf.host != nullptr); // Cannot call end() on an unallocated Buffer.
679 return (T *)buf.end();
680 }
681
682 /** The total number of bytes spanned by the data in memory. */
683 size_t size_in_bytes() const {
684 return buf.size_in_bytes();
685 }
686
687 /** Reset the Buffer to be equivalent to a default-constructed Buffer
688 * of the same static type (if any); Buffer<void> will have its runtime
689 * type reset to uint8. */
690 void reset() {
691 *this = Buffer();
692 }
693
695 : shape() {
696 buf.type = static_halide_type();
697 // If Dims are statically known, must create storage that many.
698 // otherwise, make a zero-dimensional buffer.
699 constexpr int buf_dimensions = (Dims == AnyDims) ? 0 : Dims;
701 }
702
703 /** Make a Buffer from a halide_buffer_t */
704 explicit Buffer(const halide_buffer_t &buf,
706 assert(T_is_void || buf.type == static_halide_type());
707 initialize_from_buffer(buf, ownership);
708 }
709
710 /** Give Buffers access to the members of Buffers of different dimensionalities and types. */
711 template<typename T2, int D2, int S2>
712 friend class Buffer;
713
714private:
715 template<typename T2, int D2, int S2>
716 static void static_assert_can_convert_from() {
717 static_assert((!std::is_const_v<T2> || std::is_const_v<T>),
718 "Can't convert from a Buffer<const T> to a Buffer<T>");
719 static_assert(std::is_same_v<std::remove_const_t<T>, std::remove_const_t<T2>> ||
721 "type mismatch constructing Buffer");
722 static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2,
723 "Can't convert from a Buffer with static dimensionality to a Buffer with different static dimensionality");
724 }
725
726public:
730 static void set_default_deallocate_fn(void (*deallocate_fn)(void *)) {
732 }
733
734 /** Determine if a Buffer<T, Dims, InClassDimStorage> can be constructed from some other Buffer type.
735 * If this can be determined at compile time, fail with a static assert; otherwise
736 * return a boolean based on runtime typing. */
737 template<typename T2, int D2, int S2>
740 if (Buffer<T2, D2, S2>::T_is_void && !T_is_void) {
741 if (other.type() != static_halide_type()) {
742 return false;
743 }
744 }
745 if (Dims != AnyDims) {
746 if (other.dimensions() != Dims) {
747 return false;
748 }
749 }
750 return true;
751 }
752
753 /** Fail an assertion at runtime or compile-time if an Buffer<T, Dims, InClassDimStorage>
754 * cannot be constructed from some other Buffer type. */
755 template<typename T2, int D2, int S2>
757 // Explicitly call static_assert_can_convert_from() here so
758 // that we always get compile-time checking, even if compiling with
759 // assertions disabled.
761 assert(can_convert_from(other));
762 }
763
764 /** Copy constructor. Does not copy underlying data. */
766 : buf(other.buf),
767 alloc(other.alloc) {
768 other.incref();
769 dev_ref_count = other.dev_ref_count;
770 copy_shape_from(other.buf);
771 }
772
773 /** Construct a Buffer from a Buffer of different dimensionality
774 * and type. Asserts that the type and dimensionality matches (at runtime,
775 * if one of the types is void). Note that this constructor is
776 * implicit. This, for example, lets you pass things like
777 * Buffer<T> or Buffer<const void> to functions expected
778 * Buffer<const T>. */
779 template<typename T2, int D2, int S2>
781 : buf(other.buf),
782 alloc(other.alloc) {
783 assert_can_convert_from(other);
784 other.incref();
785 dev_ref_count = other.dev_ref_count;
786 copy_shape_from(other.buf);
787 }
788
789 /** Move constructor */
791 : buf(other.buf),
792 alloc(other.alloc),
793 dev_ref_count(other.dev_ref_count) {
794 other.dev_ref_count = nullptr;
795 other.alloc = nullptr;
796 move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
797 }
798
799 /** Move-construct a Buffer from a Buffer of different
800 * dimensionality and type. Asserts that the types match (at
801 * runtime if one of the types is void). */
802 template<typename T2, int D2, int S2>
804 : buf(other.buf),
805 alloc(other.alloc),
806 dev_ref_count(other.dev_ref_count) {
807 assert_can_convert_from(other);
808 other.dev_ref_count = nullptr;
809 other.alloc = nullptr;
810 move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
811 }
812
813 /** Assign from another Buffer of possibly-different
814 * dimensionality and type. Asserts that the types match (at
815 * runtime if one of the types is void). */
816 template<typename T2, int D2, int S2>
818 if ((const void *)this == (const void *)&other) {
819 return *this;
820 }
821 assert_can_convert_from(other);
822 other.incref();
823 decref();
824 dev_ref_count = other.dev_ref_count;
825 alloc = other.alloc;
826 free_shape_storage();
827 buf = other.buf;
828 copy_shape_from(other.buf);
829 return *this;
830 }
831
832 /** Standard assignment operator */
834 // The cast to void* here is just to satisfy clang-tidy
835 if ((const void *)this == (const void *)&other) {
836 return *this;
837 }
838 other.incref();
839 decref();
840 dev_ref_count = other.dev_ref_count;
841 alloc = other.alloc;
842 free_shape_storage();
843 buf = other.buf;
844 copy_shape_from(other.buf);
845 return *this;
846 }
847
848 /** Move from another Buffer of possibly-different
849 * dimensionality and type. Asserts that the types match (at
850 * runtime if one of the types is void). */
851 template<typename T2, int D2, int S2>
853 assert_can_convert_from(other);
854 decref();
855 alloc = other.alloc;
856 other.alloc = nullptr;
857 dev_ref_count = other.dev_ref_count;
858 other.dev_ref_count = nullptr;
859 free_shape_storage();
860 buf = other.buf;
861 move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
862 return *this;
863 }
864
865 /** Standard move-assignment operator */
867 decref();
868 alloc = other.alloc;
869 other.alloc = nullptr;
870 dev_ref_count = other.dev_ref_count;
871 other.dev_ref_count = nullptr;
872 free_shape_storage();
873 buf = other.buf;
874 move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
875 return *this;
876 }
877
878 /** Check the product of the extents fits in memory. */
880 size_t size = type().bytes();
881 for (int i = 0; i < dimensions(); i++) {
882 size *= dim(i).extent();
883 }
884 // We allow 2^31 or 2^63 bytes, so drop the top bit.
885 size = (size << 1) >> 1;
886 for (int i = 0; i < dimensions(); i++) {
887 size /= dim(i).extent();
888 }
889 assert(size == (size_t)type().bytes() && "Error: Overflow computing total size of buffer.");
890 }
891
892 /** Allocate memory for this Buffer. Drops the reference to any
893 * owned memory. */
894 void allocate(void *(*allocate_fn)(size_t) = nullptr,
895 void (*deallocate_fn)(void *) = nullptr) {
896 // Drop any existing allocation
897 deallocate();
898
899 // Conservatively align images to (usually) 128 bytes. This is enough
900 // alignment for all the platforms we might use. Also ensure that the allocation
901 // is such that the logical size is an integral multiple of 128 bytes (or a bit more).
902 constexpr size_t alignment = HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT;
903
904 const auto align_up = [=](size_t value) -> size_t {
905 return (value + alignment - 1) & ~(alignment - 1);
906 };
907
908 size_t size = size_in_bytes();
909
910#if HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
911 // Only use aligned_alloc() if no custom allocators are specified.
913 // As a practical matter, sizeof(AllocationHeader) is going to be no more than 16 bytes
914 // on any supported platform, so we will just overallocate by 'alignment'
915 // so that the user storage also starts at an aligned point. This is a bit
916 // wasteful, but probably not a big deal.
917 static_assert(sizeof(AllocationHeader) <= alignment);
918 void *alloc_storage = ::aligned_alloc(alignment, align_up(size) + alignment);
920 alloc = new (alloc_storage) AllocationHeader(free);
921 buf.host = (uint8_t *)((uintptr_t)alloc_storage + alignment);
922 return;
923 }
924 // else fall thru
925#endif
926 if (!allocate_fn) {
928 if (!allocate_fn) {
930 }
931 }
932 if (!deallocate_fn) {
934 if (!deallocate_fn) {
935 deallocate_fn = free;
936 }
937 }
938
939 static_assert(sizeof(AllocationHeader) <= alignment);
940
941 // malloc() and friends must return a pointer aligned to at least alignof(std::max_align_t);
942 // make sure this is OK for AllocationHeader, since it always goes at the start
943 static_assert(alignof(AllocationHeader) <= alignof(std::max_align_t));
944
945 const size_t requested_size = align_up(size + alignment +
946 std::max(0, (int)sizeof(AllocationHeader) -
947 (int)sizeof(std::max_align_t)));
949 alloc = new (alloc_storage) AllocationHeader(deallocate_fn);
950 uint8_t *unaligned_ptr = ((uint8_t *)alloc) + sizeof(AllocationHeader);
952 }
953
954 /** Drop reference to any owned host or device memory, possibly
955 * freeing it, if this buffer held the last reference to
956 * it. Retains the shape of the buffer. Does nothing if this
957 * buffer did not allocate its own memory. */
958 void deallocate() {
959 decref();
960 }
961
962 /** Drop reference to any owned device memory, possibly freeing it
963 * if this buffer held the last reference to it. Asserts that
964 * device_dirty is false. */
966 decref(true);
967 }
968
969 /** Allocate a new image of the given size with a runtime
970 * type. Only used when you do know what size you want but you
971 * don't know statically what type the elements are. Pass zeros
972 * to make a buffer suitable for bounds query calls. */
973 template<typename... Args,
974 typename = std::enable_if_t<AllInts<Args...>::value>>
975 Buffer(halide_type_t t, int first, Args... rest) {
976 if (!T_is_void) {
977 assert(static_halide_type() == t);
978 }
979 int extents[] = {first, (int)rest...};
980 buf.type = t;
981 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
983 initialize_shape(extents);
984 if (!Internal::any_zero(extents)) {
985 check_overflow();
986 allocate();
987 }
988 }
989
990 /** Allocate a new image of the given size. Pass zeros to make a
991 * buffer suitable for bounds query calls. */
992 // @{
993
994 // The overload with one argument is 'explicit', so that
995 // (say) int is not implicitly convertible to Buffer<int>
996 explicit Buffer(int first) {
997 static_assert(!T_is_void,
998 "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
999 int extents[] = {first};
1000 buf.type = static_halide_type();
1001 constexpr int buf_dimensions = 1;
1003 initialize_shape(extents);
1004 if (first != 0) {
1005 check_overflow();
1006 allocate();
1007 }
1008 }
1009
1010 template<typename... Args,
1011 typename = std::enable_if_t<AllInts<Args...>::value>>
1012 Buffer(int first, int second, Args... rest) {
1013 static_assert(!T_is_void,
1014 "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
1015 int extents[] = {first, second, (int)rest...};
1016 buf.type = static_halide_type();
1017 constexpr int buf_dimensions = 2 + (int)(sizeof...(rest));
1019 initialize_shape(extents);
1020 if (!Internal::any_zero(extents)) {
1021 check_overflow();
1022 allocate();
1023 }
1024 }
1025 // @}
1026
1027 /** Allocate a new image of unknown type using a vector of ints as the size. */
1028 Buffer(halide_type_t t, const std::vector<int> &sizes) {
1029 if (!T_is_void) {
1030 assert(static_halide_type() == t);
1031 }
1032 buf.type = t;
1033 // make_shape_storage() will do a runtime check that dimensionality matches.
1034 make_shape_storage((int)sizes.size());
1035 initialize_shape(sizes);
1036 if (!Internal::any_zero(sizes)) {
1037 check_overflow();
1038 allocate();
1039 }
1040 }
1041
1042 /** Allocate a new image of known type using a vector of ints as the size. */
1043 explicit Buffer(const std::vector<int> &sizes)
1044 : Buffer(static_halide_type(), sizes) {
1045 }
1046
1047private:
1048 // Create a copy of the sizes vector, ordered as specified by order.
1049 static std::vector<int> make_ordered_sizes(const std::vector<int> &sizes, const std::vector<int> &order) {
1050 assert(order.size() == sizes.size());
1051 std::vector<int> ordered_sizes(sizes.size());
1052 for (size_t i = 0; i < sizes.size(); ++i) {
1053 ordered_sizes[i] = sizes.at(order[i]);
1054 }
1055 return ordered_sizes;
1056 }
1057
1058public:
1059 /** Allocate a new image of unknown type using a vector of ints as the size and
1060 * a vector of indices indicating the storage order for each dimension. The
1061 * length of the sizes vector and the storage-order vector must match. For instance,
1062 * to allocate an interleaved RGB buffer, you would pass {2, 0, 1} for storage_order. */
1063 Buffer(halide_type_t t, const std::vector<int> &sizes, const std::vector<int> &storage_order)
1064 : Buffer(t, make_ordered_sizes(sizes, storage_order)) {
1065 transpose(storage_order);
1066 }
1067
1068 Buffer(const std::vector<int> &sizes, const std::vector<int> &storage_order)
1069 : Buffer(static_halide_type(), sizes, storage_order) {
1070 }
1071
1072 /** Make an Buffer that refers to a statically sized array. Does not
1073 * take ownership of the data, and does not set the host_dirty flag. */
1074 template<typename Array, size_t N>
1075 explicit Buffer(Array (&vals)[N]) {
1076 const int buf_dimensions = dimensionality_of_array(vals);
1077 buf.type = scalar_type_of_array(vals);
1078 buf.host = (uint8_t *)vals;
1079 make_shape_storage(buf_dimensions);
1080 initialize_shape_from_array_shape(buf.dimensions - 1, vals);
1081 }
1082
1083 /** Initialize an Buffer of runtime type from a pointer and some
1084 * sizes. Assumes dense row-major packing and a min coordinate of
1085 * zero. Does not take ownership of the data and does not set the
1086 * host_dirty flag. */
1087 template<typename... Args,
1088 typename = std::enable_if_t<AllInts<Args...>::value>>
1089 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args &&...rest) {
1090 if (!T_is_void) {
1091 assert(static_halide_type() == t);
1092 }
1093 int extents[] = {first, (int)rest...};
1094 buf.type = t;
1095 buf.host = (uint8_t *)const_cast<void *>(data);
1096 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
1098 initialize_shape(extents);
1099 }
1100
1101 /** Initialize an Buffer from a pointer and some sizes. Assumes
1102 * dense row-major packing and a min coordinate of zero. Does not
1103 * take ownership of the data and does not set the host_dirty flag. */
1104 template<typename... Args,
1105 typename = std::enable_if_t<AllInts<Args...>::value>>
1106 explicit Buffer(T *data, int first, Args &&...rest) {
1107 int extents[] = {first, (int)rest...};
1108 buf.type = static_halide_type();
1109 buf.host = (uint8_t *)const_cast<std::remove_const_t<T> *>(data);
1110 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
1112 initialize_shape(extents);
1113 }
1114
1115 /** Initialize an Buffer from a pointer and a vector of
1116 * sizes. Assumes dense row-major packing and a min coordinate of
1117 * zero. Does not take ownership of the data and does not set the
1118 * host_dirty flag. */
1119 explicit Buffer(T *data, const std::vector<int> &sizes) {
1120 buf.type = static_halide_type();
1121 buf.host = (uint8_t *)const_cast<std::remove_const_t<T> *>(data);
1122 make_shape_storage((int)sizes.size());
1123 initialize_shape(sizes);
1124 }
1125
1126 /** Initialize an Buffer of runtime type from a pointer and a
1127 * vector of sizes. Assumes dense row-major packing and a min
1128 * coordinate of zero. Does not take ownership of the data and
1129 * does not set the host_dirty flag. */
1130 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, const std::vector<int> &sizes) {
1131 if (!T_is_void) {
1132 assert(static_halide_type() == t);
1133 }
1134 buf.type = t;
1135 buf.host = (uint8_t *)const_cast<void *>(data);
1136 make_shape_storage((int)sizes.size());
1137 initialize_shape(sizes);
1138 }
1139
1140 /** Initialize an Buffer from a pointer to the min coordinate and
1141 * an array describing the shape. Does not take ownership of the
1142 * data, and does not set the host_dirty flag. */
1143 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int d, const halide_dimension_t *shape) {
1144 if (!T_is_void) {
1145 assert(static_halide_type() == t);
1146 }
1147 buf.type = t;
1148 buf.host = (uint8_t *)const_cast<void *>(data);
1149 make_shape_storage(d);
1150 for (int i = 0; i < d; i++) {
1151 buf.dim[i] = shape[i];
1152 }
1153 }
1154
1155 /** Initialize a Buffer from a pointer to the min coordinate and
1156 * a vector describing the shape. Does not take ownership of the
1157 * data, and does not set the host_dirty flag. */
1158 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data,
1159 const std::vector<halide_dimension_t> &shape)
1160 : Buffer(t, data, (int)shape.size(), shape.data()) {
1161 }
1162
1163 /** Initialize an Buffer from a pointer to the min coordinate and
1164 * an array describing the shape. Does not take ownership of the
1165 * data and does not set the host_dirty flag. */
1166 explicit Buffer(T *data, int d, const halide_dimension_t *shape) {
1167 buf.type = static_halide_type();
1168 buf.host = (uint8_t *)const_cast<std::remove_const_t<T> *>(data);
1169 make_shape_storage(d);
1170 for (int i = 0; i < d; i++) {
1171 buf.dim[i] = shape[i];
1172 }
1173 }
1174
1175 /** Initialize a Buffer from a pointer to the min coordinate and
1176 * a vector describing the shape. Does not take ownership of the
1177 * data, and does not set the host_dirty flag. */
1178 explicit Buffer(T *data, const std::vector<halide_dimension_t> &shape)
1179 : Buffer(data, (int)shape.size(), shape.data()) {
1180 }
1181
1182 /** Destructor. Will release any underlying owned allocation if
1183 * this is the last reference to it. Will assert fail if there are
1184 * weak references to this Buffer outstanding. */
1186 decref();
1187 free_shape_storage();
1188 }
1189
1190 /** Get a pointer to the raw halide_buffer_t this wraps. */
1191 // @{
1193 return &buf;
1194 }
1195
1197 return &buf;
1198 }
1199 // @}
1200
1201 /** Provide a cast operator to halide_buffer_t *, so that
1202 * instances can be passed directly to Halide filters. */
1203 operator halide_buffer_t *() {
1204 return &buf;
1205 }
1206
1207 /** Return a typed reference to this Buffer. Useful for converting
1208 * a reference to a Buffer<void> to a reference to, for example, a
1209 * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1210 * You can also optionally sspecify a new value for Dims; this is useful
1211 * mainly for removing the dimensionality constraint on a Buffer with
1212 * explicit dimensionality. Does a runtime assert if the source buffer type
1213 * is void or the new dimensionality is incompatible. */
1214 template<typename T2, int D2 = Dims>
1219
1220 /** Return a const typed reference to this Buffer. Useful for converting
1221 * a reference to a Buffer<void> to a reference to, for example, a
1222 * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1223 * You can also optionally sspecify a new value for Dims; this is useful
1224 * mainly for removing the dimensionality constraint on a Buffer with
1225 * explicit dimensionality. Does a runtime assert if the source buffer type
1226 * is void or the new dimensionality is incompatible. */
1227 template<typename T2, int D2 = Dims>
1232
1233 /** Return an rval reference to this Buffer. Useful for converting
1234 * a reference to a Buffer<void> to a reference to, for example, a
1235 * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1236 * You can also optionally sspecify a new value for Dims; this is useful
1237 * mainly for removing the dimensionality constraint on a Buffer with
1238 * explicit dimensionality. Does a runtime assert if the source buffer type
1239 * is void or the new dimensionality is incompatible. */
1240 template<typename T2, int D2 = Dims>
1245
1246 /** as_const() is syntactic sugar for .as<const T>(), to avoid the need
1247 * to recapitulate the type argument. */
1248 // @{
1251 // Note that we can skip the assert_can_convert_from(), since T -> const T
1252 // conversion is always legal.
1253 return *reinterpret_cast<Buffer<std::add_const_t<T>, Dims, InClassDimStorage> *>(this);
1254 }
1255
1258 return *reinterpret_cast<const Buffer<std::add_const_t<T>, Dims, InClassDimStorage> *>(this);
1259 }
1260
1265 // @}
1266
1267 /** Add some syntactic sugar to allow autoconversion from Buffer<T> to Buffer<const T>& when
1268 * passing arguments */
1269 template<typename T2 = T, typename = std::enable_if_t<!std::is_const_v<T2>>>
1271 return as_const();
1272 }
1273
1274 /** Add some syntactic sugar to allow autoconversion from Buffer<T> to Buffer<void>& when
1275 * passing arguments */
1276 template<typename TVoid,
1277 typename T2 = T,
1278 typename = std::enable_if_t<std::is_same_v<TVoid, void> &&
1279 !std::is_void_v<T2> &&
1280 !std::is_const_v<T2>>>
1282 return as<TVoid, Dims>();
1283 }
1284
1285 /** Add some syntactic sugar to allow autoconversion from Buffer<const T> to Buffer<const void>& when
1286 * passing arguments */
1287 template<typename TVoid,
1288 typename T2 = T,
1289 typename = std::enable_if_t<std::is_same_v<TVoid, void> &&
1290 !std::is_void_v<T2> &&
1291 std::is_const_v<T2>>>
1295
1296 /** Conventional names for the first three dimensions. */
1297 // @{
1298 int width() const {
1299 return (dimensions() > 0) ? dim(0).extent() : 1;
1300 }
1301 int height() const {
1302 return (dimensions() > 1) ? dim(1).extent() : 1;
1303 }
1304 int channels() const {
1305 return (dimensions() > 2) ? dim(2).extent() : 1;
1306 }
1307 // @}
1308
1309 /** Conventional names for the min and max value of each dimension */
1310 // @{
1311 int left() const {
1312 return dim(0).min();
1313 }
1314
1315 int right() const {
1316 return dim(0).max();
1317 }
1318
1319 int top() const {
1320 return dim(1).min();
1321 }
1322
1323 int bottom() const {
1324 return dim(1).max();
1325 }
1326 // @}
1327
1328 /** Make a new image which is a deep copy of this image. Use crop
1329 * or slice followed by copy to make a copy of only a portion of
1330 * the image. The new image has the same nesting order of dimensions
1331 * (e.g. channels innermost), but resets the strides to the default
1332 * (each stride is the product of the extents of the inner dimensions).
1333 * Note that this means any strides of zero get broadcast into a non-zero stride.
1334 *
1335 * Note that the returned Buffer is always of a non-const type T (ie:
1336 *
1337 * Buffer<const T>.copy() -> Buffer<T> rather than Buffer<const T>
1338 *
1339 * which is always safe, since we are making a deep copy. (The caller
1340 * can easily cast it back to Buffer<const T> if desired, which is
1341 * always safe and free.)
1342 */
1344 void (*deallocate_fn)(void *) = nullptr) const {
1346 dst.copy_from(*this);
1347 return dst;
1348 }
1349
1350 /** Like copy(), but the copy is created in interleaved memory layout
1351 * (vs. keeping the same memory layout as the original). Requires that 'this'
1352 * has exactly 3 dimensions.
1353 */
1355 void (*deallocate_fn)(void *) = nullptr) const {
1356 static_assert(Dims == AnyDims || Dims == 3);
1357 assert(dimensions() == 3);
1359 dst.set_min(min(0), min(1), min(2));
1360 dst.allocate(allocate_fn, deallocate_fn);
1361 dst.copy_from(*this);
1362 return dst;
1363 }
1364
1365 /** Like copy(), but the copy is created in planar memory layout
1366 * (vs. keeping the same memory layout as the original).
1367 */
1369 void (*deallocate_fn)(void *) = nullptr) const {
1370 std::vector<int> mins, extents;
1371 const int dims = dimensions();
1372 mins.reserve(dims);
1373 extents.reserve(dims);
1374 for (int d = 0; d < dims; ++d) {
1375 mins.push_back(dim(d).min());
1376 extents.push_back(dim(d).extent());
1377 }
1379 dst.set_min(mins);
1380 dst.allocate(allocate_fn, deallocate_fn);
1381 dst.copy_from(*this);
1382 return dst;
1383 }
1384
1385 /** Make a copy of the Buffer which shares the underlying host and/or device
1386 * allocations as the existing Buffer. This is purely syntactic sugar for
1387 * cases where you have a const reference to a Buffer but need a temporary
1388 * non-const copy (e.g. to make a call into AOT-generated Halide code), and want a terse
1389 * inline way to create a temporary. \code
1390 * void call_my_func(const Buffer<const uint8_t>& input) {
1391 * my_func(input.alias(), output);
1392 * }\endcode
1393 */
1395 return *this;
1396 }
1397
1398 /** Fill a Buffer with the values at the same coordinates in
1399 * another Buffer. Restricts itself to coordinates contained
1400 * within the intersection of the two buffers. If the two Buffers
1401 * are not in the same coordinate system, you will need to
1402 * translate the argument Buffer first. E.g. if you're blitting a
1403 * sprite onto a framebuffer, you'll want to translate the sprite
1404 * to the correct location first like so: \code
1405 * framebuffer.copy_from(sprite.translated({x, y})); \endcode
1406 */
1407 template<typename T2, int D2, int S2>
1409 static_assert(!std::is_const_v<T>, "Cannot call copy_from() on a Buffer<const T>");
1410 assert(!device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty destination.");
1411 assert(!src.device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty source.");
1412
1414
1415 static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2);
1416 assert(src.dimensions() == dst.dimensions());
1417
1418 // Trim the copy to the region in common
1419 const int d = dimensions();
1420 for (int i = 0; i < d; i++) {
1421 int min_coord = std::max(dst.dim(i).min(), src.dim(i).min());
1422 int max_coord = std::min(dst.dim(i).max(), src.dim(i).max());
1423 if (max_coord < min_coord) {
1424 // The buffers do not overlap.
1425 return;
1426 }
1427 dst.crop(i, min_coord, max_coord - min_coord + 1);
1428 src.crop(i, min_coord, max_coord - min_coord + 1);
1429 }
1430
1431 // If T is void, we need to do runtime dispatch to an
1432 // appropriately-typed lambda. We're copying, so we only care
1433 // about the element size. (If not, this should optimize away
1434 // into a static dispatch to the right-sized copy.)
1435 if (T_is_void ? (type().bytes() == 1) : (sizeof(not_void_T) == 1)) {
1436 using MemType = uint8_t;
1437 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1438 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1439 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1440 } else if (T_is_void ? (type().bytes() == 2) : (sizeof(not_void_T) == 2)) {
1441 using MemType = uint16_t;
1442 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1443 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1444 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1445 } else if (T_is_void ? (type().bytes() == 4) : (sizeof(not_void_T) == 4)) {
1446 using MemType = uint32_t;
1447 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1448 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1449 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1450 } else if (T_is_void ? (type().bytes() == 8) : (sizeof(not_void_T) == 8)) {
1451 using MemType = uint64_t;
1452 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1453 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1454 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1455 } else {
1456 assert(false && "type().bytes() must be 1, 2, 4, or 8");
1457 }
1458 set_host_dirty();
1459 }
1460
1461 /** Make an image that refers to a sub-range of this image along
1462 * the given dimension. Asserts that the crop region is within
1463 * the existing bounds: you cannot "crop outwards", even if you know there
1464 * is valid Buffer storage (e.g. because you already cropped inwards). */
1465 Buffer<T, Dims, InClassDimStorage> cropped(int d, int min, int extent) const {
1466 // Make a fresh copy of the underlying buffer (but not a fresh
1467 // copy of the allocation, if there is one).
1469
1470 // This guarantees the prexisting device ref is dropped if the
1471 // device_crop call fails and maintains the buffer in a consistent
1472 // state.
1473 im.device_deallocate();
1474
1475 im.crop_host(d, min, extent);
1476 if (buf.device_interface != nullptr) {
1477 complete_device_crop(im);
1478 }
1479 return im;
1480 }
1481
1482 /** Crop an image in-place along the given dimension. This does
1483 * not move any data around in memory - it just changes the min
1484 * and extent of the given dimension. */
1485 void crop(int d, int min, int extent) {
1486 // An optimization for non-device buffers. For the device case,
1487 // a temp buffer is required, so reuse the not-in-place version.
1488 // TODO(zalman|abadams): Are nop crops common enough to special
1489 // case the device part of the if to do nothing?
1490 if (buf.device_interface != nullptr) {
1491 *this = cropped(d, min, extent);
1492 } else {
1493 crop_host(d, min, extent);
1494 }
1495 }
1496
1497 /** Make an image that refers to a sub-rectangle of this image along
1498 * the first N dimensions. Asserts that the crop region is within
1499 * the existing bounds. The cropped image may drop any device handle
1500 * if the device_interface cannot accomplish the crop in-place. */
1501 Buffer<T, Dims, InClassDimStorage> cropped(const std::vector<std::pair<int, int>> &rect) const {
1502 // Make a fresh copy of the underlying buffer (but not a fresh
1503 // copy of the allocation, if there is one).
1505
1506 // This guarantees the prexisting device ref is dropped if the
1507 // device_crop call fails and maintains the buffer in a consistent
1508 // state.
1509 im.device_deallocate();
1510
1511 im.crop_host(rect);
1512 if (buf.device_interface != nullptr) {
1513 complete_device_crop(im);
1514 }
1515 return im;
1516 }
1517
1518 /** Crop an image in-place along the first N dimensions. This does
1519 * not move any data around in memory, nor does it free memory. It
1520 * just rewrites the min/extent of each dimension to refer to a
1521 * subregion of the same allocation. */
1522 void crop(const std::vector<std::pair<int, int>> &rect) {
1523 // An optimization for non-device buffers. For the device case,
1524 // a temp buffer is required, so reuse the not-in-place version.
1525 // TODO(zalman|abadams): Are nop crops common enough to special
1526 // case the device part of the if to do nothing?
1527 if (buf.device_interface != nullptr) {
1528 *this = cropped(rect);
1529 } else {
1530 crop_host(rect);
1531 }
1532 }
1533
1534 /** Make an image which refers to the same data with using
1535 * translated coordinates in the given dimension. Positive values
1536 * move the image data to the right or down relative to the
1537 * coordinate system. Drops any device handle. */
1540 im.translate(d, dx);
1541 return im;
1542 }
1543
1544 /** Translate an image in-place along one dimension by changing
1545 * how it is indexed. Does not move any data around in memory. */
1546 void translate(int d, int delta) {
1547 assert(d >= 0 && d < this->dimensions());
1548 device_deallocate();
1549 buf.dim[d].min += delta;
1550 }
1551
1552 /** Make an image which refers to the same data translated along
1553 * the first N dimensions. */
1554 Buffer<T, Dims, InClassDimStorage> translated(const std::vector<int> &delta) const {
1556 im.translate(delta);
1557 return im;
1558 }
1559
1560 /** Translate an image along the first N dimensions by changing
1561 * how it is indexed. Does not move any data around in memory. */
1562 void translate(const std::vector<int> &delta) {
1563 device_deallocate();
1564 assert(delta.size() <= static_cast<decltype(delta.size())>(std::numeric_limits<int>::max()));
1565 int limit = (int)delta.size();
1566 assert(limit <= dimensions());
1567 for (int i = 0; i < limit; i++) {
1568 translate(i, delta[i]);
1569 }
1570 }
1571
1572 /** Set the min coordinate of an image in the first N dimensions. */
1573 // @{
1574 void set_min(const std::vector<int> &mins) {
1575 assert(mins.size() <= static_cast<decltype(mins.size())>(dimensions()));
1576 device_deallocate();
1577 for (size_t i = 0; i < mins.size(); i++) {
1578 buf.dim[i].min = mins[i];
1579 }
1580 }
1581
1582 template<typename... Args>
1583 void set_min(Args... args) {
1584 set_min(std::vector<int>{args...});
1585 }
1586 // @}
1587
1588 /** Test if a given coordinate is within the bounds of an image. */
1589 // @{
1590 bool contains(const std::vector<int> &coords) const {
1591 assert(coords.size() <= static_cast<decltype(coords.size())>(dimensions()));
1592 for (size_t i = 0; i < coords.size(); i++) {
1593 if (coords[i] < dim((int)i).min() || coords[i] > dim((int)i).max()) {
1594 return false;
1595 }
1596 }
1597 return true;
1598 }
1599
1600 template<typename... Args>
1601 bool contains(Args... args) const {
1602 return contains(std::vector<int>{args...});
1603 }
1604 // @}
1605
1606 /** Make a buffer which refers to the same data in the same layout
1607 * using a swapped indexing order for the dimensions given. So
1608 * A = B.transposed(0, 1) means that A(i, j) == B(j, i), and more
1609 * strongly that A.address_of(i, j) == B.address_of(j, i). */
1612 im.transpose(d1, d2);
1613 return im;
1614 }
1615
1616 /** Transpose a buffer in-place by changing how it is indexed. For
1617 * example, transpose(0, 1) on a two-dimensional buffer means that
1618 * the value referred to by coordinates (i, j) is now reached at
1619 * the coordinates (j, i), and vice versa. This is done by
1620 * reordering the per-dimension metadata rather than by moving
1621 * data around in memory, so other views of the same memory will
1622 * not see the data as having been transposed. */
1623 void transpose(int d1, int d2) {
1624 assert(d1 >= 0 && d1 < this->dimensions());
1625 assert(d2 >= 0 && d2 < this->dimensions());
1626 std::swap(buf.dim[d1], buf.dim[d2]);
1627 }
1628
1629 /** A generalized transpose: instead of swapping two dimensions,
1630 * pass a vector that lists each dimension index exactly once, in
1631 * the desired order. This does not move any data around in memory
1632 * - it just permutes how it is indexed. */
1633 void transpose(const std::vector<int> &order) {
1634 assert((int)order.size() == dimensions());
1635 if (dimensions() < 2) {
1636 // My, that was easy
1637 return;
1638 }
1639
1640 std::vector<int> order_sorted = order;
1641 for (size_t i = 1; i < order_sorted.size(); i++) {
1642 for (size_t j = i; j > 0 && order_sorted[j - 1] > order_sorted[j]; j--) {
1643 std::swap(order_sorted[j], order_sorted[j - 1]);
1644 transpose(j, j - 1);
1645 }
1646 }
1647 }
1648
1649 /** Make a buffer which refers to the same data in the same
1650 * layout using a different ordering of the dimensions. */
1651 Buffer<T, Dims, InClassDimStorage> transposed(const std::vector<int> &order) const {
1653 im.transpose(order);
1654 return im;
1655 }
1656
1657 /** Make a lower-dimensional buffer that refers to one slice of
1658 * this buffer. */
1659 Buffer<T, (Dims == AnyDims ? AnyDims : Dims - 1)>
1660 sliced(int d, int pos) const {
1661 static_assert(Dims == AnyDims || Dims > 0, "Cannot slice a 0-dimensional buffer");
1662 assert(dimensions() > 0);
1663
1665
1666 // This guarantees the prexisting device ref is dropped if the
1667 // device_slice call fails and maintains the buffer in a consistent
1668 // state.
1669 im.device_deallocate();
1670
1671 im.slice_host(d, pos);
1672 if (buf.device_interface != nullptr) {
1673 complete_device_slice(im, d, pos);
1674 }
1675 return im;
1676 }
1677
1678 /** Make a lower-dimensional buffer that refers to one slice of this
1679 * buffer at the dimension's minimum. */
1680 Buffer<T, (Dims == AnyDims ? AnyDims : Dims - 1)>
1681 sliced(int d) const {
1682 static_assert(Dims == AnyDims || Dims > 0, "Cannot slice a 0-dimensional buffer");
1683 assert(dimensions() > 0);
1684
1685 return sliced(d, dim(d).min());
1686 }
1687
1688 /** Rewrite the buffer to refer to a single lower-dimensional
1689 * slice of itself along the given dimension at the given
1690 * coordinate. Does not move any data around or free the original
1691 * memory, so other views of the same data are unaffected. Can
1692 * only be called on a Buffer with dynamic dimensionality. */
1693 void slice(int d, int pos) {
1694 static_assert(Dims == AnyDims, "Cannot call slice() on a Buffer with static dimensionality.");
1695 assert(dimensions() > 0);
1696
1697 // An optimization for non-device buffers. For the device case,
1698 // a temp buffer is required, so reuse the not-in-place version.
1699 // TODO(zalman|abadams): Are nop slices common enough to special
1700 // case the device part of the if to do nothing?
1701 if (buf.device_interface != nullptr) {
1702 *this = sliced(d, pos);
1703 } else {
1704 slice_host(d, pos);
1705 }
1706 }
1707
1708 /** Slice a buffer in-place at the dimension's minimum. */
1709 void slice(int d) {
1710 slice(d, dim(d).min());
1711 }
1712
1713 /** Make a new buffer that views this buffer as a single slice in a
1714 * higher-dimensional space. The new dimension has extent one and
1715 * the given min. This operation is the opposite of slice. As an
1716 * example, the following condition is true:
1717 *
1718 \code
1719 im2 = im.embedded(1, 17);
1720 &im(x, y, c) == &im2(x, 17, y, c);
1721 \endcode
1722 */
1723 Buffer<T, (Dims == AnyDims ? AnyDims : Dims + 1)>
1724 embedded(int d, int pos = 0) const {
1726 im.embed(d, pos);
1727 return im;
1728 }
1729
1730 /** Embed a buffer in-place, increasing the
1731 * dimensionality. */
1732 void embed(int d, int pos = 0) {
1733 static_assert(Dims == AnyDims, "Cannot call embed() on a Buffer with static dimensionality.");
1734 assert(d >= 0 && d <= dimensions());
1735 add_dimension();
1736 translate(dimensions() - 1, pos);
1737 for (int i = dimensions() - 1; i > d; i--) {
1738 transpose(i, i - 1);
1739 }
1740 }
1741
1742 /** Add a new dimension with a min of zero and an extent of
1743 * one. The stride is the extent of the outermost dimension times
1744 * its stride. The new dimension is the last dimension. This is a
1745 * special case of embed. */
1747 static_assert(Dims == AnyDims, "Cannot call add_dimension() on a Buffer with static dimensionality.");
1748 const int dims = buf.dimensions;
1749 buf.dimensions++;
1750 if (buf.dim != shape) {
1751 // We're already on the heap. Reallocate.
1753 for (int i = 0; i < dims; i++) {
1754 new_shape[i] = buf.dim[i];
1755 }
1756 delete[] buf.dim;
1757 buf.dim = new_shape;
1758 } else if (dims == InClassDimStorage) {
1759 // Transition from the in-class storage to the heap
1760 make_shape_storage(buf.dimensions);
1761 for (int i = 0; i < dims; i++) {
1762 buf.dim[i] = shape[i];
1763 }
1764 } else {
1765 // We still fit in the class
1766 }
1767 buf.dim[dims] = {0, 1, 0};
1768 if (dims == 0) {
1769 buf.dim[dims].stride = 1;
1770 } else {
1771 buf.dim[dims].stride = buf.dim[dims - 1].extent * buf.dim[dims - 1].stride;
1772 }
1773 }
1774
1775 /** Add a new dimension with a min of zero, an extent of one, and
1776 * the specified stride. The new dimension is the last
1777 * dimension. This is a special case of embed. */
1779 add_dimension();
1780 buf.dim[buf.dimensions - 1].stride = s;
1781 }
1782
1783 /** Methods for managing any GPU allocation. */
1784 // @{
1785 // Set the host dirty flag. Called by every operator()
1786 // access. Must be inlined so it can be hoisted out of loops.
1788 void set_host_dirty(bool v = true) {
1789 assert((!v || !device_dirty()) && "Cannot set host dirty when device is already dirty. Call copy_to_host() before accessing the buffer from host.");
1790 buf.set_host_dirty(v);
1791 }
1792
1793 // Check if the device allocation is dirty. Called by
1794 // set_host_dirty, which is called by every accessor. Must be
1795 // inlined so it can be hoisted out of loops.
1797 bool device_dirty() const {
1798 return buf.device_dirty();
1799 }
1800
1801 bool host_dirty() const {
1802 return buf.host_dirty();
1803 }
1804
1805 void set_device_dirty(bool v = true) {
1806 assert((!v || !host_dirty()) && "Cannot set device dirty when host is already dirty.");
1807 buf.set_device_dirty(v);
1808 }
1809
1810 int copy_to_host(void *ctx = nullptr) {
1811 if (device_dirty()) {
1812 return buf.device_interface->copy_to_host(ctx, &buf);
1813 }
1815 }
1816
1817 int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1818 if (host_dirty()) {
1819 return device_interface->copy_to_device(ctx, &buf, device_interface);
1820 }
1822 }
1823
1824 int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1825 return device_interface->device_malloc(ctx, &buf, device_interface);
1826 }
1827
1828 int device_free(void *ctx = nullptr) {
1829 if (dev_ref_count) {
1831 "Can't call device_free on an unmanaged or wrapped native device handle. "
1832 "Free the source allocation or call device_detach_native instead.");
1833 // Multiple people may be holding onto this dev field
1834 assert(dev_ref_count->count == 1 &&
1835 "Multiple Halide::Runtime::Buffer objects share this device "
1836 "allocation. Freeing it would create dangling references. "
1837 "Don't call device_free on Halide buffers that you have copied or "
1838 "passed by value.");
1839 }
1841 if (buf.device_interface) {
1842 ret = buf.device_interface->device_free(ctx, &buf);
1843 }
1844 if (dev_ref_count) {
1845 delete dev_ref_count;
1846 dev_ref_count = nullptr;
1847 }
1848 return ret;
1849 }
1850
1851 int device_wrap_native(const struct halide_device_interface_t *device_interface,
1852 uint64_t handle, void *ctx = nullptr) {
1853 assert(device_interface);
1854 dev_ref_count = new DeviceRefCount;
1856 return device_interface->wrap_native(ctx, &buf, handle, device_interface);
1857 }
1858
1859 int device_detach_native(void *ctx = nullptr) {
1860 assert(dev_ref_count &&
1862 "Only call device_detach_native on buffers wrapping a native "
1863 "device handle via device_wrap_native. This buffer was allocated "
1864 "using device_malloc, or is unmanaged. "
1865 "Call device_free or free the original allocation instead.");
1866 // Multiple people may be holding onto this dev field
1867 assert(dev_ref_count->count == 1 &&
1868 "Multiple Halide::Runtime::Buffer objects share this device "
1869 "allocation. Freeing it could create dangling references. "
1870 "Don't call device_detach_native on Halide buffers that you "
1871 "have copied or passed by value.");
1873 if (buf.device_interface) {
1874 ret = buf.device_interface->detach_native(ctx, &buf);
1875 }
1876 delete dev_ref_count;
1877 dev_ref_count = nullptr;
1878 return ret;
1879 }
1880
1881 int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1882 return device_interface->device_and_host_malloc(ctx, &buf, device_interface);
1883 }
1884
1885 int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1886 if (dev_ref_count) {
1888 "Can't call device_and_host_free on a device handle not allocated with device_and_host_malloc. "
1889 "Free the source allocation or call device_detach_native instead.");
1890 // Multiple people may be holding onto this dev field
1891 assert(dev_ref_count->count == 1 &&
1892 "Multiple Halide::Runtime::Buffer objects share this device "
1893 "allocation. Freeing it would create dangling references. "
1894 "Don't call device_and_host_free on Halide buffers that you have copied or "
1895 "passed by value.");
1896 }
1898 if (buf.device_interface) {
1900 }
1901 if (dev_ref_count) {
1902 delete dev_ref_count;
1903 dev_ref_count = nullptr;
1904 }
1905 return ret;
1906 }
1907
1908 int device_sync(void *ctx = nullptr) {
1909 return buf.device_sync(ctx);
1910 }
1911
1913 return buf.device != 0;
1914 }
1915
1916 /** Return the method by which the device field is managed. */
1918 if (dev_ref_count == nullptr) {
1920 }
1921 return dev_ref_count->ownership;
1922 }
1923 // @}
1924
1925 /** If you use the (x, y, c) indexing convention, then Halide
1926 * Buffers are stored planar by default. This function constructs
1927 * an interleaved RGB or RGBA image that can still be indexed
1928 * using (x, y, c). Passing it to a generator requires that the
1929 * generator has been compiled with support for interleaved (also
1930 * known as packed or chunky) memory layouts. */
1931 static Buffer<void, Dims, InClassDimStorage> make_interleaved(halide_type_t t, int width, int height, int channels) {
1932 static_assert(Dims == AnyDims || Dims == 3, "make_interleaved() must be called on a Buffer that can represent 3 dimensions.");
1933 Buffer<void, Dims, InClassDimStorage> im(t, channels, width, height);
1934 // Note that this is equivalent to calling transpose({2, 0, 1}),
1935 // but slightly more efficient.
1936 im.transpose(0, 1);
1937 im.transpose(1, 2);
1938 return im;
1939 }
1940
1941 /** If you use the (x, y, c) indexing convention, then Halide
1942 * Buffers are stored planar by default. This function constructs
1943 * an interleaved RGB or RGBA image that can still be indexed
1944 * using (x, y, c). Passing it to a generator requires that the
1945 * generator has been compiled with support for interleaved (also
1946 * known as packed or chunky) memory layouts. */
1947 static Buffer<T, Dims, InClassDimStorage> make_interleaved(int width, int height, int channels) {
1948 return make_interleaved(static_halide_type(), width, height, channels);
1949 }
1950
1951 /** Wrap an existing interleaved image. */
1953 make_interleaved(halide_type_t t, T *data, int width, int height, int channels) {
1954 static_assert(Dims == AnyDims || Dims == 3, "make_interleaved() must be called on a Buffer that can represent 3 dimensions.");
1955 Buffer<add_const_if_T_is_const<void>, Dims, InClassDimStorage> im(t, data, channels, width, height);
1956 im.transpose(0, 1);
1957 im.transpose(1, 2);
1958 return im;
1959 }
1960
1961 /** Wrap an existing interleaved image. */
1962 static Buffer<T, Dims, InClassDimStorage> make_interleaved(T *data, int width, int height, int channels) {
1963 return make_interleaved(static_halide_type(), data, width, height, channels);
1964 }
1965
1966 /** Make a zero-dimensional Buffer */
1968 static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1970 buf.slice(0, 0);
1971 return buf;
1972 }
1973
1974 /** Make a zero-dimensional Buffer */
1976 static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1978 buf.slice(0, 0);
1979 return buf;
1980 }
1981
1982 /** Make a zero-dimensional Buffer that points to non-owned, existing data */
1984 static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1986 buf.slice(0, 0);
1987 return buf;
1988 }
1989
1990 /** Make a buffer with the same shape and memory nesting order as
1991 * another buffer. It may have a different type. */
1992 template<typename T2, int D2, int S2>
1993 // NOLINTNEXTLINE(performance-unnecessary-value-param)
1995 void *(*allocate_fn)(size_t) = nullptr,
1996 void (*deallocate_fn)(void *) = nullptr) {
1997 // Note that src is taken by value because its dims are mutated
1998 // in-place by the helper. Do not change to taking it by reference.
1999 static_assert(Dims == D2 || Dims == AnyDims);
2002 allocate_fn, deallocate_fn);
2003 }
2004
2005private:
2006 static Buffer<> make_with_shape_of_helper(halide_type_t dst_type,
2007 int dimensions,
2008 halide_dimension_t *shape,
2009 void *(*allocate_fn)(size_t),
2010 void (*deallocate_fn)(void *)) {
2011 // Reorder the dimensions of src to have strides in increasing order
2012 std::vector<int> swaps;
2013 for (int i = dimensions - 1; i > 0; i--) {
2014 for (int j = i; j > 0; j--) {
2015 if (shape[j - 1].stride > shape[j].stride) {
2016 std::swap(shape[j - 1], shape[j]);
2017 swaps.push_back(j);
2018 }
2019 }
2020 }
2021
2022 // Rewrite the strides to be dense (this messes up src, which
2023 // is why we took it by value).
2024 for (int i = 0; i < dimensions; i++) {
2025 if (i == 0) {
2026 shape[i].stride = 1;
2027 } else {
2028 shape[i].stride = shape[i - 1].extent * shape[i - 1].stride;
2029 }
2030 }
2031
2032 // Undo the dimension reordering
2033 while (!swaps.empty()) {
2034 int j = swaps.back();
2035 std::swap(shape[j - 1], shape[j]);
2036 swaps.pop_back();
2037 }
2038
2039 // Use an explicit runtime type, and make dst a Buffer<void>, to allow
2040 // using this method with Buffer<void> for either src or dst.
2041 Buffer<> dst(dst_type, nullptr, dimensions, shape);
2042 dst.allocate(allocate_fn, deallocate_fn);
2043
2044 return dst;
2045 }
2046
2047 template<typename... Args>
2049 ptrdiff_t
2050 offset_of(int d, int first, Args... rest) const {
2051#if HALIDE_RUNTIME_BUFFER_CHECK_INDICES
2052 assert(first >= this->buf.dim[d].min);
2053 assert(first < this->buf.dim[d].min + this->buf.dim[d].extent);
2054#endif
2055 return offset_of(d + 1, rest...) + (ptrdiff_t)this->buf.dim[d].stride * (first - this->buf.dim[d].min);
2056 }
2057
2059 ptrdiff_t offset_of(int d) const {
2060 return 0;
2061 }
2062
2063 template<typename... Args>
2064 HALIDE_ALWAYS_INLINE storage_T *address_of(Args... args) const {
2065 if (T_is_void) {
2066 return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes();
2067 } else {
2068 return (storage_T *)(this->buf.host) + offset_of(0, args...);
2069 }
2070 }
2071
2073 ptrdiff_t offset_of(const int *pos) const {
2074 ptrdiff_t offset = 0;
2075 for (int i = this->dimensions() - 1; i >= 0; i--) {
2076#if HALIDE_RUNTIME_BUFFER_CHECK_INDICES
2077 assert(pos[i] >= this->buf.dim[i].min);
2078 assert(pos[i] < this->buf.dim[i].min + this->buf.dim[i].extent);
2079#endif
2080 offset += (ptrdiff_t)this->buf.dim[i].stride * (pos[i] - this->buf.dim[i].min);
2081 }
2082 return offset;
2083 }
2084
2086 storage_T *address_of(const int *pos) const {
2087 if (T_is_void) {
2088 return (storage_T *)this->buf.host + offset_of(pos) * type().bytes();
2089 } else {
2090 return (storage_T *)this->buf.host + offset_of(pos);
2091 }
2092 }
2093
2094public:
2095 /** Get a pointer to the address of the min coordinate. */
2096 T *data() const {
2097 return (T *)(this->buf.host);
2098 }
2099
2100 /** Access elements. Use im(...) to get a reference to an element,
2101 * and use &im(...) to get the address of an element. If you pass
2102 * fewer arguments than the buffer has dimensions, the rest are
2103 * treated as their min coordinate. The non-const versions set the
2104 * host_dirty flag to true.
2105 */
2106 //@{
2107 template<typename... Args,
2108 typename = std::enable_if_t<AllInts<Args...>::value>>
2109 HALIDE_ALWAYS_INLINE const not_void_T &operator()(int first, Args... rest) const {
2110 static_assert(!T_is_void,
2111 "Cannot use operator() on Buffer<void> types");
2112 constexpr int expected_dims = 1 + (int)(sizeof...(rest));
2113 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2114 assert(!device_dirty());
2115 return *((const not_void_T *)(address_of(first, rest...)));
2116 }
2117
2119 const not_void_T &operator()() const {
2120 static_assert(!T_is_void,
2121 "Cannot use operator() on Buffer<void> types");
2122 constexpr int expected_dims = 0;
2123 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2124 assert(!device_dirty());
2125 return *((const not_void_T *)(data()));
2126 }
2127
2129 const not_void_T &
2130 operator()(const int *pos) const {
2131 static_assert(!T_is_void,
2132 "Cannot use operator() on Buffer<void> types");
2133 assert(!device_dirty());
2134 return *((const not_void_T *)(address_of(pos)));
2135 }
2136
2137 template<typename... Args,
2138 typename = std::enable_if_t<AllInts<Args...>::value>>
2139 HALIDE_ALWAYS_INLINE not_void_T &operator()(int first, Args... rest) {
2140 static_assert(!T_is_void,
2141 "Cannot use operator() on Buffer<void> types");
2142 constexpr int expected_dims = 1 + (int)(sizeof...(rest));
2143 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2144 set_host_dirty();
2145 return *((not_void_T *)(address_of(first, rest...)));
2146 }
2147
2149 not_void_T &
2151 static_assert(!T_is_void,
2152 "Cannot use operator() on Buffer<void> types");
2153 constexpr int expected_dims = 0;
2154 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2155 set_host_dirty();
2156 return *((not_void_T *)(data()));
2157 }
2158
2160 not_void_T &
2161 operator()(const int *pos) {
2162 static_assert(!T_is_void,
2163 "Cannot use operator() on Buffer<void> types");
2164 set_host_dirty();
2165 return *((not_void_T *)(address_of(pos)));
2166 }
2167 // @}
2168
2169 /** Tests that all values in this buffer are equal to val. */
2170 bool all_equal(not_void_T val) const {
2171 bool all_equal = true;
2172 for_each_element([&](const int *pos) { all_equal &= (*this)(pos) == val; });
2173 return all_equal;
2174 }
2175
2177 set_host_dirty();
2178 for_each_value([=](T &v) { v = val; });
2179 return *this;
2180 }
2181
2182private:
2183 /** Helper functions for for_each_value. */
2184 // @{
2185 template<int N>
2186 struct for_each_value_task_dim {
2187 std::ptrdiff_t extent;
2188 std::ptrdiff_t stride[N];
2189 };
2190
2191 // Given an array of strides, and a bunch of pointers to pointers
2192 // (all of different types), advance the pointers using the
2193 // strides.
2194 template<typename Ptr, typename... Ptrs>
2195 HALIDE_ALWAYS_INLINE static void advance_ptrs(const std::ptrdiff_t *stride, Ptr &ptr, Ptrs &...ptrs) {
2196 ptr += *stride;
2197 advance_ptrs(stride + 1, ptrs...);
2198 }
2199
2201 static void advance_ptrs(const std::ptrdiff_t *) {
2202 }
2203
2204 template<typename Fn, typename Ptr, typename... Ptrs>
2205 HALIDE_NEVER_INLINE static void for_each_value_helper(Fn &&f, int d, bool innermost_strides_are_one,
2206 const for_each_value_task_dim<sizeof...(Ptrs) + 1> *t, Ptr ptr, Ptrs... ptrs) {
2207 if (d == 0) {
2209 Ptr end = ptr + t[0].extent;
2210 while (ptr != end) {
2211 f(*ptr++, (*ptrs++)...);
2212 }
2213 } else {
2214 for (std::ptrdiff_t i = t[0].extent; i != 0; i--) {
2215 f(*ptr, (*ptrs)...);
2216 advance_ptrs(t[0].stride, ptr, ptrs...);
2217 }
2218 }
2219 } else {
2220 for (std::ptrdiff_t i = t[d].extent; i != 0; i--) {
2221 for_each_value_helper(f, d - 1, innermost_strides_are_one, t, ptr, ptrs...);
2222 advance_ptrs(t[d].stride, ptr, ptrs...);
2223 }
2224 }
2225 }
2226
2227 // Return pair is <new_dimensions, innermost_strides_are_one>
2228 template<int N>
2229 HALIDE_NEVER_INLINE static std::pair<int, bool> for_each_value_prep(for_each_value_task_dim<N> *t,
2230 const halide_buffer_t **buffers) {
2231 const int dimensions = buffers[0]->dimensions;
2232 assert(dimensions > 0);
2233
2234 // Check the buffers all have clean host allocations
2235 for (int i = 0; i < N; i++) {
2236 if (buffers[i]->device) {
2237 assert(buffers[i]->host &&
2238 "Buffer passed to for_each_value has device allocation but no host allocation. Call allocate() and copy_to_host() first");
2239 assert(!buffers[i]->device_dirty() &&
2240 "Buffer passed to for_each_value is dirty on device. Call copy_to_host() first");
2241 } else {
2242 assert(buffers[i]->host &&
2243 "Buffer passed to for_each_value has no host or device allocation");
2244 }
2245 }
2246
2247 // Extract the strides in all the dimensions
2248 for (int i = 0; i < dimensions; i++) {
2249 for (int j = 0; j < N; j++) {
2250 assert(buffers[j]->dimensions == dimensions);
2251 assert(buffers[j]->dim[i].extent == buffers[0]->dim[i].extent &&
2252 buffers[j]->dim[i].min == buffers[0]->dim[i].min);
2253 const int s = buffers[j]->dim[i].stride;
2254 t[i].stride[j] = s;
2255 }
2256 t[i].extent = buffers[0]->dim[i].extent;
2257
2258 // Order the dimensions by stride, so that the traversal is cache-coherent.
2259 // Use the last dimension for this, because this is the source in copies.
2260 // It appears to be better to optimize read order than write order.
2261 for (int j = i; j > 0 && t[j].stride[N - 1] < t[j - 1].stride[N - 1]; j--) {
2262 std::swap(t[j], t[j - 1]);
2263 }
2264 }
2265
2266 // flatten dimensions where possible to make a larger inner
2267 // loop for autovectorization.
2268 int d = dimensions;
2269 for (int i = 1; i < d; i++) {
2270 bool flat = true;
2271 for (int j = 0; j < N; j++) {
2272 flat = flat && t[i - 1].stride[j] * t[i - 1].extent == t[i].stride[j];
2273 }
2274 if (flat) {
2275 t[i - 1].extent *= t[i].extent;
2276 for (int j = i; j < d - 1; j++) {
2277 t[j] = t[j + 1];
2278 }
2279 i--;
2280 d--;
2281 }
2282 }
2283
2284 // Note that we assert() that dimensions > 0 above
2285 // (our one-and-only caller will only call us that way)
2286 // so the unchecked access to t[0] should be safe.
2287 bool innermost_strides_are_one = true;
2288 for (int i = 0; i < N; i++) {
2289 innermost_strides_are_one &= (t[0].stride[i] == 1);
2290 }
2291
2292 return {d, innermost_strides_are_one};
2293 }
2294
2295 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2296 void for_each_value_impl(Fn &&f, Args &&...other_buffers) const {
2297 if (dimensions() > 0) {
2298 const size_t alloc_size = dimensions() * sizeof(for_each_value_task_dim<N>);
2301 // Move the preparatory code into a non-templated helper to
2302 // save code size.
2303 const halide_buffer_t *buffers[] = {&buf, (&other_buffers.buf)...};
2305 if (new_dims > 0) {
2308 t,
2309 data(), (other_buffers.data())...);
2310 return;
2311 }
2312 // else fall thru
2313 }
2314
2315 // zero-dimensional case
2316 f(*data(), (*other_buffers.data())...);
2317 }
2318 // @}
2319
2320public:
2321 /** Call a function on every value in the buffer, and the
2322 * corresponding values in some number of other buffers of the
2323 * same size. The function should take a reference, const
2324 * reference, or value of the correct type for each buffer. This
2325 * effectively lifts a function of scalars to an element-wise
2326 * function of buffers. This produces code that the compiler can
2327 * autovectorize. This is slightly cheaper than for_each_element,
2328 * because it does not need to track the coordinates.
2329 *
2330 * Note that constness of Buffers is preserved: a const Buffer<T> (for either
2331 * 'this' or the other-buffers arguments) will allow mutation of the
2332 * buffer contents, while a Buffer<const T> will not. Attempting to specify
2333 * a mutable reference for the lambda argument of a Buffer<const T>
2334 * will result in a compilation error. */
2335 // @{
2336 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2338 for_each_value_impl(f, std::forward<Args>(other_buffers)...);
2339 return *this;
2340 }
2341
2342 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2346 for_each_value_impl(f, std::forward<Args>(other_buffers)...);
2347 return *this;
2348 }
2349 // @}
2350
2351private:
2352 // Helper functions for for_each_element
2353 struct for_each_element_task_dim {
2354 int min, max;
2355 };
2356
2357 /** If f is callable with this many args, call it. The first
2358 * argument is just to make the overloads distinct. Actual
2359 * overload selection is done using the enable_if. */
2360 template<typename Fn,
2361 typename... Args,
2362 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2363 HALIDE_ALWAYS_INLINE static void for_each_element_variadic(int, int, const for_each_element_task_dim *, Fn &&f, Args... args) {
2364 f(args...);
2365 }
2366
2367 /** If the above overload is impossible, we add an outer loop over
2368 * an additional argument and try again. */
2369 template<typename Fn,
2370 typename... Args>
2371 HALIDE_ALWAYS_INLINE static void for_each_element_variadic(double, int d, const for_each_element_task_dim *t, Fn &&f, Args... args) {
2372 for (int i = t[d].min; i <= t[d].max; i++) {
2373 for_each_element_variadic(0, d - 1, t, std::forward<Fn>(f), i, args...);
2374 }
2375 }
2376
2377 /** Determine the minimum number of arguments a callable can take
2378 * using the same trick. */
2379 template<typename Fn,
2380 typename... Args,
2381 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2382 HALIDE_ALWAYS_INLINE static int num_args(int, Fn &&, Args...) {
2383 return (int)(sizeof...(Args));
2384 }
2385
2386 /** The recursive version is only enabled up to a recursion limit
2387 * of 256. This catches callables that aren't callable with any
2388 * number of ints. */
2389 template<typename Fn,
2390 typename... Args>
2391 HALIDE_ALWAYS_INLINE static int num_args(double, Fn &&f, Args... args) {
2392 static_assert(sizeof...(args) <= 256,
2393 "Callable passed to for_each_element must accept either a const int *,"
2394 " or up to 256 ints. No such operator found. Expect infinite template recursion.");
2395 return num_args(0, std::forward<Fn>(f), 0, args...);
2396 }
2397
2398 /** A version where the callable takes a position array instead,
2399 * with compile-time recursion on the dimensionality. This
2400 * overload is preferred to the one below using the same int vs
2401 * double trick as above, but is impossible once d hits -1 using
2402 * std::enable_if. */
2403 template<int d,
2404 typename Fn,
2405 typename = std::enable_if_t<(d >= 0)>>
2406 HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(int, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2407 for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
2408 for_each_element_array_helper<d - 1>(0, t, f, pos);
2409 }
2410 }
2411
2412 /** Base case for recursion above. */
2413 template<int d,
2414 typename Fn,
2415 typename = std::enable_if_t<(d < 0)>>
2416 HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(double, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2417 f(pos);
2418 }
2419
2420 /** A run-time-recursive version (instead of
2421 * compile-time-recursive) that requires the callable to take a
2422 * pointer to a position array instead. Dispatches to the
2423 * compile-time-recursive version once the dimensionality gets
2424 * small. */
2425 template<typename Fn>
2426 static void for_each_element_array(int d, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2427 if (d == -1) {
2428 f(pos);
2429 } else if (d == 0) {
2430 // Once the dimensionality gets small enough, dispatch to
2431 // a compile-time-recursive version for better codegen of
2432 // the inner loops.
2433 for_each_element_array_helper<0, Fn>(0, t, std::forward<Fn>(f), pos);
2434 } else if (d == 1) {
2435 for_each_element_array_helper<1, Fn>(0, t, std::forward<Fn>(f), pos);
2436 } else if (d == 2) {
2437 for_each_element_array_helper<2, Fn>(0, t, std::forward<Fn>(f), pos);
2438 } else if (d == 3) {
2439 for_each_element_array_helper<3, Fn>(0, t, std::forward<Fn>(f), pos);
2440 } else {
2441 for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
2442 for_each_element_array(d - 1, t, f, pos);
2443 }
2444 }
2445 }
2446
2447 /** We now have two overloads for for_each_element. This one
2448 * triggers if the callable takes a const int *.
2449 */
2450 template<typename Fn,
2451 typename = decltype(std::declval<Fn>()((const int *)nullptr))>
2452 static void for_each_element(int, int dims, const for_each_element_task_dim *t, Fn &&f, int check = 0) {
2453 const int size = dims * sizeof(int);
2454 int *pos = (int *)HALIDE_ALLOCA(size);
2455 // At least one version of GCC will (incorrectly) report that pos "may be used uninitialized".
2456 // Add this memset to silence it.
2457 memset(pos, 0, size);
2458 for_each_element_array(dims - 1, t, std::forward<Fn>(f), pos);
2459 }
2460
2461 /** This one triggers otherwise. It treats the callable as
2462 * something that takes some number of ints. */
2463 template<typename Fn>
2464 HALIDE_ALWAYS_INLINE static void for_each_element(double, int dims, const for_each_element_task_dim *t, Fn &&f) {
2465 int args = num_args(0, std::forward<Fn>(f));
2466 assert(dims >= args);
2467 for_each_element_variadic(0, args - 1, t, std::forward<Fn>(f));
2468 }
2469
2470 template<typename Fn>
2471 void for_each_element_impl(Fn &&f) const {
2472 for_each_element_task_dim *t =
2473 (for_each_element_task_dim *)HALIDE_ALLOCA(dimensions() * sizeof(for_each_element_task_dim));
2474 for (int i = 0; i < dimensions(); i++) {
2475 t[i].min = dim(i).min();
2476 t[i].max = dim(i).max();
2477 }
2478 for_each_element(0, dimensions(), t, std::forward<Fn>(f));
2479 }
2480
2481public:
2482 /** Call a function at each site in a buffer. This is likely to be
2483 * much slower than using Halide code to populate a buffer, but is
2484 * convenient for tests. If the function has more arguments than the
2485 * buffer has dimensions, the remaining arguments will be zero. If it
2486 * has fewer arguments than the buffer has dimensions then the last
2487 * few dimensions of the buffer are not iterated over. For example,
2488 * the following code exploits this to set a floating point RGB image
2489 * to red:
2490
2491 \code
2492 Buffer<float, 3> im(100, 100, 3);
2493 im.for_each_element([&](int x, int y) {
2494 im(x, y, 0) = 1.0f;
2495 im(x, y, 1) = 0.0f;
2496 im(x, y, 2) = 0.0f:
2497 });
2498 \endcode
2499
2500 * The compiled code is equivalent to writing the a nested for loop,
2501 * and compilers are capable of optimizing it in the same way.
2502 *
2503 * If the callable can be called with an int * as the sole argument,
2504 * that version is called instead. Each location in the buffer is
2505 * passed to it in a coordinate array. This version is higher-overhead
2506 * than the variadic version, but is useful for writing generic code
2507 * that accepts buffers of arbitrary dimensionality. For example, the
2508 * following sets the value at all sites in an arbitrary-dimensional
2509 * buffer to their first coordinate:
2510
2511 \code
2512 im.for_each_element([&](const int *pos) {im(pos) = pos[0];});
2513 \endcode
2514
2515 * It is also possible to use for_each_element to iterate over entire
2516 * rows or columns by cropping the buffer to a single column or row
2517 * respectively and iterating over elements of the result. For example,
2518 * to set the diagonal of the image to 1 by iterating over the columns:
2519
2520 \code
2521 Buffer<float, 3> im(100, 100, 3);
2522 im.sliced(1, 0).for_each_element([&](int x, int c) {
2523 im(x, x, c) = 1.0f;
2524 });
2525 \endcode
2526
2527 * Or, assuming the memory layout is known to be dense per row, one can
2528 * memset each row of an image like so:
2529
2530 \code
2531 Buffer<float, 3> im(100, 100, 3);
2532 im.sliced(0, 0).for_each_element([&](int y, int c) {
2533 memset(&im(0, y, c), 0, sizeof(float) * im.width());
2534 });
2535 \endcode
2536
2537 */
2538 // @{
2539 template<typename Fn>
2541 for_each_element_impl(f);
2542 return *this;
2543 }
2544
2545 template<typename Fn>
2549 for_each_element_impl(f);
2550 return *this;
2551 }
2552 // @}
2553
2554private:
2555 template<typename Fn>
2556 struct FillHelper {
2557 Fn f;
2559
2560 template<typename... Args,
2561 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2562 void operator()(Args... args) {
2563 (*buf)(args...) = f(args...);
2564 }
2565
2566 FillHelper(Fn &&f, Buffer<T, Dims, InClassDimStorage> *buf)
2567 : f(std::forward<Fn>(f)), buf(buf) {
2568 }
2569 };
2570
2571public:
2572 /** Fill a buffer by evaluating a callable at every site. The
2573 * callable should look much like a callable passed to
2574 * for_each_element, but it should return the value that should be
2575 * stored to the coordinate corresponding to the arguments. */
2576 template<typename Fn,
2577 typename = std::enable_if_t<!std::is_arithmetic_v<std::decay_t<Fn>>>>
2579 // We'll go via for_each_element. We need a variadic wrapper lambda.
2580 FillHelper<Fn> wrapper(std::forward<Fn>(f), this);
2581 return for_each_element(wrapper);
2582 }
2583
2584 /** Check if an input buffer passed extern stage is a querying
2585 * bounds. Compared to doing the host pointer check directly,
2586 * this both adds clarity to code and will facilitate moving to
2587 * another representation for bounds query arguments. */
2588 bool is_bounds_query() const {
2589 return buf.is_bounds_query();
2590 }
2591
2592 /** Convenient check to verify that all of the interesting bytes in the Buffer
2593 * are initialized under MSAN. Note that by default, we use for_each_value() here so that
2594 * we skip any unused padding that isn't part of the Buffer; this isn't efficient,
2595 * but in MSAN mode, it doesn't matter. (Pass true for the flag to force check
2596 * the entire Buffer storage.) */
2597 void msan_check_mem_is_initialized(bool entire = false) const {
2598#if defined(__has_feature)
2599#if __has_feature(memory_sanitizer)
2600 if (entire) {
2601 __msan_check_mem_is_initialized(data(), size_in_bytes());
2602 } else {
2603 for_each_value([](T &v) { __msan_check_mem_is_initialized(&v, sizeof(T)); ; });
2604 }
2605#endif
2606#endif
2607 }
2608};
2609
2610} // namespace Runtime
2611} // namespace Halide
2612
2613#undef HALIDE_ALLOCA
2614
2615#endif // HALIDE_RUNTIME_IMAGE_H
#define HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT
#define HALIDE_ALLOCA
This file declares the routines used by Halide internally in its runtime.
#define HALIDE_NEVER_INLINE
@ halide_error_code_success
There was no error.
#define HALIDE_ALWAYS_INLINE
Read-only access to the shape.
HALIDE_ALWAYS_INLINE int min() const
The lowest coordinate in this dimension.
Dimension(const halide_dimension_t &dim)
HALIDE_ALWAYS_INLINE int max() const
The highest coordinate in this dimension.
HALIDE_ALWAYS_INLINE iterator end() const
An iterator that points to one past the max coordinate.
HALIDE_ALWAYS_INLINE int stride() const
The number of elements in memory you have to step over to increment this coordinate by one.
HALIDE_ALWAYS_INLINE iterator begin() const
An iterator that points to the min coordinate.
HALIDE_ALWAYS_INLINE int extent() const
The extent of the image along this dimension.
A templated Buffer class that wraps halide_buffer_t and adds functionality.
Buffer< T, Dims, InClassDimStorage > & operator=(const Buffer< T2, D2, S2 > &other)
Assign from another Buffer of possibly-different dimensionality and type.
Buffer< not_const_T, Dims, InClassDimStorage > copy_to_planar(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr) const
Like copy(), but the copy is created in planar memory layout (vs.
Buffer< T, Dims, InClassDimStorage > transposed(const std::vector< int > &order) const
Make a buffer which refers to the same data in the same layout using a different ordering of the dime...
void translate(int d, int delta)
Translate an image in-place along one dimension by changing how it is indexed.
Buffer(const halide_buffer_t &buf, BufferDeviceOwnership ownership=BufferDeviceOwnership::Unmanaged)
Make a Buffer from a halide_buffer_t.
void allocate(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr)
Allocate memory for this Buffer.
Buffer< not_const_T, Dims, InClassDimStorage > copy(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr) const
Make a new image which is a deep copy of this image.
Buffer< T,(Dims==AnyDims ? AnyDims :Dims+1)> embedded(int d, int pos=0) const
Make a new buffer that views this buffer as a single slice in a higher-dimensional space.
void add_dimension()
Add a new dimension with a min of zero and an extent of one.
void slice(int d)
Slice a buffer in-place at the dimension's minimum.
static void set_default_allocate_fn(void *(*allocate_fn)(size_t))
bool owns_host_memory() const
Does this Buffer own the host memory it refers to?
int width() const
Conventional names for the first three dimensions.
void transpose(const std::vector< int > &order)
A generalized transpose: instead of swapping two dimensions, pass a vector that lists each dimension ...
void set_min(const std::vector< int > &mins)
Set the min coordinate of an image in the first N dimensions.
HALIDE_ALWAYS_INLINE Buffer< T, Dims, InClassDimStorage > & for_each_element(Fn &&f)
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, const std::vector< int > &sizes)
Initialize an Buffer of runtime type from a pointer and a vector of sizes.
HALIDE_ALWAYS_INLINE Buffer< T2, D2, InClassDimStorage > as() &&
Return an rval reference to this Buffer.
int copy_to_host(void *ctx=nullptr)
Buffer(halide_type_t t, const std::vector< int > &sizes)
Allocate a new image of unknown type using a vector of ints as the size.
int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
int device_free(void *ctx=nullptr)
int extent(int i) const
bool contains(Args... args) const
void crop(const std::vector< std::pair< int, int > > &rect)
Crop an image in-place along the first N dimensions.
void set_device_dirty(bool v=true)
HALIDE_ALWAYS_INLINE const not_void_T & operator()(const int *pos) const
Buffer(T *data, int d, const halide_dimension_t *shape)
Initialize an Buffer from a pointer to the min coordinate and an array describing the shape.
Buffer(Buffer< T2, D2, S2 > &&other)
Move-construct a Buffer from a Buffer of different dimensionality and type.
void slice(int d, int pos)
Rewrite the buffer to refer to a single lower-dimensional slice of itself along the given dimension a...
HALIDE_ALWAYS_INLINE const not_void_T & operator()(int first, Args... rest) const
Access elements.
HALIDE_ALWAYS_INLINE void set_host_dirty(bool v=true)
Methods for managing any GPU allocation.
void msan_check_mem_is_initialized(bool entire=false) const
Convenient check to verify that all of the interesting bytes in the Buffer are initialized under MSAN...
Buffer< T, Dims, InClassDimStorage > & operator=(Buffer< T, Dims, InClassDimStorage > &&other) noexcept
Standard move-assignment operator.
int device_detach_native(void *ctx=nullptr)
int device_wrap_native(const struct halide_device_interface_t *device_interface, uint64_t handle, void *ctx=nullptr)
Buffer< T, Dims, InClassDimStorage > translated(const std::vector< int > &delta) const
Make an image which refers to the same data translated along the first N dimensions.
HALIDE_ALWAYS_INLINE Dimension dim(int i) const
Access the shape of the buffer.
HALIDE_ALWAYS_INLINE Buffer< std::add_const_t< T >, Dims, InClassDimStorage > & as_const() &
as_const() is syntactic sugar for .as<const T>(), to avoid the need to recapitulate the type argument...
Buffer(int first, int second, Args... rest)
HALIDE_ALWAYS_INLINE Buffer< std::add_const_t< T >, Dims, InClassDimStorage > as_const() &&
Buffer< T, Dims, InClassDimStorage > transposed(int d1, int d2) const
Make a buffer which refers to the same data in the same layout using a swapped indexing order for the...
HALIDE_ALWAYS_INLINE Buffer< T, Dims, InClassDimStorage > & for_each_value(Fn &&f, Args &&...other_buffers)
HALIDE_ALWAYS_INLINE not_void_T & operator()()
BufferDeviceOwnership device_ownership() const
Return the method by which the device field is managed.
void check_overflow()
Check the product of the extents fits in memory.
static bool can_convert_from(const Buffer< T2, D2, S2 > &other)
Determine if a Buffer<T, Dims, InClassDimStorage> can be constructed from some other Buffer type.
Buffer< not_const_T, Dims, InClassDimStorage > copy_to_interleaved(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr) const
Like copy(), but the copy is created in interleaved memory layout (vs.
int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
int device_sync(void *ctx=nullptr)
static Buffer< void, Dims, InClassDimStorage > make_interleaved(halide_type_t t, int width, int height, int channels)
If you use the (x, y, c) indexing convention, then Halide Buffers are stored planar by default.
Buffer(const std::vector< int > &sizes)
Allocate a new image of known type using a vector of ints as the size.
void embed(int d, int pos=0)
Embed a buffer in-place, increasing the dimensionality.
static constexpr halide_type_t static_halide_type()
Get the Halide type of T.
Buffer(T *data, int first, Args &&...rest)
Initialize an Buffer from a pointer and some sizes.
int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
Buffer(Array(&vals)[N])
Make an Buffer that refers to a statically sized array.
const halide_buffer_t * raw_buffer() const
HALIDE_ALWAYS_INLINE not_void_T & operator()(int first, Args... rest)
static Buffer< T, Dims, InClassDimStorage > make_interleaved(int width, int height, int channels)
If you use the (x, y, c) indexing convention, then Halide Buffers are stored planar by default.
halide_type_t type() const
Get the type of the elements.
int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
Buffer(int first)
Allocate a new image of the given size.
halide_buffer_t * raw_buffer()
Get a pointer to the raw halide_buffer_t this wraps.
T * end() const
A pointer to one beyond the element with the highest address.
HALIDE_ALWAYS_INLINE bool device_dirty() const
Buffer< T, Dims, InClassDimStorage > cropped(const std::vector< std::pair< int, int > > &rect) const
Make an image that refers to a sub-rectangle of this image along the first N dimensions.
static constexpr int static_dimensions()
Callers should not use the result if has_static_dimensions is false.
HALIDE_ALWAYS_INLINE const Buffer< std::add_const_t< T >, Dims, InClassDimStorage > & as_const() const &
void transpose(int d1, int d2)
Transpose a buffer in-place by changing how it is indexed.
void deallocate()
Drop reference to any owned host or device memory, possibly freeing it, if this buffer held the last ...
size_t size_in_bytes() const
The total number of bytes spanned by the data in memory.
bool has_device_allocation() const
void reset()
Reset the Buffer to be equivalent to a default-constructed Buffer of the same static type (if any); B...
Buffer(halide_type_t t, int first, Args... rest)
Allocate a new image of the given size with a runtime type.
int dimensions() const
Get the dimensionality of the buffer.
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, int d, const halide_dimension_t *shape)
Initialize an Buffer from a pointer to the min coordinate and an array describing the shape.
int min(int i) const
Access to the mins, strides, extents.
HALIDE_ALWAYS_INLINE const Buffer< T, Dims, InClassDimStorage > & for_each_element(Fn &&f) const
Call a function at each site in a buffer.
void device_deallocate()
Drop reference to any owned device memory, possibly freeing it if this buffer held the last reference...
HALIDE_ALWAYS_INLINE const not_void_T & operator()() const
static Buffer< T, Dims, InClassDimStorage > make_scalar()
Make a zero-dimensional Buffer.
void add_dimension_with_stride(int s)
Add a new dimension with a min of zero, an extent of one, and the specified stride.
Buffer(Buffer< T, Dims, InClassDimStorage > &&other) noexcept
Move constructor.
Buffer< T, Dims, InClassDimStorage > cropped(int d, int min, int extent) const
Make an image that refers to a sub-range of this image along the given dimension.
void crop(int d, int min, int extent)
Crop an image in-place along the given dimension.
Buffer< T, Dims, InClassDimStorage > & fill(Fn &&f)
Fill a buffer by evaluating a callable at every site.
static Buffer< T, Dims, InClassDimStorage > make_scalar(T *data)
Make a zero-dimensional Buffer that points to non-owned, existing data.
Buffer< T, Dims, InClassDimStorage > alias() const
Make a copy of the Buffer which shares the underlying host and/or device allocations as the existing ...
void set_min(Args... args)
size_t number_of_elements() const
The total number of elements this buffer represents.
static void assert_can_convert_from(const Buffer< T2, D2, S2 > &other)
Fail an assertion at runtime or compile-time if an Buffer<T, Dims, InClassDimStorage> cannot be const...
void translate(const std::vector< int > &delta)
Translate an image along the first N dimensions by changing how it is indexed.
Buffer(const Buffer< T, Dims, InClassDimStorage > &other)
Copy constructor.
HALIDE_ALWAYS_INLINE not_void_T & operator()(const int *pos)
T * data() const
Get a pointer to the address of the min coordinate.
Buffer< T, Dims, InClassDimStorage > & fill(not_void_T val)
Buffer(const std::vector< int > &sizes, const std::vector< int > &storage_order)
Buffer< T, Dims, InClassDimStorage > & operator=(Buffer< T2, D2, S2 > &&other)
Move from another Buffer of possibly-different dimensionality and type.
Buffer(halide_type_t t, const std::vector< int > &sizes, const std::vector< int > &storage_order)
Allocate a new image of unknown type using a vector of ints as the size and a vector of indices indic...
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, const std::vector< halide_dimension_t > &shape)
Initialize a Buffer from a pointer to the min coordinate and a vector describing the shape.
Buffer< T,(Dims==AnyDims ? AnyDims :Dims - 1)> sliced(int d, int pos) const
Make a lower-dimensional buffer that refers to one slice of this buffer.
static Buffer< add_const_if_T_is_const< void >, Dims, InClassDimStorage > make_interleaved(halide_type_t t, T *data, int width, int height, int channels)
Wrap an existing interleaved image.
HALIDE_ALWAYS_INLINE const Buffer< T, Dims, InClassDimStorage > & for_each_value(Fn &&f, Args &&...other_buffers) const
Call a function on every value in the buffer, and the corresponding values in some number of other bu...
bool is_bounds_query() const
Check if an input buffer passed extern stage is a querying bounds.
Buffer< T,(Dims==AnyDims ? AnyDims :Dims - 1)> sliced(int d) const
Make a lower-dimensional buffer that refers to one slice of this buffer at the dimension's minimum.
int left() const
Conventional names for the min and max value of each dimension.
void copy_from(Buffer< T2, D2, S2 > src)
Fill a Buffer with the values at the same coordinates in another Buffer.
Buffer< T, Dims, InClassDimStorage > translated(int d, int dx) const
Make an image which refers to the same data with using translated coordinates in the given dimension.
int stride(int i) const
static Buffer< T, Dims, InClassDimStorage > make_interleaved(T *data, int width, int height, int channels)
Wrap an existing interleaved image.
static void set_default_deallocate_fn(void(*deallocate_fn)(void *))
static Buffer< T, Dims, InClassDimStorage > make_with_shape_of(Buffer< T2, D2, S2 > src, void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr)
Make a buffer with the same shape and memory nesting order as another buffer.
Buffer(const Buffer< T2, D2, S2 > &other)
Construct a Buffer from a Buffer of different dimensionality and type.
bool contains(const std::vector< int > &coords) const
Test if a given coordinate is within the bounds of an image.
Buffer(T *data, const std::vector< halide_dimension_t > &shape)
Initialize a Buffer from a pointer to the min coordinate and a vector describing the shape.
Buffer(T *data, const std::vector< int > &sizes)
Initialize an Buffer from a pointer and a vector of sizes.
Buffer< T, Dims, InClassDimStorage > & operator=(const Buffer< T, Dims, InClassDimStorage > &other)
Standard assignment operator.
T * begin() const
A pointer to the element with the lowest address.
bool all_equal(not_void_T val) const
Tests that all values in this buffer are equal to val.
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, int first, Args &&...rest)
Initialize an Buffer of runtime type from a pointer and some sizes.
HALIDE_ALWAYS_INLINE Buffer< T2, D2, InClassDimStorage > & as() &
Return a typed reference to this Buffer.
HALIDE_ALWAYS_INLINE const Buffer< T2, D2, InClassDimStorage > & as() const &
Return a const typed reference to this Buffer.
static Buffer< add_const_if_T_is_const< void >, Dims, InClassDimStorage > make_scalar(halide_type_t t)
Make a zero-dimensional Buffer.
ConstantInterval min(const ConstantInterval &a, const ConstantInterval &b)
ConstantInterval max(const ConstantInterval &a, const ConstantInterval &b)
auto end(reverse_adaptor< T > i)
Definition Util.h:483
bool any_zero(const Container &c)
constexpr int AnyDims
BufferDeviceOwnership
This indicates how to deallocate the device for a Halide::Runtime::Buffer.
@ AllocatedDeviceAndHost
‍No free routine will be called when device ref count goes to zero
@ WrappedNative
‍halide_device_free will be called when device ref count goes to zero
@ Unmanaged
‍halide_device_detach_native will be called when device ref count goes to zero
@ Cropped
‍Call device_and_host_free when DevRefCount goes to zero.
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
Internal::ConstantInterval cast(Type t, const Internal::ConstantInterval &a)
Cast operators for ConstantIntervals.
unsigned __INT64_TYPE__ uint64_t
__UINTPTR_TYPE__ uintptr_t
void * malloc(size_t)
ALWAYS_INLINE T align_up(T p, size_t alignment)
unsigned __INT8_TYPE__ uint8_t
__PTRDIFF_TYPE__ ptrdiff_t
unsigned __INT16_TYPE__ uint16_t
void * memcpy(void *s1, const void *s2, size_t n)
__SIZE_TYPE__ size_t
void * memset(void *s, int val, size_t n)
unsigned __INT32_TYPE__ uint32_t
void free(void *)
int64_t min
The lower and upper bound of the interval.
A struct acting as a header for allocations owned by the Buffer class itself.
AllocationHeader(void(*deallocate_fn)(void *))
An iterator class, so that you can iterate over coordinates in a dimensions using a range-based for l...
bool operator!=(const iterator &other) const
A similar struct for managing device allocations.
BufferDeviceOwnership ownership
The raw representation of an image passed around by generated Halide code.
int32_t dimensions
The dimensionality of the buffer.
halide_dimension_t * dim
The shape of the buffer.
uint64_t device
A device-handle for e.g.
uint8_t * host
A pointer to the start of the data in main memory.
struct halide_type_t type
The type of each buffer element.
const struct halide_device_interface_t * device_interface
The interface used to interpret the above handle.
Each GPU API provides a halide_device_interface_t struct pointing to the code that manages device all...
int(* device_slice)(void *user_context, const struct halide_buffer_t *src, int slice_dim, int slice_pos, struct halide_buffer_t *dst)
int(* device_and_host_malloc)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface)
int(* wrap_native)(void *user_context, struct halide_buffer_t *buf, uint64_t handle, const struct halide_device_interface_t *device_interface)
int(* device_release_crop)(void *user_context, struct halide_buffer_t *buf)
int(* device_crop)(void *user_context, const struct halide_buffer_t *src, struct halide_buffer_t *dst)
int(* copy_to_host)(void *user_context, struct halide_buffer_t *buf)
int(* copy_to_device)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface)
int(* device_free)(void *user_context, struct halide_buffer_t *buf)
int(* detach_native)(void *user_context, struct halide_buffer_t *buf)
int(* device_and_host_free)(void *user_context, struct halide_buffer_t *buf)
int(* device_malloc)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface)
A runtime tag for a type in the halide type system.