Halide 22.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
HalideBuffer.h
Go to the documentation of this file.
1/** \file
2 * Defines a Buffer type that wraps from halide_buffer_t and adds
3 * functionality, and methods for more conveniently iterating over the
4 * samples in a halide_buffer_t outside of Halide code. */
5
6#ifndef HALIDE_RUNTIME_BUFFER_H
7#define HALIDE_RUNTIME_BUFFER_H
8
9#include <algorithm>
10#include <atomic>
11#include <cassert>
12#include <cstdint>
13#include <cstdlib>
14#include <cstring>
15#include <limits>
16#include <memory>
17#include <type_traits>
18#include <vector>
19
20#ifdef __APPLE__
21#include <AvailabilityVersions.h>
22#include <TargetConditionals.h>
23#endif
24
25#if defined(__has_feature)
26#if __has_feature(memory_sanitizer)
27#include <sanitizer/msan_interface.h>
28#endif
29#endif
30
31#include "HalideRuntime.h"
32
33#ifdef _MSC_VER
34#include <malloc.h>
35#define HALIDE_ALLOCA _alloca
36#else
37#define HALIDE_ALLOCA __builtin_alloca
38#endif
39
40// gcc 5.1 has a false positive warning on this code
41#if __GNUC__ == 5 && __GNUC_MINOR__ == 1
42#pragma GCC diagnostic ignored "-Warray-bounds"
43#endif
44
45#ifndef HALIDE_RUNTIME_BUFFER_CHECK_INDICES
46#define HALIDE_RUNTIME_BUFFER_CHECK_INDICES 0
47#endif
48
49#ifndef HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT
50// Conservatively align buffer allocations to 128 bytes by default.
51// This is enough alignment for all the platforms currently in use.
52// Redefine this in your compiler settings if you desire more/less alignment.
53#define HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT 128
54#endif
55
57 "HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT must be a power of 2.");
58
59// Unfortunately, not all C++17 runtimes support aligned_alloc
60// (it may depends on OS/SDK version); this is provided as an opt-out
61// if you are compiling on a platform that doesn't provide a (good)
62// implementation. (Note that we actually use the C11 `::aligned_alloc()`
63// rather than the C++17 `std::aligned_alloc()` because at least one platform
64// we found supports the former but not the latter.)
65#ifndef HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
66
67// clang-format off
68#ifdef _WIN32
69
70 // Windows (regardless of which compiler) doesn't implement aligned_alloc(),
71 // even in C++17 mode, and has stated they probably never will, as the issue
72 // is in the incompatibility that free() needs to be able to free both pointers
73 // returned by malloc() and aligned_alloc(). So, always default it off here.
74 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
75
76#elif defined(__ANDROID_API__) && __ANDROID_API__ < 28
77
78 // Android doesn't provide aligned_alloc until API 28
79 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
80
81#elif defined(__APPLE__)
82
83 #if TARGET_OS_OSX && (__MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_15)
84
85 // macOS doesn't provide aligned_alloc until 10.15
86 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
87
88 #elif TARGET_OS_IPHONE && (__IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_14_0)
89
90 // iOS doesn't provide aligned_alloc until 14.0
91 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
92
93 #else
94
95 // Assume it's ok on all other Apple targets
96 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 1
97
98 #endif
99
100#else
101
102 #if defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC)
103
104 // ARM GNU-A baremetal compiler doesn't provide aligned_alloc as of 12.2
105 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
106
107 #else
108
109 // Not Windows, Android, or Apple: just assume it's ok
110 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 1
111
112 #endif
113
114#endif
115// clang-format on
116
117#endif // HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
118
119namespace Halide {
120namespace Runtime {
121
122// Forward-declare our Buffer class
123template<typename T, int Dims, int InClassDimStorage>
124class Buffer;
125
126// A helper to check if a parameter pack is entirely implicitly
127// int-convertible to use with std::enable_if
128template<typename... Args>
129struct AllInts : std::false_type {};
130
131template<>
132struct AllInts<> : std::true_type {};
133
134template<typename T, typename... Args>
135struct AllInts<T, Args...> {
136 static const bool value = std::is_convertible_v<T, int> && AllInts<Args...>::value;
137};
138
139// Floats and doubles are technically implicitly int-convertible, but
140// doing so produces a warning we treat as an error, so just disallow
141// it here.
142template<typename... Args>
143struct AllInts<float, Args...> : std::false_type {};
144
145template<typename... Args>
146struct AllInts<double, Args...> : std::false_type {};
147
148namespace Internal {
149// A helper to detect if there are any zeros in a container
150template<typename Container>
151bool any_zero(const Container &c) {
152 for (int i : c) {
153 if (i == 0) {
154 return true;
155 }
156 }
157 return false;
158}
159
161 static inline void *(*default_allocate_fn)(size_t) = nullptr;
162 static inline void (*default_deallocate_fn)(void *) = nullptr;
163};
164} // namespace Internal
165
166/** A struct acting as a header for allocations owned by the Buffer
167 * class itself. */
169 void (*deallocate_fn)(void *);
170 std::atomic<int> ref_count;
171
172 // Note that ref_count always starts at 1
173 explicit AllocationHeader(void (*deallocate_fn)(void *))
175 }
176};
177
178/** This indicates how to deallocate the device for a Halide::Runtime::Buffer. */
179enum struct BufferDeviceOwnership : int {
180 Allocated, ///> halide_device_free will be called when device ref count goes to zero
181 WrappedNative, ///> halide_device_detach_native will be called when device ref count goes to zero
182 Unmanaged, ///> No free routine will be called when device ref count goes to zero
183 AllocatedDeviceAndHost, ///> Call device_and_host_free when DevRefCount goes to zero.
184 Cropped, ///> Call halide_device_release_crop when DevRefCount goes to zero.
185};
186
187/** A similar struct for managing device allocations. */
189 // This is only ever constructed when there's something to manage,
190 // so start at one.
191 std::atomic<int> count{1};
193};
194
195constexpr int AnyDims = -1;
196
197/** A templated Buffer class that wraps halide_buffer_t and adds
198 * functionality. When using Halide from C++, this is the preferred
199 * way to create input and output buffers. The overhead of using this
200 * class relative to a naked halide_buffer_t is minimal - it uses another
201 * ~16 bytes on the stack, and does no dynamic allocations when using
202 * it to represent existing memory of a known maximum dimensionality.
203 *
204 * The template parameter T is the element type. For buffers where the
205 * element type is unknown, or may vary, use void or const void.
206 *
207 * The template parameter Dims is the number of dimensions. For buffers where
208 * the dimensionality type is unknown at, or may vary, use AnyDims.
209 *
210 * InClassDimStorage is the maximum number of dimensions that can be represented
211 * using space inside the class itself. Set it to the maximum dimensionality
212 * you expect this buffer to be. If the actual dimensionality exceeds
213 * this, heap storage is allocated to track the shape of the buffer.
214 * InClassDimStorage defaults to 4, which should cover nearly all usage.
215 *
216 * The class optionally allocates and owns memory for the image using
217 * a shared pointer allocated with the provided allocator. If they are
218 * null, malloc and free are used. Any device-side allocation is
219 * considered as owned if and only if the host-side allocation is
220 * owned. */
221template<typename T = void,
222 int Dims = AnyDims,
223 int InClassDimStorage = (Dims == AnyDims ? 4 : std::max(Dims, 1))>
224class Buffer {
225 /** The underlying halide_buffer_t */
226 halide_buffer_t buf = {};
227
228 /** Some in-class storage for shape of the dimensions. */
230
231 /** The allocation owned by this Buffer. NULL if the Buffer does not
232 * own the memory. */
233 AllocationHeader *alloc = nullptr;
234
235 /** A reference count for the device allocation owned by this
236 * buffer. */
237 mutable DeviceRefCount *dev_ref_count = nullptr;
238
239 /** True if T is of type void or const void */
240 static const bool T_is_void = std::is_same_v<std::remove_const_t<T>, void>;
241
242 /** A type function that adds a const qualifier if T is a const type. */
243 template<typename T2>
244 using add_const_if_T_is_const = std::conditional_t<std::is_const_v<T>, const T2, T2>;
245
246 /** T unless T is (const) void, in which case (const)
247 * uint8_t. Useful for providing return types for operator() */
248 using not_void_T = std::conditional_t<T_is_void,
249 add_const_if_T_is_const<uint8_t>,
250 T>;
251
252 /** T with constness removed. Useful for return type of copy(). */
253 using not_const_T = std::remove_const_t<T>;
254
255 /** The type the elements are stored as. Equal to not_void_T
256 * unless T is a pointer, in which case uint64_t. Halide stores
257 * all pointer types as uint64s internally, even on 32-bit
258 * systems. */
259 using storage_T = std::conditional_t<std::is_pointer_v<T>, uint64_t, not_void_T>;
260
261public:
262 /** True if the Halide type is not void (or const void). */
263 static constexpr bool has_static_halide_type = !T_is_void;
264
265 /** Get the Halide type of T. Callers should not use the result if
266 * has_static_halide_type is false. */
270
271 /** Does this Buffer own the host memory it refers to? */
272 bool owns_host_memory() const {
273 return alloc != nullptr;
274 }
275
276 static constexpr bool has_static_dimensions = (Dims != AnyDims);
277
278 /** Callers should not use the result if
279 * has_static_dimensions is false. */
280 static constexpr int static_dimensions() {
281 return Dims;
282 }
283
284 static_assert(!has_static_dimensions || static_dimensions() >= 0);
285
286private:
287 /** Increment the reference count of any owned allocation */
288 void incref() const {
289 if (owns_host_memory()) {
290 alloc->ref_count++;
291 }
292 if (buf.device) {
293 if (!dev_ref_count) {
294 // I seem to have a non-zero dev field but no
295 // reference count for it. I must have been given a
296 // device allocation by a Halide pipeline, and have
297 // never been copied from since. Take sole ownership
298 // of it.
299 dev_ref_count = new DeviceRefCount;
300 }
301 dev_ref_count->count++;
302 }
303 }
304
305 // Note that this is called "cropped" but can also encompass a slice/embed
306 // operation as well.
307 struct DevRefCountCropped : DeviceRefCount {
308 // We will only store Buffers that have a dynamic number of dimensions.
309 // Buffers that cropped or sliced from need to be first converted to
310 // one with variable size. This is required because we cannot possibly
311 // know what the actual dimensionality is of the buffer this is a
312 // crop or slice from. Since cropping a sliced buffer is also possible,
313 // no optimizations can be made for cropped buffers either.
314 Buffer<T, AnyDims> cropped_from;
315 explicit DevRefCountCropped(const Buffer<T, AnyDims> &cropped_from)
316 : cropped_from(cropped_from) {
317 ownership = BufferDeviceOwnership::Cropped;
318 }
319 };
320
321 /** Setup the device ref count for a buffer to indicate it is a crop (or slice, embed, etc) of cropped_from */
322 void crop_from(const Buffer<T, AnyDims> &cropped_from) {
323 assert(dev_ref_count == nullptr);
324 dev_ref_count = new DevRefCountCropped(cropped_from);
325 }
326
327 /** Decrement the reference count of any owned allocation and free host
328 * and device memory if it hits zero. Sets alloc to nullptr. */
329 void decref(bool device_only = false) {
330 if (owns_host_memory() && !device_only) {
331 int new_count = --(alloc->ref_count);
332 if (new_count == 0) {
333 void (*fn)(void *) = alloc->deallocate_fn;
334 alloc->~AllocationHeader();
335 fn(alloc);
336 }
337 buf.host = nullptr;
338 alloc = nullptr;
339 set_host_dirty(false);
340 }
341 int new_count = 0;
342 if (dev_ref_count) {
343 new_count = --(dev_ref_count->count);
344 }
345 if (new_count == 0) {
346 if (buf.device) {
347 assert(!(alloc && device_dirty()) &&
348 "Implicitly freeing a dirty device allocation while a host allocation still lives. "
349 "Call device_free explicitly if you want to drop dirty device-side data. "
350 "Call copy_to_host explicitly if you want the data copied to the host allocation "
351 "before the device allocation is freed.");
352 int result = halide_error_code_success;
353 if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative) {
354 result = buf.device_interface->detach_native(nullptr, &buf);
355 } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost) {
356 result = buf.device_interface->device_and_host_free(nullptr, &buf);
357 } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
358 result = buf.device_interface->device_release_crop(nullptr, &buf);
359 } else if (dev_ref_count == nullptr || dev_ref_count->ownership == BufferDeviceOwnership::Allocated) {
360 result = buf.device_interface->device_free(nullptr, &buf);
361 }
362 // No reasonable way to return the error, but we can at least assert-fail in debug builds.
363 assert((result == halide_error_code_success) && "device_interface call returned a nonzero result in Buffer::decref()");
364 (void)result;
365 }
366 if (dev_ref_count) {
367 if (dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
368 delete (DevRefCountCropped *)dev_ref_count;
369 } else {
370 delete dev_ref_count;
371 }
372 }
373 }
374 dev_ref_count = nullptr;
375 buf.device = 0;
376 buf.device_interface = nullptr;
377 }
378
379 void free_shape_storage() {
380 if (buf.dim != shape) {
381 delete[] buf.dim;
382 buf.dim = nullptr;
383 }
384 }
385
386 template<int DimsSpecified>
387 void make_static_shape_storage() {
388 static_assert(Dims == AnyDims || Dims == DimsSpecified,
389 "Number of arguments to Buffer() does not match static dimensionality");
391 if constexpr (Dims == AnyDims) {
392 if constexpr (DimsSpecified <= InClassDimStorage) {
393 buf.dim = shape;
394 } else {
395 static_assert(DimsSpecified >= 1);
397 }
398 } else {
399 static_assert(InClassDimStorage >= Dims);
400 buf.dim = shape;
401 }
402 }
403
404 void make_shape_storage(const int dimensions) {
405 if (Dims != AnyDims && Dims != dimensions) {
406 assert(false && "Number of arguments to Buffer() does not match static dimensionality");
407 }
408 // This should usually be inlined, so if dimensions is statically known,
409 // we can skip the call to new
410 buf.dimensions = dimensions;
411 buf.dim = (dimensions <= InClassDimStorage) ? shape : new halide_dimension_t[dimensions];
412 }
413
414 void copy_shape_from(const halide_buffer_t &other) {
415 // All callers of this ensure that buf.dimensions == other.dimensions.
416 make_shape_storage(other.dimensions);
417 std::copy(other.dim, other.dim + other.dimensions, buf.dim);
418 }
419
420 template<typename T2, int D2, int S2>
421 void move_shape_from(Buffer<T2, D2, S2> &&other) {
422 if (other.shape == other.buf.dim) {
423 copy_shape_from(other.buf);
424 } else {
425 buf.dim = other.buf.dim;
426 other.buf.dim = nullptr;
427 }
428 other.buf = halide_buffer_t();
429 }
430
431 /** Initialize the shape from a halide_buffer_t. */
432 void initialize_from_buffer(const halide_buffer_t &b,
433 BufferDeviceOwnership ownership) {
434 memcpy(&buf, &b, sizeof(halide_buffer_t));
435 copy_shape_from(b);
436 if (b.device) {
437 dev_ref_count = new DeviceRefCount;
438 dev_ref_count->ownership = ownership;
439 }
440 }
441
442 /** Initialize the shape from an array of ints */
443 void initialize_shape(const int *sizes) {
444 for (int i = 0; i < buf.dimensions; i++) {
445 buf.dim[i].min = 0;
446 buf.dim[i].extent = sizes[i];
447 if (i == 0) {
448 buf.dim[i].stride = 1;
449 } else {
450 buf.dim[i].stride = buf.dim[i - 1].stride * buf.dim[i - 1].extent;
451 }
452 }
453 }
454
455 /** Initialize the shape from a vector of extents */
456 void initialize_shape(const std::vector<int> &sizes) {
457 assert(buf.dimensions == (int)sizes.size());
458 initialize_shape(sizes.data());
459 }
460
461 /** Initialize the shape from the static shape of an array */
462 template<typename Array, size_t N>
463 void initialize_shape_from_array_shape(int next, Array (&vals)[N]) {
464 buf.dim[next].min = 0;
465 buf.dim[next].extent = (int)N;
466 if (next == 0) {
467 buf.dim[next].stride = 1;
468 } else {
469 initialize_shape_from_array_shape(next - 1, vals[0]);
470 buf.dim[next].stride = buf.dim[next - 1].stride * buf.dim[next - 1].extent;
471 }
472 }
473
474 /** Base case for the template recursion above. */
475 template<typename T2>
476 void initialize_shape_from_array_shape(int, const T2 &) {
477 }
478
479 /** Get the dimensionality of a multi-dimensional C array */
480 template<typename Array, size_t N>
481 static int dimensionality_of_array(Array (&vals)[N]) {
482 return dimensionality_of_array(vals[0]) + 1;
483 }
484
485 template<typename T2>
486 static int dimensionality_of_array(const T2 &) {
487 return 0;
488 }
489
490 /** Get the underlying halide_type_t of an array's element type. */
491 template<typename Array, size_t N>
492 static halide_type_t scalar_type_of_array(Array (&vals)[N]) {
493 return scalar_type_of_array(vals[0]);
494 }
495
496 template<typename T2>
497 static halide_type_t scalar_type_of_array(const T2 &) {
499 }
500
501 /** Crop a single dimension without handling device allocation. */
502 void crop_host(int d, int min, int extent) {
503 assert(dim(d).min() <= min);
504 assert(dim(d).max() >= min + extent - 1);
505 ptrdiff_t shift = min - dim(d).min();
506 if (buf.host != nullptr) {
507 buf.host += (shift * dim(d).stride()) * type().bytes();
508 }
509 buf.dim[d].min = min;
510 buf.dim[d].extent = extent;
511 }
512
513 /** Crop as many dimensions as are in rect, without handling device allocation. */
514 void crop_host(const std::vector<std::pair<int, int>> &rect) {
515 assert(rect.size() <= static_cast<decltype(rect.size())>(std::numeric_limits<int>::max()));
516 int limit = (int)rect.size();
517 assert(limit <= dimensions());
518 for (int i = 0; i < limit; i++) {
519 crop_host(i, rect[i].first, rect[i].second);
520 }
521 }
522
523 void complete_device_crop(Buffer<T, Dims, InClassDimStorage> &result_host_cropped) const {
524 assert(buf.device_interface != nullptr);
526 // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
527 // is it possible to get to this point without incref having run at least once since
528 // the device field was set? (I.e. in the internal logic of crop. incref might have been
529 // called.)
530 if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
531 result_host_cropped.crop_from(((DevRefCountCropped *)dev_ref_count)->cropped_from);
532 } else {
533 result_host_cropped.crop_from(*this);
534 }
535 }
536 }
537
538 /** slice a single dimension without handling device allocation. */
539 void slice_host(int d, int pos) {
540 static_assert(Dims == AnyDims);
541 assert(dimensions() > 0);
542 assert(d >= 0 && d < dimensions());
543 assert(pos >= dim(d).min() && pos <= dim(d).max());
544 buf.dimensions--;
545 ptrdiff_t shift = pos - buf.dim[d].min;
546 if (buf.host != nullptr) {
547 buf.host += (shift * buf.dim[d].stride) * type().bytes();
548 }
549 for (int i = d; i < buf.dimensions; i++) {
550 buf.dim[i] = buf.dim[i + 1];
551 }
552 buf.dim[buf.dimensions] = {0, 0, 0};
553 }
554
555 void complete_device_slice(Buffer<T, AnyDims, InClassDimStorage> &result_host_sliced, int d, int pos) const {
556 assert(buf.device_interface != nullptr);
558 // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
559 // is it possible to get to this point without incref having run at least once since
560 // the device field was set? (I.e. in the internal logic of slice. incref might have been
561 // called.)
562 if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
563 // crop_from() is correct here, despite the fact that we are slicing.
564 result_host_sliced.crop_from(((DevRefCountCropped *)dev_ref_count)->cropped_from);
565 } else {
566 // crop_from() is correct here, despite the fact that we are slicing.
567 result_host_sliced.crop_from(*this);
568 }
569 }
570 }
571
572public:
573 typedef T ElemType;
574
575 /** Read-only access to the shape */
576 class Dimension {
577 const halide_dimension_t &d;
578
579 public:
580 /** The lowest coordinate in this dimension */
582 return d.min;
583 }
584
585 /** The number of elements in memory you have to step over to
586 * increment this coordinate by one. */
588 return d.stride;
589 }
590
591 /** The extent of the image along this dimension */
593 return d.extent;
594 }
595
596 /** The highest coordinate in this dimension */
598 return min() + extent() - 1;
599 }
600
601 /** An iterator class, so that you can iterate over
602 * coordinates in a dimensions using a range-based for loop. */
603 struct iterator {
604 int val;
605 int operator*() const {
606 return val;
607 }
608 bool operator!=(const iterator &other) const {
609 return val != other.val;
610 }
612 val++;
613 return *this;
614 }
615 };
616
617 /** An iterator that points to the min coordinate */
619 return {min()};
620 }
621
622 /** An iterator that points to one past the max coordinate */
624 return {min() + extent()};
625 }
626
627 explicit Dimension(const halide_dimension_t &dim)
628 : d(dim) {
629 }
630 };
631
632 /** Access the shape of the buffer */
634 assert(i >= 0 && i < this->dimensions());
635 return Dimension(buf.dim[i]);
636 }
637
638 /** Access to the mins, strides, extents. Will be deprecated. Do not use. */
639 // @{
640 int min(int i) const {
641 return dim(i).min();
642 }
643 int extent(int i) const {
644 return dim(i).extent();
645 }
646 int stride(int i) const {
647 return dim(i).stride();
648 }
649 // @}
650
651 /** The total number of elements this buffer represents. Equal to
652 * the product of the extents */
653 size_t number_of_elements() const {
654 return buf.number_of_elements();
655 }
656
657 /** Get the dimensionality of the buffer. */
658 int dimensions() const {
659 if constexpr (has_static_dimensions) {
660 return Dims;
661 } else {
662 return buf.dimensions;
663 }
664 }
665
666 /** Get the type of the elements. */
668 return buf.type;
669 }
670
671 /** A pointer to the element with the lowest address. If all
672 * strides are positive, equal to the host pointer. */
673 T *begin() const {
674 assert(buf.host != nullptr); // Cannot call begin() on an unallocated Buffer.
675 return (T *)buf.begin();
676 }
677
678 /** A pointer to one beyond the element with the highest address. */
679 T *end() const {
680 assert(buf.host != nullptr); // Cannot call end() on an unallocated Buffer.
681 return (T *)buf.end();
682 }
683
684 /** The total number of bytes spanned by the data in memory. */
685 size_t size_in_bytes() const {
686 return buf.size_in_bytes();
687 }
688
689 /** Reset the Buffer to be equivalent to a default-constructed Buffer
690 * of the same static type (if any); Buffer<void> will have its runtime
691 * type reset to uint8. */
692 void reset() {
693 *this = Buffer();
694 }
695
697 : shape() {
698 buf.type = static_halide_type();
699 // If Dims are statically known, must create storage that many.
700 // otherwise, make a zero-dimensional buffer.
701 constexpr int buf_dimensions = (Dims == AnyDims) ? 0 : Dims;
703 }
704
705 /** Make a Buffer from a halide_buffer_t */
706 explicit Buffer(const halide_buffer_t &buf,
708 assert(T_is_void || buf.type == static_halide_type());
709 initialize_from_buffer(buf, ownership);
710 }
711
712 /** Give Buffers access to the members of Buffers of different dimensionalities and types. */
713 template<typename T2, int D2, int S2>
714 friend class Buffer;
715
716private:
717 template<typename T2, int D2, int S2>
718 static void static_assert_can_convert_from() {
719 static_assert((!std::is_const_v<T2> || std::is_const_v<T>),
720 "Can't convert from a Buffer<const T> to a Buffer<T>");
721 static_assert(std::is_same_v<std::remove_const_t<T>, std::remove_const_t<T2>> ||
723 "type mismatch constructing Buffer");
724 static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2,
725 "Can't convert from a Buffer with static dimensionality to a Buffer with different static dimensionality");
726 }
727
728public:
732 static void set_default_deallocate_fn(void (*deallocate_fn)(void *)) {
734 }
735
736 /** Determine if a Buffer<T, Dims, InClassDimStorage> can be constructed from some other Buffer type.
737 * If this can be determined at compile time, fail with a static assert; otherwise
738 * return a boolean based on runtime typing. */
739 template<typename T2, int D2, int S2>
742 if (Buffer<T2, D2, S2>::T_is_void && !T_is_void) {
743 if (other.type() != static_halide_type()) {
744 return false;
745 }
746 }
747 if (Dims != AnyDims) {
748 if (other.dimensions() != Dims) {
749 return false;
750 }
751 }
752 return true;
753 }
754
755 /** Fail an assertion at runtime or compile-time if an Buffer<T, Dims, InClassDimStorage>
756 * cannot be constructed from some other Buffer type. */
757 template<typename T2, int D2, int S2>
759 // Explicitly call static_assert_can_convert_from() here so
760 // that we always get compile-time checking, even if compiling with
761 // assertions disabled.
763 assert(can_convert_from(other));
764 }
765
766 /** Copy constructor. Does not copy underlying data. */
768 : buf(other.buf),
769 alloc(other.alloc) {
770 other.incref();
771 dev_ref_count = other.dev_ref_count;
772 copy_shape_from(other.buf);
773 }
774
775 /** Construct a Buffer from a Buffer of different dimensionality
776 * and type. Asserts that the type and dimensionality matches (at runtime,
777 * if one of the types is void). Note that this constructor is
778 * implicit. This, for example, lets you pass things like
779 * Buffer<T> or Buffer<const void> to functions expected
780 * Buffer<const T>. */
781 template<typename T2, int D2, int S2>
783 : buf(other.buf),
784 alloc(other.alloc) {
785 assert_can_convert_from(other);
786 other.incref();
787 dev_ref_count = other.dev_ref_count;
788 copy_shape_from(other.buf);
789 }
790
791 /** Move constructor */
793 : buf(other.buf),
794 alloc(other.alloc),
795 dev_ref_count(other.dev_ref_count) {
796 other.dev_ref_count = nullptr;
797 other.alloc = nullptr;
798 move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
799 }
800
801 /** Move-construct a Buffer from a Buffer of different
802 * dimensionality and type. Asserts that the types match (at
803 * runtime if one of the types is void). */
804 template<typename T2, int D2, int S2>
806 : buf(other.buf),
807 alloc(other.alloc),
808 dev_ref_count(other.dev_ref_count) {
809 assert_can_convert_from(other);
810 other.dev_ref_count = nullptr;
811 other.alloc = nullptr;
812 move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
813 }
814
815 /** Assign from another Buffer of possibly-different
816 * dimensionality and type. Asserts that the types match (at
817 * runtime if one of the types is void). */
818 template<typename T2, int D2, int S2>
820 if ((const void *)this == (const void *)&other) {
821 return *this;
822 }
823 assert_can_convert_from(other);
824 other.incref();
825 decref();
826 dev_ref_count = other.dev_ref_count;
827 alloc = other.alloc;
828 free_shape_storage();
829 buf = other.buf;
830 copy_shape_from(other.buf);
831 return *this;
832 }
833
834 /** Standard assignment operator */
836 // The cast to void* here is just to satisfy clang-tidy
837 if ((const void *)this == (const void *)&other) {
838 return *this;
839 }
840 other.incref();
841 decref();
842 dev_ref_count = other.dev_ref_count;
843 alloc = other.alloc;
844 free_shape_storage();
845 buf = other.buf;
846 copy_shape_from(other.buf);
847 return *this;
848 }
849
850 /** Move from another Buffer of possibly-different
851 * dimensionality and type. Asserts that the types match (at
852 * runtime if one of the types is void). */
853 template<typename T2, int D2, int S2>
855 assert_can_convert_from(other);
856 decref();
857 alloc = other.alloc;
858 other.alloc = nullptr;
859 dev_ref_count = other.dev_ref_count;
860 other.dev_ref_count = nullptr;
861 free_shape_storage();
862 buf = other.buf;
863 move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
864 return *this;
865 }
866
867 /** Standard move-assignment operator */
869 decref();
870 alloc = other.alloc;
871 other.alloc = nullptr;
872 dev_ref_count = other.dev_ref_count;
873 other.dev_ref_count = nullptr;
874 free_shape_storage();
875 buf = other.buf;
876 move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
877 return *this;
878 }
879
880 /** Check the product of the extents fits in memory. */
882 size_t size = type().bytes();
883 for (int i = 0; i < dimensions(); i++) {
884 size *= dim(i).extent();
885 }
886 // We allow 2^31 or 2^63 bytes, so drop the top bit.
887 size = (size << 1) >> 1;
888 for (int i = 0; i < dimensions(); i++) {
889 size /= dim(i).extent();
890 }
891 assert(size == (size_t)type().bytes() && "Error: Overflow computing total size of buffer.");
892 }
893
894 /** Allocate memory for this Buffer. Drops the reference to any
895 * owned memory. */
896 void allocate(void *(*allocate_fn)(size_t) = nullptr,
897 void (*deallocate_fn)(void *) = nullptr) {
898 // Drop any existing allocation
899 deallocate();
900
901 // Conservatively align images to (usually) 128 bytes. This is enough
902 // alignment for all the platforms we might use. Also ensure that the allocation
903 // is such that the logical size is an integral multiple of 128 bytes (or a bit more).
904 constexpr size_t alignment = HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT;
905
906 const auto align_up = [=](size_t value) -> size_t {
907 return (value + alignment - 1) & ~(alignment - 1);
908 };
909
910 size_t size = size_in_bytes();
911
912#if HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
913 // Only use aligned_alloc() if no custom allocators are specified.
915 // As a practical matter, sizeof(AllocationHeader) is going to be no more than 16 bytes
916 // on any supported platform, so we will just overallocate by 'alignment'
917 // so that the user storage also starts at an aligned point. This is a bit
918 // wasteful, but probably not a big deal.
919 static_assert(sizeof(AllocationHeader) <= alignment);
920 void *alloc_storage = ::aligned_alloc(alignment, align_up(size) + alignment);
922 alloc = new (alloc_storage) AllocationHeader(free);
923 buf.host = (uint8_t *)((uintptr_t)alloc_storage + alignment);
924 return;
925 }
926 // else fall thru
927#endif
928 if (!allocate_fn) {
930 if (!allocate_fn) {
932 }
933 }
934 if (!deallocate_fn) {
936 if (!deallocate_fn) {
937 deallocate_fn = free;
938 }
939 }
940
941 static_assert(sizeof(AllocationHeader) <= alignment);
942
943 // malloc() and friends must return a pointer aligned to at least alignof(std::max_align_t);
944 // make sure this is OK for AllocationHeader, since it always goes at the start
945 static_assert(alignof(AllocationHeader) <= alignof(std::max_align_t));
946
947 const size_t requested_size = align_up(size + alignment +
948 std::max(0, (int)sizeof(AllocationHeader) -
949 (int)sizeof(std::max_align_t)));
951 alloc = new (alloc_storage) AllocationHeader(deallocate_fn);
952 uint8_t *unaligned_ptr = ((uint8_t *)alloc) + sizeof(AllocationHeader);
954 }
955
956 /** Drop reference to any owned host or device memory, possibly
957 * freeing it, if this buffer held the last reference to
958 * it. Retains the shape of the buffer. Does nothing if this
959 * buffer did not allocate its own memory. */
960 void deallocate() {
961 decref();
962 }
963
964 /** Drop reference to any owned device memory, possibly freeing it
965 * if this buffer held the last reference to it. Asserts that
966 * device_dirty is false. */
968 decref(true);
969 }
970
971 /** Allocate a new image of the given size with a runtime
972 * type. Only used when you do know what size you want but you
973 * don't know statically what type the elements are. Pass zeros
974 * to make a buffer suitable for bounds query calls. */
975 template<typename... Args,
976 typename = std::enable_if_t<AllInts<Args...>::value>>
977 Buffer(halide_type_t t, int first, Args... rest) {
978 if (!T_is_void) {
979 assert(static_halide_type() == t);
980 }
981 int extents[] = {first, (int)rest...};
982 buf.type = t;
983 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
985 initialize_shape(extents);
986 if (!Internal::any_zero(extents)) {
987 check_overflow();
988 allocate();
989 }
990 }
991
992 /** Allocate a new image of the given size. Pass zeros to make a
993 * buffer suitable for bounds query calls. */
994 // @{
995
996 // The overload with one argument is 'explicit', so that
997 // (say) int is not implicitly convertible to Buffer<int>
998 explicit Buffer(int first) {
999 static_assert(!T_is_void,
1000 "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
1001 int extents[] = {first};
1002 buf.type = static_halide_type();
1003 constexpr int buf_dimensions = 1;
1005 initialize_shape(extents);
1006 if (first != 0) {
1007 check_overflow();
1008 allocate();
1009 }
1010 }
1011
1012 template<typename... Args,
1013 typename = std::enable_if_t<AllInts<Args...>::value>>
1014 Buffer(int first, int second, Args... rest) {
1015 static_assert(!T_is_void,
1016 "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
1017 int extents[] = {first, second, (int)rest...};
1018 buf.type = static_halide_type();
1019 constexpr int buf_dimensions = 2 + (int)(sizeof...(rest));
1021 initialize_shape(extents);
1022 if (!Internal::any_zero(extents)) {
1023 check_overflow();
1024 allocate();
1025 }
1026 }
1027 // @}
1028
1029 /** Allocate a new image of unknown type using a vector of ints as the size. */
1030 Buffer(halide_type_t t, const std::vector<int> &sizes) {
1031 if (!T_is_void) {
1032 assert(static_halide_type() == t);
1033 }
1034 buf.type = t;
1035 // make_shape_storage() will do a runtime check that dimensionality matches.
1036 make_shape_storage((int)sizes.size());
1037 initialize_shape(sizes);
1038 if (!Internal::any_zero(sizes)) {
1039 check_overflow();
1040 allocate();
1041 }
1042 }
1043
1044 /** Allocate a new image of known type using a vector of ints as the size. */
1045 explicit Buffer(const std::vector<int> &sizes)
1046 : Buffer(static_halide_type(), sizes) {
1047 }
1048
1049private:
1050 // Create a copy of the sizes vector, ordered as specified by order.
1051 static std::vector<int> make_ordered_sizes(const std::vector<int> &sizes, const std::vector<int> &order) {
1052 assert(order.size() == sizes.size());
1053 std::vector<int> ordered_sizes(sizes.size());
1054 for (size_t i = 0; i < sizes.size(); ++i) {
1055 ordered_sizes[i] = sizes.at(order[i]);
1056 }
1057 return ordered_sizes;
1058 }
1059
1060public:
1061 /** Allocate a new image of unknown type using a vector of ints as the size and
1062 * a vector of indices indicating the storage order for each dimension. The
1063 * length of the sizes vector and the storage-order vector must match. For instance,
1064 * to allocate an interleaved RGB buffer, you would pass {2, 0, 1} for storage_order. */
1065 Buffer(halide_type_t t, const std::vector<int> &sizes, const std::vector<int> &storage_order)
1066 : Buffer(t, make_ordered_sizes(sizes, storage_order)) {
1067 transpose(storage_order);
1068 }
1069
1070 Buffer(const std::vector<int> &sizes, const std::vector<int> &storage_order)
1071 : Buffer(static_halide_type(), sizes, storage_order) {
1072 }
1073
1074 /** Make an Buffer that refers to a statically sized array. Does not
1075 * take ownership of the data, and does not set the host_dirty flag. */
1076 template<typename Array, size_t N>
1077 explicit Buffer(Array (&vals)[N]) {
1078 const int buf_dimensions = dimensionality_of_array(vals);
1079 buf.type = scalar_type_of_array(vals);
1080 buf.host = (uint8_t *)vals;
1081 make_shape_storage(buf_dimensions);
1082 initialize_shape_from_array_shape(buf.dimensions - 1, vals);
1083 }
1084
1085 /** Initialize an Buffer of runtime type from a pointer and some
1086 * sizes. Assumes dense row-major packing and a min coordinate of
1087 * zero. Does not take ownership of the data and does not set the
1088 * host_dirty flag. */
1089 template<typename... Args,
1090 typename = std::enable_if_t<AllInts<Args...>::value>>
1091 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args &&...rest) {
1092 if (!T_is_void) {
1093 assert(static_halide_type() == t);
1094 }
1095 int extents[] = {first, (int)rest...};
1096 buf.type = t;
1097 buf.host = (uint8_t *)const_cast<void *>(data);
1098 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
1100 initialize_shape(extents);
1101 }
1102
1103 /** Initialize an Buffer from a pointer and some sizes. Assumes
1104 * dense row-major packing and a min coordinate of zero. Does not
1105 * take ownership of the data and does not set the host_dirty flag. */
1106 template<typename... Args,
1107 typename = std::enable_if_t<AllInts<Args...>::value>>
1108 explicit Buffer(T *data, int first, Args &&...rest) {
1109 int extents[] = {first, (int)rest...};
1110 buf.type = static_halide_type();
1111 buf.host = (uint8_t *)const_cast<std::remove_const_t<T> *>(data);
1112 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
1114 initialize_shape(extents);
1115 }
1116
1117 /** Initialize an Buffer from a pointer and a vector of
1118 * sizes. Assumes dense row-major packing and a min coordinate of
1119 * zero. Does not take ownership of the data and does not set the
1120 * host_dirty flag. */
1121 explicit Buffer(T *data, const std::vector<int> &sizes) {
1122 buf.type = static_halide_type();
1123 buf.host = (uint8_t *)const_cast<std::remove_const_t<T> *>(data);
1124 make_shape_storage((int)sizes.size());
1125 initialize_shape(sizes);
1126 }
1127
1128 /** Initialize an Buffer of runtime type from a pointer and a
1129 * vector of sizes. Assumes dense row-major packing and a min
1130 * coordinate of zero. Does not take ownership of the data and
1131 * does not set the host_dirty flag. */
1132 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, const std::vector<int> &sizes) {
1133 if (!T_is_void) {
1134 assert(static_halide_type() == t);
1135 }
1136 buf.type = t;
1137 buf.host = (uint8_t *)const_cast<void *>(data);
1138 make_shape_storage((int)sizes.size());
1139 initialize_shape(sizes);
1140 }
1141
1142 /** Initialize an Buffer from a pointer to the min coordinate and
1143 * an array describing the shape. Does not take ownership of the
1144 * data, and does not set the host_dirty flag. */
1145 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int d, const halide_dimension_t *shape) {
1146 if (!T_is_void) {
1147 assert(static_halide_type() == t);
1148 }
1149 buf.type = t;
1150 buf.host = (uint8_t *)const_cast<void *>(data);
1151 make_shape_storage(d);
1152 for (int i = 0; i < d; i++) {
1153 buf.dim[i] = shape[i];
1154 }
1155 }
1156
1157 /** Initialize a Buffer from a pointer to the min coordinate and
1158 * a vector describing the shape. Does not take ownership of the
1159 * data, and does not set the host_dirty flag. */
1160 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data,
1161 const std::vector<halide_dimension_t> &shape)
1162 : Buffer(t, data, (int)shape.size(), shape.data()) {
1163 }
1164
1165 /** Initialize an Buffer from a pointer to the min coordinate and
1166 * an array describing the shape. Does not take ownership of the
1167 * data and does not set the host_dirty flag. */
1168 explicit Buffer(T *data, int d, const halide_dimension_t *shape) {
1169 buf.type = static_halide_type();
1170 buf.host = (uint8_t *)const_cast<std::remove_const_t<T> *>(data);
1171 make_shape_storage(d);
1172 for (int i = 0; i < d; i++) {
1173 buf.dim[i] = shape[i];
1174 }
1175 }
1176
1177 /** Initialize a Buffer from a pointer to the min coordinate and
1178 * a vector describing the shape. Does not take ownership of the
1179 * data, and does not set the host_dirty flag. */
1180 explicit Buffer(T *data, const std::vector<halide_dimension_t> &shape)
1181 : Buffer(data, (int)shape.size(), shape.data()) {
1182 }
1183
1184 /** Destructor. Will release any underlying owned allocation if
1185 * this is the last reference to it. Will assert fail if there are
1186 * weak references to this Buffer outstanding. */
1188 decref();
1189 free_shape_storage();
1190 }
1191
1192 /** Get a pointer to the raw halide_buffer_t this wraps. */
1193 // @{
1195 return &buf;
1196 }
1197
1199 return &buf;
1200 }
1201 // @}
1202
1203 /** Provide a cast operator to halide_buffer_t *, so that
1204 * instances can be passed directly to Halide filters. */
1205 operator halide_buffer_t *() {
1206 return &buf;
1207 }
1208
1209 /** Return a typed reference to this Buffer. Useful for converting
1210 * a reference to a Buffer<void> to a reference to, for example, a
1211 * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1212 * You can also optionally sspecify a new value for Dims; this is useful
1213 * mainly for removing the dimensionality constraint on a Buffer with
1214 * explicit dimensionality. Does a runtime assert if the source buffer type
1215 * is void or the new dimensionality is incompatible. */
1216 template<typename T2, int D2 = Dims>
1221
1222 /** Return a const typed reference to this Buffer. Useful for converting
1223 * a reference to a Buffer<void> to a reference to, for example, a
1224 * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1225 * You can also optionally sspecify a new value for Dims; this is useful
1226 * mainly for removing the dimensionality constraint on a Buffer with
1227 * explicit dimensionality. Does a runtime assert if the source buffer type
1228 * is void or the new dimensionality is incompatible. */
1229 template<typename T2, int D2 = Dims>
1234
1235 /** Return an rval reference to this Buffer. Useful for converting
1236 * a reference to a Buffer<void> to a reference to, for example, a
1237 * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1238 * You can also optionally sspecify a new value for Dims; this is useful
1239 * mainly for removing the dimensionality constraint on a Buffer with
1240 * explicit dimensionality. Does a runtime assert if the source buffer type
1241 * is void or the new dimensionality is incompatible. */
1242 template<typename T2, int D2 = Dims>
1247
1248 /** as_const() is syntactic sugar for .as<const T>(), to avoid the need
1249 * to recapitulate the type argument. */
1250 // @{
1253 // Note that we can skip the assert_can_convert_from(), since T -> const T
1254 // conversion is always legal.
1255 return *reinterpret_cast<Buffer<std::add_const_t<T>, Dims, InClassDimStorage> *>(this);
1256 }
1257
1260 return *reinterpret_cast<const Buffer<std::add_const_t<T>, Dims, InClassDimStorage> *>(this);
1261 }
1262
1267 // @}
1268
1269 /** Add some syntactic sugar to allow autoconversion from Buffer<T> to Buffer<const T>& when
1270 * passing arguments */
1271 template<typename T2 = T, typename = std::enable_if_t<!std::is_const_v<T2>>>
1273 return as_const();
1274 }
1275
1276 /** Add some syntactic sugar to allow autoconversion from Buffer<T> to Buffer<void>& when
1277 * passing arguments */
1278 template<typename TVoid,
1279 typename T2 = T,
1280 typename = std::enable_if_t<std::is_same_v<TVoid, void> &&
1281 !std::is_void_v<T2> &&
1282 !std::is_const_v<T2>>>
1284 return as<TVoid, Dims>();
1285 }
1286
1287 /** Add some syntactic sugar to allow autoconversion from Buffer<const T> to Buffer<const void>& when
1288 * passing arguments */
1289 template<typename TVoid,
1290 typename T2 = T,
1291 typename = std::enable_if_t<std::is_same_v<TVoid, void> &&
1292 !std::is_void_v<T2> &&
1293 std::is_const_v<T2>>>
1297
1298 /** Conventional names for the first three dimensions. */
1299 // @{
1300 int width() const {
1301 return (dimensions() > 0) ? dim(0).extent() : 1;
1302 }
1303 int height() const {
1304 return (dimensions() > 1) ? dim(1).extent() : 1;
1305 }
1306 int channels() const {
1307 return (dimensions() > 2) ? dim(2).extent() : 1;
1308 }
1309 // @}
1310
1311 /** Conventional names for the min and max value of each dimension */
1312 // @{
1313 int left() const {
1314 return dim(0).min();
1315 }
1316
1317 int right() const {
1318 return dim(0).max();
1319 }
1320
1321 int top() const {
1322 return dim(1).min();
1323 }
1324
1325 int bottom() const {
1326 return dim(1).max();
1327 }
1328 // @}
1329
1330 /** Make a new image which is a deep copy of this image. Use crop
1331 * or slice followed by copy to make a copy of only a portion of
1332 * the image. The new image has the same nesting order of dimensions
1333 * (e.g. channels innermost), but resets the strides to the default
1334 * (each stride is the product of the extents of the inner dimensions).
1335 * Note that this means any strides of zero get broadcast into a non-zero stride.
1336 *
1337 * Note that the returned Buffer is always of a non-const type T (ie:
1338 *
1339 * Buffer<const T>.copy() -> Buffer<T> rather than Buffer<const T>
1340 *
1341 * which is always safe, since we are making a deep copy. (The caller
1342 * can easily cast it back to Buffer<const T> if desired, which is
1343 * always safe and free.)
1344 */
1346 void (*deallocate_fn)(void *) = nullptr) const {
1348 dst.copy_from(*this);
1349 return dst;
1350 }
1351
1352 /** Like copy(), but the copy is created in interleaved memory layout
1353 * (vs. keeping the same memory layout as the original). Requires that 'this'
1354 * has exactly 3 dimensions.
1355 */
1357 void (*deallocate_fn)(void *) = nullptr) const {
1358 static_assert(Dims == AnyDims || Dims == 3);
1359 assert(dimensions() == 3);
1361 dst.set_min(min(0), min(1), min(2));
1362 dst.allocate(allocate_fn, deallocate_fn);
1363 dst.copy_from(*this);
1364 return dst;
1365 }
1366
1367 /** Like copy(), but the copy is created in planar memory layout
1368 * (vs. keeping the same memory layout as the original).
1369 */
1371 void (*deallocate_fn)(void *) = nullptr) const {
1372 std::vector<int> mins, extents;
1373 const int dims = dimensions();
1374 mins.reserve(dims);
1375 extents.reserve(dims);
1376 for (int d = 0; d < dims; ++d) {
1377 mins.push_back(dim(d).min());
1378 extents.push_back(dim(d).extent());
1379 }
1381 dst.set_min(mins);
1382 dst.allocate(allocate_fn, deallocate_fn);
1383 dst.copy_from(*this);
1384 return dst;
1385 }
1386
1387 /** Make a copy of the Buffer which shares the underlying host and/or device
1388 * allocations as the existing Buffer. This is purely syntactic sugar for
1389 * cases where you have a const reference to a Buffer but need a temporary
1390 * non-const copy (e.g. to make a call into AOT-generated Halide code), and want a terse
1391 * inline way to create a temporary. \code
1392 * void call_my_func(const Buffer<const uint8_t>& input) {
1393 * my_func(input.alias(), output);
1394 * }\endcode
1395 */
1397 return *this;
1398 }
1399
1400 /** Fill a Buffer with the values at the same coordinates in
1401 * another Buffer. Restricts itself to coordinates contained
1402 * within the intersection of the two buffers. If the two Buffers
1403 * are not in the same coordinate system, you will need to
1404 * translate the argument Buffer first. E.g. if you're blitting a
1405 * sprite onto a framebuffer, you'll want to translate the sprite
1406 * to the correct location first like so: \code
1407 * framebuffer.copy_from(sprite.translated({x, y})); \endcode
1408 */
1409 template<typename T2, int D2, int S2>
1411 static_assert(!std::is_const_v<T>, "Cannot call copy_from() on a Buffer<const T>");
1412 assert(!device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty destination.");
1413 assert(!src.device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty source.");
1414
1416
1417 static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2);
1418 assert(src.dimensions() == dst.dimensions());
1419
1420 // Trim the copy to the region in common
1421 const int d = dimensions();
1422 for (int i = 0; i < d; i++) {
1423 int min_coord = std::max(dst.dim(i).min(), src.dim(i).min());
1424 int max_coord = std::min(dst.dim(i).max(), src.dim(i).max());
1425 if (max_coord < min_coord) {
1426 // The buffers do not overlap.
1427 return;
1428 }
1429 dst.crop(i, min_coord, max_coord - min_coord + 1);
1430 src.crop(i, min_coord, max_coord - min_coord + 1);
1431 }
1432
1433 // If T is void, we need to do runtime dispatch to an
1434 // appropriately-typed lambda. We're copying, so we only care
1435 // about the element size. (If not, this should optimize away
1436 // into a static dispatch to the right-sized copy.)
1437 if (T_is_void ? (type().bytes() == 1) : (sizeof(not_void_T) == 1)) {
1438 using MemType = uint8_t;
1439 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1440 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1441 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1442 } else if (T_is_void ? (type().bytes() == 2) : (sizeof(not_void_T) == 2)) {
1443 using MemType = uint16_t;
1444 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1445 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1446 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1447 } else if (T_is_void ? (type().bytes() == 4) : (sizeof(not_void_T) == 4)) {
1448 using MemType = uint32_t;
1449 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1450 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1451 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1452 } else if (T_is_void ? (type().bytes() == 8) : (sizeof(not_void_T) == 8)) {
1453 using MemType = uint64_t;
1454 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1455 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1456 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1457 } else {
1458 assert(false && "type().bytes() must be 1, 2, 4, or 8");
1459 }
1460 set_host_dirty();
1461 }
1462
1463 /** Make an image that refers to a sub-range of this image along
1464 * the given dimension. Asserts that the crop region is within
1465 * the existing bounds: you cannot "crop outwards", even if you know there
1466 * is valid Buffer storage (e.g. because you already cropped inwards). */
1467 Buffer<T, Dims, InClassDimStorage> cropped(int d, int min, int extent) const {
1468 // Make a fresh copy of the underlying buffer (but not a fresh
1469 // copy of the allocation, if there is one).
1471
1472 // This guarantees the prexisting device ref is dropped if the
1473 // device_crop call fails and maintains the buffer in a consistent
1474 // state.
1475 im.device_deallocate();
1476
1477 im.crop_host(d, min, extent);
1478 if (buf.device_interface != nullptr) {
1479 complete_device_crop(im);
1480 }
1481 return im;
1482 }
1483
1484 /** Crop an image in-place along the given dimension. This does
1485 * not move any data around in memory - it just changes the min
1486 * and extent of the given dimension. */
1487 void crop(int d, int min, int extent) {
1488 // An optimization for non-device buffers. For the device case,
1489 // a temp buffer is required, so reuse the not-in-place version.
1490 // TODO(zalman|abadams): Are nop crops common enough to special
1491 // case the device part of the if to do nothing?
1492 if (buf.device_interface != nullptr) {
1493 *this = cropped(d, min, extent);
1494 } else {
1495 crop_host(d, min, extent);
1496 }
1497 }
1498
1499 /** Make an image that refers to a sub-rectangle of this image along
1500 * the first N dimensions. Asserts that the crop region is within
1501 * the existing bounds. The cropped image may drop any device handle
1502 * if the device_interface cannot accomplish the crop in-place. */
1503 Buffer<T, Dims, InClassDimStorage> cropped(const std::vector<std::pair<int, int>> &rect) const {
1504 // Make a fresh copy of the underlying buffer (but not a fresh
1505 // copy of the allocation, if there is one).
1507
1508 // This guarantees the prexisting device ref is dropped if the
1509 // device_crop call fails and maintains the buffer in a consistent
1510 // state.
1511 im.device_deallocate();
1512
1513 im.crop_host(rect);
1514 if (buf.device_interface != nullptr) {
1515 complete_device_crop(im);
1516 }
1517 return im;
1518 }
1519
1520 /** Crop an image in-place along the first N dimensions. This does
1521 * not move any data around in memory, nor does it free memory. It
1522 * just rewrites the min/extent of each dimension to refer to a
1523 * subregion of the same allocation. */
1524 void crop(const std::vector<std::pair<int, int>> &rect) {
1525 // An optimization for non-device buffers. For the device case,
1526 // a temp buffer is required, so reuse the not-in-place version.
1527 // TODO(zalman|abadams): Are nop crops common enough to special
1528 // case the device part of the if to do nothing?
1529 if (buf.device_interface != nullptr) {
1530 *this = cropped(rect);
1531 } else {
1532 crop_host(rect);
1533 }
1534 }
1535
1536 /** Make an image which refers to the same data with using
1537 * translated coordinates in the given dimension. Positive values
1538 * move the image data to the right or down relative to the
1539 * coordinate system. Drops any device handle. */
1542 im.translate(d, dx);
1543 return im;
1544 }
1545
1546 /** Translate an image in-place along one dimension by changing
1547 * how it is indexed. Does not move any data around in memory. */
1548 void translate(int d, int delta) {
1549 assert(d >= 0 && d < this->dimensions());
1550 device_deallocate();
1551 buf.dim[d].min += delta;
1552 }
1553
1554 /** Make an image which refers to the same data translated along
1555 * the first N dimensions. */
1556 Buffer<T, Dims, InClassDimStorage> translated(const std::vector<int> &delta) const {
1558 im.translate(delta);
1559 return im;
1560 }
1561
1562 /** Translate an image along the first N dimensions by changing
1563 * how it is indexed. Does not move any data around in memory. */
1564 void translate(const std::vector<int> &delta) {
1565 device_deallocate();
1566 assert(delta.size() <= static_cast<decltype(delta.size())>(std::numeric_limits<int>::max()));
1567 int limit = (int)delta.size();
1568 assert(limit <= dimensions());
1569 for (int i = 0; i < limit; i++) {
1570 translate(i, delta[i]);
1571 }
1572 }
1573
1574 /** Set the min coordinate of an image in the first N dimensions. */
1575 // @{
1576 void set_min(const std::vector<int> &mins) {
1577 assert(mins.size() <= static_cast<decltype(mins.size())>(dimensions()));
1578 device_deallocate();
1579 for (size_t i = 0; i < mins.size(); i++) {
1580 buf.dim[i].min = mins[i];
1581 }
1582 }
1583
1584 template<typename... Args>
1585 void set_min(Args... args) {
1586 set_min(std::vector<int>{args...});
1587 }
1588 // @}
1589
1590 /** Test if a given coordinate is within the bounds of an image. */
1591 // @{
1592 bool contains(const std::vector<int> &coords) const {
1593 assert(coords.size() <= static_cast<decltype(coords.size())>(dimensions()));
1594 for (size_t i = 0; i < coords.size(); i++) {
1595 if (coords[i] < dim((int)i).min() || coords[i] > dim((int)i).max()) {
1596 return false;
1597 }
1598 }
1599 return true;
1600 }
1601
1602 template<typename... Args>
1603 bool contains(Args... args) const {
1604 return contains(std::vector<int>{args...});
1605 }
1606 // @}
1607
1608 /** Make a buffer which refers to the same data in the same layout
1609 * using a swapped indexing order for the dimensions given. So
1610 * A = B.transposed(0, 1) means that A(i, j) == B(j, i), and more
1611 * strongly that A.address_of(i, j) == B.address_of(j, i). */
1614 im.transpose(d1, d2);
1615 return im;
1616 }
1617
1618 /** Transpose a buffer in-place by changing how it is indexed. For
1619 * example, transpose(0, 1) on a two-dimensional buffer means that
1620 * the value referred to by coordinates (i, j) is now reached at
1621 * the coordinates (j, i), and vice versa. This is done by
1622 * reordering the per-dimension metadata rather than by moving
1623 * data around in memory, so other views of the same memory will
1624 * not see the data as having been transposed. */
1625 void transpose(int d1, int d2) {
1626 assert(d1 >= 0 && d1 < this->dimensions());
1627 assert(d2 >= 0 && d2 < this->dimensions());
1628 std::swap(buf.dim[d1], buf.dim[d2]);
1629 }
1630
1631 /** A generalized transpose: instead of swapping two dimensions,
1632 * pass a vector that lists each dimension index exactly once, in
1633 * the desired order. This does not move any data around in memory
1634 * - it just permutes how it is indexed. */
1635 void transpose(const std::vector<int> &order) {
1636 assert((int)order.size() == dimensions());
1637 if (dimensions() < 2) {
1638 // My, that was easy
1639 return;
1640 }
1641
1642 std::vector<int> order_sorted = order;
1643 for (size_t i = 1; i < order_sorted.size(); i++) {
1644 for (size_t j = i; j > 0 && order_sorted[j - 1] > order_sorted[j]; j--) {
1645 std::swap(order_sorted[j], order_sorted[j - 1]);
1646 transpose(j, j - 1);
1647 }
1648 }
1649 }
1650
1651 /** Make a buffer which refers to the same data in the same
1652 * layout using a different ordering of the dimensions. */
1653 Buffer<T, Dims, InClassDimStorage> transposed(const std::vector<int> &order) const {
1655 im.transpose(order);
1656 return im;
1657 }
1658
1659 /** Make a lower-dimensional buffer that refers to one slice of
1660 * this buffer. */
1661 Buffer<T, (Dims == AnyDims ? AnyDims : Dims - 1)>
1662 sliced(int d, int pos) const {
1663 static_assert(Dims == AnyDims || Dims > 0, "Cannot slice a 0-dimensional buffer");
1664 assert(dimensions() > 0);
1665
1667
1668 // This guarantees the prexisting device ref is dropped if the
1669 // device_slice call fails and maintains the buffer in a consistent
1670 // state.
1671 im.device_deallocate();
1672
1673 im.slice_host(d, pos);
1674 if (buf.device_interface != nullptr) {
1675 complete_device_slice(im, d, pos);
1676 }
1677 return im;
1678 }
1679
1680 /** Make a lower-dimensional buffer that refers to one slice of this
1681 * buffer at the dimension's minimum. */
1682 Buffer<T, (Dims == AnyDims ? AnyDims : Dims - 1)>
1683 sliced(int d) const {
1684 static_assert(Dims == AnyDims || Dims > 0, "Cannot slice a 0-dimensional buffer");
1685 assert(dimensions() > 0);
1686
1687 return sliced(d, dim(d).min());
1688 }
1689
1690 /** Rewrite the buffer to refer to a single lower-dimensional
1691 * slice of itself along the given dimension at the given
1692 * coordinate. Does not move any data around or free the original
1693 * memory, so other views of the same data are unaffected. Can
1694 * only be called on a Buffer with dynamic dimensionality. */
1695 void slice(int d, int pos) {
1696 static_assert(Dims == AnyDims, "Cannot call slice() on a Buffer with static dimensionality.");
1697 assert(dimensions() > 0);
1698
1699 // An optimization for non-device buffers. For the device case,
1700 // a temp buffer is required, so reuse the not-in-place version.
1701 // TODO(zalman|abadams): Are nop slices common enough to special
1702 // case the device part of the if to do nothing?
1703 if (buf.device_interface != nullptr) {
1704 *this = sliced(d, pos);
1705 } else {
1706 slice_host(d, pos);
1707 }
1708 }
1709
1710 /** Slice a buffer in-place at the dimension's minimum. */
1711 void slice(int d) {
1712 slice(d, dim(d).min());
1713 }
1714
1715 /** Make a new buffer that views this buffer as a single slice in a
1716 * higher-dimensional space. The new dimension has extent one and
1717 * the given min. This operation is the opposite of slice. As an
1718 * example, the following condition is true:
1719 *
1720 \code
1721 im2 = im.embedded(1, 17);
1722 &im(x, y, c) == &im2(x, 17, y, c);
1723 \endcode
1724 */
1725 Buffer<T, (Dims == AnyDims ? AnyDims : Dims + 1)>
1726 embedded(int d, int pos = 0) const {
1728 im.embed(d, pos);
1729 return im;
1730 }
1731
1732 /** Embed a buffer in-place, increasing the
1733 * dimensionality. */
1734 void embed(int d, int pos = 0) {
1735 static_assert(Dims == AnyDims, "Cannot call embed() on a Buffer with static dimensionality.");
1736 assert(d >= 0 && d <= dimensions());
1737 add_dimension();
1738 translate(dimensions() - 1, pos);
1739 for (int i = dimensions() - 1; i > d; i--) {
1740 transpose(i, i - 1);
1741 }
1742 }
1743
1744 /** Add a new dimension with a min of zero and an extent of
1745 * one. The stride is the extent of the outermost dimension times
1746 * its stride. The new dimension is the last dimension. This is a
1747 * special case of embed. */
1749 static_assert(Dims == AnyDims, "Cannot call add_dimension() on a Buffer with static dimensionality.");
1750 const int dims = buf.dimensions;
1751 buf.dimensions++;
1752 if (buf.dim != shape) {
1753 // We're already on the heap. Reallocate.
1755 for (int i = 0; i < dims; i++) {
1756 new_shape[i] = buf.dim[i];
1757 }
1758 delete[] buf.dim;
1759 buf.dim = new_shape;
1760 } else if (dims == InClassDimStorage) {
1761 // Transition from the in-class storage to the heap
1762 make_shape_storage(buf.dimensions);
1763 for (int i = 0; i < dims; i++) {
1764 buf.dim[i] = shape[i];
1765 }
1766 } else {
1767 // We still fit in the class
1768 }
1769 buf.dim[dims] = {0, 1, 0};
1770 if (dims == 0) {
1771 buf.dim[dims].stride = 1;
1772 } else {
1773 buf.dim[dims].stride = buf.dim[dims - 1].extent * buf.dim[dims - 1].stride;
1774 }
1775 }
1776
1777 /** Add a new dimension with a min of zero, an extent of one, and
1778 * the specified stride. The new dimension is the last
1779 * dimension. This is a special case of embed. */
1781 add_dimension();
1782 buf.dim[buf.dimensions - 1].stride = s;
1783 }
1784
1785 /** Methods for managing any GPU allocation. */
1786 // @{
1787 // Set the host dirty flag. Called by every operator()
1788 // access. Must be inlined so it can be hoisted out of loops.
1790 void set_host_dirty(bool v = true) {
1791 assert((!v || !device_dirty()) && "Cannot set host dirty when device is already dirty. Call copy_to_host() before accessing the buffer from host.");
1792 buf.set_host_dirty(v);
1793 }
1794
1795 // Check if the device allocation is dirty. Called by
1796 // set_host_dirty, which is called by every accessor. Must be
1797 // inlined so it can be hoisted out of loops.
1799 bool device_dirty() const {
1800 return buf.device_dirty();
1801 }
1802
1803 bool host_dirty() const {
1804 return buf.host_dirty();
1805 }
1806
1807 void set_device_dirty(bool v = true) {
1808 assert((!v || !host_dirty()) && "Cannot set device dirty when host is already dirty.");
1809 buf.set_device_dirty(v);
1810 }
1811
1812 int copy_to_host(void *ctx = nullptr) {
1813 if (device_dirty()) {
1814 return buf.device_interface->copy_to_host(ctx, &buf);
1815 }
1817 }
1818
1819 int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1820 if (host_dirty()) {
1821 return device_interface->copy_to_device(ctx, &buf, device_interface);
1822 }
1824 }
1825
1826 int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1827 return device_interface->device_malloc(ctx, &buf, device_interface);
1828 }
1829
1830 int device_free(void *ctx = nullptr) {
1831 if (dev_ref_count) {
1833 "Can't call device_free on an unmanaged or wrapped native device handle. "
1834 "Free the source allocation or call device_detach_native instead.");
1835 // Multiple people may be holding onto this dev field
1836 assert(dev_ref_count->count == 1 &&
1837 "Multiple Halide::Runtime::Buffer objects share this device "
1838 "allocation. Freeing it would create dangling references. "
1839 "Don't call device_free on Halide buffers that you have copied or "
1840 "passed by value.");
1841 }
1843 if (buf.device_interface) {
1844 ret = buf.device_interface->device_free(ctx, &buf);
1845 }
1846 if (dev_ref_count) {
1847 delete dev_ref_count;
1848 dev_ref_count = nullptr;
1849 }
1850 return ret;
1851 }
1852
1853 int device_wrap_native(const struct halide_device_interface_t *device_interface,
1854 uint64_t handle, void *ctx = nullptr) {
1855 assert(device_interface);
1856 dev_ref_count = new DeviceRefCount;
1858 return device_interface->wrap_native(ctx, &buf, handle, device_interface);
1859 }
1860
1861 int device_detach_native(void *ctx = nullptr) {
1862 assert(dev_ref_count &&
1864 "Only call device_detach_native on buffers wrapping a native "
1865 "device handle via device_wrap_native. This buffer was allocated "
1866 "using device_malloc, or is unmanaged. "
1867 "Call device_free or free the original allocation instead.");
1868 // Multiple people may be holding onto this dev field
1869 assert(dev_ref_count->count == 1 &&
1870 "Multiple Halide::Runtime::Buffer objects share this device "
1871 "allocation. Freeing it could create dangling references. "
1872 "Don't call device_detach_native on Halide buffers that you "
1873 "have copied or passed by value.");
1875 if (buf.device_interface) {
1876 ret = buf.device_interface->detach_native(ctx, &buf);
1877 }
1878 delete dev_ref_count;
1879 dev_ref_count = nullptr;
1880 return ret;
1881 }
1882
1883 int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1884 return device_interface->device_and_host_malloc(ctx, &buf, device_interface);
1885 }
1886
1887 int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1888 if (dev_ref_count) {
1890 "Can't call device_and_host_free on a device handle not allocated with device_and_host_malloc. "
1891 "Free the source allocation or call device_detach_native instead.");
1892 // Multiple people may be holding onto this dev field
1893 assert(dev_ref_count->count == 1 &&
1894 "Multiple Halide::Runtime::Buffer objects share this device "
1895 "allocation. Freeing it would create dangling references. "
1896 "Don't call device_and_host_free on Halide buffers that you have copied or "
1897 "passed by value.");
1898 }
1900 if (buf.device_interface) {
1902 }
1903 if (dev_ref_count) {
1904 delete dev_ref_count;
1905 dev_ref_count = nullptr;
1906 }
1907 return ret;
1908 }
1909
1910 int device_sync(void *ctx = nullptr) {
1911 return buf.device_sync(ctx);
1912 }
1913
1915 return buf.device != 0;
1916 }
1917
1918 /** Return the method by which the device field is managed. */
1920 if (dev_ref_count == nullptr) {
1922 }
1923 return dev_ref_count->ownership;
1924 }
1925 // @}
1926
1927 /** If you use the (x, y, c) indexing convention, then Halide
1928 * Buffers are stored planar by default. This function constructs
1929 * an interleaved RGB or RGBA image that can still be indexed
1930 * using (x, y, c). Passing it to a generator requires that the
1931 * generator has been compiled with support for interleaved (also
1932 * known as packed or chunky) memory layouts. */
1933 static Buffer<void, Dims, InClassDimStorage> make_interleaved(halide_type_t t, int width, int height, int channels) {
1934 static_assert(Dims == AnyDims || Dims == 3, "make_interleaved() must be called on a Buffer that can represent 3 dimensions.");
1935 Buffer<void, Dims, InClassDimStorage> im(t, channels, width, height);
1936 // Note that this is equivalent to calling transpose({2, 0, 1}),
1937 // but slightly more efficient.
1938 im.transpose(0, 1);
1939 im.transpose(1, 2);
1940 return im;
1941 }
1942
1943 /** If you use the (x, y, c) indexing convention, then Halide
1944 * Buffers are stored planar by default. This function constructs
1945 * an interleaved RGB or RGBA image that can still be indexed
1946 * using (x, y, c). Passing it to a generator requires that the
1947 * generator has been compiled with support for interleaved (also
1948 * known as packed or chunky) memory layouts. */
1949 static Buffer<T, Dims, InClassDimStorage> make_interleaved(int width, int height, int channels) {
1950 return make_interleaved(static_halide_type(), width, height, channels);
1951 }
1952
1953 /** Wrap an existing interleaved image. */
1955 make_interleaved(halide_type_t t, T *data, int width, int height, int channels) {
1956 static_assert(Dims == AnyDims || Dims == 3, "make_interleaved() must be called on a Buffer that can represent 3 dimensions.");
1957 Buffer<add_const_if_T_is_const<void>, Dims, InClassDimStorage> im(t, data, channels, width, height);
1958 im.transpose(0, 1);
1959 im.transpose(1, 2);
1960 return im;
1961 }
1962
1963 /** Wrap an existing interleaved image. */
1964 static Buffer<T, Dims, InClassDimStorage> make_interleaved(T *data, int width, int height, int channels) {
1965 return make_interleaved(static_halide_type(), data, width, height, channels);
1966 }
1967
1968 /** Make a zero-dimensional Buffer */
1970 static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1972 buf.slice(0, 0);
1973 return buf;
1974 }
1975
1976 /** Make a zero-dimensional Buffer */
1978 static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1980 buf.slice(0, 0);
1981 return buf;
1982 }
1983
1984 /** Make a zero-dimensional Buffer that points to non-owned, existing data */
1986 static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1988 buf.slice(0, 0);
1989 return buf;
1990 }
1991
1992 /** Make a buffer with the same shape and memory nesting order as
1993 * another buffer. It may have a different type. */
1994 template<typename T2, int D2, int S2>
1995 // NOLINTNEXTLINE(performance-unnecessary-value-param)
1997 void *(*allocate_fn)(size_t) = nullptr,
1998 void (*deallocate_fn)(void *) = nullptr) {
1999 // Note that src is taken by value because its dims are mutated
2000 // in-place by the helper. Do not change to taking it by reference.
2001 static_assert(Dims == D2 || Dims == AnyDims);
2004 allocate_fn, deallocate_fn);
2005 }
2006
2007private:
2008 static Buffer<> make_with_shape_of_helper(halide_type_t dst_type,
2009 int dimensions,
2010 halide_dimension_t *shape,
2011 void *(*allocate_fn)(size_t),
2012 void (*deallocate_fn)(void *)) {
2013 // Reorder the dimensions of src to have strides in increasing order
2014 std::vector<int> swaps;
2015 for (int i = dimensions - 1; i > 0; i--) {
2016 for (int j = i; j > 0; j--) {
2017 if (shape[j - 1].stride > shape[j].stride) {
2018 std::swap(shape[j - 1], shape[j]);
2019 swaps.push_back(j);
2020 }
2021 }
2022 }
2023
2024 // Rewrite the strides to be dense (this messes up src, which
2025 // is why we took it by value).
2026 for (int i = 0; i < dimensions; i++) {
2027 if (i == 0) {
2028 shape[i].stride = 1;
2029 } else {
2030 shape[i].stride = shape[i - 1].extent * shape[i - 1].stride;
2031 }
2032 }
2033
2034 // Undo the dimension reordering
2035 while (!swaps.empty()) {
2036 int j = swaps.back();
2037 std::swap(shape[j - 1], shape[j]);
2038 swaps.pop_back();
2039 }
2040
2041 // Use an explicit runtime type, and make dst a Buffer<void>, to allow
2042 // using this method with Buffer<void> for either src or dst.
2043 Buffer<> dst(dst_type, nullptr, dimensions, shape);
2044 dst.allocate(allocate_fn, deallocate_fn);
2045
2046 return dst;
2047 }
2048
2049 template<typename... Args>
2051 ptrdiff_t
2052 offset_of(int d, int first, Args... rest) const {
2053#if HALIDE_RUNTIME_BUFFER_CHECK_INDICES
2054 assert(first >= this->buf.dim[d].min);
2055 assert(first < this->buf.dim[d].min + this->buf.dim[d].extent);
2056#endif
2057 return offset_of(d + 1, rest...) + (ptrdiff_t)this->buf.dim[d].stride * (first - this->buf.dim[d].min);
2058 }
2059
2061 ptrdiff_t offset_of(int d) const {
2062 return 0;
2063 }
2064
2065 template<typename... Args>
2066 HALIDE_ALWAYS_INLINE storage_T *address_of(Args... args) const {
2067 if (T_is_void) {
2068 return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes();
2069 } else {
2070 return (storage_T *)(this->buf.host) + offset_of(0, args...);
2071 }
2072 }
2073
2075 ptrdiff_t offset_of(const int *pos) const {
2076 ptrdiff_t offset = 0;
2077 for (int i = this->dimensions() - 1; i >= 0; i--) {
2078#if HALIDE_RUNTIME_BUFFER_CHECK_INDICES
2079 assert(pos[i] >= this->buf.dim[i].min);
2080 assert(pos[i] < this->buf.dim[i].min + this->buf.dim[i].extent);
2081#endif
2082 offset += (ptrdiff_t)this->buf.dim[i].stride * (pos[i] - this->buf.dim[i].min);
2083 }
2084 return offset;
2085 }
2086
2088 storage_T *address_of(const int *pos) const {
2089 if (T_is_void) {
2090 return (storage_T *)this->buf.host + offset_of(pos) * type().bytes();
2091 } else {
2092 return (storage_T *)this->buf.host + offset_of(pos);
2093 }
2094 }
2095
2096public:
2097 /** Get a pointer to the address of the min coordinate. */
2098 T *data() const {
2099 return (T *)(this->buf.host);
2100 }
2101
2102 /** Access elements. Use im(...) to get a reference to an element,
2103 * and use &im(...) to get the address of an element. If you pass
2104 * fewer arguments than the buffer has dimensions, the rest are
2105 * treated as their min coordinate. The non-const versions set the
2106 * host_dirty flag to true.
2107 */
2108 //@{
2109 template<typename... Args,
2110 typename = std::enable_if_t<AllInts<Args...>::value>>
2111 HALIDE_ALWAYS_INLINE const not_void_T &operator()(int first, Args... rest) const {
2112 static_assert(!T_is_void,
2113 "Cannot use operator() on Buffer<void> types");
2114 constexpr int expected_dims = 1 + (int)(sizeof...(rest));
2115 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2116 assert(!device_dirty());
2117 return *((const not_void_T *)(address_of(first, rest...)));
2118 }
2119
2121 const not_void_T &operator()() const {
2122 static_assert(!T_is_void,
2123 "Cannot use operator() on Buffer<void> types");
2124 constexpr int expected_dims = 0;
2125 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2126 assert(!device_dirty());
2127 return *((const not_void_T *)(data()));
2128 }
2129
2131 const not_void_T &
2132 operator()(const int *pos) const {
2133 static_assert(!T_is_void,
2134 "Cannot use operator() on Buffer<void> types");
2135 assert(!device_dirty());
2136 return *((const not_void_T *)(address_of(pos)));
2137 }
2138
2139 template<typename... Args,
2140 typename = std::enable_if_t<AllInts<Args...>::value>>
2141 HALIDE_ALWAYS_INLINE not_void_T &operator()(int first, Args... rest) {
2142 static_assert(!T_is_void,
2143 "Cannot use operator() on Buffer<void> types");
2144 constexpr int expected_dims = 1 + (int)(sizeof...(rest));
2145 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2146 set_host_dirty();
2147 return *((not_void_T *)(address_of(first, rest...)));
2148 }
2149
2151 not_void_T &
2153 static_assert(!T_is_void,
2154 "Cannot use operator() on Buffer<void> types");
2155 constexpr int expected_dims = 0;
2156 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2157 set_host_dirty();
2158 return *((not_void_T *)(data()));
2159 }
2160
2162 not_void_T &
2163 operator()(const int *pos) {
2164 static_assert(!T_is_void,
2165 "Cannot use operator() on Buffer<void> types");
2166 set_host_dirty();
2167 return *((not_void_T *)(address_of(pos)));
2168 }
2169 // @}
2170
2171 /** Tests that all values in this buffer are equal to val. */
2172 bool all_equal(not_void_T val) const {
2173 bool all_equal = true;
2174 for_each_element([&](const int *pos) { all_equal &= (*this)(pos) == val; });
2175 return all_equal;
2176 }
2177
2179 set_host_dirty();
2180 for_each_value([=](T &v) { v = val; });
2181 return *this;
2182 }
2183
2184private:
2185 /** Helper functions for for_each_value. */
2186 // @{
2187 template<int N>
2188 struct for_each_value_task_dim {
2189 std::ptrdiff_t extent;
2190 std::ptrdiff_t stride[N];
2191 };
2192
2193 // Given an array of strides, and a bunch of pointers to pointers
2194 // (all of different types), advance the pointers using the
2195 // strides.
2196 template<typename Ptr, typename... Ptrs>
2197 HALIDE_ALWAYS_INLINE static void advance_ptrs(const std::ptrdiff_t *stride, Ptr &ptr, Ptrs &...ptrs) {
2198 ptr += *stride;
2199 advance_ptrs(stride + 1, ptrs...);
2200 }
2201
2203 static void advance_ptrs(const std::ptrdiff_t *) {
2204 }
2205
2206 template<typename Fn, typename Ptr, typename... Ptrs>
2207 HALIDE_NEVER_INLINE static void for_each_value_helper(Fn &&f, int d, bool innermost_strides_are_one,
2208 const for_each_value_task_dim<sizeof...(Ptrs) + 1> *t, Ptr ptr, Ptrs... ptrs) {
2209 if (d == 0) {
2211 Ptr end = ptr + t[0].extent;
2212 while (ptr != end) {
2213 f(*ptr++, (*ptrs++)...);
2214 }
2215 } else {
2216 for (std::ptrdiff_t i = t[0].extent; i != 0; i--) {
2217 f(*ptr, (*ptrs)...);
2218 advance_ptrs(t[0].stride, ptr, ptrs...);
2219 }
2220 }
2221 } else {
2222 for (std::ptrdiff_t i = t[d].extent; i != 0; i--) {
2223 for_each_value_helper(f, d - 1, innermost_strides_are_one, t, ptr, ptrs...);
2224 advance_ptrs(t[d].stride, ptr, ptrs...);
2225 }
2226 }
2227 }
2228
2229 // Return pair is <new_dimensions, innermost_strides_are_one>
2230 template<int N>
2231 HALIDE_NEVER_INLINE static std::pair<int, bool> for_each_value_prep(for_each_value_task_dim<N> *t,
2232 const halide_buffer_t **buffers) {
2233 const int dimensions = buffers[0]->dimensions;
2234 assert(dimensions > 0);
2235
2236 // Check the buffers all have clean host allocations
2237 for (int i = 0; i < N; i++) {
2238 if (buffers[i]->device) {
2239 assert(buffers[i]->host &&
2240 "Buffer passed to for_each_value has device allocation but no host allocation. Call allocate() and copy_to_host() first");
2241 assert(!buffers[i]->device_dirty() &&
2242 "Buffer passed to for_each_value is dirty on device. Call copy_to_host() first");
2243 } else {
2244 assert(buffers[i]->host &&
2245 "Buffer passed to for_each_value has no host or device allocation");
2246 }
2247 }
2248
2249 // Extract the strides in all the dimensions
2250 for (int i = 0; i < dimensions; i++) {
2251 for (int j = 0; j < N; j++) {
2252 assert(buffers[j]->dimensions == dimensions);
2253 assert(buffers[j]->dim[i].extent == buffers[0]->dim[i].extent &&
2254 buffers[j]->dim[i].min == buffers[0]->dim[i].min);
2255 const int s = buffers[j]->dim[i].stride;
2256 t[i].stride[j] = s;
2257 }
2258 t[i].extent = buffers[0]->dim[i].extent;
2259
2260 // Order the dimensions by stride, so that the traversal is cache-coherent.
2261 // Use the last dimension for this, because this is the source in copies.
2262 // It appears to be better to optimize read order than write order.
2263 for (int j = i; j > 0 && t[j].stride[N - 1] < t[j - 1].stride[N - 1]; j--) {
2264 std::swap(t[j], t[j - 1]);
2265 }
2266 }
2267
2268 // flatten dimensions where possible to make a larger inner
2269 // loop for autovectorization.
2270 int d = dimensions;
2271 for (int i = 1; i < d; i++) {
2272 bool flat = true;
2273 for (int j = 0; j < N; j++) {
2274 flat = flat && t[i - 1].stride[j] * t[i - 1].extent == t[i].stride[j];
2275 }
2276 if (flat) {
2277 t[i - 1].extent *= t[i].extent;
2278 for (int j = i; j < d - 1; j++) {
2279 t[j] = t[j + 1];
2280 }
2281 i--;
2282 d--;
2283 }
2284 }
2285
2286 // Note that we assert() that dimensions > 0 above
2287 // (our one-and-only caller will only call us that way)
2288 // so the unchecked access to t[0] should be safe.
2289 bool innermost_strides_are_one = true;
2290 for (int i = 0; i < N; i++) {
2291 innermost_strides_are_one &= (t[0].stride[i] == 1);
2292 }
2293
2294 return {d, innermost_strides_are_one};
2295 }
2296
2297 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2298 void for_each_value_impl(Fn &&f, Args &&...other_buffers) const {
2299 if (dimensions() > 0) {
2300 const size_t alloc_size = dimensions() * sizeof(for_each_value_task_dim<N>);
2303 // Move the preparatory code into a non-templated helper to
2304 // save code size.
2305 const halide_buffer_t *buffers[] = {&buf, (&other_buffers.buf)...};
2307 if (new_dims > 0) {
2310 t,
2311 data(), (other_buffers.data())...);
2312 return;
2313 }
2314 // else fall thru
2315 }
2316
2317 // zero-dimensional case
2318 f(*data(), (*other_buffers.data())...);
2319 }
2320 // @}
2321
2322public:
2323 /** Call a function on every value in the buffer, and the
2324 * corresponding values in some number of other buffers of the
2325 * same size. The function should take a reference, const
2326 * reference, or value of the correct type for each buffer. This
2327 * effectively lifts a function of scalars to an element-wise
2328 * function of buffers. This produces code that the compiler can
2329 * autovectorize. This is slightly cheaper than for_each_element,
2330 * because it does not need to track the coordinates.
2331 *
2332 * Note that constness of Buffers is preserved: a const Buffer<T> (for either
2333 * 'this' or the other-buffers arguments) will allow mutation of the
2334 * buffer contents, while a Buffer<const T> will not. Attempting to specify
2335 * a mutable reference for the lambda argument of a Buffer<const T>
2336 * will result in a compilation error. */
2337 // @{
2338 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2340 for_each_value_impl(f, std::forward<Args>(other_buffers)...);
2341 return *this;
2342 }
2343
2344 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2348 for_each_value_impl(f, std::forward<Args>(other_buffers)...);
2349 return *this;
2350 }
2351 // @}
2352
2353private:
2354 // Helper functions for for_each_element
2355 struct for_each_element_task_dim {
2356 int min, max;
2357 };
2358
2359 /** If f is callable with this many args, call it. The first
2360 * argument is just to make the overloads distinct. Actual
2361 * overload selection is done using the enable_if. */
2362 template<typename Fn,
2363 typename... Args,
2364 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2365 HALIDE_ALWAYS_INLINE static void for_each_element_variadic(int, int, const for_each_element_task_dim *, Fn &&f, Args... args) {
2366 f(args...);
2367 }
2368
2369 /** If the above overload is impossible, we add an outer loop over
2370 * an additional argument and try again. */
2371 template<typename Fn,
2372 typename... Args>
2373 HALIDE_ALWAYS_INLINE static void for_each_element_variadic(double, int d, const for_each_element_task_dim *t, Fn &&f, Args... args) {
2374 for (int i = t[d].min; i <= t[d].max; i++) {
2375 for_each_element_variadic(0, d - 1, t, std::forward<Fn>(f), i, args...);
2376 }
2377 }
2378
2379 /** Determine the minimum number of arguments a callable can take
2380 * using the same trick. */
2381 template<typename Fn,
2382 typename... Args,
2383 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2384 HALIDE_ALWAYS_INLINE static int num_args(int, Fn &&, Args...) {
2385 return (int)(sizeof...(Args));
2386 }
2387
2388 /** The recursive version is only enabled up to a recursion limit
2389 * of 256. This catches callables that aren't callable with any
2390 * number of ints. */
2391 template<typename Fn,
2392 typename... Args>
2393 HALIDE_ALWAYS_INLINE static int num_args(double, Fn &&f, Args... args) {
2394 static_assert(sizeof...(args) <= 256,
2395 "Callable passed to for_each_element must accept either a const int *,"
2396 " or up to 256 ints. No such operator found. Expect infinite template recursion.");
2397 return num_args(0, std::forward<Fn>(f), 0, args...);
2398 }
2399
2400 /** A version where the callable takes a position array instead,
2401 * with compile-time recursion on the dimensionality. This
2402 * overload is preferred to the one below using the same int vs
2403 * double trick as above, but is impossible once d hits -1 using
2404 * std::enable_if. */
2405 template<int d,
2406 typename Fn,
2407 typename = std::enable_if_t<(d >= 0)>>
2408 HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(int, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2409 for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
2410 for_each_element_array_helper<d - 1>(0, t, f, pos);
2411 }
2412 }
2413
2414 /** Base case for recursion above. */
2415 template<int d,
2416 typename Fn,
2417 typename = std::enable_if_t<(d < 0)>>
2418 HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(double, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2419 f(pos);
2420 }
2421
2422 /** A run-time-recursive version (instead of
2423 * compile-time-recursive) that requires the callable to take a
2424 * pointer to a position array instead. Dispatches to the
2425 * compile-time-recursive version once the dimensionality gets
2426 * small. */
2427 template<typename Fn>
2428 static void for_each_element_array(int d, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2429 if (d == -1) {
2430 f(pos);
2431 } else if (d == 0) {
2432 // Once the dimensionality gets small enough, dispatch to
2433 // a compile-time-recursive version for better codegen of
2434 // the inner loops.
2435 for_each_element_array_helper<0, Fn>(0, t, std::forward<Fn>(f), pos);
2436 } else if (d == 1) {
2437 for_each_element_array_helper<1, Fn>(0, t, std::forward<Fn>(f), pos);
2438 } else if (d == 2) {
2439 for_each_element_array_helper<2, Fn>(0, t, std::forward<Fn>(f), pos);
2440 } else if (d == 3) {
2441 for_each_element_array_helper<3, Fn>(0, t, std::forward<Fn>(f), pos);
2442 } else {
2443 for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
2444 for_each_element_array(d - 1, t, f, pos);
2445 }
2446 }
2447 }
2448
2449 /** We now have two overloads for for_each_element. This one
2450 * triggers if the callable takes a const int *.
2451 */
2452 template<typename Fn,
2453 typename = decltype(std::declval<Fn>()((const int *)nullptr))>
2454 static void for_each_element(int, int dims, const for_each_element_task_dim *t, Fn &&f, int check = 0) {
2455 const int size = dims * sizeof(int);
2456 int *pos = (int *)HALIDE_ALLOCA(size);
2457 // At least one version of GCC will (incorrectly) report that pos "may be used uninitialized".
2458 // Add this memset to silence it.
2459 memset(pos, 0, size);
2460 for_each_element_array(dims - 1, t, std::forward<Fn>(f), pos);
2461 }
2462
2463 /** This one triggers otherwise. It treats the callable as
2464 * something that takes some number of ints. */
2465 template<typename Fn>
2466 HALIDE_ALWAYS_INLINE static void for_each_element(double, int dims, const for_each_element_task_dim *t, Fn &&f) {
2467 int args = num_args(0, std::forward<Fn>(f));
2468 assert(dims >= args);
2469 for_each_element_variadic(0, args - 1, t, std::forward<Fn>(f));
2470 }
2471
2472 template<typename Fn>
2473 void for_each_element_impl(Fn &&f) const {
2474 for_each_element_task_dim *t =
2475 (for_each_element_task_dim *)HALIDE_ALLOCA(dimensions() * sizeof(for_each_element_task_dim));
2476 for (int i = 0; i < dimensions(); i++) {
2477 t[i].min = dim(i).min();
2478 t[i].max = dim(i).max();
2479 }
2480 for_each_element(0, dimensions(), t, std::forward<Fn>(f));
2481 }
2482
2483public:
2484 /** Call a function at each site in a buffer. This is likely to be
2485 * much slower than using Halide code to populate a buffer, but is
2486 * convenient for tests. If the function has more arguments than the
2487 * buffer has dimensions, the remaining arguments will be zero. If it
2488 * has fewer arguments than the buffer has dimensions then the last
2489 * few dimensions of the buffer are not iterated over. For example,
2490 * the following code exploits this to set a floating point RGB image
2491 * to red:
2492
2493 \code
2494 Buffer<float, 3> im(100, 100, 3);
2495 im.for_each_element([&](int x, int y) {
2496 im(x, y, 0) = 1.0f;
2497 im(x, y, 1) = 0.0f;
2498 im(x, y, 2) = 0.0f:
2499 });
2500 \endcode
2501
2502 * The compiled code is equivalent to writing the a nested for loop,
2503 * and compilers are capable of optimizing it in the same way.
2504 *
2505 * If the callable can be called with an int * as the sole argument,
2506 * that version is called instead. Each location in the buffer is
2507 * passed to it in a coordinate array. This version is higher-overhead
2508 * than the variadic version, but is useful for writing generic code
2509 * that accepts buffers of arbitrary dimensionality. For example, the
2510 * following sets the value at all sites in an arbitrary-dimensional
2511 * buffer to their first coordinate:
2512
2513 \code
2514 im.for_each_element([&](const int *pos) {im(pos) = pos[0];});
2515 \endcode
2516
2517 * It is also possible to use for_each_element to iterate over entire
2518 * rows or columns by cropping the buffer to a single column or row
2519 * respectively and iterating over elements of the result. For example,
2520 * to set the diagonal of the image to 1 by iterating over the columns:
2521
2522 \code
2523 Buffer<float, 3> im(100, 100, 3);
2524 im.sliced(1, 0).for_each_element([&](int x, int c) {
2525 im(x, x, c) = 1.0f;
2526 });
2527 \endcode
2528
2529 * Or, assuming the memory layout is known to be dense per row, one can
2530 * memset each row of an image like so:
2531
2532 \code
2533 Buffer<float, 3> im(100, 100, 3);
2534 im.sliced(0, 0).for_each_element([&](int y, int c) {
2535 memset(&im(0, y, c), 0, sizeof(float) * im.width());
2536 });
2537 \endcode
2538
2539 */
2540 // @{
2541 template<typename Fn>
2543 for_each_element_impl(f);
2544 return *this;
2545 }
2546
2547 template<typename Fn>
2551 for_each_element_impl(f);
2552 return *this;
2553 }
2554 // @}
2555
2556private:
2557 template<typename Fn>
2558 struct FillHelper {
2559 Fn f;
2561
2562 template<typename... Args,
2563 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2564 void operator()(Args... args) {
2565 (*buf)(args...) = f(args...);
2566 }
2567
2568 FillHelper(Fn &&f, Buffer<T, Dims, InClassDimStorage> *buf)
2569 : f(std::forward<Fn>(f)), buf(buf) {
2570 }
2571 };
2572
2573public:
2574 /** Fill a buffer by evaluating a callable at every site. The
2575 * callable should look much like a callable passed to
2576 * for_each_element, but it should return the value that should be
2577 * stored to the coordinate corresponding to the arguments. */
2578 template<typename Fn,
2579 typename = std::enable_if_t<!std::is_arithmetic_v<std::decay_t<Fn>>>>
2581 // We'll go via for_each_element. We need a variadic wrapper lambda.
2582 FillHelper<Fn> wrapper(std::forward<Fn>(f), this);
2583 return for_each_element(wrapper);
2584 }
2585
2586 /** Check if an input buffer passed extern stage is a querying
2587 * bounds. Compared to doing the host pointer check directly,
2588 * this both adds clarity to code and will facilitate moving to
2589 * another representation for bounds query arguments. */
2590 bool is_bounds_query() const {
2591 return buf.is_bounds_query();
2592 }
2593
2594 /** Convenient check to verify that all of the interesting bytes in the Buffer
2595 * are initialized under MSAN. Note that by default, we use for_each_value() here so that
2596 * we skip any unused padding that isn't part of the Buffer; this isn't efficient,
2597 * but in MSAN mode, it doesn't matter. (Pass true for the flag to force check
2598 * the entire Buffer storage.) */
2599 void msan_check_mem_is_initialized(bool entire = false) const {
2600#if defined(__has_feature)
2601#if __has_feature(memory_sanitizer)
2602 if (entire) {
2603 __msan_check_mem_is_initialized(data(), size_in_bytes());
2604 } else {
2605 for_each_value([](T &v) { __msan_check_mem_is_initialized(&v, sizeof(T)); ; });
2606 }
2607#endif
2608#endif
2609 }
2610};
2611
2612} // namespace Runtime
2613} // namespace Halide
2614
2615#undef HALIDE_ALLOCA
2616
2617#endif // HALIDE_RUNTIME_IMAGE_H
#define HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT
#define HALIDE_ALLOCA
This file declares the routines used by Halide internally in its runtime.
#define HALIDE_NEVER_INLINE
@ halide_error_code_success
There was no error.
#define HALIDE_ALWAYS_INLINE
Read-only access to the shape.
HALIDE_ALWAYS_INLINE int min() const
The lowest coordinate in this dimension.
Dimension(const halide_dimension_t &dim)
HALIDE_ALWAYS_INLINE int max() const
The highest coordinate in this dimension.
HALIDE_ALWAYS_INLINE iterator end() const
An iterator that points to one past the max coordinate.
HALIDE_ALWAYS_INLINE int stride() const
The number of elements in memory you have to step over to increment this coordinate by one.
HALIDE_ALWAYS_INLINE iterator begin() const
An iterator that points to the min coordinate.
HALIDE_ALWAYS_INLINE int extent() const
The extent of the image along this dimension.
A templated Buffer class that wraps halide_buffer_t and adds functionality.
Buffer< T, Dims, InClassDimStorage > & operator=(const Buffer< T2, D2, S2 > &other)
Assign from another Buffer of possibly-different dimensionality and type.
Buffer< not_const_T, Dims, InClassDimStorage > copy_to_planar(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr) const
Like copy(), but the copy is created in planar memory layout (vs.
Buffer< T, Dims, InClassDimStorage > transposed(const std::vector< int > &order) const
Make a buffer which refers to the same data in the same layout using a different ordering of the dime...
void translate(int d, int delta)
Translate an image in-place along one dimension by changing how it is indexed.
Buffer(const halide_buffer_t &buf, BufferDeviceOwnership ownership=BufferDeviceOwnership::Unmanaged)
Make a Buffer from a halide_buffer_t.
void allocate(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr)
Allocate memory for this Buffer.
Buffer< not_const_T, Dims, InClassDimStorage > copy(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr) const
Make a new image which is a deep copy of this image.
Buffer< T,(Dims==AnyDims ? AnyDims :Dims+1)> embedded(int d, int pos=0) const
Make a new buffer that views this buffer as a single slice in a higher-dimensional space.
void add_dimension()
Add a new dimension with a min of zero and an extent of one.
void slice(int d)
Slice a buffer in-place at the dimension's minimum.
static void set_default_allocate_fn(void *(*allocate_fn)(size_t))
bool owns_host_memory() const
Does this Buffer own the host memory it refers to?
int width() const
Conventional names for the first three dimensions.
void transpose(const std::vector< int > &order)
A generalized transpose: instead of swapping two dimensions, pass a vector that lists each dimension ...
void set_min(const std::vector< int > &mins)
Set the min coordinate of an image in the first N dimensions.
HALIDE_ALWAYS_INLINE Buffer< T, Dims, InClassDimStorage > & for_each_element(Fn &&f)
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, const std::vector< int > &sizes)
Initialize an Buffer of runtime type from a pointer and a vector of sizes.
HALIDE_ALWAYS_INLINE Buffer< T2, D2, InClassDimStorage > as() &&
Return an rval reference to this Buffer.
int copy_to_host(void *ctx=nullptr)
Buffer(halide_type_t t, const std::vector< int > &sizes)
Allocate a new image of unknown type using a vector of ints as the size.
int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
int device_free(void *ctx=nullptr)
int extent(int i) const
bool contains(Args... args) const
void crop(const std::vector< std::pair< int, int > > &rect)
Crop an image in-place along the first N dimensions.
void set_device_dirty(bool v=true)
HALIDE_ALWAYS_INLINE const not_void_T & operator()(const int *pos) const
Buffer(T *data, int d, const halide_dimension_t *shape)
Initialize an Buffer from a pointer to the min coordinate and an array describing the shape.
Buffer(Buffer< T2, D2, S2 > &&other)
Move-construct a Buffer from a Buffer of different dimensionality and type.
void slice(int d, int pos)
Rewrite the buffer to refer to a single lower-dimensional slice of itself along the given dimension a...
HALIDE_ALWAYS_INLINE const not_void_T & operator()(int first, Args... rest) const
Access elements.
HALIDE_ALWAYS_INLINE void set_host_dirty(bool v=true)
Methods for managing any GPU allocation.
void msan_check_mem_is_initialized(bool entire=false) const
Convenient check to verify that all of the interesting bytes in the Buffer are initialized under MSAN...
Buffer< T, Dims, InClassDimStorage > & operator=(Buffer< T, Dims, InClassDimStorage > &&other) noexcept
Standard move-assignment operator.
int device_detach_native(void *ctx=nullptr)
int device_wrap_native(const struct halide_device_interface_t *device_interface, uint64_t handle, void *ctx=nullptr)
Buffer< T, Dims, InClassDimStorage > translated(const std::vector< int > &delta) const
Make an image which refers to the same data translated along the first N dimensions.
HALIDE_ALWAYS_INLINE Dimension dim(int i) const
Access the shape of the buffer.
HALIDE_ALWAYS_INLINE Buffer< std::add_const_t< T >, Dims, InClassDimStorage > & as_const() &
as_const() is syntactic sugar for .as<const T>(), to avoid the need to recapitulate the type argument...
Buffer(int first, int second, Args... rest)
HALIDE_ALWAYS_INLINE Buffer< std::add_const_t< T >, Dims, InClassDimStorage > as_const() &&
Buffer< T, Dims, InClassDimStorage > transposed(int d1, int d2) const
Make a buffer which refers to the same data in the same layout using a swapped indexing order for the...
HALIDE_ALWAYS_INLINE Buffer< T, Dims, InClassDimStorage > & for_each_value(Fn &&f, Args &&...other_buffers)
HALIDE_ALWAYS_INLINE not_void_T & operator()()
BufferDeviceOwnership device_ownership() const
Return the method by which the device field is managed.
void check_overflow()
Check the product of the extents fits in memory.
static bool can_convert_from(const Buffer< T2, D2, S2 > &other)
Determine if a Buffer<T, Dims, InClassDimStorage> can be constructed from some other Buffer type.
Buffer< not_const_T, Dims, InClassDimStorage > copy_to_interleaved(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr) const
Like copy(), but the copy is created in interleaved memory layout (vs.
int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
int device_sync(void *ctx=nullptr)
static Buffer< void, Dims, InClassDimStorage > make_interleaved(halide_type_t t, int width, int height, int channels)
If you use the (x, y, c) indexing convention, then Halide Buffers are stored planar by default.
Buffer(const std::vector< int > &sizes)
Allocate a new image of known type using a vector of ints as the size.
void embed(int d, int pos=0)
Embed a buffer in-place, increasing the dimensionality.
static constexpr halide_type_t static_halide_type()
Get the Halide type of T.
Buffer(T *data, int first, Args &&...rest)
Initialize an Buffer from a pointer and some sizes.
int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
Buffer(Array(&vals)[N])
Make an Buffer that refers to a statically sized array.
const halide_buffer_t * raw_buffer() const
HALIDE_ALWAYS_INLINE not_void_T & operator()(int first, Args... rest)
static Buffer< T, Dims, InClassDimStorage > make_interleaved(int width, int height, int channels)
If you use the (x, y, c) indexing convention, then Halide Buffers are stored planar by default.
halide_type_t type() const
Get the type of the elements.
int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
Buffer(int first)
Allocate a new image of the given size.
halide_buffer_t * raw_buffer()
Get a pointer to the raw halide_buffer_t this wraps.
T * end() const
A pointer to one beyond the element with the highest address.
HALIDE_ALWAYS_INLINE bool device_dirty() const
Buffer< T, Dims, InClassDimStorage > cropped(const std::vector< std::pair< int, int > > &rect) const
Make an image that refers to a sub-rectangle of this image along the first N dimensions.
static constexpr int static_dimensions()
Callers should not use the result if has_static_dimensions is false.
HALIDE_ALWAYS_INLINE const Buffer< std::add_const_t< T >, Dims, InClassDimStorage > & as_const() const &
void transpose(int d1, int d2)
Transpose a buffer in-place by changing how it is indexed.
void deallocate()
Drop reference to any owned host or device memory, possibly freeing it, if this buffer held the last ...
size_t size_in_bytes() const
The total number of bytes spanned by the data in memory.
bool has_device_allocation() const
void reset()
Reset the Buffer to be equivalent to a default-constructed Buffer of the same static type (if any); B...
Buffer(halide_type_t t, int first, Args... rest)
Allocate a new image of the given size with a runtime type.
int dimensions() const
Get the dimensionality of the buffer.
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, int d, const halide_dimension_t *shape)
Initialize an Buffer from a pointer to the min coordinate and an array describing the shape.
int min(int i) const
Access to the mins, strides, extents.
HALIDE_ALWAYS_INLINE const Buffer< T, Dims, InClassDimStorage > & for_each_element(Fn &&f) const
Call a function at each site in a buffer.
void device_deallocate()
Drop reference to any owned device memory, possibly freeing it if this buffer held the last reference...
HALIDE_ALWAYS_INLINE const not_void_T & operator()() const
static Buffer< T, Dims, InClassDimStorage > make_scalar()
Make a zero-dimensional Buffer.
void add_dimension_with_stride(int s)
Add a new dimension with a min of zero, an extent of one, and the specified stride.
Buffer(Buffer< T, Dims, InClassDimStorage > &&other) noexcept
Move constructor.
Buffer< T, Dims, InClassDimStorage > cropped(int d, int min, int extent) const
Make an image that refers to a sub-range of this image along the given dimension.
void crop(int d, int min, int extent)
Crop an image in-place along the given dimension.
Buffer< T, Dims, InClassDimStorage > & fill(Fn &&f)
Fill a buffer by evaluating a callable at every site.
static Buffer< T, Dims, InClassDimStorage > make_scalar(T *data)
Make a zero-dimensional Buffer that points to non-owned, existing data.
Buffer< T, Dims, InClassDimStorage > alias() const
Make a copy of the Buffer which shares the underlying host and/or device allocations as the existing ...
void set_min(Args... args)
size_t number_of_elements() const
The total number of elements this buffer represents.
static void assert_can_convert_from(const Buffer< T2, D2, S2 > &other)
Fail an assertion at runtime or compile-time if an Buffer<T, Dims, InClassDimStorage> cannot be const...
void translate(const std::vector< int > &delta)
Translate an image along the first N dimensions by changing how it is indexed.
Buffer(const Buffer< T, Dims, InClassDimStorage > &other)
Copy constructor.
HALIDE_ALWAYS_INLINE not_void_T & operator()(const int *pos)
T * data() const
Get a pointer to the address of the min coordinate.
Buffer< T, Dims, InClassDimStorage > & fill(not_void_T val)
Buffer(const std::vector< int > &sizes, const std::vector< int > &storage_order)
Buffer< T, Dims, InClassDimStorage > & operator=(Buffer< T2, D2, S2 > &&other)
Move from another Buffer of possibly-different dimensionality and type.
Buffer(halide_type_t t, const std::vector< int > &sizes, const std::vector< int > &storage_order)
Allocate a new image of unknown type using a vector of ints as the size and a vector of indices indic...
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, const std::vector< halide_dimension_t > &shape)
Initialize a Buffer from a pointer to the min coordinate and a vector describing the shape.
Buffer< T,(Dims==AnyDims ? AnyDims :Dims - 1)> sliced(int d, int pos) const
Make a lower-dimensional buffer that refers to one slice of this buffer.
static Buffer< add_const_if_T_is_const< void >, Dims, InClassDimStorage > make_interleaved(halide_type_t t, T *data, int width, int height, int channels)
Wrap an existing interleaved image.
HALIDE_ALWAYS_INLINE const Buffer< T, Dims, InClassDimStorage > & for_each_value(Fn &&f, Args &&...other_buffers) const
Call a function on every value in the buffer, and the corresponding values in some number of other bu...
bool is_bounds_query() const
Check if an input buffer passed extern stage is a querying bounds.
Buffer< T,(Dims==AnyDims ? AnyDims :Dims - 1)> sliced(int d) const
Make a lower-dimensional buffer that refers to one slice of this buffer at the dimension's minimum.
int left() const
Conventional names for the min and max value of each dimension.
void copy_from(Buffer< T2, D2, S2 > src)
Fill a Buffer with the values at the same coordinates in another Buffer.
Buffer< T, Dims, InClassDimStorage > translated(int d, int dx) const
Make an image which refers to the same data with using translated coordinates in the given dimension.
int stride(int i) const
static Buffer< T, Dims, InClassDimStorage > make_interleaved(T *data, int width, int height, int channels)
Wrap an existing interleaved image.
static void set_default_deallocate_fn(void(*deallocate_fn)(void *))
static Buffer< T, Dims, InClassDimStorage > make_with_shape_of(Buffer< T2, D2, S2 > src, void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr)
Make a buffer with the same shape and memory nesting order as another buffer.
Buffer(const Buffer< T2, D2, S2 > &other)
Construct a Buffer from a Buffer of different dimensionality and type.
bool contains(const std::vector< int > &coords) const
Test if a given coordinate is within the bounds of an image.
Buffer(T *data, const std::vector< halide_dimension_t > &shape)
Initialize a Buffer from a pointer to the min coordinate and a vector describing the shape.
Buffer(T *data, const std::vector< int > &sizes)
Initialize an Buffer from a pointer and a vector of sizes.
Buffer< T, Dims, InClassDimStorage > & operator=(const Buffer< T, Dims, InClassDimStorage > &other)
Standard assignment operator.
T * begin() const
A pointer to the element with the lowest address.
bool all_equal(not_void_T val) const
Tests that all values in this buffer are equal to val.
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, int first, Args &&...rest)
Initialize an Buffer of runtime type from a pointer and some sizes.
HALIDE_ALWAYS_INLINE Buffer< T2, D2, InClassDimStorage > & as() &
Return a typed reference to this Buffer.
HALIDE_ALWAYS_INLINE const Buffer< T2, D2, InClassDimStorage > & as() const &
Return a const typed reference to this Buffer.
static Buffer< add_const_if_T_is_const< void >, Dims, InClassDimStorage > make_scalar(halide_type_t t)
Make a zero-dimensional Buffer.
ConstantInterval min(const ConstantInterval &a, const ConstantInterval &b)
ConstantInterval max(const ConstantInterval &a, const ConstantInterval &b)
auto end(reverse_adaptor< T > i)
Definition Util.h:483
bool any_zero(const Container &c)
constexpr int AnyDims
BufferDeviceOwnership
This indicates how to deallocate the device for a Halide::Runtime::Buffer.
@ AllocatedDeviceAndHost
‍No free routine will be called when device ref count goes to zero
@ WrappedNative
‍halide_device_free will be called when device ref count goes to zero
@ Unmanaged
‍halide_device_detach_native will be called when device ref count goes to zero
@ Cropped
‍Call device_and_host_free when DevRefCount goes to zero.
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
Internal::ConstantInterval cast(Type t, const Internal::ConstantInterval &a)
Cast operators for ConstantIntervals.
unsigned __INT64_TYPE__ uint64_t
__UINTPTR_TYPE__ uintptr_t
void * malloc(size_t)
ALWAYS_INLINE T align_up(T p, size_t alignment)
unsigned __INT8_TYPE__ uint8_t
__PTRDIFF_TYPE__ ptrdiff_t
unsigned __INT16_TYPE__ uint16_t
void * memcpy(void *s1, const void *s2, size_t n)
__SIZE_TYPE__ size_t
void * memset(void *s, int val, size_t n)
unsigned __INT32_TYPE__ uint32_t
void free(void *)
int64_t min
The lower and upper bound of the interval.
A struct acting as a header for allocations owned by the Buffer class itself.
AllocationHeader(void(*deallocate_fn)(void *))
An iterator class, so that you can iterate over coordinates in a dimensions using a range-based for l...
bool operator!=(const iterator &other) const
A similar struct for managing device allocations.
BufferDeviceOwnership ownership
The raw representation of an image passed around by generated Halide code.
int32_t dimensions
The dimensionality of the buffer.
halide_dimension_t * dim
The shape of the buffer.
uint64_t device
A device-handle for e.g.
uint8_t * host
A pointer to the start of the data in main memory.
struct halide_type_t type
The type of each buffer element.
const struct halide_device_interface_t * device_interface
The interface used to interpret the above handle.
Each GPU API provides a halide_device_interface_t struct pointing to the code that manages device all...
int(* device_slice)(void *user_context, const struct halide_buffer_t *src, int slice_dim, int slice_pos, struct halide_buffer_t *dst)
int(* device_and_host_malloc)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface)
int(* wrap_native)(void *user_context, struct halide_buffer_t *buf, uint64_t handle, const struct halide_device_interface_t *device_interface)
int(* device_release_crop)(void *user_context, struct halide_buffer_t *buf)
int(* device_crop)(void *user_context, const struct halide_buffer_t *src, struct halide_buffer_t *dst)
int(* copy_to_host)(void *user_context, struct halide_buffer_t *buf)
int(* copy_to_device)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface)
int(* device_free)(void *user_context, struct halide_buffer_t *buf)
int(* detach_native)(void *user_context, struct halide_buffer_t *buf)
int(* device_and_host_free)(void *user_context, struct halide_buffer_t *buf)
int(* device_malloc)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface)
A runtime tag for a type in the halide type system.